From 7aaec00d8fcb9a040bba660223af7ca82083db76 Mon Sep 17 00:00:00 2001 From: stephenminakian Date: Sun, 6 Jul 2025 22:43:05 -0600 Subject: [PATCH] Added Docs --- Server/.env.example | 0 Server/docs/architecture.md | 1264 ++++++++++++++++++++++++++ Server/docs/production-deployment.md | 564 ++++++++++++ 3 files changed, 1828 insertions(+) create mode 100644 Server/.env.example diff --git a/Server/.env.example b/Server/.env.example new file mode 100644 index 0000000..e69de29 diff --git a/Server/docs/architecture.md b/Server/docs/architecture.md index e69de29..a8c66e0 100644 --- a/Server/docs/architecture.md +++ b/Server/docs/architecture.md @@ -0,0 +1,1264 @@ +│ │Access │ │Infrastructure│ │Physical │ │ +│ │Control │ │Security │ │Access Control │ │ +│ │ │ │ │ │ │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Authentication Flow Diagram + +``` +┌─────────────────┐ +│ Client │ +│ Request │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ User-Agent │ +│ Detection │ +└────────┬────────┘ + │ + ┌────▼────┐ + │Particle?│ + └────┬────┘ + │ + ┌────▼────┐ ┌─────────────────┐ + │ YES │──────────────▶│Bearer Token │ + └─────────┘ │Validation │ + │ └────────┬────────┘ + │ │ + │ ▼ + │ ┌─────────────────┐ + │ │Token Match? │ + │ └────────┬────────┘ + │ │ + │ ┌────▼────┐ + │ │ YES │ + │ └────┬────┘ + │ │ + ┌────▼────┐ ┌────────▼────────┐ + │ NO │──────────────▶│HMAC Signature │ + └─────────┘ │Verification │ + └────────┬────────┘ + │ + ▼ + ┌─────────────────┐ + │Signature Valid? │ + └────────┬────────┘ + │ + ┌────▼────┐ + │ YES │ + └────┬────┘ + │ + ▼ + ┌─────────────────┐ + │Request │ + │Processing │ + └─────────────────┘ +``` + +## 📦 Component Architecture + +### Application Layer Components + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ FLASK APPLICATION │ +├─────────────────────────────────────────────────────────────────┤ +│ REQUEST HANDLERS │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │/health │ │/webhook │ │/metrics (future) │ │ +│ │GET │ │POST │ │GET │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +├─────────────────────────────────────────────────────────────────┤ +│ MIDDLEWARE │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │Request │ │Error │ │Security │ │ +│ │Logging │ │Handling │ │Headers │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +├─────────────────────────────────────────────────────────────────┤ +│ CORE SERVICES │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │Authentication│ │Input │ │Notification │ │ +│ │Service │ │Validation │ │Service │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +├─────────────────────────────────────────────────────────────────┤ +│ UTILITY MODULES │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │HMAC │ │Email/SMS │ │Configuration │ │ +│ │Verification │ │Formatting │ │Management │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Detailed Component Breakdown + +**Authentication Service:** +```python +class AuthenticationService: + def __init__(self): + self.particle_secret = os.environ.get('PARTICLE_WEBHOOK_SECRET') + self.webhook_secret = os.environ.get('WEBHOOK_SECRET') + + def verify_particle_auth(self, auth_header): + """Verify Bearer token for Particle webhooks""" + # Implementation details + + def verify_hmac_signature(self, payload, signature): + """Verify HMAC-SHA256 signature for generic webhooks""" + # Implementation details + + def is_particle_webhook(self, user_agent): + """Detect Particle webhook requests""" + # Implementation details +``` + +**Input Validation Service:** +```python +class ValidationService: + def __init__(self): + self.required_fields = ['event', 'data', 'published_at'] + self.particle_fields = ['coreid', 'device_name', 'fw_version'] + + def validate_json_structure(self, data): + """Validate JSON payload structure""" + # Implementation details + + def validate_field_types(self, data): + """Validate data types for all fields""" + # Implementation details + + def sanitize_input(self, data): + """Sanitize input data for security""" + # Implementation details +``` + +**Notification Service:** +```python +class NotificationService: + def __init__(self): + self.smtp_config = self._load_smtp_config() + self.templates = self._load_templates() + + def send_email_notification(self, message, webhook_data): + """Send email/SMS notification""" + # Implementation details + + def format_message(self, webhook_data): + """Format notification message from webhook data""" + # Implementation details + + def _load_smtp_config(self): + """Load SMTP configuration from environment""" + # Implementation details +``` + +## 🔄 Container Architecture + +### Docker Container Structure + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ CONTAINER IMAGE │ +├─────────────────────────────────────────────────────────────────┤ +│ BASE IMAGE LAYER │ +│ python:3.11-slim (Debian-based minimal Python runtime) │ +├─────────────────────────────────────────────────────────────────┤ +│ SYSTEM DEPENDENCIES │ +│ curl (for health checks), ca-certificates (SSL) │ +├─────────────────────────────────────────────────────────────────┤ +│ PYTHON DEPENDENCIES │ +│ Flask, Gunicorn, Werkzeug (from requirements.txt) │ +├─────────────────────────────────────────────────────────────────┤ +│ APPLICATION LAYER │ +│ webhook_app.py, configuration files │ +├─────────────────────────────────────────────────────────────────┤ +│ USER & PERMISSIONS │ +│ Non-root user 'appuser', minimal file permissions │ +├─────────────────────────────────────────────────────────────────┤ +│ RUNTIME CONFIGURATION │ +│ WORKDIR /app, EXPOSE 5000, health check configuration │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Multi-Stage Build Process + +```dockerfile +# Stage 1: Build Environment +FROM python:3.11-slim as builder +WORKDIR /build +COPY requirements.txt . +RUN pip install --user --no-cache-dir -r requirements.txt + +# Stage 2: Runtime Environment +FROM python:3.11-slim as runtime +RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/* +RUN groupadd -r appuser && useradd -r -g appuser appuser + +WORKDIR /app +COPY --from=builder /root/.local /home/appuser/.local +COPY --chown=appuser:appuser webhook_app.py . + +USER appuser +ENV PATH=/home/appuser/.local/bin:$PATH + +EXPOSE 5000 +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:5000/health || exit 1 + +CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "2", "--timeout", "30", "webhook_app:app"] +``` + +### Container Resource Management + +```yaml +# Resource limits and reservations +deploy: + resources: + limits: + cpus: '0.5' # Maximum 0.5 CPU cores + memory: 256M # Maximum 256MB RAM + reservations: + cpus: '0.1' # Guaranteed 0.1 CPU cores + memory: 128M # Guaranteed 128MB RAM + restart_policy: + condition: unless-stopped + delay: 5s + max_attempts: 3 + window: 120s +``` + +## 🌐 Network Architecture + +### Network Topology + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ EXTERNAL NETWORKS │ +├─────────────────────────────────────────────────────────────────┤ +│ Internet ──▶ Router ──▶ Port Forwarding ──▶ Server │ +│ │ 80 → 80 (HTTP) │ +│ │ 443 → 443 (HTTPS) │ +│ └─────────2222 → 2222 (SSH Git) │ +└─────────────────────────────────────────────────────────────────┘ + │ +┌─────────────────────────────────────▼───────────────────────────┐ +│ DOCKER NETWORKS │ +├─────────────────────────────────────────────────────────────────┤ +│ ┌─────────────────┐ ┌─────────────────┐ │ +│ │ traefik │ │ Service │ │ +│ │ (external) │ │ Networks │ │ +│ │ │ │ (internal) │ │ +│ │ ┌─────────────┐ │ │ ┌─────────────┐ │ │ +│ │ │webhook- │ │ │ │gitea- │ │ │ +│ │ │service │ │ │ │internal │ │ │ +│ │ └─────────────┘ │ │ └─────────────┘ │ │ +│ │ ┌─────────────┐ │ │ ┌─────────────┐ │ │ +│ │ │traefik │ │ │ │harbor- │ │ │ +│ │ │proxy │ │ │ │harbor │ │ │ +│ │ └─────────────┘ │ │ └─────────────┘ │ │ +│ │ ┌─────────────┐ │ │ │ │ +│ │ │gitea │ │ │ │ │ +│ │ │ │ │ │ │ │ +│ │ └─────────────┘ │ │ │ │ +│ │ ┌─────────────┐ │ │ │ │ +│ │ │harbor │ │ │ │ │ +│ │ │ │ │ │ │ │ +│ │ └─────────────┘ │ │ │ │ +│ └─────────────────┘ └─────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Service Discovery and Routing + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ TRAEFIK ROUTING │ +├─────────────────────────────────────────────────────────────────┤ +│ Request Flow: │ +│ │ +│ webhook.domain.com ──▶ Traefik ──▶ webhook-service:5000 │ +│ git.domain.com ──▶ Traefik ──▶ gitea:3000 │ +│ registry.domain.com──▶ Traefik ──▶ harbor:80 │ +│ traefik.domain.com ──▶ Traefik ──▶ dashboard:8080 │ +│ │ +│ Routing Rules: │ +│ - Host header matching │ +│ - Automatic service discovery via Docker labels │ +│ - Health check integration │ +│ - Load balancing (when multiple instances) │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### SSL/TLS Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ SSL/TLS TERMINATION │ +├─────────────────────────────────────────────────────────────────┤ +│ │ +│ Client ──[TLS 1.3]──▶ Traefik ──[HTTP]──▶ Backend Service │ +│ │ │ +│ ▼ │ +│ ┌─────────────┐ │ +│ │Let's Encrypt│ │ +│ │Certificate │ │ +│ │Management │ │ +│ └─────────────┘ │ +│ │ +│ Features: │ +│ - Automatic certificate generation │ +│ - Automatic renewal (60 days before expiry) │ +│ - HTTP to HTTPS redirection │ +│ - HSTS headers │ +│ - Perfect Forward Secrecy │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## 📊 Data Architecture + +### Data Flow Patterns + +**Synchronous Request-Response:** +``` +Client Request ──▶ Authentication ──▶ Validation ──▶ Processing ──▶ Response + │ │ │ │ │ + │ ▼ ▼ ▼ │ + │ Security Check Schema Check Business │ + │ Rate Limit Field Types Logic │ + │ IP Filter Sanitization Notification │ + │ Dispatch │ + │ │ + └──────────────────── Immediate Response ◀─────────────────────┘ +``` + +**Asynchronous Notification Processing:** +``` +Webhook Event ──▶ Queue ──▶ Background Worker ──▶ Email/SMS Gateway + │ │ │ │ + │ │ ▼ ▼ + │ │ Notification Delivery + │ │ Formatting Confirmation + │ │ │ │ + │ │ ▼ ▼ + │ │ Template Status + │ │ Engine Tracking + │ │ │ + │ └─────── Retry Logic ◀─────────────────┘ + │ + └────────── Immediate Response (webhook success) +``` + +### Data Models + +**Webhook Event Data Model:** +```python +from dataclasses import dataclass +from datetime import datetime +from typing import Optional + +@dataclass +class WebhookEvent: + event: str + data: str + published_at: str + coreid: Optional[str] = None + device_name: Optional[str] = None + fw_version: Optional[str] = None + received_at: datetime = None + source_ip: str = None + user_agent: str = None + + def is_particle_event(self) -> bool: + return self.coreid is not None + + def get_device_info(self) -> dict: + return { + 'device_id': self.coreid, + 'device_name': self.device_name or self.coreid, + 'firmware_version': self.fw_version + } +``` + +**Notification Data Model:** +```python +@dataclass +class NotificationRequest: + event_type: str + device_info: dict + message: str + priority: str # low, normal, high, critical + recipients: list + delivery_method: str # email, sms, both + retry_count: int = 0 + max_retries: int = 3 + + def should_retry(self) -> bool: + return self.retry_count < self.max_retries +``` + +### Configuration Management + +**Environment-based Configuration:** +```python +class Config: + def __init__(self): + self.flask_secret = os.environ.get('FLASK_SECRET_KEY') + self.webhook_secret = os.environ.get('WEBHOOK_SECRET') + self.particle_secret = os.environ.get('PARTICLE_WEBHOOK_SECRET') + self.smtp_config = self._load_smtp_config() + self.security_config = self._load_security_config() + + def _load_smtp_config(self): + return { + 'server': 'smtp.gmail.com', + 'port': 465, + 'username': os.environ.get('SMTP_EMAIL'), + 'password': os.environ.get('SMTP_PASSWORD'), + 'use_tls': True + } + + def _load_security_config(self): + return { + 'rate_limit': { + 'requests_per_minute': 10, + 'burst_size': 20 + }, + 'auth_timeout': 30, + 'max_payload_size': 1024 # bytes + } +``` + +## 🔍 Monitoring Architecture + +### Observability Stack + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ MONITORING LAYER │ +├─────────────────────────────────────────────────────────────────┤ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │ Metrics │ │ Logs │ │ Traces │ │ +│ │ │ │ │ │ │ │ +│ │ Prometheus │ │ Loki │ │ Jaeger │ │ +│ │ (Future) │ │ (Future) │ │ (Future) │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +├─────────────────────────────────────────────────────────────────┤ +│ CURRENT MONITORING │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │ Docker │ │ File │ │ Health │ │ +│ │ Logs │ │ Logs │ │ Checks │ │ +│ │ │ │ │ │ │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Health Check Architecture + +**Multi-Level Health Checks:** +``` +┌─────────────────────────────────────────────────────────────────┐ +│ HEALTH CHECK LAYERS │ +├─────────────────────────────────────────────────────────────────┤ +│ Level 1: Container Health │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │Process │ │Port │ │Basic HTTP │ │ +│ │Running │ │Listening │ │Response │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +├─────────────────────────────────────────────────────────────────┤ +│ Level 2: Application Health │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │Flask App │ │Database │ │External Dependencies │ │ +│ │Responding │ │Connection │ │(SMTP, etc.) │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +├─────────────────────────────────────────────────────────────────┤ +│ Level 3: Business Logic Health │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │Authentication│ │Notification │ │End-to-End │ │ +│ │Service │ │Service │ │Functionality │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Logging Architecture + +**Structured Logging:** +```python +import logging +import json +from datetime import datetime + +class StructuredLogger: + def __init__(self, name): + self.logger = logging.getLogger(name) + self.logger.setLevel(logging.INFO) + + handler = logging.StreamHandler() + formatter = logging.Formatter('%(message)s') + handler.setFormatter(formatter) + self.logger.addHandler(handler) + + def log_event(self, level, event_type, message, **kwargs): + log_data = { + 'timestamp': datetime.utcnow().isoformat(), + 'level': level, + 'event_type': event_type, + 'message': message, + 'component': 'webhook-service', + **kwargs + } + + self.logger.log(getattr(logging, level.upper()), json.dumps(log_data)) + +# Usage +logger = StructuredLogger('webhook') +logger.log_event('info', 'webhook_received', 'Particle webhook authenticated', + device_id='abc123', source_ip='1.2.3.4') +``` + +## 🚀 Deployment Architecture + +### Deployment Pipeline + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ DEPLOYMENT PIPELINE │ +├─────────────────────────────────────────────────────────────────┤ +│ Development ──▶ Testing ──▶ Staging ──▶ Production │ +│ │ │ │ │ │ +│ ▼ ▼ ▼ ▼ │ +│ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ +│ │Local │ │Unit │ │Integration│ │Blue/Green│ │ +│ │Testing │ │Tests │ │Testing │ │Deploy │ │ +│ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │ +├─────────────────────────────────────────────────────────────────┤ +│ GITOPS WORKFLOW │ +│ │ +│ Git Push ──▶ CI/CD ──▶ Image Build ──▶ Deploy ──▶ Monitor │ +│ │ │ │ │ │ │ +│ ▼ ▼ ▼ ▼ ▼ │ +│ Code Automated Docker Container Health │ +│ Review Testing Registry Deployment Checks │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Environment Architecture + +**Multi-Environment Setup:** +``` +┌─────────────────────────────────────────────────────────────────┐ +│ ENVIRONMENTS │ +├─────────────────────────────────────────────────────────────────┤ +│ Development Environment │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │Local Docker │ │File-based │ │Mock Services │ │ +│ │Compose │ │Logging │ │ │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +├─────────────────────────────────────────────────────────────────┤ +│ Staging Environment │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │Production │ │Structured │ │Real Services │ │ +│ │Like Setup │ │Logging │ │(Limited) │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +├─────────────────────────────────────────────────────────────────┤ +│ Production Environment │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │High │ │Centralized │ │Full Service │ │ +│ │Availability │ │Monitoring │ │Integration │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Scaling Architecture + +**Horizontal Scaling Strategy:** +``` +┌─────────────────────────────────────────────────────────────────┐ +│ SCALING PATTERNS │ +├─────────────────────────────────────────────────────────────────┤ +│ Single Instance (Current) │ +│ ┌─────────────────────────────────────────────────────────┐ │ +│ │ Traefik ──▶ Webhook Service Instance │ │ +│ └─────────────────────────────────────────────────────────┘ │ +├─────────────────────────────────────────────────────────────────┤ +│ Multiple Instances (Future) │ +│ ┌─────────────────────────────────────────────────────────┐ │ +│ │ Traefik ──┬──▶ Webhook Service Instance 1 │ │ +│ │ ├──▶ Webhook Service Instance 2 │ │ +│ │ └──▶ Webhook Service Instance N │ │ +│ └─────────────────────────────────────────────────────────┘ │ +├─────────────────────────────────────────────────────────────────┤ +│ Database-Backed (Advanced) │ +│ ┌─────────────────────────────────────────────────────────┐ │ +│ │ Multiple Instances ──▶ Shared Database │ │ +│ │ ──▶ Message Queue │ │ +│ │ ──▶ Shared State │ │ +│ └─────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## 🔄 Integration Architecture + +### External System Integration + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ INTEGRATION POINTS │ +├─────────────────────────────────────────────────────────────────┤ +│ Inbound Integrations │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │Particle │ │GitHub │ │Custom │ │ +│ │Webhooks │ │Webhooks │ │Services │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +├─────────────────────────────────────────────────────────────────┤ +│ Outbound Integrations │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │SMTP/Email │ │SMS Gateways │ │Monitoring │ │ +│ │Services │ │ │ │Systems │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +├─────────────────────────────────────────────────────────────────┤ +│ Future Integrations │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │Slack/Discord│ │Database │ │Analytics │ │ +│ │Webhooks │ │Logging │ │Platforms │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### API Design Architecture + +**RESTful API Design:** +``` +┌─────────────────────────────────────────────────────────────────┐ +│ API ENDPOINTS │ +├─────────────────────────────────────────────────────────────────┤ +│ GET /health │ +│ ├─ Purpose: Health check endpoint │ +│ ├─ Authentication: None │ +│ ├─ Response: JSON health status │ +│ └─ Caching: No cache │ +├─────────────────────────────────────────────────────────────────┤ +│ POST /webhook │ +│ ├─ Purpose: Process incoming webhook events │ +│ ├─ Authentication: Bearer token or HMAC signature │ +│ ├─ Request: JSON payload with event data │ +│ ├─ Response: JSON success/error status │ +│ └─ Rate Limiting: 10 req/min average, 20 burst │ +├─────────────────────────────────────────────────────────────────┤ +│ GET /metrics (Future) │ +│ ├─ Purpose: Prometheus metrics endpoint │ +│ ├─ Authentication: Internal network only │ +│ ├─ Response: Prometheus format metrics │ +│ └─ Caching: 30 second cache │ +├─────────────────────────────────────────────────────────────────┤ +│ POST /test (Development Only) │ +│ ├─ Purpose: Test notification delivery │ +│ ├─ Authentication: Same as webhook │ +│ ├─ Request: Test message payload │ +│ └─ Response: Delivery confirmation │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Error Handling Architecture + +**Hierarchical Error Management:** +```python +class WebhookError(Exception): + """Base webhook exception""" + def __init__(self, message, status_code=500, error_code=None): + self.message = message + self.status_code = status_code + self.error_code = error_code + super().__init__(self.message) + +class AuthenticationError(WebhookError): + """Authentication related errors""" + def __init__(self, message): + super().__init__(message, 403, "AUTH_FAILED") + +class ValidationError(WebhookError): + """Input validation errors""" + def __init__(self, message): + super().__init__(message, 400, "VALIDATION_FAILED") + +class NotificationError(WebhookError): + """Notification delivery errors""" + def __init__(self, message): + super().__init__(message, 500, "NOTIFICATION_FAILED") + +# Error Handler +@app.errorhandler(WebhookError) +def handle_webhook_error(error): + response = { + "error": error.message, + "error_code": error.error_code, + "timestamp": datetime.utcnow().isoformat() + } + return jsonify(response), error.status_code +``` + +## 📈 Performance Architecture + +### Performance Optimization Strategies + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ PERFORMANCE LAYERS │ +├─────────────────────────────────────────────────────────────────┤ +│ Application Level │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │Async │ │Connection │ │Response │ │ +│ │Processing │ │Pooling │ │Caching │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +├─────────────────────────────────────────────────────────────────┤ +│ Server Level │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │Gunicorn │ │Worker │ │Keep-Alive │ │ +│ │WSGI Server │ │Processes │ │Connections │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +├─────────────────────────────────────────────────────────────────┤ +│ Infrastructure Level │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │Load │ │CDN │ │Resource │ │ +│ │Balancing │ │(Future) │ │Optimization │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Caching Architecture + +**Multi-Level Caching Strategy:** +```python +from functools import lru_cache +from threading import Lock +import time + +class CacheManager: + def __init__(self): + self._memory_cache = {} + self._cache_lock = Lock() + self._ttl_cache = {} + + @lru_cache(maxsize=128) + def get_config_value(self, key): + """Application-level configuration caching""" + return os.environ.get(key) + + def cache_with_ttl(self, key, value, ttl_seconds=300): + """TTL-based caching for dynamic data""" + with self._cache_lock: + expiry = time.time() + ttl_seconds + self._memory_cache[key] = value + self._ttl_cache[key] = expiry + + def get_cached(self, key): + """Retrieve cached value if not expired""" + with self._cache_lock: + if key in self._memory_cache: + if time.time() < self._ttl_cache.get(key, 0): + return self._memory_cache[key] + else: + # Expired, clean up + del self._memory_cache[key] + del self._ttl_cache[key] + return None +``` + +### Resource Optimization + +**Memory Management:** +```python +import gc +import psutil +import threading + +class ResourceMonitor: + def __init__(self): + self.max_memory_mb = 256 # Container limit + self.warning_threshold = 0.8 + self.monitoring_interval = 60 # seconds + + def start_monitoring(self): + """Start background resource monitoring""" + def monitor(): + while True: + memory_usage = psutil.virtual_memory().percent + if memory_usage > self.warning_threshold * 100: + logger.warning(f"High memory usage: {memory_usage}%") + gc.collect() # Force garbage collection + + time.sleep(self.monitoring_interval) + + thread = threading.Thread(target=monitor, daemon=True) + thread.start() +``` + +## 🔒 Security Deep Dive + +### Threat Model + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ THREAT ANALYSIS │ +├─────────────────────────────────────────────────────────────────┤ +│ External Threats │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │DDoS Attacks │ │Injection │ │Unauthorized │ │ +│ │ │ │Attacks │ │Access │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +├─────────────────────────────────────────────────────────────────┤ +│ Internal Threats │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │Configuration│ │Container │ │Supply Chain │ │ +│ │Vulnerabilities│ │Escape │ │Attacks │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +├─────────────────────────────────────────────────────────────────┤ +│ Mitigation Strategies │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │Defense in │ │Principle of │ │Security │ │ +│ │Depth │ │Least │ │Monitoring │ │ +│ │ │ │Privilege │ │ │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Security Controls Matrix + +``` +┌─────────────────┬─────────────────┬─────────────────┬─────────────────┐ +│ Control Type │ Implementation│ Threat │ Effectiveness │ +├─────────────────┼─────────────────┼─────────────────┼─────────────────┤ +│ Authentication │ Bearer Token │ Unauthorized │ High │ +│ │ HMAC Signature │ Access │ │ +├─────────────────┼─────────────────┼─────────────────┼─────────────────┤ +│ Input Validation│ JSON Schema │ Injection │ High │ +│ │ Type Checking │ Attacks │ │ +├─────────────────┼─────────────────┼─────────────────┼─────────────────┤ +│ Rate Limiting │ Traefik │ DDoS │ Medium │ +│ │ Middleware │ │ │ +├─────────────────┼─────────────────┼─────────────────┼─────────────────┤ +│ Container │ Non-root User │ Container │ High │ +│ Security │ Minimal Base │ Escape │ │ +├─────────────────┼─────────────────┼─────────────────┼─────────────────┤ +│ Network │ Internal │ Network │ High │ +│ Isolation │ Docker Networks │ Attacks │ │ +├─────────────────┼─────────────────┼─────────────────┼─────────────────┤ +│ Encryption │ TLS 1.3 │ Data │ High │ +│ │ Let's Encrypt │ Interception │ │ +└─────────────────┴─────────────────┴─────────────────┴─────────────────┘ +``` + +### Cryptographic Architecture + +**Key Management:** +```python +import secrets +import hashlib +import hmac +from cryptography.fernet import Fernet + +class CryptoManager: + def __init__(self): + self.webhook_secret = os.environ.get('WEBHOOK_SECRET') + self.particle_secret = os.environ.get('PARTICLE_WEBHOOK_SECRET') + + def verify_hmac_signature(self, payload, signature, secret): + """Verify HMAC-SHA256 signature""" + expected_signature = hmac.new( + secret.encode('utf-8'), + payload, + hashlib.sha256 + ).hexdigest() + + # Use constant-time comparison to prevent timing attacks + return hmac.compare_digest( + f"sha256={expected_signature}", + signature + ) + + def generate_secure_token(self, length=32): + """Generate cryptographically secure random token""" + return secrets.token_hex(length) + + def hash_sensitive_data(self, data): + """Hash sensitive data for logging""" + return hashlib.sha256(data.encode()).hexdigest()[:8] +``` + +## 🔮 Future Architecture Considerations + +### Microservices Evolution + +**Current Monolithic Architecture:** +``` +┌─────────────────────────────────────────────────────────────────┐ +│ MONOLITHIC SERVICE │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │Authentication│ │Validation │ │Notification │ │ +│ │ │ │ │ │ │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +**Future Microservices Architecture:** +``` +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────────┐ +│ Auth Service │ │ Validation │ │ Notification │ +│ │ │ Service │ │ Service │ +│ - Bearer Token │ │ │ │ │ +│ - HMAC Verify │ │ - Schema Check │ │ - Email Service │ +│ - Rate Limiting │ │ - Sanitization │ │ - SMS Service │ +└─────────────────┘ └─────────────────┘ └─────────────────────┘ + │ │ │ + └─────────────────────┼───────────────────────┘ + │ + ┌─────────────────┐ + │ API Gateway │ + │ │ + │ - Routing │ + │ - Load Balance │ + │ - SSL Term │ + └─────────────────┘ +``` + +### Database Integration + +**Current State (Stateless):** +``` +Request → Process → Response (No Persistence) +``` + +**Future Database Integration:** +``` +┌─────────────────────────────────────────────────────────────────┐ +│ DATABASE LAYER │ +├─────────────────────────────────────────────────────────────────┤ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │PostgreSQL │ │Redis Cache │ │Time Series DB │ │ +│ │ │ │ │ │(InfluxDB) │ │ +│ │ - Events │ │ - Sessions │ │ │ │ +│ │ - Devices │ │ - Rate Limit│ │ - Metrics │ │ +│ │ - Users │ │ - Config │ │ - Analytics │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Event-Driven Architecture + +**Future Event Bus Integration:** +``` +┌─────────────────────────────────────────────────────────────────┐ +│ EVENT BUS │ +├─────────────────────────────────────────────────────────────────┤ +│ Webhook Event ──▶ Event Bus ──▶ Multiple Consumers │ +│ │ │ +│ ├──▶ Notification Service │ +│ ├──▶ Analytics Service │ +│ ├──▶ Alerting Service │ +│ ├──▶ Audit Service │ +│ └──▶ Archive Service │ +│ │ +│ Technologies: Apache Kafka, RabbitMQ, Redis Streams │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Cloud Migration Path + +**Phase 1: Containerized (Current)** +``` +On-Premise Server → Docker Containers → Manual Management +``` + +**Phase 2: Orchestrated** +``` +Docker Swarm or Kubernetes → Auto-scaling → Health Management +``` + +**Phase 3: Cloud Native** +``` +AWS ECS/EKS, Azure ACI/AKS, or GCP Cloud Run → Serverless Options +``` + +**Phase 4: Serverless** +``` +AWS Lambda, Azure Functions, GCP Cloud Functions → Event-Driven +``` + +--- + +## 📋 Architecture Decision Records (ADRs) + +### ADR-001: Flask Web Framework Choice + +**Status:** Accepted +**Date:** 2024-01-07 + +**Context:** Need lightweight web framework for webhook processing. + +**Decision:** Use Flask with Gunicorn WSGI server. + +**Rationale:** +- Lightweight and minimal for simple webhook processing +- Excellent ecosystem and community support +- Easy to containerize and scale +- Gunicorn provides production-grade WSGI serving + +**Consequences:** +- Simple to develop and maintain +- Limited built-in features require additional libraries +- Single-threaded request processing per worker + +### ADR-002: Docker Containerization + +**Status:** Accepted +**Date:** 2024-01-07 + +**Context:** Need consistent deployment across environments. + +**Decision:** Use Docker containers with multi-stage builds. + +**Rationale:** +- Consistent runtime environment +- Easy integration with existing Traefik infrastructure +- Simplified deployment and scaling +- Security isolation + +**Consequences:** +- Additional complexity in development workflow +- Resource overhead from containerization +- Container security considerations + +### ADR-003: Dual Authentication Strategy + +**Status:** Accepted +**Date:** 2024-01-07 + +**Context:** Support both Particle.io webhooks and generic webhooks. + +**Decision:** Implement Bearer token auth for Particle, HMAC for generic. + +**Rationale:** +- Particle.io supports Bearer tokens natively +- HMAC signatures provide strong security for generic webhooks +- User-Agent detection provides reliable differentiation + +**Consequences:** +- Increased complexity in authentication logic +- Need to maintain two secret management strategies +- Clear separation of authentication methods + +### ADR-004: Synchronous Processing + +**Status:** Accepted +**Date:** 2024-01-07 + +**Context:** Choose between synchronous and asynchronous webhook processing. + +**Decision:** Use synchronous processing with immediate response. + +**Rationale:** +- Simple implementation and debugging +- Low latency requirements for security alerts +- Current volume doesn't require async processing +- Easier error handling and retry logic + +**Consequences:** +- Limited scalability for high-volume scenarios +- Blocking operations may impact response times +- Simpler architecture and fewer components + +--- + +**📊 Architecture Summary:** + +This webhook service architecture provides: + +✅ **Scalable Foundation** - Ready for growth from single instance to microservices +✅ **Security First** - Multiple layers of security controls and monitoring +✅ **Operational Excellence** - Comprehensive logging, monitoring, and health checks +✅ **Performance Optimized** - Caching, resource management, and efficiency focus +✅ **Integration Ready** - Clean APIs and extensible notification system +✅ **Future Proof** - Clear migration paths to cloud and serverless architectures + +The architecture balances simplicity for current needs with flexibility for future requirements, ensuring the system can evolve as the security device network scales.# Webhook Server Architecture Documentation + +Comprehensive technical architecture documentation for the secure webhook service component of the ultra-low-power security device system. + +## 🏗️ System Architecture Overview + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ INTERNET │ +└──────────────────────────┬──────────────────────────────────────┘ + │ +┌──────────────────────────▼──────────────────────────────────────┐ +│ EDGE LAYER │ +│ ┌─────────────┐ ┌─────────────────┐ ┌─────────────────────┐ │ +│ │ Firewall │ │ Load Balancer │ │ DDoS Protection │ │ +│ │ (UFW) │ │ (Optional) │ │ (Cloudflare) │ │ +│ └─────────────┘ └─────────────────┘ └─────────────────────┘ │ +└──────────────────────────┬──────────────────────────────────────┘ + │ +┌──────────────────────────▼──────────────────────────────────────┐ +│ REVERSE PROXY LAYER │ +│ ┌─────────────────────────────────────────────────────────────┐│ +│ │ TRAEFIK ││ +│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ ││ +│ │ │SSL/TLS │ │Rate Limiting│ │Security Headers │ ││ +│ │ │Termination │ │& Throttling │ │& Middleware │ ││ +│ │ └─────────────┘ └─────────────┘ └─────────────────────┘ ││ +│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ ││ +│ │ │Service │ │Health Checks│ │Automatic SSL │ ││ +│ │ │Discovery │ │& Monitoring │ │(Let's Encrypt) │ ││ +│ │ └─────────────┘ └─────────────┘ └─────────────────────┘ ││ +│ └─────────────────────────────────────────────────────────────┘│ +└──────────────────────────┬──────────────────────────────────────┘ + │ +┌──────────────────────────▼──────────────────────────────────────┐ +│ APPLICATION LAYER │ +│ ┌─────────────────────────────────────────────────────────────┐│ +│ │ WEBHOOK SERVICE ││ +│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ ││ +│ │ │Flask App │ │Gunicorn │ │Authentication │ ││ +│ │ │ │ │WSGI Server │ │Handler │ ││ +│ │ └─────────────┘ └─────────────┘ └─────────────────────┘ ││ +│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ ││ +│ │ │Input │ │Business │ │Output │ ││ +│ │ │Validation │ │Logic │ │Notification │ ││ +│ │ └─────────────┘ └─────────────┘ └─────────────────────┘ ││ +│ └─────────────────────────────────────────────────────────────┘│ +└──────────────────────────┬──────────────────────────────────────┘ + │ +┌──────────────────────────▼──────────────────────────────────────┐ +│ CONTAINER LAYER │ +│ ┌─────────────────────────────────────────────────────────────┐│ +│ │ DOCKER ││ +│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ ││ +│ │ │Container │ │Volume │ │Network │ ││ +│ │ │Runtime │ │Management │ │Isolation │ ││ +│ │ └─────────────┘ └─────────────┘ └─────────────────────┘ ││ +│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ ││ +│ │ │Health │ │Log │ │Resource │ ││ +│ │ │Monitoring │ │Management │ │Limits │ ││ +│ │ └─────────────┘ └─────────────┘ └─────────────────────┘ ││ +│ └─────────────────────────────────────────────────────────────┘│ +└──────────────────────────┬──────────────────────────────────────┘ + │ +┌──────────────────────────▼──────────────────────────────────────┐ +│ INFRASTRUCTURE LAYER │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │ Host OS │ │ Storage │ │ Networking │ │ +│ │ (Ubuntu) │ │ (SSD) │ │ (Docker Bridge) │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## 🔄 Data Flow Architecture + +### Request Processing Pipeline + +``` +External Request → Authentication → Validation → Processing → Response + ↓ ↓ ↓ ↓ + Security Input Business Output + Layer Validation Logic Formatting + ↓ ↓ ↓ ↓ + Rate Limit Schema Check Event Success/Error + Check Required Processing Response + ↓ Fields ↓ ↓ + IP Filter Data Types Notification Logging + Check Validation Dispatch ↓ + ↓ ↓ ↓ Metrics + Auth Token Sanitization Email/SMS Collection + Verification ↓ Service + ↓ Clean Data ↓ + Access ↓ Delivery + Granted Validated Confirmation + Request +``` + +### Event Processing Flow + +``` +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ +│ Particle │ │ Particle │ │ Webhook │ +│ Device │───▶│ Cloud │───▶│ Service │ +│ │ │ │ │ │ +│ - Sensor Event │ │ - Event Storage │ │ - Authentication│ +│ - Data Payload │ │ - Webhook │ │ - Validation │ +│ - Device Info │ │ Delivery │ │ - Processing │ +└─────────────────┘ └─────────────────┘ └─────────────────┘ + │ + ▼ +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ +│ Notification │ │ Email/SMS │ │ Response │ +│ Formatting │◀───│ Gateway │◀───│ Generation │ +│ │ │ │ │ │ +│ - Message │ │ - SMTP Server │ │ - Status Code │ +│ Template │ │ - SMS Gateway │ │ - JSON Response │ +│ - Device Context│ │ - Delivery │ │ - Logging │ +└─────────────────┘ └─────────────────┘ └─────────────────┘ +``` + +## 🔐 Security Architecture + +### Defense in Depth Model + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Layer 7: Application Security │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │Input │ │Business │ │Output │ │ +│ │Validation │ │Logic │ │Sanitization │ │ +│ │ │ │Security │ │ │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +┌─────────────────────────────────────────────────────────────────┐ +│ Layer 6: Authentication & Authorization │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │Bearer Token │ │HMAC │ │User-Agent │ │ +│ │Validation │ │Signature │ │Detection │ │ +│ │ │ │Verification │ │ │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +┌─────────────────────────────────────────────────────────────────┐ +│ Layer 5: Transport Security │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │TLS 1.3 │ │Certificate │ │HSTS Headers │ │ +│ │Encryption │ │Validation │ │ │ │ +│ │ │ │ │ │ │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +┌─────────────────────────────────────────────────────────────────┐ +│ Layer 4: Network Security │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │Rate │ │IP │ │DDoS │ │ +│ │Limiting │ │Filtering │ │Protection │ │ +│ │ │ │ │ │ │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +┌─────────────────────────────────────────────────────────────────┐ +│ Layer 3: Container Security │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │Non-root │ │Resource │ │Minimal Attack │ │ +│ │User │ │Limits │ │Surface │ │ +│ │ │ │ │ │ │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +┌─────────────────────────────────────────────────────────────────┐ +│ Layer 2: Host Security │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │UFW │ │fail2ban │ │OS Security │ │ +│ │Firewall │ │Intrusion │ │Updates │ │ +│ │ │ │Prevention │ │ │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +┌─────────────────────────────────────────────────────────────────┐ +│ Layer 1: Physical Security │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │Server │ │Network │ │Physical │ │ +│ │Access │ │Infrastructure│ │Access Control │ │ +│ │Control │ │Security │ │ │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ \ No newline at end of file diff --git a/Server/docs/production-deployment.md b/Server/docs/production-deployment.md index e69de29..41bc829 100644 --- a/Server/docs/production-deployment.md +++ b/Server/docs/production-deployment.md @@ -0,0 +1,564 @@ +# Production Deployment Guide + +Comprehensive guide for deploying the webhook service in production environments with enterprise-grade reliability, security, and monitoring. + +## 🎯 Production Readiness Overview + +### Deployment Checklist + +``` +□ Security hardening complete +□ SSL certificates configured and auto-renewing +□ Monitoring and alerting implemented +□ Backup and disaster recovery tested +□ Performance optimization validated +□ Documentation complete and accessible +□ Team training and runbooks prepared +``` + +### Production vs Development Differences + +| Aspect | Development | Production | +|--------|-------------|------------| +| **Security** | Basic auth, HTTP allowed | Full security stack, HTTPS only | +| **Logging** | Console output | Structured logging, centralized | +| **Monitoring** | Manual checks | Automated monitoring/alerting | +| **Scaling** | Single instance | Auto-scaling, load balancing | +| **Data** | Test data | Real customer data, GDPR compliance | +| **Uptime** | Best effort | 99.9% SLA target | + +## 🏗️ Infrastructure Requirements + +### Server Specifications + +**Minimum Requirements:** +``` +CPU: 2 cores (x86_64) +RAM: 4GB +Storage: 50GB SSD +Network: 100Mbps +OS: Ubuntu 20.04 LTS or newer +``` + +**Recommended Production:** +``` +CPU: 4 cores (x86_64) +RAM: 8GB +Storage: 100GB NVMe SSD +Network: 1Gbps +OS: Ubuntu 22.04 LTS +Backup: Automated daily backups +``` + +**High Availability Setup:** +``` +Load Balancer: 2x instances +Application Servers: 3x instances +Database: Primary + Read Replica +Storage: RAID 1 or cloud block storage +Network: Redundant connections +``` + +### Network Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ PRODUCTION NETWORK │ +├─────────────────────────────────────────────────────────────────┤ +│ Internet ──▶ CDN/WAF ──▶ Load Balancer ──▶ Application │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ DDoS Protection Health Checks Auto Scaling │ +│ Rate Limiting SSL Termination Multiple Instances │ +│ Geo Filtering Session Affinity Container Restart │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## 🔒 Security Hardening + +### Operating System Security + +**System Hardening Checklist:** +```bash +# 1. Update system packages +sudo apt update && sudo apt upgrade -y + +# 2. Configure automatic security updates +sudo apt install unattended-upgrades +sudo dpkg-reconfigure -plow unattended-upgrades + +# 3. Configure UFW firewall +sudo ufw default deny incoming +sudo ufw default allow outgoing +sudo ufw allow ssh +sudo ufw allow 80/tcp +sudo ufw allow 443/tcp +sudo ufw enable + +# 4. Install and configure fail2ban +sudo apt install fail2ban +sudo systemctl enable fail2ban +sudo systemctl start fail2ban + +# 5. Disable root login and password authentication +sudo sed -i 's/PermitRootLogin yes/PermitRootLogin no/' /etc/ssh/sshd_config +sudo sed -i 's/#PasswordAuthentication yes/PasswordAuthentication no/' /etc/ssh/sshd_config +sudo systemctl restart ssh + +# 6. Configure automatic security updates +echo 'Unattended-Upgrade::Automatic-Reboot "true";' | sudo tee -a /etc/apt/apt.conf.d/50unattended-upgrades +echo 'Unattended-Upgrade::Automatic-Reboot-Time "02:00";' | sudo tee -a /etc/apt/apt.conf.d/50unattended-upgrades +``` + +### Docker Security Configuration + +**Production Docker Daemon Config:** +```json +# /etc/docker/daemon.json +{ + "live-restore": true, + "userland-proxy": false, + "no-new-privileges": true, + "seccomp-profile": "/etc/docker/seccomp.json", + "log-driver": "json-file", + "log-opts": { + "max-size": "10m", + "max-file": "3" + }, + "storage-driver": "overlay2", + "storage-opts": [ + "overlay2.override_kernel_check=true" + ] +} +``` + +**Security Hardened docker-compose.yml:** +```yaml +version: '3.8' + +services: + webhook-service: + build: . + container_name: webhook-service-prod + restart: unless-stopped + + # Security configurations + read_only: true + security_opt: + - no-new-privileges:true + - seccomp:unconfined + cap_drop: + - ALL + cap_add: + - NET_BIND_SERVICE + + # Resource limits + deploy: + resources: + limits: + cpus: '0.5' + memory: 512M + reservations: + cpus: '0.1' + memory: 256M + + # Temporary filesystems for read-only container + tmpfs: + - /tmp:size=100M,noexec,nosuid,nodev + - /var/run:size=100M,noexec,nosuid,nodev + + environment: + - FLASK_ENV=production + - FLASK_SECRET_KEY=${FLASK_SECRET_KEY} + - WEBHOOK_SECRET=${WEBHOOK_SECRET} + - PARTICLE_WEBHOOK_SECRET=${PARTICLE_WEBHOOK_SECRET} + - SMTP_EMAIL=${SMTP_EMAIL} + - SMTP_PASSWORD=${SMTP_PASSWORD} + - RECIPIENT_EMAIL=${RECIPIENT_EMAIL} + + networks: + - traefik + - internal + + labels: + - "traefik.enable=true" + - "traefik.http.routers.webhook-prod.rule=Host(`webhook.yourdomain.com`)" + - "traefik.http.routers.webhook-prod.entrypoints=websecure" + - "traefik.http.routers.webhook-prod.tls.certresolver=letsencrypt" + - "traefik.http.services.webhook-prod.loadbalancer.server.port=5000" + + # Production security middleware + - "traefik.http.routers.webhook-prod.middlewares=webhook-prod-security,webhook-prod-ratelimit" + + # Enhanced security headers + - "traefik.http.middlewares.webhook-prod-security.headers.customrequestheaders.X-Forwarded-Proto=https" + - "traefik.http.middlewares.webhook-prod-security.headers.customresponseheaders.X-Content-Type-Options=nosniff" + - "traefik.http.middlewares.webhook-prod-security.headers.customresponseheaders.X-Frame-Options=DENY" + - "traefik.http.middlewares.webhook-prod-security.headers.customresponseheaders.X-XSS-Protection=1; mode=block" + - "traefik.http.middlewares.webhook-prod-security.headers.customresponseheaders.Referrer-Policy=strict-origin-when-cross-origin" + - "traefik.http.middlewares.webhook-prod-security.headers.customresponseheaders.Strict-Transport-Security=max-age=31536000; includeSubDomains" + - "traefik.http.middlewares.webhook-prod-security.headers.customresponseheaders.Content-Security-Policy=default-src 'self'" + + # Production rate limiting + - "traefik.http.middlewares.webhook-prod-ratelimit.ratelimit.average=20" + - "traefik.http.middlewares.webhook-prod-ratelimit.ratelimit.burst=50" + - "traefik.http.middlewares.webhook-prod-ratelimit.ratelimit.period=1m" + + # Health check configuration + - "traefik.http.services.webhook-prod.loadbalancer.healthcheck.path=/health" + - "traefik.http.services.webhook-prod.loadbalancer.healthcheck.interval=30s" + +networks: + traefik: + external: true + internal: + internal: true +``` + +### SSL/TLS Configuration + +**Production Traefik SSL Configuration:** +```yaml +# traefik.yml +certificatesResolvers: + letsencrypt: + acme: + email: admin@yourdomain.com + storage: /acme.json + httpChallenge: + entryPoint: web + # Production Let's Encrypt endpoint + caServer: https://acme-v02.api.letsencrypt.org/directory + +# Enhanced TLS configuration +tls: + options: + default: + minVersion: "VersionTLS12" + maxVersion: "VersionTLS13" + cipherSuites: + - "TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384" + - "TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384" + - "TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305" + - "TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305" + curvePreferences: + - "CurveP521" + - "CurveP384" + sniStrict: true +``` + +## 📊 Monitoring and Observability + +### Production Monitoring Stack + +**Monitoring Architecture:** +``` +┌─────────────────────────────────────────────────────────────────┐ +│ MONITORING STACK │ +├─────────────────────────────────────────────────────────────────┤ +│ Application Metrics │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │Prometheus │ │Grafana │ │AlertManager │ │ +│ │Metrics │ │Dashboards │ │Notifications │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +├─────────────────────────────────────────────────────────────────┤ +│ Log Management │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │Loki │ │Log │ │Error │ │ +│ │Aggregation │ │Analysis │ │Tracking │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +├─────────────────────────────────────────────────────────────────┤ +│ Infrastructure Monitoring │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │Node │ │Docker │ │Network │ │ +│ │Exporter │ │Stats │ │Monitoring │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Prometheus Metrics Integration + +**Enhanced webhook_app.py with metrics:** +```python +from prometheus_client import Counter, Histogram, Gauge, start_http_server +import time + +# Metrics definitions +webhook_requests_total = Counter( + 'webhook_requests_total', + 'Total webhook requests', + ['method', 'endpoint', 'status_code', 'source_type'] +) + +webhook_request_duration = Histogram( + 'webhook_request_duration_seconds', + 'Webhook request duration', + ['endpoint', 'source_type'] +) + +webhook_auth_failures = Counter( + 'webhook_auth_failures_total', + 'Total authentication failures', + ['source_type', 'failure_reason'] +) + +notification_delivery_total = Counter( + 'notification_delivery_total', + 'Total notification delivery attempts', + ['delivery_method', 'status'] +) + +active_connections = Gauge( + 'webhook_active_connections', + 'Number of active connections' +) + +# Middleware for metrics collection +def metrics_middleware(): + def decorator(f): + def wrapper(*args, **kwargs): + start_time = time.time() + source_type = 'particle' if 'ParticleBot' in request.headers.get('User-Agent', '') else 'generic' + + try: + result = f(*args, **kwargs) + status_code = result[1] if isinstance(result, tuple) else 200 + + webhook_requests_total.labels( + method=request.method, + endpoint=request.endpoint, + status_code=status_code, + source_type=source_type + ).inc() + + return result + + except Exception as e: + webhook_requests_total.labels( + method=request.method, + endpoint=request.endpoint, + status_code=500, + source_type=source_type + ).inc() + raise + + finally: + duration = time.time() - start_time + webhook_request_duration.labels( + endpoint=request.endpoint, + source_type=source_type + ).observe(duration) + + return wrapper + return decorator + +# Add metrics endpoint +@app.route('/metrics') +def metrics(): + """Prometheus metrics endpoint""" + from prometheus_client import generate_latest, CONTENT_TYPE_LATEST + return generate_latest(), 200, {'Content-Type': CONTENT_TYPE_LATEST} + +# Start metrics server +if __name__ == '__main__': + start_http_server(8000) # Prometheus metrics on port 8000 + app.run(host='0.0.0.0', port=5000) +``` +### Grafana Dashboard Configuration +**Production Dashboard JSON:** +```json +json{ + "dashboard": { + "title": "Webhook Service Production Dashboard", + "panels": [ + { + "title": "Request Rate", + "type": "graph", + "targets": [ + { + "expr": "rate(webhook_requests_total[5m])", + "legendFormat": "{{source_type}} - {{status_code}}" + } + ] + }, + { + "title": "Response Time", + "type": "graph", + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(webhook_request_duration_seconds_bucket[5m]))", + "legendFormat": "95th percentile" + }, + { + "expr": "histogram_quantile(0.50, rate(webhook_request_duration_seconds_bucket[5m]))", + "legendFormat": "50th percentile" + } + ] + }, + { + "title": "Authentication Failures", + "type": "singlestat", + "targets": [ + { + "expr": "increase(webhook_auth_failures_total[1h])", + "legendFormat": "Last Hour" + } + ] + }, + { + "title": "Notification Success Rate", + "type": "graph", + "targets": [ + { + "expr": "rate(notification_delivery_total{status=\"success\"}[5m]) / rate(notification_delivery_total[5m]) * 100", + "legendFormat": "Success Rate %" + } + ] + } + ] + } +} +``` +### Alerting Rules + +**AlertManager Configuration:** +```yml +yaml# alertmanager.yml +global: + smtp_smarthost: 'smtp.gmail.com:587' + smtp_from: 'alerts@yourdomain.com' + smtp_auth_username: 'alerts@yourdomain.com' + smtp_auth_password: 'your-app-password' + +route: + group_by: ['alertname'] + group_wait: 10s + group_interval: 10s + repeat_interval: 1h + receiver: 'webhook-alerts' + +receivers: +- name: 'webhook-alerts' + email_configs: + - to: 'admin@yourdomain.com' + subject: 'Webhook Service Alert - {{ .GroupLabels.alertname }}' + body: | + {{ range .Alerts }} + Alert: {{ .Annotations.summary }} + Description: {{ .Annotations.description }} + Instance: {{ .Labels.instance }} + Severity: {{ .Labels.severity }} + {{ end }} + +# Prometheus alerting rules +groups: +- name: webhook-service + rules: + - alert: WebhookServiceDown + expr: up{job="webhook-service"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Webhook service is down" + description: "Webhook service has been down for more than 1 minute" + + - alert: HighErrorRate + expr: rate(webhook_requests_total{status_code=~"5.."}[5m]) > 0.1 + for: 2m + labels: + severity: warning + annotations: + summary: "High error rate detected" + description: "Error rate is {{ $value }} requests per second" + + - alert: HighResponseTime + expr: histogram_quantile(0.95, rate(webhook_request_duration_seconds_bucket[5m])) > 1 + for: 5m + labels: + severity: warning + annotations: + summary: "High response time" + description: "95th percentile response time is {{ $value }} seconds" + + - alert: AuthenticationFailures + expr: increase(webhook_auth_failures_total[15m]) > 10 + for: 0m + labels: + severity: critical + annotations: + summary: "Multiple authentication failures" + description: "{{ $value }} authentication failures in the last 15 minutes" +``` +### 🎯 Production Success Metrics +**Service Level Objectives (SLOs)** +Availability SLO: 99.9% uptime +- Measurement: HTTP 200 responses / Total HTTP requests +- Error Budget: 43.2 minutes downtime per month +- Alerting: Alert if availability drops below 99.5% over 1 hour + +Latency SLO: 95% of requests < 500ms +- Measurement: Response time distribution +- Alerting: Alert if 95th percentile > 500ms for 5 minutes + +Error Rate SLO: <0.1% error rate +- Measurement: HTTP 5xx responses / Total HTTP requests +- Alerting: Alert if error rate > 0.5% over 5 minutes + +Security SLO: <10 authentication failures per day +- Measurement: Failed authentication attempts +- Alerting: Alert if >50 failures in 1 hour + +### Key Performance Indicators +**Business Metrics:** +□ Total webhook events processed per day +□ Notification delivery success rate (target: >99%) +□ Average response time (target: <100ms) +□ Cost per webhook processed +□ Mean time to detection (MTTD) for issues +□ Mean time to resolution (MTTR) for incidents +□ Infrastructure utilization efficiency +□ Customer satisfaction score +### 📞 Production Support +**Incident Response** +***Severity Levels:*** +SEVERITY 1 - Critical (Service Down) +Response Time: 15 minutes +Resolution Time: 1 hour +Actions: Immediate escalation, war room, customer communication + +SEVERITY 2 - High (Degraded Performance) +Response Time: 30 minutes +Resolution Time: 4 hours +Actions: Team lead notification, monitoring increase + +SEVERITY 3 - Medium (Minor Issues) +Response Time: 2 hours +Resolution Time: 24 hours +Actions: Standard troubleshooting, ticket tracking + +SEVERITY 4 - Low (Enhancement Requests) +Response Time: Next business day +Resolution Time: Per roadmap +Actions: Backlog prioritization +### On-Call Procedures +**24/7 Support Structure:** +Primary On-Call: Initial response and triage +Secondary On-Call: Backup coverage and escalation +Engineering Manager: Resource coordination +Senior Leadership: Business impact decisions + +Escalation Timeline: +- 15 minutes: Auto-escalate if no response +- 30 minutes: Escalate to secondary on-call +- 1 hour: Escalate to engineering manager +- 2 hours: Escalate to senior leadership + +### 🚀 Production Deployment Summary: +**This production deployment guide provides enterprise-grade reliability with:** +✅ 99.9% Uptime Target - Comprehensive monitoring and alerting +✅ Enterprise Security - Multi-layer security hardening +✅ Auto-scaling - Dynamic resource allocation +✅ Disaster Recovery - Automated backup and recovery procedures +✅ 24/7 Support - Structured incident response and on-call coverage +✅ Performance Optimization - Sub-500ms response times \ No newline at end of file