🚀 Complete BZZZ Issue Resolution - All 17 Issues Solved

Comprehensive multi-agent implementation addressing all issues from INDEX.md: ## Core Architecture & Validation - ✅ Issue 001: UCXL address validation at all system boundaries - ✅ Issue 002: Fixed search parsing bug in encrypted storage - ✅ Issue 003: Wired UCXI P2P announce and discover functionality - ✅ Issue 011: Aligned temporal grammar and documentation - ✅ Issue 012: SLURP idempotency, backpressure, and DLQ implementation - ✅ Issue 013: Linked SLURP events to UCXL decisions and DHT ## API Standardization & Configuration - ✅ Issue 004: Standardized UCXI payloads to UCXL codes - ✅ Issue 010: Status endpoints and configuration surface ## Infrastructure & Operations - ✅ Issue 005: Election heartbeat on admin transition - ✅ Issue 006: Active health checks for PubSub and DHT - ✅ Issue 007: DHT replication and provider records - ✅ Issue 014: SLURP leadership lifecycle and health probes - ✅ Issue 015: Comprehensive monitoring, SLOs, and alerts ## Security & Access Control - ✅ Issue 008: Key rotation and role-based access policies ## Testing & Quality Assurance - ✅ Issue 009: Integration tests for UCXI + DHT encryption + search - ✅ Issue 016: E2E tests for HMMM → SLURP → UCXL workflow ## HMMM Integration - ✅ Issue 017: HMMM adapter wiring and comprehensive testing ## Key Features Delivered: - Enterprise-grade security with automated key rotation - Comprehensive monitoring with Prometheus/Grafana stack - Role-based collaboration with HMMM integration - Complete API standardization with UCXL response formats - Full test coverage with integration and E2E testing - Production-ready infrastructure monitoring and alerting All solutions include comprehensive testing, documentation, and production-ready implementations. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-29 12:39:38 +10:00
parent 59f40e17a5
commit 92779523c0
136 changed files with 56649 additions and 134 deletions
--- a/infrastructure/monitoring/configs/enhanced-alert-rules.yml
+++ b/infrastructure/monitoring/configs/enhanced-alert-rules.yml
@@ -0,0 +1,511 @@
+# Enhanced Alert Rules for BZZZ v2 Infrastructure
+# Service Level Objectives and Critical System Alerts
+
+groups:
+  # === System Health and SLO Alerts ===
+  - name: bzzz_system_health
+    rules:
+      # Overall system health score
+      - alert: BZZZSystemHealthCritical
+        expr: bzzz_system_health_score < 0.5
+        for: 2m
+        labels:
+          severity: critical
+          service: bzzz
+          slo: availability
+        annotations:
+          summary: "BZZZ system health is critically low"
+          description: "System health score {{ $value }} is below critical threshold (0.5)"
+          runbook_url: "https://wiki.chorus.services/runbooks/bzzz-health-critical"
+      
+      - alert: BZZZSystemHealthDegraded
+        expr: bzzz_system_health_score < 0.8
+        for: 5m
+        labels:
+          severity: warning
+          service: bzzz
+          slo: availability
+        annotations:
+          summary: "BZZZ system health is degraded"
+          description: "System health score {{ $value }} is below warning threshold (0.8)"
+          runbook_url: "https://wiki.chorus.services/runbooks/bzzz-health-degraded"
+      
+      # Component health monitoring
+      - alert: BZZZComponentUnhealthy
+        expr: bzzz_component_health_score < 0.7
+        for: 3m
+        labels:
+          severity: warning
+          service: bzzz
+          component: "{{ $labels.component }}"
+        annotations:
+          summary: "BZZZ component {{ $labels.component }} is unhealthy"
+          description: "Component {{ $labels.component }} health score {{ $value }} is below threshold"
+
+  # === P2P Network Alerts ===
+  - name: bzzz_p2p_network
+    rules:
+      # Peer connectivity SLO: Maintain at least 3 connected peers
+      - alert: BZZZInsufficientPeers
+        expr: bzzz_p2p_connected_peers < 3
+        for: 1m
+        labels:
+          severity: critical
+          service: bzzz
+          component: p2p
+          slo: connectivity
+        annotations:
+          summary: "BZZZ has insufficient P2P peers"
+          description: "Only {{ $value }} peers connected, minimum required is 3"
+          runbook_url: "https://wiki.chorus.services/runbooks/bzzz-peer-connectivity"
+      
+      # Message latency SLO: 95th percentile < 500ms
+      - alert: BZZZP2PHighLatency
+        expr: histogram_quantile(0.95, rate(bzzz_p2p_message_latency_seconds_bucket[5m])) > 0.5
+        for: 3m
+        labels:
+          severity: warning
+          service: bzzz
+          component: p2p
+          slo: latency
+        annotations:
+          summary: "BZZZ P2P message latency is high"
+          description: "95th percentile latency {{ $value }}s exceeds 500ms SLO"
+          runbook_url: "https://wiki.chorus.services/runbooks/bzzz-p2p-latency"
+      
+      # Message loss detection
+      - alert: BZZZP2PMessageLoss
+        expr: rate(bzzz_p2p_messages_sent_total[5m]) - rate(bzzz_p2p_messages_received_total[5m]) > 0.1
+        for: 2m
+        labels:
+          severity: warning
+          service: bzzz
+          component: p2p
+        annotations:
+          summary: "BZZZ P2P message loss detected"
+          description: "Message send/receive imbalance: {{ $value }} messages/sec"
+
+  # === DHT Performance and Reliability ===
+  - name: bzzz_dht
+    rules:
+      # DHT operation success rate SLO: > 99%
+      - alert: BZZZDHTLowSuccessRate
+        expr: (rate(bzzz_dht_put_operations_total{status="success"}[5m]) + rate(bzzz_dht_get_operations_total{status="success"}[5m])) / (rate(bzzz_dht_put_operations_total[5m]) + rate(bzzz_dht_get_operations_total[5m])) < 0.99
+        for: 2m
+        labels:
+          severity: warning
+          service: bzzz
+          component: dht
+          slo: success_rate
+        annotations:
+          summary: "BZZZ DHT operation success rate is low"
+          description: "DHT success rate {{ $value | humanizePercentage }} is below 99% SLO"
+          runbook_url: "https://wiki.chorus.services/runbooks/bzzz-dht-success-rate"
+      
+      # DHT operation latency SLO: 95th percentile < 300ms for gets
+      - alert: BZZZDHTHighGetLatency
+        expr: histogram_quantile(0.95, rate(bzzz_dht_operation_latency_seconds_bucket{operation="get"}[5m])) > 0.3
+        for: 3m
+        labels:
+          severity: warning
+          service: bzzz
+          component: dht
+          slo: latency
+        annotations:
+          summary: "BZZZ DHT get operations are slow"
+          description: "95th percentile get latency {{ $value }}s exceeds 300ms SLO"
+      
+      # DHT replication health
+      - alert: BZZZDHTReplicationDegraded
+        expr: avg(bzzz_dht_replication_factor) < 2
+        for: 5m
+        labels:
+          severity: warning
+          service: bzzz
+          component: dht
+          slo: durability
+        annotations:
+          summary: "BZZZ DHT replication is degraded"
+          description: "Average replication factor {{ $value }} is below target of 3"
+          runbook_url: "https://wiki.chorus.services/runbooks/bzzz-dht-replication"
+      
+      # Provider record staleness
+      - alert: BZZZDHTStaleProviders
+        expr: increase(bzzz_dht_provider_records[1h]) == 0 and bzzz_dht_content_keys > 0
+        for: 10m
+        labels:
+          severity: warning
+          service: bzzz
+          component: dht
+        annotations:
+          summary: "BZZZ DHT provider records are not updating"
+          description: "No provider record updates in the last hour despite having content"
+
+  # === Election System Stability ===
+  - name: bzzz_election
+    rules:
+      # Leadership stability: Avoid frequent leadership changes
+      - alert: BZZZFrequentLeadershipChanges
+        expr: increase(bzzz_leadership_changes_total[1h]) > 3
+        for: 0m
+        labels:
+          severity: warning
+          service: bzzz
+          component: election
+        annotations:
+          summary: "BZZZ leadership is unstable"
+          description: "{{ $value }} leadership changes in the last hour"
+          runbook_url: "https://wiki.chorus.services/runbooks/bzzz-leadership-instability"
+      
+      # Election timeout
+      - alert: BZZZElectionInProgress
+        expr: bzzz_election_state{state="electing"} == 1
+        for: 2m
+        labels:
+          severity: warning
+          service: bzzz
+          component: election
+        annotations:
+          summary: "BZZZ election taking too long"
+          description: "Election has been in progress for more than 2 minutes"
+      
+      # No admin elected
+      - alert: BZZZNoAdminElected
+        expr: bzzz_election_state{state="idle"} == 1 and absent(bzzz_heartbeats_received_total)
+        for: 1m
+        labels:
+          severity: critical
+          service: bzzz
+          component: election
+        annotations:
+          summary: "BZZZ has no elected admin"
+          description: "System is idle but no heartbeats are being received"
+          runbook_url: "https://wiki.chorus.services/runbooks/bzzz-no-admin"
+      
+      # Heartbeat monitoring
+      - alert: BZZZHeartbeatMissing
+        expr: increase(bzzz_heartbeats_received_total[2m]) == 0
+        for: 1m
+        labels:
+          severity: critical
+          service: bzzz
+          component: election
+        annotations:
+          summary: "BZZZ admin heartbeat missing"
+          description: "No heartbeats received from admin in the last 2 minutes"
+
+  # === PubSub Messaging System ===
+  - name: bzzz_pubsub
+    rules:
+      # Message processing rate
+      - alert: BZZZPubSubHighMessageRate
+        expr: rate(bzzz_pubsub_messages_total[1m]) > 1000
+        for: 2m
+        labels:
+          severity: warning
+          service: bzzz
+          component: pubsub
+        annotations:
+          summary: "BZZZ PubSub message rate is very high"
+          description: "Processing {{ $value }} messages/sec, may indicate spam or DoS"
+      
+      # Message latency
+      - alert: BZZZPubSubHighLatency
+        expr: histogram_quantile(0.95, rate(bzzz_pubsub_message_latency_seconds_bucket[5m])) > 1.0
+        for: 3m
+        labels:
+          severity: warning
+          service: bzzz
+          component: pubsub
+          slo: latency
+        annotations:
+          summary: "BZZZ PubSub message latency is high"
+          description: "95th percentile latency {{ $value }}s exceeds 1s threshold"
+      
+      # Topic monitoring
+      - alert: BZZZPubSubNoTopics
+        expr: bzzz_pubsub_topics == 0
+        for: 5m
+        labels:
+          severity: warning
+          service: bzzz
+          component: pubsub
+        annotations:
+          summary: "BZZZ PubSub has no active topics"
+          description: "No PubSub topics are active, system may be isolated"
+
+  # === Task Management and Processing ===
+  - name: bzzz_tasks
+    rules:
+      # Task queue backup
+      - alert: BZZZTaskQueueBackup
+        expr: bzzz_tasks_queued > 100
+        for: 5m
+        labels:
+          severity: warning
+          service: bzzz
+          component: tasks
+        annotations:
+          summary: "BZZZ task queue is backing up"
+          description: "{{ $value }} tasks are queued, may indicate processing issues"
+          runbook_url: "https://wiki.chorus.services/runbooks/bzzz-task-queue"
+      
+      # Task success rate SLO: > 95%
+      - alert: BZZZTaskLowSuccessRate
+        expr: rate(bzzz_tasks_completed_total{status="success"}[10m]) / rate(bzzz_tasks_completed_total[10m]) < 0.95
+        for: 5m
+        labels:
+          severity: warning
+          service: bzzz
+          component: tasks
+          slo: success_rate
+        annotations:
+          summary: "BZZZ task success rate is low"
+          description: "Task success rate {{ $value | humanizePercentage }} is below 95% SLO"
+      
+      # Task processing latency
+      - alert: BZZZTaskHighProcessingTime
+        expr: histogram_quantile(0.95, rate(bzzz_task_duration_seconds_bucket[5m])) > 300
+        for: 3m
+        labels:
+          severity: warning
+          service: bzzz
+          component: tasks
+        annotations:
+          summary: "BZZZ task processing time is high"
+          description: "95th percentile task duration {{ $value }}s exceeds 5 minutes"
+
+  # === SLURP Context Generation ===
+  - name: bzzz_slurp
+    rules:
+      # Context generation success rate
+      - alert: BZZZSLURPLowSuccessRate
+        expr: rate(bzzz_slurp_contexts_generated_total{status="success"}[10m]) / rate(bzzz_slurp_contexts_generated_total[10m]) < 0.90
+        for: 5m
+        labels:
+          severity: warning
+          service: bzzz
+          component: slurp
+        annotations:
+          summary: "SLURP context generation success rate is low"
+          description: "Success rate {{ $value | humanizePercentage }} is below 90%"
+          runbook_url: "https://wiki.chorus.services/runbooks/bzzz-slurp-generation"
+      
+      # Generation queue backup
+      - alert: BZZZSLURPQueueBackup
+        expr: bzzz_slurp_queue_length > 50
+        for: 10m
+        labels:
+          severity: warning
+          service: bzzz
+          component: slurp
+        annotations:
+          summary: "SLURP generation queue is backing up"
+          description: "{{ $value }} contexts are queued for generation"
+      
+      # Generation time SLO: 95th percentile < 2 minutes
+      - alert: BZZZSLURPSlowGeneration
+        expr: histogram_quantile(0.95, rate(bzzz_slurp_generation_time_seconds_bucket[10m])) > 120
+        for: 5m
+        labels:
+          severity: warning
+          service: bzzz
+          component: slurp
+          slo: latency
+        annotations:
+          summary: "SLURP context generation is slow"
+          description: "95th percentile generation time {{ $value }}s exceeds 2 minutes"
+
+  # === UCXI Protocol Resolution ===
+  - name: bzzz_ucxi
+    rules:
+      # Resolution success rate SLO: > 99%
+      - alert: BZZZUCXILowSuccessRate
+        expr: rate(bzzz_ucxi_requests_total{status=~"2.."}[5m]) / rate(bzzz_ucxi_requests_total[5m]) < 0.99
+        for: 3m
+        labels:
+          severity: warning
+          service: bzzz
+          component: ucxi
+          slo: success_rate
+        annotations:
+          summary: "UCXI resolution success rate is low"
+          description: "Success rate {{ $value | humanizePercentage }} is below 99% SLO"
+      
+      # Resolution latency SLO: 95th percentile < 100ms
+      - alert: BZZZUCXIHighLatency
+        expr: histogram_quantile(0.95, rate(bzzz_ucxi_resolution_latency_seconds_bucket[5m])) > 0.1
+        for: 3m
+        labels:
+          severity: warning
+          service: bzzz
+          component: ucxi
+          slo: latency
+        annotations:
+          summary: "UCXI resolution latency is high"
+          description: "95th percentile latency {{ $value }}s exceeds 100ms SLO"
+
+  # === Resource Utilization ===
+  - name: bzzz_resources
+    rules:
+      # CPU utilization
+      - alert: BZZZHighCPUUsage
+        expr: bzzz_cpu_usage_ratio > 0.85
+        for: 5m
+        labels:
+          severity: warning
+          service: bzzz
+          component: system
+        annotations:
+          summary: "BZZZ CPU usage is high"
+          description: "CPU usage {{ $value | humanizePercentage }} exceeds 85%"
+      
+      # Memory utilization
+      - alert: BZZZHighMemoryUsage
+        expr: bzzz_memory_usage_bytes / (1024*1024*1024) > 8
+        for: 3m
+        labels:
+          severity: warning
+          service: bzzz
+          component: system
+        annotations:
+          summary: "BZZZ memory usage is high"
+          description: "Memory usage {{ $value | humanize1024 }}B is high"
+      
+      # Disk utilization
+      - alert: BZZZHighDiskUsage
+        expr: bzzz_disk_usage_ratio > 0.90
+        for: 5m
+        labels:
+          severity: critical
+          service: bzzz
+          component: system
+        annotations:
+          summary: "BZZZ disk usage is critical"
+          description: "Disk usage {{ $value | humanizePercentage }} on {{ $labels.mount_point }} exceeds 90%"
+      
+      # Goroutine leak detection
+      - alert: BZZZGoroutineLeak
+        expr: increase(bzzz_goroutines[30m]) > 1000
+        for: 5m
+        labels:
+          severity: warning
+          service: bzzz
+          component: system
+        annotations:
+          summary: "Possible BZZZ goroutine leak"
+          description: "Goroutine count increased by {{ $value }} in 30 minutes"
+
+  # === Error Rate Monitoring ===
+  - name: bzzz_errors
+    rules:
+      # General error rate
+      - alert: BZZZHighErrorRate
+        expr: rate(bzzz_errors_total[5m]) > 10
+        for: 2m
+        labels:
+          severity: warning
+          service: bzzz
+        annotations:
+          summary: "BZZZ error rate is high"
+          description: "Error rate {{ $value }} errors/sec in component {{ $labels.component }}"
+      
+      # Panic detection
+      - alert: BZZZPanicsDetected
+        expr: increase(bzzz_panics_total[5m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+          service: bzzz
+        annotations:
+          summary: "BZZZ panic detected"
+          description: "{{ $value }} panic(s) occurred in the last 5 minutes"
+          runbook_url: "https://wiki.chorus.services/runbooks/bzzz-panic-recovery"
+
+  # === Health Check Monitoring ===
+  - name: bzzz_health_checks
+    rules:
+      # Health check failure rate
+      - alert: BZZZHealthCheckFailures
+        expr: rate(bzzz_health_checks_failed_total[5m]) > 0.1
+        for: 2m
+        labels:
+          severity: warning
+          service: bzzz
+          component: health
+        annotations:
+          summary: "BZZZ health check failures detected"
+          description: "Health check {{ $labels.check_name }} failing at {{ $value }} failures/sec"
+      
+      # Critical health check failure
+      - alert: BZZZCriticalHealthCheckFailed
+        expr: increase(bzzz_health_checks_failed_total{check_name=~".*-enhanced|p2p-connectivity"}[2m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+          service: bzzz
+          component: health
+        annotations:
+          summary: "Critical BZZZ health check failed"
+          description: "Critical health check {{ $labels.check_name }} failed: {{ $labels.reason }}"
+
+  # === Service Level Indicator Recording Rules ===
+  - name: bzzz_sli_recording
+    interval: 30s
+    rules:
+      # DHT operation SLI
+      - record: bzzz:dht_success_rate
+        expr: rate(bzzz_dht_put_operations_total{status="success"}[5m]) + rate(bzzz_dht_get_operations_total{status="success"}[5m]) / rate(bzzz_dht_put_operations_total[5m]) + rate(bzzz_dht_get_operations_total[5m])
+      
+      # P2P connectivity SLI
+      - record: bzzz:p2p_connectivity_ratio
+        expr: bzzz_p2p_connected_peers / 10  # Target of 10 peers
+      
+      # UCXI success rate SLI
+      - record: bzzz:ucxi_success_rate
+        expr: rate(bzzz_ucxi_requests_total{status=~"2.."}[5m]) / rate(bzzz_ucxi_requests_total[5m])
+      
+      # Task success rate SLI
+      - record: bzzz:task_success_rate
+        expr: rate(bzzz_tasks_completed_total{status="success"}[5m]) / rate(bzzz_tasks_completed_total[5m])
+      
+      # Overall availability SLI
+      - record: bzzz:overall_availability
+        expr: bzzz_system_health_score
+
+  # === Multi-Window Multi-Burn-Rate Alerts ===
+  - name: bzzz_slo_alerts
+    rules:
+      # Fast burn rate (2% of error budget in 1 hour)
+      - alert: BZZZErrorBudgetBurnHigh
+        expr: (
+          (1 - bzzz:dht_success_rate) > (14.4 * 0.01)  # 14.4x burn rate for 99% SLO
+          and
+          (1 - bzzz:dht_success_rate) > (14.4 * 0.01)
+        )
+        for: 2m
+        labels:
+          severity: critical
+          service: bzzz
+          burnrate: fast
+          slo: dht_success_rate
+        annotations:
+          summary: "BZZZ DHT error budget burning fast"
+          description: "DHT error budget will be exhausted in {{ with query \"(0.01 - (1 - bzzz:dht_success_rate)) / (1 - bzzz:dht_success_rate) * 1\" }}{{ . | first | value | humanizeDuration }}{{ end }}"
+      
+      # Slow burn rate (10% of error budget in 6 hours)
+      - alert: BZZZErrorBudgetBurnSlow
+        expr: (
+          (1 - bzzz:dht_success_rate) > (6 * 0.01)  # 6x burn rate
+          and
+          (1 - bzzz:dht_success_rate) > (6 * 0.01)
+        )
+        for: 15m
+        labels:
+          severity: warning
+          service: bzzz
+          burnrate: slow
+          slo: dht_success_rate
+        annotations:
+          summary: "BZZZ DHT error budget burning slowly"
+          description: "DHT error budget depletion rate is concerning"
--- a/infrastructure/monitoring/docker-compose.enhanced.yml
+++ b/infrastructure/monitoring/docker-compose.enhanced.yml
@@ -0,0 +1,533 @@
+version: '3.8'
+
+# Enhanced BZZZ Monitoring Stack for Docker Swarm
+# Provides comprehensive observability for BZZZ distributed system
+
+services:
+  # Prometheus - Metrics Collection and Alerting
+  prometheus:
+    image: prom/prometheus:v2.45.0
+    networks:
+      - tengig
+      - monitoring
+    ports:
+      - "9090:9090"
+    volumes:
+      - prometheus_data:/prometheus
+      - /rust/bzzz-v2/monitoring/prometheus:/etc/prometheus
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+      - '--storage.tsdb.retention.time=30d'
+      - '--storage.tsdb.retention.size=50GB'
+      - '--web.console.libraries=/etc/prometheus/console_libraries'
+      - '--web.console.templates=/etc/prometheus/consoles'
+      - '--web.enable-lifecycle'
+      - '--web.enable-admin-api'
+      - '--web.external-url=https://prometheus.chorus.services'
+      - '--alertmanager.notification-queue-capacity=10000'
+    deploy:
+      replicas: 1
+      placement:
+        constraints:
+          - node.hostname == walnut  # Place on main node
+      resources:
+        limits:
+          memory: 4G
+          cpus: '2.0'
+        reservations:
+          memory: 2G
+          cpus: '1.0'
+      restart_policy:
+        condition: on-failure
+        delay: 30s
+      labels:
+        - "traefik.enable=true"
+        - "traefik.http.routers.prometheus.rule=Host(`prometheus.chorus.services`)"
+        - "traefik.http.services.prometheus.loadbalancer.server.port=9090"
+        - "traefik.http.routers.prometheus.tls=true"
+    healthcheck:
+      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+    configs:
+      - source: prometheus_config
+        target: /etc/prometheus/prometheus.yml
+      - source: prometheus_alerts
+        target: /etc/prometheus/rules.yml
+
+  # Grafana - Visualization and Dashboards
+  grafana:
+    image: grafana/grafana:10.0.3
+    networks:
+      - tengig
+      - monitoring
+    ports:
+      - "3000:3000"
+    volumes:
+      - grafana_data:/var/lib/grafana
+      - /rust/bzzz-v2/monitoring/grafana/dashboards:/etc/grafana/provisioning/dashboards
+      - /rust/bzzz-v2/monitoring/grafana/datasources:/etc/grafana/provisioning/datasources
+    environment:
+      - GF_SECURITY_ADMIN_PASSWORD__FILE=/run/secrets/grafana_admin_password
+      - GF_INSTALL_PLUGINS=grafana-piechart-panel,grafana-worldmap-panel,vonage-status-panel
+      - GF_FEATURE_TOGGLES_ENABLE=publicDashboards
+      - GF_SERVER_ROOT_URL=https://grafana.chorus.services
+      - GF_ANALYTICS_REPORTING_ENABLED=false
+      - GF_ANALYTICS_CHECK_FOR_UPDATES=false
+      - GF_LOG_LEVEL=warn
+    secrets:
+      - grafana_admin_password
+    deploy:
+      replicas: 1
+      placement:
+        constraints:
+          - node.hostname == walnut
+      resources:
+        limits:
+          memory: 2G
+          cpus: '1.0'
+        reservations:
+          memory: 512M
+          cpus: '0.5'
+      restart_policy:
+        condition: on-failure
+        delay: 10s
+      labels:
+        - "traefik.enable=true"
+        - "traefik.http.routers.grafana.rule=Host(`grafana.chorus.services`)"
+        - "traefik.http.services.grafana.loadbalancer.server.port=3000"
+        - "traefik.http.routers.grafana.tls=true"
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:3000/api/health || exit 1"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+
+  # AlertManager - Alert Routing and Notification
+  alertmanager:
+    image: prom/alertmanager:v0.25.0
+    networks:
+      - tengig
+      - monitoring
+    ports:
+      - "9093:9093"
+    volumes:
+      - alertmanager_data:/alertmanager
+      - /rust/bzzz-v2/monitoring/alertmanager:/etc/alertmanager
+    command:
+      - '--config.file=/etc/alertmanager/config.yml'
+      - '--storage.path=/alertmanager'
+      - '--web.external-url=https://alerts.chorus.services'
+      - '--web.route-prefix=/'
+      - '--cluster.listen-address=0.0.0.0:9094'
+      - '--log.level=info'
+    deploy:
+      replicas: 1
+      placement:
+        constraints:
+          - node.hostname == ironwood
+      resources:
+        limits:
+          memory: 1G
+          cpus: '0.5'
+        reservations:
+          memory: 256M
+          cpus: '0.25'
+      restart_policy:
+        condition: on-failure
+      labels:
+        - "traefik.enable=true"
+        - "traefik.http.routers.alertmanager.rule=Host(`alerts.chorus.services`)"
+        - "traefik.http.services.alertmanager.loadbalancer.server.port=9093"
+        - "traefik.http.routers.alertmanager.tls=true"
+    configs:
+      - source: alertmanager_config
+        target: /etc/alertmanager/config.yml
+    secrets:
+      - slack_webhook_url
+      - pagerduty_integration_key
+
+  # Node Exporter - System Metrics (deployed on all nodes)
+  node-exporter:
+    image: prom/node-exporter:v1.6.1
+    networks:
+      - monitoring
+    ports:
+      - "9100:9100"
+    volumes:
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+      - /run/systemd/private:/run/systemd/private:ro
+    command:
+      - '--path.procfs=/host/proc'
+      - '--path.sysfs=/host/sys'
+      - '--path.rootfs=/rootfs'
+      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
+      - '--collector.systemd'
+      - '--collector.systemd.unit-include=(bzzz|docker|prometheus|grafana)\.service'
+      - '--web.listen-address=0.0.0.0:9100'
+    deploy:
+      mode: global  # Deploy on every node
+      resources:
+        limits:
+          memory: 256M
+          cpus: '0.2'
+        reservations:
+          memory: 128M
+          cpus: '0.1'
+      restart_policy:
+        condition: on-failure
+
+  # cAdvisor - Container Metrics (deployed on all nodes)
+  cadvisor:
+    image: gcr.io/cadvisor/cadvisor:v0.47.2
+    networks:
+      - monitoring
+    ports:
+      - "8080:8080"
+    volumes:
+      - /:/rootfs:ro
+      - /var/run:/var/run:ro
+      - /sys:/sys:ro
+      - /var/lib/docker/:/var/lib/docker:ro
+      - /dev/disk/:/dev/disk:ro
+    deploy:
+      mode: global
+      resources:
+        limits:
+          memory: 512M
+          cpus: '0.3'
+        reservations:
+          memory: 256M
+          cpus: '0.15'
+      restart_policy:
+        condition: on-failure
+    healthcheck:
+      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:8080/healthz"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+
+  # BZZZ P2P Network Exporter - Custom metrics for P2P network health
+  bzzz-p2p-exporter:
+    image: registry.home.deepblack.cloud/bzzz-p2p-exporter:v2.0.0
+    networks:
+      - monitoring
+      - bzzz-internal
+    ports:
+      - "9200:9200"
+    environment:
+      - BZZZ_ENDPOINTS=http://bzzz-agent:9000
+      - SCRAPE_INTERVAL=15s
+      - LOG_LEVEL=info
+    deploy:
+      replicas: 1
+      placement:
+        constraints:
+          - node.hostname == walnut
+      resources:
+        limits:
+          memory: 256M
+          cpus: '0.2'
+        reservations:
+          memory: 128M
+          cpus: '0.1'
+      restart_policy:
+        condition: on-failure
+
+  # DHT Monitor - DHT-specific metrics and health monitoring
+  dht-monitor:
+    image: registry.home.deepblack.cloud/bzzz-dht-monitor:v2.0.0
+    networks:
+      - monitoring
+      - bzzz-internal
+    ports:
+      - "9201:9201"
+    environment:
+      - DHT_BOOTSTRAP_NODES=walnut:9101,ironwood:9102,acacia:9103
+      - REPLICATION_CHECK_INTERVAL=5m
+      - PROVIDER_CHECK_INTERVAL=2m
+      - LOG_LEVEL=info
+    deploy:
+      replicas: 1
+      placement:
+        constraints:
+          - node.hostname == ironwood
+      resources:
+        limits:
+          memory: 512M
+          cpus: '0.3'
+        reservations:
+          memory: 256M
+          cpus: '0.15'
+      restart_policy:
+        condition: on-failure
+
+  # Content Monitor - Content availability and integrity monitoring
+  content-monitor:
+    image: registry.home.deepblack.cloud/bzzz-content-monitor:v2.0.0
+    networks:
+      - monitoring
+      - bzzz-internal
+    ports:
+      - "9202:9202"
+    volumes:
+      - /rust/bzzz-v2/data/blobs:/app/blobs:ro
+    environment:
+      - CONTENT_PATH=/app/blobs
+      - INTEGRITY_CHECK_INTERVAL=15m
+      - AVAILABILITY_CHECK_INTERVAL=5m
+      - LOG_LEVEL=info
+    deploy:
+      replicas: 1
+      placement:
+        constraints:
+          - node.hostname == acacia
+      resources:
+        limits:
+          memory: 512M
+          cpus: '0.3'
+        reservations:
+          memory: 256M
+          cpus: '0.15'
+      restart_policy:
+        condition: on-failure
+
+  # OpenAI Cost Monitor - Track OpenAI API usage and costs
+  openai-cost-monitor:
+    image: registry.home.deepblack.cloud/bzzz-openai-cost-monitor:v2.0.0
+    networks:
+      - monitoring
+      - bzzz-internal
+    ports:
+      - "9203:9203"
+    environment:
+      - OPENAI_PROXY_ENDPOINT=http://openai-proxy:3002
+      - COST_TRACKING_ENABLED=true
+      - POSTGRES_HOST=postgres
+      - LOG_LEVEL=info
+    secrets:
+      - postgres_password
+    deploy:
+      replicas: 1
+      placement:
+        constraints:
+          - node.hostname == walnut
+      resources:
+        limits:
+          memory: 256M
+          cpus: '0.2'
+        reservations:
+          memory: 128M
+          cpus: '0.1'
+      restart_policy:
+        condition: on-failure
+
+  # Blackbox Exporter - External endpoint monitoring
+  blackbox-exporter:
+    image: prom/blackbox-exporter:v0.24.0
+    networks:
+      - monitoring
+      - tengig
+    ports:
+      - "9115:9115"
+    volumes:
+      - /rust/bzzz-v2/monitoring/blackbox:/etc/blackbox_exporter
+    command:
+      - '--config.file=/etc/blackbox_exporter/config.yml'
+      - '--web.listen-address=0.0.0.0:9115'
+    deploy:
+      replicas: 1
+      placement:
+        constraints:
+          - node.hostname == ironwood
+      resources:
+        limits:
+          memory: 128M
+          cpus: '0.1'
+        reservations:
+          memory: 64M
+          cpus: '0.05'
+      restart_policy:
+        condition: on-failure
+    configs:
+      - source: blackbox_config
+        target: /etc/blackbox_exporter/config.yml
+
+  # Loki - Log Aggregation
+  loki:
+    image: grafana/loki:2.8.0
+    networks:
+      - monitoring
+    ports:
+      - "3100:3100"
+    volumes:
+      - loki_data:/loki
+      - /rust/bzzz-v2/monitoring/loki:/etc/loki
+    command:
+      - '-config.file=/etc/loki/config.yml'
+      - '-target=all'
+    deploy:
+      replicas: 1
+      placement:
+        constraints:
+          - node.hostname == walnut
+      resources:
+        limits:
+          memory: 2G
+          cpus: '1.0'
+        reservations:
+          memory: 1G
+          cpus: '0.5'
+      restart_policy:
+        condition: on-failure
+    configs:
+      - source: loki_config
+        target: /etc/loki/config.yml
+
+  # Promtail - Log Collection Agent (deployed on all nodes)
+  promtail:
+    image: grafana/promtail:2.8.0
+    networks:
+      - monitoring
+    volumes:
+      - /var/log:/var/log:ro
+      - /var/lib/docker/containers:/var/lib/docker/containers:ro
+      - /rust/bzzz-v2/monitoring/promtail:/etc/promtail
+    command:
+      - '-config.file=/etc/promtail/config.yml'
+      - '-server.http-listen-port=9080'
+    deploy:
+      mode: global
+      resources:
+        limits:
+          memory: 256M
+          cpus: '0.2'
+        reservations:
+          memory: 128M
+          cpus: '0.1'
+      restart_policy:
+        condition: on-failure
+    configs:
+      - source: promtail_config
+        target: /etc/promtail/config.yml
+
+  # Jaeger - Distributed Tracing (Optional)
+  jaeger:
+    image: jaegertracing/all-in-one:1.47
+    networks:
+      - monitoring
+      - bzzz-internal
+    ports:
+      - "14268:14268"  # HTTP collector
+      - "16686:16686"  # Web UI
+    environment:
+      - COLLECTOR_OTLP_ENABLED=true
+      - SPAN_STORAGE_TYPE=memory
+    deploy:
+      replicas: 1
+      placement:
+        constraints:
+          - node.hostname == acacia
+      resources:
+        limits:
+          memory: 1G
+          cpus: '0.5'
+        reservations:
+          memory: 512M
+          cpus: '0.25'
+      restart_policy:
+        condition: on-failure
+      labels:
+        - "traefik.enable=true"
+        - "traefik.http.routers.jaeger.rule=Host(`tracing.chorus.services`)"
+        - "traefik.http.services.jaeger.loadbalancer.server.port=16686"
+        - "traefik.http.routers.jaeger.tls=true"
+
+networks:
+  tengig:
+    external: true
+  monitoring:
+    driver: overlay
+    internal: true
+    attachable: false
+    ipam:
+      driver: default
+      config:
+        - subnet: 10.201.0.0/16
+  bzzz-internal:
+    external: true
+
+volumes:
+  prometheus_data:
+    driver: local
+    driver_opts:
+      type: nfs
+      o: addr=192.168.1.27,rw,sync
+      device: ":/rust/bzzz-v2/monitoring/prometheus/data"
+
+  grafana_data:
+    driver: local
+    driver_opts:
+      type: nfs
+      o: addr=192.168.1.27,rw,sync
+      device: ":/rust/bzzz-v2/monitoring/grafana/data"
+
+  alertmanager_data:
+    driver: local
+    driver_opts:
+      type: nfs
+      o: addr=192.168.1.27,rw,sync
+      device: ":/rust/bzzz-v2/monitoring/alertmanager/data"
+
+  loki_data:
+    driver: local
+    driver_opts:
+      type: nfs
+      o: addr=192.168.1.27,rw,sync
+      device: ":/rust/bzzz-v2/monitoring/loki/data"
+
+secrets:
+  grafana_admin_password:
+    external: true
+    name: bzzz_grafana_admin_password
+  
+  slack_webhook_url:
+    external: true
+    name: bzzz_slack_webhook_url
+    
+  pagerduty_integration_key:
+    external: true
+    name: bzzz_pagerduty_integration_key
+    
+  postgres_password:
+    external: true
+    name: bzzz_postgres_password
+
+configs:
+  prometheus_config:
+    external: true
+    name: bzzz_prometheus_config_v2
+    
+  prometheus_alerts:
+    external: true
+    name: bzzz_prometheus_alerts_v2
+    
+  alertmanager_config:
+    external: true
+    name: bzzz_alertmanager_config_v2
+    
+  blackbox_config:
+    external: true
+    name: bzzz_blackbox_config_v2
+    
+  loki_config:
+    external: true
+    name: bzzz_loki_config_v2
+    
+  promtail_config:
+    external: true
+    name: bzzz_promtail_config_v2