Prepare for v2 development: Add MCP integration and future development planning

- Add FUTURE_DEVELOPMENT.md with comprehensive v2 protocol specification - Add MCP integration design and implementation foundation - Add infrastructure and deployment configurations - Update system architecture for v2 evolution 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-07 14:38:22 +10:00
parent 5f94288fbb
commit 065dddf8d5
41 changed files with 14970 additions and 161 deletions
--- a/infrastructure/monitoring/configs/alert-rules.yml
+++ b/infrastructure/monitoring/configs/alert-rules.yml
@@ -0,0 +1,339 @@
+# BZZZ v2 Prometheus Alert Rules
+
+groups:
+  # P2P Network Health Rules
+  - name: p2p-network
+    rules:
+      - alert: P2PNetworkPartition
+        expr: bzzz_p2p_connected_peers < 2
+        for: 5m
+        labels:
+          severity: critical
+          component: p2p
+        annotations:
+          summary: "P2P network partition detected"
+          description: "Node {{ $labels.instance }} has less than 2 peers connected for more than 5 minutes"
+
+      - alert: P2PHighLatency
+        expr: histogram_quantile(0.95, bzzz_p2p_message_duration_seconds) > 5
+        for: 2m
+        labels:
+          severity: warning
+          component: p2p
+        annotations:
+          summary: "High P2P message latency"
+          description: "95th percentile P2P message latency is {{ $value }}s on {{ $labels.instance }}"
+
+      - alert: P2PMessageDropRate
+        expr: rate(bzzz_p2p_messages_dropped_total[5m]) > 0.1
+        for: 2m
+        labels:
+          severity: warning
+          component: p2p
+        annotations:
+          summary: "High P2P message drop rate"
+          description: "P2P message drop rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
+
+  # DHT Network Rules
+  - name: dht-network
+    rules:
+      - alert: DHTBootstrapNodeDown
+        expr: up{job="dht-bootstrap"} == 0
+        for: 1m
+        labels:
+          severity: critical
+          component: dht
+        annotations:
+          summary: "DHT bootstrap node is down"
+          description: "DHT bootstrap node {{ $labels.instance }} has been down for more than 1 minute"
+
+      - alert: DHTRoutingTableSize
+        expr: bzzz_dht_routing_table_size < 10
+        for: 5m
+        labels:
+          severity: warning
+          component: dht
+        annotations:
+          summary: "DHT routing table is small"
+          description: "DHT routing table size is {{ $value }} on {{ $labels.instance }}, indicating poor network connectivity"
+
+      - alert: DHTLookupFailureRate
+        expr: rate(bzzz_dht_lookup_failures_total[5m]) / rate(bzzz_dht_lookups_total[5m]) > 0.2
+        for: 2m
+        labels:
+          severity: warning
+          component: dht
+        annotations:
+          summary: "High DHT lookup failure rate"
+          description: "DHT lookup failure rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
+
+  # Content Store Rules
+  - name: content-store
+    rules:
+      - alert: ContentStoreDiskUsage
+        expr: (bzzz_content_store_disk_used_bytes / bzzz_content_store_disk_total_bytes) * 100 > 85
+        for: 5m
+        labels:
+          severity: warning
+          component: content-store
+          disk_usage: "{{ $value | humanize }}"
+        annotations:
+          summary: "Content store disk usage is high"
+          description: "Content store disk usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
+
+      - alert: ContentStoreDiskFull
+        expr: (bzzz_content_store_disk_used_bytes / bzzz_content_store_disk_total_bytes) * 100 > 95
+        for: 1m
+        labels:
+          severity: critical
+          component: content-store
+          disk_usage: "{{ $value | humanize }}"
+        annotations:
+          summary: "Content store disk is nearly full"
+          description: "Content store disk usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
+
+      - alert: ContentReplicationFailed
+        expr: increase(bzzz_content_replication_failures_total[10m]) > 5
+        for: 5m
+        labels:
+          severity: warning
+          component: content-store
+        annotations:
+          summary: "Content replication failures detected"
+          description: "{{ $value }} content replication failures in the last 10 minutes on {{ $labels.instance }}"
+
+      - alert: BLAKE3HashCollision
+        expr: increase(bzzz_blake3_hash_collisions_total[1h]) > 0
+        for: 0m
+        labels:
+          severity: critical
+          component: content-store
+        annotations:
+          summary: "BLAKE3 hash collision detected"
+          description: "BLAKE3 hash collision detected on {{ $labels.instance }} - immediate investigation required"
+
+  # OpenAI Integration Rules
+  - name: openai-integration
+    rules:
+      - alert: OpenAIHighCost
+        expr: bzzz_openai_cost_daily_usd > 100
+        for: 0m
+        labels:
+          severity: warning
+          component: openai-cost
+          current_cost: "{{ $value }}"
+          cost_threshold: "100"
+          cost_period: "daily"
+        annotations:
+          summary: "OpenAI daily cost exceeds threshold"
+          description: "Daily OpenAI cost is ${{ $value }}, exceeding the $100 threshold"
+
+      - alert: OpenAICriticalCost
+        expr: bzzz_openai_cost_daily_usd > 500
+        for: 0m
+        labels:
+          severity: critical
+          component: openai-cost
+          current_cost: "{{ $value }}"
+          cost_threshold: "500"
+          cost_period: "daily"
+        annotations:
+          summary: "OpenAI daily cost critically high"
+          description: "Daily OpenAI cost is ${{ $value }}, which is critically high - consider rate limiting"
+
+      - alert: OpenAIRateLimitHit
+        expr: increase(bzzz_openai_rate_limit_hits_total[5m]) > 10
+        for: 1m
+        labels:
+          severity: warning
+          component: openai-cost
+        annotations:
+          summary: "OpenAI rate limit frequently hit"
+          description: "OpenAI rate limit hit {{ $value }} times in the last 5 minutes"
+
+      - alert: OpenAIProxyDown
+        expr: up{job="openai-proxy"} == 0
+        for: 2m
+        labels:
+          severity: critical
+          component: service-health
+        annotations:
+          summary: "OpenAI proxy is down"
+          description: "OpenAI proxy service is down on {{ $labels.instance }}"
+
+  # MCP Server Rules
+  - name: mcp-server
+    rules:
+      - alert: MCPServerDown
+        expr: up{job="mcp-server"} == 0
+        for: 2m
+        labels:
+          severity: critical
+          component: service-health
+        annotations:
+          summary: "MCP server is down"
+          description: "MCP server is down on {{ $labels.instance }}"
+
+      - alert: MCPHighResponseTime
+        expr: histogram_quantile(0.95, bzzz_mcp_request_duration_seconds) > 10
+        for: 5m
+        labels:
+          severity: warning
+          component: service-health
+        annotations:
+          summary: "MCP server high response time"
+          description: "95th percentile MCP response time is {{ $value }}s on {{ $labels.instance }}"
+
+      - alert: MCPConnectionLimit
+        expr: bzzz_mcp_active_connections / bzzz_mcp_max_connections > 0.8
+        for: 2m
+        labels:
+          severity: warning
+          component: service-health
+        annotations:
+          summary: "MCP server connection limit approaching"
+          description: "MCP server connection usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
+
+  # Conversation Threading Rules
+  - name: conversation-threading
+    rules:
+      - alert: ConversationThreadLag
+        expr: bzzz_conversation_lamport_clock_lag_seconds > 30
+        for: 2m
+        labels:
+          severity: warning
+          component: conversation
+        annotations:
+          summary: "Conversation thread lag detected"
+          description: "Lamport clock lag is {{ $value }}s on {{ $labels.instance }}, indicating thread synchronization issues"
+
+      - alert: ConversationStorageFailure
+        expr: increase(bzzz_conversation_storage_failures_total[5m]) > 3
+        for: 1m
+        labels:
+          severity: critical
+          component: conversation
+        annotations:
+          summary: "Conversation storage failures"
+          description: "{{ $value }} conversation storage failures in the last 5 minutes on {{ $labels.instance }}"
+
+  # System Resource Rules
+  - name: system-resources
+    rules:
+      - alert: NodeDown
+        expr: up{job="node-exporter"} == 0
+        for: 1m
+        labels:
+          severity: critical
+          component: system
+        annotations:
+          summary: "Node is down"
+          description: "Node {{ $labels.instance }} has been down for more than 1 minute"
+
+      - alert: HighCPUUsage
+        expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 80
+        for: 5m
+        labels:
+          severity: warning
+          component: resources
+          resource_type: "cpu"
+          usage_percent: "{{ $value | humanize }}"
+          threshold: "80"
+        annotations:
+          summary: "High CPU usage"
+          description: "CPU usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
+
+      - alert: HighMemoryUsage
+        expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
+        for: 5m
+        labels:
+          severity: warning
+          component: resources
+          resource_type: "memory"
+          usage_percent: "{{ $value | humanize }}"
+          threshold: "85"
+        annotations:
+          summary: "High memory usage"
+          description: "Memory usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
+
+      - alert: DiskSpaceLow
+        expr: (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"}) * 100 < 15
+        for: 5m
+        labels:
+          severity: warning
+          component: resources
+          resource_type: "disk"
+          usage_percent: "{{ 100 - $value | humanize }}"
+          threshold: "85"
+        annotations:
+          summary: "Low disk space"
+          description: "Disk space is {{ 100 - $value | humanizePercentage }} full on {{ $labels.instance }} ({{ $labels.mountpoint }})"
+
+  # Database Rules
+  - name: database
+    rules:
+      - alert: PostgreSQLDown
+        expr: up{job="postgres"} == 0
+        for: 1m
+        labels:
+          severity: critical
+          component: service-health
+        annotations:
+          summary: "PostgreSQL is down"
+          description: "PostgreSQL database is down on {{ $labels.instance }}"
+
+      - alert: PostgreSQLHighConnections
+        expr: pg_stat_database_numbackends / pg_settings_max_connections > 0.8
+        for: 2m
+        labels:
+          severity: warning
+          component: service-health
+        annotations:
+          summary: "PostgreSQL connection limit approaching"
+          description: "PostgreSQL connection usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
+
+      - alert: RedisDown
+        expr: up{job="redis"} == 0
+        for: 1m
+        labels:
+          severity: critical
+          component: service-health
+        annotations:
+          summary: "Redis is down"
+          description: "Redis cache is down on {{ $labels.instance }}"
+
+  # Security Rules
+  - name: security
+    rules:
+      - alert: UnauthorizedP2PConnection
+        expr: increase(bzzz_p2p_unauthorized_connections_total[5m]) > 5
+        for: 1m
+        labels:
+          severity: warning
+          component: security
+          security_type: "unauthorized_connection"
+        annotations:
+          summary: "Unauthorized P2P connection attempts"
+          description: "{{ $value }} unauthorized P2P connection attempts in the last 5 minutes on {{ $labels.instance }}"
+
+      - alert: SuspiciousContentRequest
+        expr: increase(bzzz_content_suspicious_requests_total[5m]) > 10
+        for: 2m
+        labels:
+          severity: warning
+          component: security
+          security_type: "suspicious_content"
+        annotations:
+          summary: "Suspicious content requests detected"
+          description: "{{ $value }} suspicious content requests in the last 5 minutes on {{ $labels.instance }}"
+
+      - alert: FailedAuthentication
+        expr: increase(bzzz_auth_failures_total[5m]) > 20
+        for: 1m
+        labels:
+          severity: warning
+          component: security
+          security_type: "authentication_failure"
+        annotations:
+          summary: "High authentication failure rate"
+          description: "{{ $value }} authentication failures in the last 5 minutes on {{ $labels.instance }}"
--- a/infrastructure/monitoring/configs/alertmanager.yml
+++ b/infrastructure/monitoring/configs/alertmanager.yml
@@ -0,0 +1,255 @@
+# AlertManager Configuration for BZZZ v2
+
+global:
+  smtp_smarthost: 'localhost:587'
+  smtp_from: 'alerts@deepblack.cloud'
+  smtp_require_tls: true
+  resolve_timeout: 5m
+
+# Template files
+templates:
+  - '/etc/alertmanager/templates/*.tmpl'
+
+# Route configuration
+route:
+  group_by: ['cluster', 'alertname', 'service']
+  group_wait: 30s
+  group_interval: 5m
+  repeat_interval: 12h
+  receiver: 'default'
+  routes:
+    # Critical P2P network issues
+    - match:
+        severity: critical
+        component: p2p
+      receiver: 'p2p-critical'
+      group_wait: 10s
+      repeat_interval: 5m
+
+    # DHT network issues
+    - match:
+        component: dht
+      receiver: 'dht-alerts'
+      group_wait: 1m
+      repeat_interval: 30m
+
+    # Content store issues
+    - match:
+        component: content-store
+      receiver: 'storage-alerts'
+      group_wait: 2m
+      repeat_interval: 1h
+
+    # OpenAI cost alerts
+    - match:
+        component: openai-cost
+      receiver: 'cost-alerts'
+      group_wait: 5m
+      repeat_interval: 6h
+
+    # Service health alerts
+    - match:
+        component: service-health
+      receiver: 'service-alerts'
+      group_wait: 1m
+      repeat_interval: 15m
+
+    # Resource exhaustion
+    - match:
+        severity: warning
+        component: resources
+      receiver: 'resource-alerts'
+      group_wait: 5m
+      repeat_interval: 2h
+
+    # Security alerts
+    - match:
+        component: security
+      receiver: 'security-alerts'
+      group_wait: 30s
+      repeat_interval: 1h
+
+# Inhibition rules
+inhibit_rules:
+  # Silence warning if critical alert is firing
+  - source_match:
+      severity: 'critical'
+    target_match:
+      severity: 'warning'
+    equal: ['cluster', 'service', 'instance']
+
+  # Silence service alerts if node is down
+  - source_match:
+      alertname: 'NodeDown'
+    target_match:
+      component: 'service-health'
+    equal: ['instance']
+
+# Receiver configurations
+receivers:
+  # Default receiver
+  - name: 'default'
+    slack_configs:
+      - api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
+        channel: '#bzzz-monitoring'
+        title: 'BZZZ v2 Alert'
+        text: |
+          {{ range .Alerts }}
+          *Alert:* {{ .Annotations.summary }}
+          *Description:* {{ .Annotations.description }}
+          *Severity:* {{ .Labels.severity }}
+          *Instance:* {{ .Labels.instance }}
+          *Service:* {{ .Labels.service }}
+          {{ end }}
+        send_resolved: true
+
+  # Critical P2P network alerts
+  - name: 'p2p-critical'
+    slack_configs:
+      - api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
+        channel: '#bzzz-critical'
+        title: '🚨 CRITICAL P2P Network Issue'
+        text: |
+          {{ range .Alerts }}
+          *CRITICAL P2P ALERT*
+          
+          *Summary:* {{ .Annotations.summary }}
+          *Description:* {{ .Annotations.description }}
+          *Node:* {{ .Labels.instance }}
+          *Time:* {{ .StartsAt.Format "2006-01-02 15:04:05" }}
+          
+          *Immediate Action Required*
+          {{ end }}
+        send_resolved: true
+    pagerduty_configs:
+      - service_key: 'YOUR_PAGERDUTY_SERVICE_KEY'
+        description: '{{ .GroupLabels.alertname }} - {{ .Annotations.summary }}'
+
+  # DHT network alerts
+  - name: 'dht-alerts'
+    slack_configs:
+      - api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
+        channel: '#bzzz-dht'
+        title: '🔗 DHT Network Alert'
+        text: |
+          {{ range .Alerts }}
+          *DHT Network Issue*
+          
+          *Alert:* {{ .Annotations.summary }}
+          *Description:* {{ .Annotations.description }}
+          *Bootstrap Node:* {{ .Labels.instance }}
+          *Peers Connected:* {{ .Labels.peer_count | default "unknown" }}
+          {{ end }}
+        send_resolved: true
+
+  # Storage alerts
+  - name: 'storage-alerts'
+    slack_configs:
+      - api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
+        channel: '#bzzz-storage'
+        title: '💾 Content Store Alert'
+        text: |
+          {{ range .Alerts }}
+          *Storage Alert*
+          
+          *Issue:* {{ .Annotations.summary }}
+          *Details:* {{ .Annotations.description }}
+          *Node:* {{ .Labels.instance }}
+          *Usage:* {{ .Labels.disk_usage | default "unknown" }}%
+          {{ end }}
+        send_resolved: true
+
+  # OpenAI cost alerts
+  - name: 'cost-alerts'
+    slack_configs:
+      - api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
+        channel: '#bzzz-costs'
+        title: '💰 OpenAI Cost Alert'
+        text: |
+          {{ range .Alerts }}
+          *Cost Alert*
+          
+          *Alert:* {{ .Annotations.summary }}
+          *Current Cost:* ${{ .Labels.current_cost | default "unknown" }}
+          *Threshold:* ${{ .Labels.cost_threshold | default "unknown" }}
+          *Period:* {{ .Labels.cost_period | default "daily" }}
+          *Action:* {{ .Annotations.description }}
+          {{ end }}
+        send_resolved: true
+    email_configs:
+      - to: 'finance@deepblack.cloud'
+        subject: 'BZZZ v2 OpenAI Cost Alert'
+        body: |
+          OpenAI usage has exceeded cost thresholds.
+          
+          {{ range .Alerts }}
+          Alert: {{ .Annotations.summary }}
+          Current Cost: ${{ .Labels.current_cost }}
+          Threshold: ${{ .Labels.cost_threshold }}
+          {{ end }}
+
+  # Service health alerts
+  - name: 'service-alerts'
+    slack_configs:
+      - api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
+        channel: '#bzzz-services'
+        title: '🔧 Service Health Alert'
+        text: |
+          {{ range .Alerts }}
+          *Service Health Issue*
+          
+          *Service:* {{ .Labels.service }}
+          *Alert:* {{ .Annotations.summary }}
+          *Node:* {{ .Labels.instance }}
+          *Status:* {{ .Labels.status | default "unknown" }}
+          *Description:* {{ .Annotations.description }}
+          {{ end }}
+        send_resolved: true
+
+  # Resource alerts
+  - name: 'resource-alerts'
+    slack_configs:
+      - api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
+        channel: '#bzzz-resources'
+        title: '⚡ Resource Alert'
+        text: |
+          {{ range .Alerts }}
+          *Resource Warning*
+          
+          *Resource:* {{ .Labels.resource_type | default "unknown" }}
+          *Node:* {{ .Labels.instance }}
+          *Alert:* {{ .Annotations.summary }}
+          *Current Usage:* {{ .Labels.usage_percent | default "unknown" }}%
+          *Threshold:* {{ .Labels.threshold | default "unknown" }}%
+          {{ end }}
+        send_resolved: true
+
+  # Security alerts
+  - name: 'security-alerts'
+    slack_configs:
+      - api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
+        channel: '#bzzz-security'
+        title: '🔒 Security Alert'
+        text: |
+          {{ range .Alerts }}
+          *SECURITY ALERT*
+          
+          *Type:* {{ .Labels.security_type | default "unknown" }}
+          *Alert:* {{ .Annotations.summary }}
+          *Source:* {{ .Labels.instance }}
+          *Details:* {{ .Annotations.description }}
+          *Severity:* {{ .Labels.severity }}
+          {{ end }}
+        send_resolved: true
+    email_configs:
+      - to: 'security@deepblack.cloud'
+        subject: 'BZZZ v2 Security Alert'
+        body: |
+          Security alert triggered in BZZZ v2 cluster.
+          
+          {{ range .Alerts }}
+          Alert: {{ .Annotations.summary }}
+          Severity: {{ .Labels.severity }}
+          Source: {{ .Labels.instance }}
+          Details: {{ .Annotations.description }}
+          {{ end }}
--- a/infrastructure/monitoring/configs/prometheus.yml
+++ b/infrastructure/monitoring/configs/prometheus.yml
@@ -0,0 +1,216 @@
+# Prometheus Configuration for BZZZ v2 Monitoring
+
+global:
+  scrape_interval: 30s
+  scrape_timeout: 10s
+  evaluation_interval: 30s
+  external_labels:
+    cluster: 'deepblack-cloud'
+    environment: 'production'
+
+rule_files:
+  - "/etc/prometheus/rules.yml"
+
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets:
+          - alertmanager:9093
+
+scrape_configs:
+  # Prometheus self-monitoring
+  - job_name: 'prometheus'
+    static_configs:
+      - targets: ['localhost:9090']
+    metrics_path: /metrics
+    scrape_interval: 15s
+
+  # System metrics from node exporters
+  - job_name: 'node-exporter'
+    static_configs:
+      - targets:
+        - 'walnut:9100'
+        - 'ironwood:9100' 
+        - 'acacia:9100'
+    metrics_path: /metrics
+    scrape_interval: 15s
+
+  # Container metrics from cAdvisor
+  - job_name: 'cadvisor'
+    static_configs:
+      - targets:
+        - 'walnut:8080'
+        - 'ironwood:8080'
+        - 'acacia:8080'
+    metrics_path: /metrics
+    scrape_interval: 30s
+
+  # BZZZ v2 Application Services
+  - job_name: 'bzzz-agent'
+    docker_sd_configs:
+      - host: unix:///var/run/docker.sock
+        port: 9000
+    relabel_configs:
+      - source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
+        target_label: __tmp_service_name
+      - source_labels: [__tmp_service_name]
+        regex: bzzz-v2_bzzz-agent
+        action: keep
+      - source_labels: [__meta_docker_container_label_com_docker_swarm_node_id]
+        target_label: node_id
+      - source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
+        target_label: service
+    metrics_path: /metrics
+    scrape_interval: 15s
+
+  # MCP Server Metrics
+  - job_name: 'mcp-server'
+    docker_sd_configs:
+      - host: unix:///var/run/docker.sock
+        port: 3001
+    relabel_configs:
+      - source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
+        regex: bzzz-v2_mcp-server
+        action: keep
+      - source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
+        target_label: service
+    metrics_path: /metrics
+    scrape_interval: 30s
+
+  # OpenAI Proxy Metrics
+  - job_name: 'openai-proxy'
+    docker_sd_configs:
+      - host: unix:///var/run/docker.sock
+        port: 3002
+    relabel_configs:
+      - source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
+        regex: bzzz-v2_openai-proxy
+        action: keep
+      - source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
+        target_label: service
+    metrics_path: /metrics
+    scrape_interval: 30s
+
+  # Content Resolver Metrics
+  - job_name: 'content-resolver'
+    docker_sd_configs:
+      - host: unix:///var/run/docker.sock
+        port: 3003
+    relabel_configs:
+      - source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
+        regex: bzzz-v2_content-resolver
+        action: keep
+      - source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
+        target_label: service
+    metrics_path: /metrics
+    scrape_interval: 30s
+
+  # DHT Bootstrap Nodes
+  - job_name: 'dht-bootstrap'
+    static_configs:
+      - targets:
+        - 'walnut:9101'
+        - 'ironwood:9102'
+        - 'acacia:9103'
+        labels:
+          service: 'dht-bootstrap'
+    metrics_path: /metrics
+    scrape_interval: 15s
+
+  # P2P Network Metrics  
+  - job_name: 'bzzz-p2p-exporter'
+    static_configs:
+      - targets: ['bzzz-p2p-exporter:9200']
+    metrics_path: /metrics
+    scrape_interval: 30s
+
+  # DHT Network Monitoring
+  - job_name: 'dht-monitor'
+    static_configs:
+      - targets: ['dht-monitor:9201']
+    metrics_path: /metrics
+    scrape_interval: 60s
+
+  # Content Store Monitoring
+  - job_name: 'content-monitor'
+    static_configs:
+      - targets: ['content-monitor:9202']
+    metrics_path: /metrics
+    scrape_interval: 300s  # 5 minutes for storage checks
+
+  # OpenAI Cost Monitoring
+  - job_name: 'openai-cost-monitor'
+    static_configs:
+      - targets: ['openai-cost-monitor:9203']
+    metrics_path: /metrics
+    scrape_interval: 60s
+
+  # Database Metrics (PostgreSQL)
+  - job_name: 'postgres'
+    docker_sd_configs:
+      - host: unix:///var/run/docker.sock
+        port: 5432
+    relabel_configs:
+      - source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
+        regex: bzzz-v2_postgres
+        action: keep
+      - source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
+        target_label: service
+    metrics_path: /metrics
+    scrape_interval: 30s
+    params:
+      dbname: [bzzz_v2]
+
+  # Cache Metrics (Redis)
+  - job_name: 'redis'
+    docker_sd_configs:
+      - host: unix:///var/run/docker.sock
+        port: 6379
+    relabel_configs:
+      - source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
+        regex: bzzz-v2_redis
+        action: keep
+      - source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
+        target_label: service
+    metrics_path: /metrics
+    scrape_interval: 30s
+
+  # Traefik Load Balancer Metrics
+  - job_name: 'traefik'
+    static_configs:
+      - targets: ['traefik:8080']
+    metrics_path: /metrics
+    scrape_interval: 30s
+
+  # Conversation Management Metrics
+  - job_name: 'conversation-manager'
+    docker_sd_configs:
+      - host: unix:///var/run/docker.sock
+        port: 8090
+    relabel_configs:
+      - source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
+        regex: bzzz-v2_conversation-manager
+        action: keep
+      - source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
+        target_label: service
+    metrics_path: /metrics
+    scrape_interval: 30s
+
+  # External Service Monitoring (Webhook endpoints)
+  - job_name: 'external-health'
+    static_configs:
+      - targets:
+        - 'bzzz.deepblack.cloud'
+        - 'mcp.deepblack.cloud'
+        - 'resolve.deepblack.cloud'
+        - 'openai.deepblack.cloud'
+    metrics_path: /health
+    scrape_interval: 60s
+    scrape_timeout: 10s
+
+# Remote write configuration for long-term storage (optional)
+# remote_write:
+#   - url: "https://prometheus-remote-write.example.com/api/v1/write"
+#     basic_auth:
+#       username: "bzzz-cluster"
+#       password_file: "/etc/prometheus/remote-write-password"
--- a/infrastructure/monitoring/docker-compose.monitoring.yml
+++ b/infrastructure/monitoring/docker-compose.monitoring.yml
@@ -0,0 +1,372 @@
+version: '3.8'
+
+services:
+  # Prometheus for metrics collection
+  prometheus:
+    image: prom/prometheus:v2.48.0
+    networks:
+      - tengig
+      - monitoring
+    ports:
+      - "9090:9090"
+    volumes:
+      - /rust/bzzz-v2/config/prometheus:/etc/prometheus:ro
+      - /rust/bzzz-v2/data/prometheus:/prometheus
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+      - '--storage.tsdb.retention.time=30d'
+      - '--storage.tsdb.retention.size=50GB'
+      - '--web.console.libraries=/etc/prometheus/console_libraries'
+      - '--web.console.templates=/etc/prometheus/consoles'
+      - '--web.enable-lifecycle'
+      - '--web.external-url=https://prometheus.deepblack.cloud'
+    configs:
+      - source: prometheus_config
+        target: /etc/prometheus/prometheus.yml
+      - source: prometheus_rules
+        target: /etc/prometheus/rules.yml
+    deploy:
+      replicas: 1
+      placement:
+        constraints:
+          - node.hostname == walnut
+      resources:
+        limits:
+          memory: 4G
+          cpus: '2.0'
+        reservations:
+          memory: 2G
+          cpus: '1.0'
+      labels:
+        - "traefik.enable=true"
+        - "traefik.http.routers.prometheus.rule=Host(`prometheus.deepblack.cloud`)"
+        - "traefik.http.services.prometheus.loadbalancer.server.port=9090"
+        - "traefik.http.routers.prometheus.tls=true"
+
+  # Grafana for visualization  
+  grafana:
+    image: grafana/grafana:10.2.0
+    networks:
+      - tengig
+      - monitoring
+    ports:
+      - "3000:3000"
+    environment:
+      - GF_SECURITY_ADMIN_PASSWORD__FILE=/run/secrets/grafana_admin_password
+      - GF_SERVER_ROOT_URL=https://grafana.deepblack.cloud
+      - GF_SERVER_DOMAIN=grafana.deepblack.cloud
+      - GF_ANALYTICS_REPORTING_ENABLED=false
+      - GF_ANALYTICS_CHECK_FOR_UPDATES=false
+      - GF_USERS_ALLOW_SIGN_UP=false
+      - GF_INSTALL_PLUGINS=grafana-piechart-panel,grafana-worldmap-panel
+    volumes:
+      - /rust/bzzz-v2/data/grafana:/var/lib/grafana
+      - /rust/bzzz-v2/config/grafana/provisioning:/etc/grafana/provisioning:ro
+    secrets:
+      - grafana_admin_password
+    deploy:
+      replicas: 1
+      placement:
+        constraints:
+          - node.hostname == walnut
+      resources:
+        limits:
+          memory: 2G
+          cpus: '1.0'
+        reservations:
+          memory: 1G
+          cpus: '0.5'
+      labels:
+        - "traefik.enable=true"
+        - "traefik.http.routers.grafana.rule=Host(`grafana.deepblack.cloud`)"
+        - "traefik.http.services.grafana.loadbalancer.server.port=3000"
+        - "traefik.http.routers.grafana.tls=true"
+
+  # AlertManager for alerting
+  alertmanager:
+    image: prom/alertmanager:v0.26.0
+    networks:
+      - tengig
+      - monitoring
+    ports:
+      - "9093:9093"
+    volumes:
+      - /rust/bzzz-v2/data/alertmanager:/alertmanager
+    command:
+      - '--config.file=/etc/alertmanager/config.yml'
+      - '--storage.path=/alertmanager'
+      - '--web.external-url=https://alerts.deepblack.cloud'
+    configs:
+      - source: alertmanager_config
+        target: /etc/alertmanager/config.yml
+    deploy:
+      replicas: 1
+      placement:
+        constraints:
+          - node.hostname == ironwood
+      resources:
+        limits:
+          memory: 1G
+          cpus: '0.5'
+        reservations:
+          memory: 512M
+          cpus: '0.25'
+      labels:
+        - "traefik.enable=true"
+        - "traefik.http.routers.alertmanager.rule=Host(`alerts.deepblack.cloud`)"
+        - "traefik.http.services.alertmanager.loadbalancer.server.port=9093"
+        - "traefik.http.routers.alertmanager.tls=true"
+
+  # Node Exporter for system metrics
+  node-exporter:
+    image: prom/node-exporter:v1.6.1
+    networks:
+      - monitoring
+    ports:
+      - "9100:9100"
+    volumes:
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+      - /etc/hostname:/etc/nodename:ro
+    command:
+      - '--path.procfs=/host/proc'
+      - '--path.rootfs=/rootfs'
+      - '--path.sysfs=/host/sys'
+      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
+      - '--collector.textfile.directory=/var/lib/node_exporter/textfile_collector'
+    deploy:
+      mode: global
+      resources:
+        limits:
+          memory: 256M
+          cpus: '0.5'
+        reservations:
+          memory: 128M
+          cpus: '0.25'
+
+  # cAdvisor for container metrics
+  cadvisor:
+    image: gcr.io/cadvisor/cadvisor:v0.47.0
+    networks:
+      - monitoring
+    ports:
+      - "8080:8080"
+    volumes:
+      - /:/rootfs:ro
+      - /var/run:/var/run:rw
+      - /sys:/sys:ro
+      - /var/lib/docker:/var/lib/docker:ro
+      - /var/run/docker.sock:/var/run/docker.sock:ro
+    command:
+      - '--housekeeping_interval=10s'
+      - '--docker_only=true'
+      - '--disable_metrics=percpu,process,sched,tcp,udp,disk,diskIO,accelerator,hugetlb,referenced_memory,cpu_topology,resctrl'
+    deploy:
+      mode: global
+      resources:
+        limits:
+          memory: 512M
+          cpus: '0.5'
+        reservations:
+          memory: 256M
+          cpus: '0.25'
+
+  # BZZZ P2P Metrics Exporter
+  bzzz-p2p-exporter:
+    image: registry.home.deepblack.cloud/bzzz/p2p-exporter:v2.0.0
+    networks:
+      - monitoring
+      - bzzz-internal
+    ports:
+      - "9200:9200"
+    environment:
+      - BZZZ_AGENT_ENDPOINTS=http://bzzz-v2_bzzz-agent:9000
+      - DHT_BOOTSTRAP_NODES=walnut:9101,ironwood:9102,acacia:9103
+      - METRICS_PORT=9200
+      - SCRAPE_INTERVAL=30s
+    deploy:
+      replicas: 1
+      placement:
+        constraints:
+          - node.hostname == acacia
+      resources:
+        limits:
+          memory: 512M
+          cpus: '0.5'
+
+  # DHT Network Monitor
+  dht-monitor:
+    image: registry.home.deepblack.cloud/bzzz/dht-monitor:v2.0.0
+    networks:
+      - monitoring
+      - bzzz-internal
+    ports:
+      - "9201:9201"
+    environment:
+      - DHT_BOOTSTRAP_NODES=walnut:9101,ironwood:9102,acacia:9103
+      - MONITOR_PORT=9201
+      - PEER_CHECK_INTERVAL=60s
+    deploy:
+      replicas: 1
+      resources:
+        limits:
+          memory: 256M
+          cpus: '0.25'
+
+  # Content Store Monitor  
+  content-monitor:
+    image: registry.home.deepblack.cloud/bzzz/content-monitor:v2.0.0
+    networks:
+      - monitoring
+      - bzzz-internal
+    ports:
+      - "9202:9202"
+    environment:
+      - CONTENT_STORE_PATH=/rust/bzzz-v2/data/blobs
+      - MONITOR_PORT=9202
+      - CHECK_INTERVAL=300s
+    volumes:
+      - /rust/bzzz-v2/data/blobs:/data/blobs:ro
+    deploy:
+      replicas: 1
+      resources:
+        limits:
+          memory: 256M
+          cpus: '0.25'
+
+  # OpenAI Cost Monitor
+  openai-cost-monitor:
+    image: registry.home.deepblack.cloud/bzzz/openai-cost-monitor:v2.0.0
+    networks:
+      - monitoring
+      - bzzz-internal
+    ports:
+      - "9203:9203"
+    environment:
+      - POSTGRES_HOST=bzzz-v2_postgres
+      - POSTGRES_DB=bzzz_v2
+      - POSTGRES_USER=bzzz
+      - MONITOR_PORT=9203
+      - COST_ALERT_THRESHOLD=100.00
+    secrets:
+      - postgres_password
+    deploy:
+      replicas: 1
+      resources:
+        limits:
+          memory: 256M
+          cpus: '0.25'
+
+  # Log aggregation with Loki
+  loki:
+    image: grafana/loki:2.9.0
+    networks:
+      - monitoring
+    ports:
+      - "3100:3100"
+    volumes:
+      - /rust/bzzz-v2/data/loki:/loki
+    command: -config.file=/etc/loki/local-config.yaml
+    configs:
+      - source: loki_config
+        target: /etc/loki/local-config.yaml
+    deploy:
+      replicas: 1
+      placement:
+        constraints:
+          - node.hostname == acacia
+      resources:
+        limits:
+          memory: 2G
+          cpus: '1.0'
+        reservations:
+          memory: 1G
+          cpus: '0.5'
+
+  # Promtail for log shipping
+  promtail:
+    image: grafana/promtail:2.9.0
+    networks:
+      - monitoring
+    volumes:
+      - /var/log:/var/log:ro
+      - /var/lib/docker/containers:/var/lib/docker/containers:ro
+      - /rust/bzzz-v2/logs:/app/logs:ro
+    command: -config.file=/etc/promtail/config.yml
+    configs:
+      - source: promtail_config
+        target: /etc/promtail/config.yml
+    deploy:
+      mode: global
+      resources:
+        limits:
+          memory: 256M
+          cpus: '0.25'
+
+  # Jaeger for distributed tracing
+  jaeger:
+    image: jaegertracing/all-in-one:1.49
+    networks:
+      - tengig
+      - monitoring
+    ports:
+      - "16686:16686"
+      - "14268:14268"
+    environment:
+      - COLLECTOR_OTLP_ENABLED=true
+      - SPAN_STORAGE_TYPE=badger
+      - BADGER_EPHEMERAL=false
+      - BADGER_DIRECTORY_VALUE=/badger/data
+      - BADGER_DIRECTORY_KEY=/badger/key
+    volumes:
+      - /rust/bzzz-v2/data/jaeger:/badger
+    deploy:
+      replicas: 1
+      placement:
+        constraints:
+          - node.hostname == ironwood
+      resources:
+        limits:
+          memory: 1G
+          cpus: '0.5'
+      labels:
+        - "traefik.enable=true"
+        - "traefik.http.routers.jaeger.rule=Host(`tracing.deepblack.cloud`)"
+        - "traefik.http.services.jaeger.loadbalancer.server.port=16686"
+        - "traefik.http.routers.jaeger.tls=true"
+
+networks:
+  tengig:
+    external: true
+  monitoring:
+    driver: overlay
+    attachable: true
+  bzzz-internal:
+    external: true
+
+secrets:
+  grafana_admin_password:
+    external: true
+    name: bzzz_grafana_admin_password
+  postgres_password:
+    external: true
+    name: bzzz_postgres_password
+
+configs:
+  prometheus_config:
+    external: true
+    name: bzzz_prometheus_config
+  prometheus_rules:
+    external: true
+    name: bzzz_prometheus_rules
+  alertmanager_config:
+    external: true
+    name: bzzz_alertmanager_config
+  loki_config:
+    external: true
+    name: bzzz_loki_config
+  promtail_config:
+    external: true
+    name: bzzz_promtail_config