bzzz/infrastructure/monitoring/docker-compose.enhanced.yml

version: '3.8'

# Enhanced BZZZ Monitoring Stack for Docker Swarm
# Provides comprehensive observability for BZZZ distributed system

services:
  # Prometheus - Metrics Collection and Alerting
  prometheus:
    image: prom/prometheus:v2.45.0
    networks:
      - tengig
      - monitoring
    ports:
      - "9090:9090"
    volumes:
      - prometheus_data:/prometheus
      - /rust/bzzz-v2/monitoring/prometheus:/etc/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention.time=30d'
      - '--storage.tsdb.retention.size=50GB'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--web.enable-lifecycle'
      - '--web.enable-admin-api'
      - '--web.external-url=https://prometheus.chorus.services'
      - '--alertmanager.notification-queue-capacity=10000'
    deploy:
      replicas: 1
      placement:
        constraints:
          - node.hostname == walnut  # Place on main node
      resources:
        limits:
          memory: 4G
          cpus: '2.0'
        reservations:
          memory: 2G
          cpus: '1.0'
      restart_policy:
        condition: on-failure
        delay: 30s
      labels:
        - "traefik.enable=true"
        - "traefik.http.routers.prometheus.rule=Host(`prometheus.chorus.services`)"
        - "traefik.http.services.prometheus.loadbalancer.server.port=9090"
        - "traefik.http.routers.prometheus.tls=true"
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
      interval: 30s
      timeout: 10s
      retries: 3
    configs:
      - source: prometheus_config
        target: /etc/prometheus/prometheus.yml
      - source: prometheus_alerts
        target: /etc/prometheus/rules.yml

  # Grafana - Visualization and Dashboards
  grafana:
    image: grafana/grafana:10.0.3
    networks:
      - tengig
      - monitoring
    ports:
      - "3000:3000"
    volumes:
      - grafana_data:/var/lib/grafana
      - /rust/bzzz-v2/monitoring/grafana/dashboards:/etc/grafana/provisioning/dashboards
      - /rust/bzzz-v2/monitoring/grafana/datasources:/etc/grafana/provisioning/datasources
    environment:
      - GF_SECURITY_ADMIN_PASSWORD__FILE=/run/secrets/grafana_admin_password
      - GF_INSTALL_PLUGINS=grafana-piechart-panel,grafana-worldmap-panel,vonage-status-panel
      - GF_FEATURE_TOGGLES_ENABLE=publicDashboards
      - GF_SERVER_ROOT_URL=https://grafana.chorus.services
      - GF_ANALYTICS_REPORTING_ENABLED=false
      - GF_ANALYTICS_CHECK_FOR_UPDATES=false
      - GF_LOG_LEVEL=warn
    secrets:
      - grafana_admin_password
    deploy:
      replicas: 1
      placement:
        constraints:
          - node.hostname == walnut
      resources:
        limits:
          memory: 2G
          cpus: '1.0'
        reservations:
          memory: 512M
          cpus: '0.5'
      restart_policy:
        condition: on-failure
        delay: 10s
      labels:
        - "traefik.enable=true"
        - "traefik.http.routers.grafana.rule=Host(`grafana.chorus.services`)"
        - "traefik.http.services.grafana.loadbalancer.server.port=3000"
        - "traefik.http.routers.grafana.tls=true"
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:3000/api/health || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 3

  # AlertManager - Alert Routing and Notification
  alertmanager:
    image: prom/alertmanager:v0.25.0
    networks:
      - tengig
      - monitoring
    ports:
      - "9093:9093"
    volumes:
      - alertmanager_data:/alertmanager
      - /rust/bzzz-v2/monitoring/alertmanager:/etc/alertmanager
    command:
      - '--config.file=/etc/alertmanager/config.yml'
      - '--storage.path=/alertmanager'
      - '--web.external-url=https://alerts.chorus.services'
      - '--web.route-prefix=/'
      - '--cluster.listen-address=0.0.0.0:9094'
      - '--log.level=info'
    deploy:
      replicas: 1
      placement:
        constraints:
          - node.hostname == ironwood
      resources:
        limits:
          memory: 1G
          cpus: '0.5'
        reservations:
          memory: 256M
          cpus: '0.25'
      restart_policy:
        condition: on-failure
      labels:
        - "traefik.enable=true"
        - "traefik.http.routers.alertmanager.rule=Host(`alerts.chorus.services`)"
        - "traefik.http.services.alertmanager.loadbalancer.server.port=9093"
        - "traefik.http.routers.alertmanager.tls=true"
    configs:
      - source: alertmanager_config
        target: /etc/alertmanager/config.yml
    secrets:
      - slack_webhook_url
      - pagerduty_integration_key

  # Node Exporter - System Metrics (deployed on all nodes)
  node-exporter:
    image: prom/node-exporter:v1.6.1
    networks:
      - monitoring
    ports:
      - "9100:9100"
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
      - /run/systemd/private:/run/systemd/private:ro
    command:
      - '--path.procfs=/host/proc'
      - '--path.sysfs=/host/sys'
      - '--path.rootfs=/rootfs'
      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
      - '--collector.systemd'
      - '--collector.systemd.unit-include=(bzzz|docker|prometheus|grafana)\.service'
      - '--web.listen-address=0.0.0.0:9100'
    deploy:
      mode: global  # Deploy on every node
      resources:
        limits:
          memory: 256M
          cpus: '0.2'
        reservations:
          memory: 128M
          cpus: '0.1'
      restart_policy:
        condition: on-failure

  # cAdvisor - Container Metrics (deployed on all nodes)
  cadvisor:
    image: gcr.io/cadvisor/cadvisor:v0.47.2
    networks:
      - monitoring
    ports:
      - "8080:8080"
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:ro
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro
      - /dev/disk/:/dev/disk:ro
    deploy:
      mode: global
      resources:
        limits:
          memory: 512M
          cpus: '0.3'
        reservations:
          memory: 256M
          cpus: '0.15'
      restart_policy:
        condition: on-failure
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:8080/healthz"]
      interval: 30s
      timeout: 10s
      retries: 3

  # BZZZ P2P Network Exporter - Custom metrics for P2P network health
  bzzz-p2p-exporter:
    image: registry.home.deepblack.cloud/bzzz-p2p-exporter:v2.0.0
    networks:
      - monitoring
      - bzzz-internal
    ports:
      - "9200:9200"
    environment:
      - BZZZ_ENDPOINTS=http://bzzz-agent:9000
      - SCRAPE_INTERVAL=15s
      - LOG_LEVEL=info
    deploy:
      replicas: 1
      placement:
        constraints:
          - node.hostname == walnut
      resources:
        limits:
          memory: 256M
          cpus: '0.2'
        reservations:
          memory: 128M
          cpus: '0.1'
      restart_policy:
        condition: on-failure

  # DHT Monitor - DHT-specific metrics and health monitoring
  dht-monitor:
    image: registry.home.deepblack.cloud/bzzz-dht-monitor:v2.0.0
    networks:
      - monitoring
      - bzzz-internal
    ports:
      - "9201:9201"
    environment:
      - DHT_BOOTSTRAP_NODES=walnut:9101,ironwood:9102,acacia:9103
      - REPLICATION_CHECK_INTERVAL=5m
      - PROVIDER_CHECK_INTERVAL=2m
      - LOG_LEVEL=info
    deploy:
      replicas: 1
      placement:
        constraints:
          - node.hostname == ironwood
      resources:
        limits:
          memory: 512M
          cpus: '0.3'
        reservations:
          memory: 256M
          cpus: '0.15'
      restart_policy:
        condition: on-failure

  # Content Monitor - Content availability and integrity monitoring
  content-monitor:
    image: registry.home.deepblack.cloud/bzzz-content-monitor:v2.0.0
    networks:
      - monitoring
      - bzzz-internal
    ports:
      - "9202:9202"
    volumes:
      - /rust/bzzz-v2/data/blobs:/app/blobs:ro
    environment:
      - CONTENT_PATH=/app/blobs
      - INTEGRITY_CHECK_INTERVAL=15m
      - AVAILABILITY_CHECK_INTERVAL=5m
      - LOG_LEVEL=info
    deploy:
      replicas: 1
      placement:
        constraints:
          - node.hostname == acacia
      resources:
        limits:
          memory: 512M
          cpus: '0.3'
        reservations:
          memory: 256M
          cpus: '0.15'
      restart_policy:
        condition: on-failure

  # OpenAI Cost Monitor - Track OpenAI API usage and costs
  openai-cost-monitor:
    image: registry.home.deepblack.cloud/bzzz-openai-cost-monitor:v2.0.0
    networks:
      - monitoring
      - bzzz-internal
    ports:
      - "9203:9203"
    environment:
      - OPENAI_PROXY_ENDPOINT=http://openai-proxy:3002
      - COST_TRACKING_ENABLED=true
      - POSTGRES_HOST=postgres
      - LOG_LEVEL=info
    secrets:
      - postgres_password
    deploy:
      replicas: 1
      placement:
        constraints:
          - node.hostname == walnut
      resources:
        limits:
          memory: 256M
          cpus: '0.2'
        reservations:
          memory: 128M
          cpus: '0.1'
      restart_policy:
        condition: on-failure

  # Blackbox Exporter - External endpoint monitoring
  blackbox-exporter:
    image: prom/blackbox-exporter:v0.24.0
    networks:
      - monitoring
      - tengig
    ports:
      - "9115:9115"
    volumes:
      - /rust/bzzz-v2/monitoring/blackbox:/etc/blackbox_exporter
    command:
      - '--config.file=/etc/blackbox_exporter/config.yml'
      - '--web.listen-address=0.0.0.0:9115'
    deploy:
      replicas: 1
      placement:
        constraints:
          - node.hostname == ironwood
      resources:
        limits:
          memory: 128M
          cpus: '0.1'
        reservations:
          memory: 64M
          cpus: '0.05'
      restart_policy:
        condition: on-failure
    configs:
      - source: blackbox_config
        target: /etc/blackbox_exporter/config.yml

  # Loki - Log Aggregation
  loki:
    image: grafana/loki:2.8.0
    networks:
      - monitoring
    ports:
      - "3100:3100"
    volumes:
      - loki_data:/loki
      - /rust/bzzz-v2/monitoring/loki:/etc/loki
    command:
      - '-config.file=/etc/loki/config.yml'
      - '-target=all'
    deploy:
      replicas: 1
      placement:
        constraints:
          - node.hostname == walnut
      resources:
        limits:
          memory: 2G
          cpus: '1.0'
        reservations:
          memory: 1G
          cpus: '0.5'
      restart_policy:
        condition: on-failure
    configs:
      - source: loki_config
        target: /etc/loki/config.yml

  # Promtail - Log Collection Agent (deployed on all nodes)
  promtail:
    image: grafana/promtail:2.8.0
    networks:
      - monitoring
    volumes:
      - /var/log:/var/log:ro
      - /var/lib/docker/containers:/var/lib/docker/containers:ro
      - /rust/bzzz-v2/monitoring/promtail:/etc/promtail
    command:
      - '-config.file=/etc/promtail/config.yml'
      - '-server.http-listen-port=9080'
    deploy:
      mode: global
      resources:
        limits:
          memory: 256M
          cpus: '0.2'
        reservations:
          memory: 128M
          cpus: '0.1'
      restart_policy:
        condition: on-failure
    configs:
      - source: promtail_config
        target: /etc/promtail/config.yml

  # Jaeger - Distributed Tracing (Optional)
  jaeger:
    image: jaegertracing/all-in-one:1.47
    networks:
      - monitoring
      - bzzz-internal
    ports:
      - "14268:14268"  # HTTP collector
      - "16686:16686"  # Web UI
    environment:
      - COLLECTOR_OTLP_ENABLED=true
      - SPAN_STORAGE_TYPE=memory
    deploy:
      replicas: 1
      placement:
        constraints:
          - node.hostname == acacia
      resources:
        limits:
          memory: 1G
          cpus: '0.5'
        reservations:
          memory: 512M
          cpus: '0.25'
      restart_policy:
        condition: on-failure
      labels:
        - "traefik.enable=true"
        - "traefik.http.routers.jaeger.rule=Host(`tracing.chorus.services`)"
        - "traefik.http.services.jaeger.loadbalancer.server.port=16686"
        - "traefik.http.routers.jaeger.tls=true"

networks:
  tengig:
    external: true
  monitoring:
    driver: overlay
    internal: true
    attachable: false
    ipam:
      driver: default
      config:
        - subnet: 10.201.0.0/16
  bzzz-internal:
    external: true

volumes:
  prometheus_data:
    driver: local
    driver_opts:
      type: nfs
      o: addr=192.168.1.27,rw,sync
      device: ":/rust/bzzz-v2/monitoring/prometheus/data"

  grafana_data:
    driver: local
    driver_opts:
      type: nfs
      o: addr=192.168.1.27,rw,sync
      device: ":/rust/bzzz-v2/monitoring/grafana/data"

  alertmanager_data:
    driver: local
    driver_opts:
      type: nfs
      o: addr=192.168.1.27,rw,sync
      device: ":/rust/bzzz-v2/monitoring/alertmanager/data"

  loki_data:
    driver: local
    driver_opts:
      type: nfs
      o: addr=192.168.1.27,rw,sync
      device: ":/rust/bzzz-v2/monitoring/loki/data"

secrets:
  grafana_admin_password:
    external: true
    name: bzzz_grafana_admin_password

  slack_webhook_url:
    external: true
    name: bzzz_slack_webhook_url

  pagerduty_integration_key:
    external: true
    name: bzzz_pagerduty_integration_key

  postgres_password:
    external: true
    name: bzzz_postgres_password

configs:
  prometheus_config:
    external: true
    name: bzzz_prometheus_config_v2

  prometheus_alerts:
    external: true
    name: bzzz_prometheus_alerts_v2

  alertmanager_config:
    external: true
    name: bzzz_alertmanager_config_v2

  blackbox_config:
    external: true
    name: bzzz_blackbox_config_v2

  loki_config:
    external: true
    name: bzzz_loki_config_v2

  promtail_config:
    external: true
    name: bzzz_promtail_config_v2