version: '3.8' # Enhanced BZZZ Monitoring Stack for Docker Swarm # Provides comprehensive observability for BZZZ distributed system services: # Prometheus - Metrics Collection and Alerting prometheus: image: prom/prometheus:v2.45.0 networks: - tengig - monitoring ports: - "9090:9090" volumes: - prometheus_data:/prometheus - /rust/bzzz-v2/monitoring/prometheus:/etc/prometheus command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--storage.tsdb.retention.time=30d' - '--storage.tsdb.retention.size=50GB' - '--web.console.libraries=/etc/prometheus/console_libraries' - '--web.console.templates=/etc/prometheus/consoles' - '--web.enable-lifecycle' - '--web.enable-admin-api' - '--web.external-url=https://prometheus.chorus.services' - '--alertmanager.notification-queue-capacity=10000' deploy: replicas: 1 placement: constraints: - node.hostname == walnut # Place on main node resources: limits: memory: 4G cpus: '2.0' reservations: memory: 2G cpus: '1.0' restart_policy: condition: on-failure delay: 30s labels: - "traefik.enable=true" - "traefik.http.routers.prometheus.rule=Host(`prometheus.chorus.services`)" - "traefik.http.services.prometheus.loadbalancer.server.port=9090" - "traefik.http.routers.prometheus.tls=true" healthcheck: test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9090/-/healthy"] interval: 30s timeout: 10s retries: 3 configs: - source: prometheus_config target: /etc/prometheus/prometheus.yml - source: prometheus_alerts target: /etc/prometheus/rules.yml # Grafana - Visualization and Dashboards grafana: image: grafana/grafana:10.0.3 networks: - tengig - monitoring ports: - "3000:3000" volumes: - grafana_data:/var/lib/grafana - /rust/bzzz-v2/monitoring/grafana/dashboards:/etc/grafana/provisioning/dashboards - /rust/bzzz-v2/monitoring/grafana/datasources:/etc/grafana/provisioning/datasources environment: - GF_SECURITY_ADMIN_PASSWORD__FILE=/run/secrets/grafana_admin_password - GF_INSTALL_PLUGINS=grafana-piechart-panel,grafana-worldmap-panel,vonage-status-panel - GF_FEATURE_TOGGLES_ENABLE=publicDashboards - GF_SERVER_ROOT_URL=https://grafana.chorus.services - GF_ANALYTICS_REPORTING_ENABLED=false - GF_ANALYTICS_CHECK_FOR_UPDATES=false - GF_LOG_LEVEL=warn secrets: - grafana_admin_password deploy: replicas: 1 placement: constraints: - node.hostname == walnut resources: limits: memory: 2G cpus: '1.0' reservations: memory: 512M cpus: '0.5' restart_policy: condition: on-failure delay: 10s labels: - "traefik.enable=true" - "traefik.http.routers.grafana.rule=Host(`grafana.chorus.services`)" - "traefik.http.services.grafana.loadbalancer.server.port=3000" - "traefik.http.routers.grafana.tls=true" healthcheck: test: ["CMD-SHELL", "curl -f http://localhost:3000/api/health || exit 1"] interval: 30s timeout: 10s retries: 3 # AlertManager - Alert Routing and Notification alertmanager: image: prom/alertmanager:v0.25.0 networks: - tengig - monitoring ports: - "9093:9093" volumes: - alertmanager_data:/alertmanager - /rust/bzzz-v2/monitoring/alertmanager:/etc/alertmanager command: - '--config.file=/etc/alertmanager/config.yml' - '--storage.path=/alertmanager' - '--web.external-url=https://alerts.chorus.services' - '--web.route-prefix=/' - '--cluster.listen-address=0.0.0.0:9094' - '--log.level=info' deploy: replicas: 1 placement: constraints: - node.hostname == ironwood resources: limits: memory: 1G cpus: '0.5' reservations: memory: 256M cpus: '0.25' restart_policy: condition: on-failure labels: - "traefik.enable=true" - "traefik.http.routers.alertmanager.rule=Host(`alerts.chorus.services`)" - "traefik.http.services.alertmanager.loadbalancer.server.port=9093" - "traefik.http.routers.alertmanager.tls=true" configs: - source: alertmanager_config target: /etc/alertmanager/config.yml secrets: - slack_webhook_url - pagerduty_integration_key # Node Exporter - System Metrics (deployed on all nodes) node-exporter: image: prom/node-exporter:v1.6.1 networks: - monitoring ports: - "9100:9100" volumes: - /proc:/host/proc:ro - /sys:/host/sys:ro - /:/rootfs:ro - /run/systemd/private:/run/systemd/private:ro command: - '--path.procfs=/host/proc' - '--path.sysfs=/host/sys' - '--path.rootfs=/rootfs' - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' - '--collector.systemd' - '--collector.systemd.unit-include=(bzzz|docker|prometheus|grafana)\.service' - '--web.listen-address=0.0.0.0:9100' deploy: mode: global # Deploy on every node resources: limits: memory: 256M cpus: '0.2' reservations: memory: 128M cpus: '0.1' restart_policy: condition: on-failure # cAdvisor - Container Metrics (deployed on all nodes) cadvisor: image: gcr.io/cadvisor/cadvisor:v0.47.2 networks: - monitoring ports: - "8080:8080" volumes: - /:/rootfs:ro - /var/run:/var/run:ro - /sys:/sys:ro - /var/lib/docker/:/var/lib/docker:ro - /dev/disk/:/dev/disk:ro deploy: mode: global resources: limits: memory: 512M cpus: '0.3' reservations: memory: 256M cpus: '0.15' restart_policy: condition: on-failure healthcheck: test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:8080/healthz"] interval: 30s timeout: 10s retries: 3 # BZZZ P2P Network Exporter - Custom metrics for P2P network health bzzz-p2p-exporter: image: registry.home.deepblack.cloud/bzzz-p2p-exporter:v2.0.0 networks: - monitoring - bzzz-internal ports: - "9200:9200" environment: - BZZZ_ENDPOINTS=http://bzzz-agent:9000 - SCRAPE_INTERVAL=15s - LOG_LEVEL=info deploy: replicas: 1 placement: constraints: - node.hostname == walnut resources: limits: memory: 256M cpus: '0.2' reservations: memory: 128M cpus: '0.1' restart_policy: condition: on-failure # DHT Monitor - DHT-specific metrics and health monitoring dht-monitor: image: registry.home.deepblack.cloud/bzzz-dht-monitor:v2.0.0 networks: - monitoring - bzzz-internal ports: - "9201:9201" environment: - DHT_BOOTSTRAP_NODES=walnut:9101,ironwood:9102,acacia:9103 - REPLICATION_CHECK_INTERVAL=5m - PROVIDER_CHECK_INTERVAL=2m - LOG_LEVEL=info deploy: replicas: 1 placement: constraints: - node.hostname == ironwood resources: limits: memory: 512M cpus: '0.3' reservations: memory: 256M cpus: '0.15' restart_policy: condition: on-failure # Content Monitor - Content availability and integrity monitoring content-monitor: image: registry.home.deepblack.cloud/bzzz-content-monitor:v2.0.0 networks: - monitoring - bzzz-internal ports: - "9202:9202" volumes: - /rust/bzzz-v2/data/blobs:/app/blobs:ro environment: - CONTENT_PATH=/app/blobs - INTEGRITY_CHECK_INTERVAL=15m - AVAILABILITY_CHECK_INTERVAL=5m - LOG_LEVEL=info deploy: replicas: 1 placement: constraints: - node.hostname == acacia resources: limits: memory: 512M cpus: '0.3' reservations: memory: 256M cpus: '0.15' restart_policy: condition: on-failure # OpenAI Cost Monitor - Track OpenAI API usage and costs openai-cost-monitor: image: registry.home.deepblack.cloud/bzzz-openai-cost-monitor:v2.0.0 networks: - monitoring - bzzz-internal ports: - "9203:9203" environment: - OPENAI_PROXY_ENDPOINT=http://openai-proxy:3002 - COST_TRACKING_ENABLED=true - POSTGRES_HOST=postgres - LOG_LEVEL=info secrets: - postgres_password deploy: replicas: 1 placement: constraints: - node.hostname == walnut resources: limits: memory: 256M cpus: '0.2' reservations: memory: 128M cpus: '0.1' restart_policy: condition: on-failure # Blackbox Exporter - External endpoint monitoring blackbox-exporter: image: prom/blackbox-exporter:v0.24.0 networks: - monitoring - tengig ports: - "9115:9115" volumes: - /rust/bzzz-v2/monitoring/blackbox:/etc/blackbox_exporter command: - '--config.file=/etc/blackbox_exporter/config.yml' - '--web.listen-address=0.0.0.0:9115' deploy: replicas: 1 placement: constraints: - node.hostname == ironwood resources: limits: memory: 128M cpus: '0.1' reservations: memory: 64M cpus: '0.05' restart_policy: condition: on-failure configs: - source: blackbox_config target: /etc/blackbox_exporter/config.yml # Loki - Log Aggregation loki: image: grafana/loki:2.8.0 networks: - monitoring ports: - "3100:3100" volumes: - loki_data:/loki - /rust/bzzz-v2/monitoring/loki:/etc/loki command: - '-config.file=/etc/loki/config.yml' - '-target=all' deploy: replicas: 1 placement: constraints: - node.hostname == walnut resources: limits: memory: 2G cpus: '1.0' reservations: memory: 1G cpus: '0.5' restart_policy: condition: on-failure configs: - source: loki_config target: /etc/loki/config.yml # Promtail - Log Collection Agent (deployed on all nodes) promtail: image: grafana/promtail:2.8.0 networks: - monitoring volumes: - /var/log:/var/log:ro - /var/lib/docker/containers:/var/lib/docker/containers:ro - /rust/bzzz-v2/monitoring/promtail:/etc/promtail command: - '-config.file=/etc/promtail/config.yml' - '-server.http-listen-port=9080' deploy: mode: global resources: limits: memory: 256M cpus: '0.2' reservations: memory: 128M cpus: '0.1' restart_policy: condition: on-failure configs: - source: promtail_config target: /etc/promtail/config.yml # Jaeger - Distributed Tracing (Optional) jaeger: image: jaegertracing/all-in-one:1.47 networks: - monitoring - bzzz-internal ports: - "14268:14268" # HTTP collector - "16686:16686" # Web UI environment: - COLLECTOR_OTLP_ENABLED=true - SPAN_STORAGE_TYPE=memory deploy: replicas: 1 placement: constraints: - node.hostname == acacia resources: limits: memory: 1G cpus: '0.5' reservations: memory: 512M cpus: '0.25' restart_policy: condition: on-failure labels: - "traefik.enable=true" - "traefik.http.routers.jaeger.rule=Host(`tracing.chorus.services`)" - "traefik.http.services.jaeger.loadbalancer.server.port=16686" - "traefik.http.routers.jaeger.tls=true" networks: tengig: external: true monitoring: driver: overlay internal: true attachable: false ipam: driver: default config: - subnet: 10.201.0.0/16 bzzz-internal: external: true volumes: prometheus_data: driver: local driver_opts: type: nfs o: addr=192.168.1.27,rw,sync device: ":/rust/bzzz-v2/monitoring/prometheus/data" grafana_data: driver: local driver_opts: type: nfs o: addr=192.168.1.27,rw,sync device: ":/rust/bzzz-v2/monitoring/grafana/data" alertmanager_data: driver: local driver_opts: type: nfs o: addr=192.168.1.27,rw,sync device: ":/rust/bzzz-v2/monitoring/alertmanager/data" loki_data: driver: local driver_opts: type: nfs o: addr=192.168.1.27,rw,sync device: ":/rust/bzzz-v2/monitoring/loki/data" secrets: grafana_admin_password: external: true name: bzzz_grafana_admin_password slack_webhook_url: external: true name: bzzz_slack_webhook_url pagerduty_integration_key: external: true name: bzzz_pagerduty_integration_key postgres_password: external: true name: bzzz_postgres_password configs: prometheus_config: external: true name: bzzz_prometheus_config_v2 prometheus_alerts: external: true name: bzzz_prometheus_alerts_v2 alertmanager_config: external: true name: bzzz_alertmanager_config_v2 blackbox_config: external: true name: bzzz_blackbox_config_v2 loki_config: external: true name: bzzz_loki_config_v2 promtail_config: external: true name: bzzz_promtail_config_v2