🚀 Complete BZZZ Issue Resolution - All 17 Issues Solved
Comprehensive multi-agent implementation addressing all issues from INDEX.md: ## Core Architecture & Validation - ✅ Issue 001: UCXL address validation at all system boundaries - ✅ Issue 002: Fixed search parsing bug in encrypted storage - ✅ Issue 003: Wired UCXI P2P announce and discover functionality - ✅ Issue 011: Aligned temporal grammar and documentation - ✅ Issue 012: SLURP idempotency, backpressure, and DLQ implementation - ✅ Issue 013: Linked SLURP events to UCXL decisions and DHT ## API Standardization & Configuration - ✅ Issue 004: Standardized UCXI payloads to UCXL codes - ✅ Issue 010: Status endpoints and configuration surface ## Infrastructure & Operations - ✅ Issue 005: Election heartbeat on admin transition - ✅ Issue 006: Active health checks for PubSub and DHT - ✅ Issue 007: DHT replication and provider records - ✅ Issue 014: SLURP leadership lifecycle and health probes - ✅ Issue 015: Comprehensive monitoring, SLOs, and alerts ## Security & Access Control - ✅ Issue 008: Key rotation and role-based access policies ## Testing & Quality Assurance - ✅ Issue 009: Integration tests for UCXI + DHT encryption + search - ✅ Issue 016: E2E tests for HMMM → SLURP → UCXL workflow ## HMMM Integration - ✅ Issue 017: HMMM adapter wiring and comprehensive testing ## Key Features Delivered: - Enterprise-grade security with automated key rotation - Comprehensive monitoring with Prometheus/Grafana stack - Role-based collaboration with HMMM integration - Complete API standardization with UCXL response formats - Full test coverage with integration and E2E testing - Production-ready infrastructure monitoring and alerting All solutions include comprehensive testing, documentation, and production-ready implementations. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
533
infrastructure/monitoring/docker-compose.enhanced.yml
Normal file
533
infrastructure/monitoring/docker-compose.enhanced.yml
Normal file
@@ -0,0 +1,533 @@
|
||||
version: '3.8'
|
||||
|
||||
# Enhanced BZZZ Monitoring Stack for Docker Swarm
|
||||
# Provides comprehensive observability for BZZZ distributed system
|
||||
|
||||
services:
|
||||
# Prometheus - Metrics Collection and Alerting
|
||||
prometheus:
|
||||
image: prom/prometheus:v2.45.0
|
||||
networks:
|
||||
- tengig
|
||||
- monitoring
|
||||
ports:
|
||||
- "9090:9090"
|
||||
volumes:
|
||||
- prometheus_data:/prometheus
|
||||
- /rust/bzzz-v2/monitoring/prometheus:/etc/prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--storage.tsdb.retention.time=30d'
|
||||
- '--storage.tsdb.retention.size=50GB'
|
||||
- '--web.console.libraries=/etc/prometheus/console_libraries'
|
||||
- '--web.console.templates=/etc/prometheus/consoles'
|
||||
- '--web.enable-lifecycle'
|
||||
- '--web.enable-admin-api'
|
||||
- '--web.external-url=https://prometheus.chorus.services'
|
||||
- '--alertmanager.notification-queue-capacity=10000'
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == walnut # Place on main node
|
||||
resources:
|
||||
limits:
|
||||
memory: 4G
|
||||
cpus: '2.0'
|
||||
reservations:
|
||||
memory: 2G
|
||||
cpus: '1.0'
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
delay: 30s
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.prometheus.rule=Host(`prometheus.chorus.services`)"
|
||||
- "traefik.http.services.prometheus.loadbalancer.server.port=9090"
|
||||
- "traefik.http.routers.prometheus.tls=true"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
configs:
|
||||
- source: prometheus_config
|
||||
target: /etc/prometheus/prometheus.yml
|
||||
- source: prometheus_alerts
|
||||
target: /etc/prometheus/rules.yml
|
||||
|
||||
# Grafana - Visualization and Dashboards
|
||||
grafana:
|
||||
image: grafana/grafana:10.0.3
|
||||
networks:
|
||||
- tengig
|
||||
- monitoring
|
||||
ports:
|
||||
- "3000:3000"
|
||||
volumes:
|
||||
- grafana_data:/var/lib/grafana
|
||||
- /rust/bzzz-v2/monitoring/grafana/dashboards:/etc/grafana/provisioning/dashboards
|
||||
- /rust/bzzz-v2/monitoring/grafana/datasources:/etc/grafana/provisioning/datasources
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_PASSWORD__FILE=/run/secrets/grafana_admin_password
|
||||
- GF_INSTALL_PLUGINS=grafana-piechart-panel,grafana-worldmap-panel,vonage-status-panel
|
||||
- GF_FEATURE_TOGGLES_ENABLE=publicDashboards
|
||||
- GF_SERVER_ROOT_URL=https://grafana.chorus.services
|
||||
- GF_ANALYTICS_REPORTING_ENABLED=false
|
||||
- GF_ANALYTICS_CHECK_FOR_UPDATES=false
|
||||
- GF_LOG_LEVEL=warn
|
||||
secrets:
|
||||
- grafana_admin_password
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == walnut
|
||||
resources:
|
||||
limits:
|
||||
memory: 2G
|
||||
cpus: '1.0'
|
||||
reservations:
|
||||
memory: 512M
|
||||
cpus: '0.5'
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
delay: 10s
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.grafana.rule=Host(`grafana.chorus.services`)"
|
||||
- "traefik.http.services.grafana.loadbalancer.server.port=3000"
|
||||
- "traefik.http.routers.grafana.tls=true"
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://localhost:3000/api/health || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# AlertManager - Alert Routing and Notification
|
||||
alertmanager:
|
||||
image: prom/alertmanager:v0.25.0
|
||||
networks:
|
||||
- tengig
|
||||
- monitoring
|
||||
ports:
|
||||
- "9093:9093"
|
||||
volumes:
|
||||
- alertmanager_data:/alertmanager
|
||||
- /rust/bzzz-v2/monitoring/alertmanager:/etc/alertmanager
|
||||
command:
|
||||
- '--config.file=/etc/alertmanager/config.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
- '--web.external-url=https://alerts.chorus.services'
|
||||
- '--web.route-prefix=/'
|
||||
- '--cluster.listen-address=0.0.0.0:9094'
|
||||
- '--log.level=info'
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == ironwood
|
||||
resources:
|
||||
limits:
|
||||
memory: 1G
|
||||
cpus: '0.5'
|
||||
reservations:
|
||||
memory: 256M
|
||||
cpus: '0.25'
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.alertmanager.rule=Host(`alerts.chorus.services`)"
|
||||
- "traefik.http.services.alertmanager.loadbalancer.server.port=9093"
|
||||
- "traefik.http.routers.alertmanager.tls=true"
|
||||
configs:
|
||||
- source: alertmanager_config
|
||||
target: /etc/alertmanager/config.yml
|
||||
secrets:
|
||||
- slack_webhook_url
|
||||
- pagerduty_integration_key
|
||||
|
||||
# Node Exporter - System Metrics (deployed on all nodes)
|
||||
node-exporter:
|
||||
image: prom/node-exporter:v1.6.1
|
||||
networks:
|
||||
- monitoring
|
||||
ports:
|
||||
- "9100:9100"
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
- /run/systemd/private:/run/systemd/private:ro
|
||||
command:
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--path.sysfs=/host/sys'
|
||||
- '--path.rootfs=/rootfs'
|
||||
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
|
||||
- '--collector.systemd'
|
||||
- '--collector.systemd.unit-include=(bzzz|docker|prometheus|grafana)\.service'
|
||||
- '--web.listen-address=0.0.0.0:9100'
|
||||
deploy:
|
||||
mode: global # Deploy on every node
|
||||
resources:
|
||||
limits:
|
||||
memory: 256M
|
||||
cpus: '0.2'
|
||||
reservations:
|
||||
memory: 128M
|
||||
cpus: '0.1'
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
|
||||
# cAdvisor - Container Metrics (deployed on all nodes)
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:v0.47.2
|
||||
networks:
|
||||
- monitoring
|
||||
ports:
|
||||
- "8080:8080"
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:ro
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
- /dev/disk/:/dev/disk:ro
|
||||
deploy:
|
||||
mode: global
|
||||
resources:
|
||||
limits:
|
||||
memory: 512M
|
||||
cpus: '0.3'
|
||||
reservations:
|
||||
memory: 256M
|
||||
cpus: '0.15'
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:8080/healthz"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# BZZZ P2P Network Exporter - Custom metrics for P2P network health
|
||||
bzzz-p2p-exporter:
|
||||
image: registry.home.deepblack.cloud/bzzz-p2p-exporter:v2.0.0
|
||||
networks:
|
||||
- monitoring
|
||||
- bzzz-internal
|
||||
ports:
|
||||
- "9200:9200"
|
||||
environment:
|
||||
- BZZZ_ENDPOINTS=http://bzzz-agent:9000
|
||||
- SCRAPE_INTERVAL=15s
|
||||
- LOG_LEVEL=info
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == walnut
|
||||
resources:
|
||||
limits:
|
||||
memory: 256M
|
||||
cpus: '0.2'
|
||||
reservations:
|
||||
memory: 128M
|
||||
cpus: '0.1'
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
|
||||
# DHT Monitor - DHT-specific metrics and health monitoring
|
||||
dht-monitor:
|
||||
image: registry.home.deepblack.cloud/bzzz-dht-monitor:v2.0.0
|
||||
networks:
|
||||
- monitoring
|
||||
- bzzz-internal
|
||||
ports:
|
||||
- "9201:9201"
|
||||
environment:
|
||||
- DHT_BOOTSTRAP_NODES=walnut:9101,ironwood:9102,acacia:9103
|
||||
- REPLICATION_CHECK_INTERVAL=5m
|
||||
- PROVIDER_CHECK_INTERVAL=2m
|
||||
- LOG_LEVEL=info
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == ironwood
|
||||
resources:
|
||||
limits:
|
||||
memory: 512M
|
||||
cpus: '0.3'
|
||||
reservations:
|
||||
memory: 256M
|
||||
cpus: '0.15'
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
|
||||
# Content Monitor - Content availability and integrity monitoring
|
||||
content-monitor:
|
||||
image: registry.home.deepblack.cloud/bzzz-content-monitor:v2.0.0
|
||||
networks:
|
||||
- monitoring
|
||||
- bzzz-internal
|
||||
ports:
|
||||
- "9202:9202"
|
||||
volumes:
|
||||
- /rust/bzzz-v2/data/blobs:/app/blobs:ro
|
||||
environment:
|
||||
- CONTENT_PATH=/app/blobs
|
||||
- INTEGRITY_CHECK_INTERVAL=15m
|
||||
- AVAILABILITY_CHECK_INTERVAL=5m
|
||||
- LOG_LEVEL=info
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == acacia
|
||||
resources:
|
||||
limits:
|
||||
memory: 512M
|
||||
cpus: '0.3'
|
||||
reservations:
|
||||
memory: 256M
|
||||
cpus: '0.15'
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
|
||||
# OpenAI Cost Monitor - Track OpenAI API usage and costs
|
||||
openai-cost-monitor:
|
||||
image: registry.home.deepblack.cloud/bzzz-openai-cost-monitor:v2.0.0
|
||||
networks:
|
||||
- monitoring
|
||||
- bzzz-internal
|
||||
ports:
|
||||
- "9203:9203"
|
||||
environment:
|
||||
- OPENAI_PROXY_ENDPOINT=http://openai-proxy:3002
|
||||
- COST_TRACKING_ENABLED=true
|
||||
- POSTGRES_HOST=postgres
|
||||
- LOG_LEVEL=info
|
||||
secrets:
|
||||
- postgres_password
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == walnut
|
||||
resources:
|
||||
limits:
|
||||
memory: 256M
|
||||
cpus: '0.2'
|
||||
reservations:
|
||||
memory: 128M
|
||||
cpus: '0.1'
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
|
||||
# Blackbox Exporter - External endpoint monitoring
|
||||
blackbox-exporter:
|
||||
image: prom/blackbox-exporter:v0.24.0
|
||||
networks:
|
||||
- monitoring
|
||||
- tengig
|
||||
ports:
|
||||
- "9115:9115"
|
||||
volumes:
|
||||
- /rust/bzzz-v2/monitoring/blackbox:/etc/blackbox_exporter
|
||||
command:
|
||||
- '--config.file=/etc/blackbox_exporter/config.yml'
|
||||
- '--web.listen-address=0.0.0.0:9115'
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == ironwood
|
||||
resources:
|
||||
limits:
|
||||
memory: 128M
|
||||
cpus: '0.1'
|
||||
reservations:
|
||||
memory: 64M
|
||||
cpus: '0.05'
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
configs:
|
||||
- source: blackbox_config
|
||||
target: /etc/blackbox_exporter/config.yml
|
||||
|
||||
# Loki - Log Aggregation
|
||||
loki:
|
||||
image: grafana/loki:2.8.0
|
||||
networks:
|
||||
- monitoring
|
||||
ports:
|
||||
- "3100:3100"
|
||||
volumes:
|
||||
- loki_data:/loki
|
||||
- /rust/bzzz-v2/monitoring/loki:/etc/loki
|
||||
command:
|
||||
- '-config.file=/etc/loki/config.yml'
|
||||
- '-target=all'
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == walnut
|
||||
resources:
|
||||
limits:
|
||||
memory: 2G
|
||||
cpus: '1.0'
|
||||
reservations:
|
||||
memory: 1G
|
||||
cpus: '0.5'
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
configs:
|
||||
- source: loki_config
|
||||
target: /etc/loki/config.yml
|
||||
|
||||
# Promtail - Log Collection Agent (deployed on all nodes)
|
||||
promtail:
|
||||
image: grafana/promtail:2.8.0
|
||||
networks:
|
||||
- monitoring
|
||||
volumes:
|
||||
- /var/log:/var/log:ro
|
||||
- /var/lib/docker/containers:/var/lib/docker/containers:ro
|
||||
- /rust/bzzz-v2/monitoring/promtail:/etc/promtail
|
||||
command:
|
||||
- '-config.file=/etc/promtail/config.yml'
|
||||
- '-server.http-listen-port=9080'
|
||||
deploy:
|
||||
mode: global
|
||||
resources:
|
||||
limits:
|
||||
memory: 256M
|
||||
cpus: '0.2'
|
||||
reservations:
|
||||
memory: 128M
|
||||
cpus: '0.1'
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
configs:
|
||||
- source: promtail_config
|
||||
target: /etc/promtail/config.yml
|
||||
|
||||
# Jaeger - Distributed Tracing (Optional)
|
||||
jaeger:
|
||||
image: jaegertracing/all-in-one:1.47
|
||||
networks:
|
||||
- monitoring
|
||||
- bzzz-internal
|
||||
ports:
|
||||
- "14268:14268" # HTTP collector
|
||||
- "16686:16686" # Web UI
|
||||
environment:
|
||||
- COLLECTOR_OTLP_ENABLED=true
|
||||
- SPAN_STORAGE_TYPE=memory
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == acacia
|
||||
resources:
|
||||
limits:
|
||||
memory: 1G
|
||||
cpus: '0.5'
|
||||
reservations:
|
||||
memory: 512M
|
||||
cpus: '0.25'
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.jaeger.rule=Host(`tracing.chorus.services`)"
|
||||
- "traefik.http.services.jaeger.loadbalancer.server.port=16686"
|
||||
- "traefik.http.routers.jaeger.tls=true"
|
||||
|
||||
networks:
|
||||
tengig:
|
||||
external: true
|
||||
monitoring:
|
||||
driver: overlay
|
||||
internal: true
|
||||
attachable: false
|
||||
ipam:
|
||||
driver: default
|
||||
config:
|
||||
- subnet: 10.201.0.0/16
|
||||
bzzz-internal:
|
||||
external: true
|
||||
|
||||
volumes:
|
||||
prometheus_data:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: nfs
|
||||
o: addr=192.168.1.27,rw,sync
|
||||
device: ":/rust/bzzz-v2/monitoring/prometheus/data"
|
||||
|
||||
grafana_data:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: nfs
|
||||
o: addr=192.168.1.27,rw,sync
|
||||
device: ":/rust/bzzz-v2/monitoring/grafana/data"
|
||||
|
||||
alertmanager_data:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: nfs
|
||||
o: addr=192.168.1.27,rw,sync
|
||||
device: ":/rust/bzzz-v2/monitoring/alertmanager/data"
|
||||
|
||||
loki_data:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: nfs
|
||||
o: addr=192.168.1.27,rw,sync
|
||||
device: ":/rust/bzzz-v2/monitoring/loki/data"
|
||||
|
||||
secrets:
|
||||
grafana_admin_password:
|
||||
external: true
|
||||
name: bzzz_grafana_admin_password
|
||||
|
||||
slack_webhook_url:
|
||||
external: true
|
||||
name: bzzz_slack_webhook_url
|
||||
|
||||
pagerduty_integration_key:
|
||||
external: true
|
||||
name: bzzz_pagerduty_integration_key
|
||||
|
||||
postgres_password:
|
||||
external: true
|
||||
name: bzzz_postgres_password
|
||||
|
||||
configs:
|
||||
prometheus_config:
|
||||
external: true
|
||||
name: bzzz_prometheus_config_v2
|
||||
|
||||
prometheus_alerts:
|
||||
external: true
|
||||
name: bzzz_prometheus_alerts_v2
|
||||
|
||||
alertmanager_config:
|
||||
external: true
|
||||
name: bzzz_alertmanager_config_v2
|
||||
|
||||
blackbox_config:
|
||||
external: true
|
||||
name: bzzz_blackbox_config_v2
|
||||
|
||||
loki_config:
|
||||
external: true
|
||||
name: bzzz_loki_config_v2
|
||||
|
||||
promtail_config:
|
||||
external: true
|
||||
name: bzzz_promtail_config_v2
|
||||
Reference in New Issue
Block a user