Files
CHORUS/docker/docker-compose.yml
anthonyrawlins 859e5e1e02 fix: P2P connectivity broken - containers isolated at 0 peers
Current state: All 9 CHORUS containers show "📊 Status: 0 connected peers"
and " No winner found in election". P2P connectivity completely broken.

Issues:
- libp2p AutoRelay was attempted to be fixed but connectivity still failing
- Elections cannot receive candidacy or votes due to isolation
- Task Execution Engine (v0.5.0) implementation completed but P2P regressed

Status: Need to compare with pre-Task-Engine baseline to identify root cause
Next: Checkout working version before d1252ad to find what broke connectivity

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-25 16:41:08 +10:00

652 lines
19 KiB
YAML

version: "3.9"
services:
chorus:
image: anthonyrawlins/chorus:v0.5.4-p2p-fix
# REQUIRED: License configuration (CHORUS will not start without this)
environment:
# CRITICAL: License configuration - REQUIRED for operation
- CHORUS_LICENSE_ID_FILE=/run/secrets/chorus_license_id
- CHORUS_CLUSTER_ID=${CHORUS_CLUSTER_ID:-docker-cluster}
- CHORUS_KACHING_URL=${CHORUS_KACHING_URL:-https://kaching.chorus.services/api}
# Agent configuration
- CHORUS_AGENT_ID=${CHORUS_AGENT_ID:-} # Auto-generated if not provided
- CHORUS_SPECIALIZATION=${CHORUS_SPECIALIZATION:-general_developer}
- CHORUS_MAX_TASKS=${CHORUS_MAX_TASKS:-3}
- CHORUS_CAPABILITIES=general_development,task_coordination,admin_election
# Network configuration
- CHORUS_API_PORT=8080
- CHORUS_HEALTH_PORT=8081
- CHORUS_P2P_PORT=9000
- CHORUS_BIND_ADDRESS=0.0.0.0
# Scaling optimizations (as per WHOOSH issue #7)
- CHORUS_MDNS_ENABLED=false # Disabled for container/swarm environments
- CHORUS_DIALS_PER_SEC=5 # Rate limit outbound connections to prevent storms
- CHORUS_MAX_CONCURRENT_DHT=16 # Limit concurrent DHT queries
# Election stability windows (Medium-risk fix 2.1)
- CHORUS_ELECTION_MIN_TERM=30s # Minimum time between elections to prevent churn
- CHORUS_LEADER_MIN_TERM=45s # Minimum time before challenging healthy leader
# Assignment system for runtime configuration (Medium-risk fix 2.2)
- ASSIGN_URL=${ASSIGN_URL:-} # Optional: WHOOSH assignment endpoint
- TASK_SLOT=${TASK_SLOT:-} # Optional: Task slot identifier
- TASK_ID=${TASK_ID:-} # Optional: Task identifier
- NODE_ID=${NODE_ID:-} # Optional: Node identifier
# Bootstrap pool configuration (supports JSON and CSV)
- BOOTSTRAP_JSON=/config/bootstrap.json # Optional: JSON bootstrap config
- CHORUS_BOOTSTRAP_PEERS=${CHORUS_BOOTSTRAP_PEERS:-} # CSV fallback
# AI configuration - Provider selection
- CHORUS_AI_PROVIDER=${CHORUS_AI_PROVIDER:-resetdata}
# ResetData configuration (default provider)
- RESETDATA_BASE_URL=${RESETDATA_BASE_URL:-https://models.au-syd.resetdata.ai/v1}
- RESETDATA_API_KEY_FILE=/run/secrets/resetdata_api_key
- RESETDATA_MODEL=${RESETDATA_MODEL:-meta/llama-3.1-8b-instruct}
# Ollama configuration (alternative provider)
- OLLAMA_ENDPOINT=${OLLAMA_ENDPOINT:-http://host.docker.internal:11434}
# Model configuration
- CHORUS_MODELS=${CHORUS_MODELS:-meta/llama-3.1-8b-instruct}
- CHORUS_DEFAULT_REASONING_MODEL=${CHORUS_DEFAULT_REASONING_MODEL:-meta/llama-3.1-8b-instruct}
# Logging configuration
- LOG_LEVEL=${LOG_LEVEL:-info}
- LOG_FORMAT=${LOG_FORMAT:-structured}
# BACKBEAT configuration
- CHORUS_BACKBEAT_ENABLED=${CHORUS_BACKBEAT_ENABLED:-true}
- CHORUS_BACKBEAT_CLUSTER_ID=${CHORUS_BACKBEAT_CLUSTER_ID:-chorus-production}
- CHORUS_BACKBEAT_AGENT_ID=${CHORUS_BACKBEAT_AGENT_ID:-} # Auto-generated from CHORUS_AGENT_ID
- CHORUS_BACKBEAT_NATS_URL=${CHORUS_BACKBEAT_NATS_URL:-nats://backbeat-nats:4222}
# Prompt sourcing (mounted volume)
- CHORUS_PROMPTS_DIR=/etc/chorus/prompts
- CHORUS_DEFAULT_INSTRUCTIONS_PATH=/etc/chorus/prompts/defaults.md
- CHORUS_ROLE=${CHORUS_ROLE:-arbiter}
# Docker secrets for sensitive configuration
secrets:
- chorus_license_id
- resetdata_api_key
# Configuration files
configs:
- source: chorus_bootstrap
target: /config/bootstrap.json
# Persistent data storage
volumes:
- chorus_data:/app/data
# Mount prompts directory read-only for role YAMLs and defaults.md
- /rust/containers/WHOOSH/prompts:/etc/chorus/prompts:ro
# Network ports
ports:
- "${CHORUS_P2P_PORT:-9000}:9000" # P2P communication
# Container resource limits
deploy:
mode: replicated
replicas: ${CHORUS_REPLICAS:-9}
update_config:
parallelism: 1
delay: 10s
failure_action: pause
order: start-first
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
window: 120s
resources:
limits:
cpus: "${CHORUS_CPU_LIMIT:-1.0}"
memory: "${CHORUS_MEMORY_LIMIT:-1G}"
reservations:
cpus: "0.1"
memory: 128M
placement:
constraints:
- node.hostname != acacia
preferences:
- spread: node.hostname
# CHORUS is internal-only, no Traefik labels needed
# Network configuration
networks:
- chorus_net
# Host resolution for external services
extra_hosts:
- "host.docker.internal:host-gateway"
# Container logging configuration
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
tag: "{{.ImageName}}/{{.Name}}/{{.ID}}"
# Health check configuration
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8081/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
whoosh:
image: anthonyrawlins/whoosh:scaling-v1.0.0
ports:
- target: 8080
published: 8800
protocol: tcp
mode: ingress
environment:
# Database configuration
WHOOSH_DATABASE_DB_HOST: postgres
WHOOSH_DATABASE_DB_PORT: 5432
WHOOSH_DATABASE_DB_NAME: whoosh
WHOOSH_DATABASE_DB_USER: whoosh
WHOOSH_DATABASE_DB_PASSWORD_FILE: /run/secrets/whoosh_db_password
WHOOSH_DATABASE_DB_SSL_MODE: disable
WHOOSH_DATABASE_DB_AUTO_MIGRATE: "true"
# Server configuration
WHOOSH_SERVER_LISTEN_ADDR: ":8080"
WHOOSH_SERVER_READ_TIMEOUT: "30s"
WHOOSH_SERVER_WRITE_TIMEOUT: "30s"
WHOOSH_SERVER_SHUTDOWN_TIMEOUT: "30s"
# GITEA configuration
WHOOSH_GITEA_BASE_URL: https://gitea.chorus.services
WHOOSH_GITEA_TOKEN_FILE: /run/secrets/gitea_token
WHOOSH_GITEA_WEBHOOK_TOKEN_FILE: /run/secrets/webhook_token
WHOOSH_GITEA_WEBHOOK_PATH: /webhooks/gitea
# Auth configuration
WHOOSH_AUTH_JWT_SECRET_FILE: /run/secrets/jwt_secret
WHOOSH_AUTH_SERVICE_TOKENS_FILE: /run/secrets/service_tokens
WHOOSH_AUTH_JWT_EXPIRY: "24h"
# Logging
WHOOSH_LOGGING_LEVEL: debug
WHOOSH_LOGGING_ENVIRONMENT: production
# Redis configuration
WHOOSH_REDIS_ENABLED: "true"
WHOOSH_REDIS_HOST: redis
WHOOSH_REDIS_PORT: 6379
WHOOSH_REDIS_PASSWORD_FILE: /run/secrets/redis_password
WHOOSH_REDIS_DATABASE: 0
# Scaling system configuration
WHOOSH_SCALING_KACHING_URL: "https://kaching.chorus.services"
WHOOSH_SCALING_BACKBEAT_URL: "http://backbeat-pulse:8080"
WHOOSH_SCALING_CHORUS_URL: "http://chorus:9000"
# BACKBEAT integration configuration (temporarily disabled)
WHOOSH_BACKBEAT_ENABLED: "false"
WHOOSH_BACKBEAT_CLUSTER_ID: "chorus-production"
WHOOSH_BACKBEAT_AGENT_ID: "whoosh"
WHOOSH_BACKBEAT_NATS_URL: "nats://backbeat-nats:4222"
secrets:
- whoosh_db_password
- gitea_token
- webhook_token
- jwt_secret
- service_tokens
- redis_password
volumes:
- /var/run/docker.sock:/var/run/docker.sock
deploy:
replicas: 2
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
window: 120s
update_config:
parallelism: 1
delay: 10s
failure_action: pause
monitor: 60s
order: start-first
# rollback_config:
# parallelism: 1
# delay: 0s
# failure_action: pause
# monitor: 60s
# order: stop-first
placement:
constraints:
- node.hostname != acacia
preferences:
- spread: node.hostname
resources:
limits:
memory: 256M
cpus: '0.5'
reservations:
memory: 128M
cpus: '0.25'
labels:
- traefik.enable=true
- traefik.docker.network=tengig
- traefik.http.routers.whoosh.rule=Host(`whoosh.chorus.services`)
- traefik.http.routers.whoosh.tls=true
- traefik.http.routers.whoosh.tls.certresolver=letsencryptresolver
- traefik.http.routers.photoprism.entrypoints=web,web-secured
- traefik.http.services.whoosh.loadbalancer.server.port=8080
- traefik.http.services.photoprism.loadbalancer.passhostheader=true
- traefik.http.middlewares.whoosh-auth.basicauth.users=admin:$2y$10$example_hash
networks:
- tengig
- chorus_net
healthcheck:
test: ["CMD", "/app/whoosh", "--health-check"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
postgres:
image: postgres:15-alpine
environment:
POSTGRES_DB: whoosh
POSTGRES_USER: whoosh
POSTGRES_PASSWORD_FILE: /run/secrets/whoosh_db_password
POSTGRES_INITDB_ARGS: --auth-host=scram-sha-256
secrets:
- whoosh_db_password
volumes:
- whoosh_postgres_data:/var/lib/postgresql/data
deploy:
replicas: 1
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
window: 120s
placement:
preferences:
- spread: node.hostname
resources:
limits:
memory: 512M
cpus: '1.0'
reservations:
memory: 256M
cpus: '0.5'
networks:
- chorus_net
healthcheck:
test: ["CMD-SHELL", "pg_isready -h localhost -p 5432 -U whoosh -d whoosh"]
interval: 30s
timeout: 10s
retries: 5
start_period: 40s
redis:
image: redis:7-alpine
command: sh -c 'redis-server --requirepass "$$(cat /run/secrets/redis_password)" --appendonly yes'
secrets:
- redis_password
volumes:
- whoosh_redis_data:/data
deploy:
replicas: 1
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
window: 120s
placement:
preferences:
- spread: node.hostname
resources:
limits:
memory: 128M
cpus: '0.25'
reservations:
memory: 64M
cpus: '0.1'
networks:
- chorus_net
healthcheck:
test: ["CMD", "sh", "-c", "redis-cli --no-auth-warning -a $$(cat /run/secrets/redis_password) ping"]
interval: 30s
timeout: 10s
retries: 3
start_period: 30s
prometheus:
image: prom/prometheus:latest
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
volumes:
- /rust/containers/CHORUS/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- /rust/containers/CHORUS/monitoring/prometheus:/prometheus
ports:
- "9099:9090" # Expose Prometheus UI
deploy:
replicas: 1
labels:
- traefik.enable=true
- traefik.http.routers.prometheus.rule=Host(`prometheus.chorus.services`)
- traefik.http.routers.prometheus.entrypoints=web,web-secured
- traefik.http.routers.prometheus.tls=true
- traefik.http.routers.prometheus.tls.certresolver=letsencryptresolver
- traefik.http.services.prometheus.loadbalancer.server.port=9090
networks:
- chorus_net
- tengig
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/ready"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
grafana:
image: grafana/grafana:latest
user: "1000:1000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin} # Use a strong password in production
- GF_SERVER_ROOT_URL=https://grafana.chorus.services
volumes:
- /rust/containers/CHORUS/monitoring/grafana:/var/lib/grafana
ports:
- "3300:3000" # Expose Grafana UI
deploy:
replicas: 1
labels:
- traefik.enable=true
- traefik.http.routers.grafana.rule=Host(`grafana.chorus.services`)
- traefik.http.routers.grafana.entrypoints=web,web-secured
- traefik.http.routers.grafana.tls=true
- traefik.http.routers.grafana.tls.certresolver=letsencryptresolver
- traefik.http.services.grafana.loadbalancer.server.port=3000
networks:
- chorus_net
- tengig
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/api/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
# BACKBEAT Pulse Service - Leader-elected tempo broadcaster
# REQ: BACKBEAT-REQ-001 - Single BeatFrame publisher per cluster
# REQ: BACKBEAT-OPS-001 - One replica prefers leadership
backbeat-pulse:
image: anthonyrawlins/backbeat-pulse:v1.0.5
command: >
./pulse
-cluster=chorus-production
-admin-port=8080
-raft-bind=0.0.0.0:9000
-data-dir=/data
-nats=nats://backbeat-nats:4222
-tempo=2
-bar-length=8
-log-level=info
# Internal service ports (not externally exposed - routed via Traefik)
expose:
- "8080" # Admin API
- "9000" # Raft communication
# REQ: BACKBEAT-OPS-002 - Health probes for liveness/readiness
healthcheck:
test: ["CMD", "nc", "-z", "localhost", "8080"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
deploy:
replicas: 1 # Single leader with automatic failover
restart_policy:
condition: on-failure
delay: 30s # Wait longer for NATS to be ready
max_attempts: 5
window: 120s
update_config:
parallelism: 1
delay: 30s # Wait for leader election
failure_action: pause
monitor: 60s
order: start-first
placement:
preferences:
- spread: node.hostname
resources:
limits:
memory: 256M
cpus: '0.5'
reservations:
memory: 128M
cpus: '0.25'
# Traefik routing for admin API
labels:
- traefik.enable=true
- traefik.http.routers.backbeat-pulse.rule=Host(`backbeat-pulse.chorus.services`)
- traefik.http.routers.backbeat-pulse.tls=true
- traefik.http.routers.backbeat-pulse.tls.certresolver=letsencryptresolver
- traefik.http.services.backbeat-pulse.loadbalancer.server.port=8080
networks:
- chorus_net
- tengig # External network for Traefik
# Container logging
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
tag: "backbeat-pulse/{{.Name}}/{{.ID}}"
# BACKBEAT Reverb Service - StatusClaim aggregator
# REQ: BACKBEAT-REQ-020 - Subscribe to INT-B and group by window_id
# REQ: BACKBEAT-OPS-001 - Reverb can scale stateless
backbeat-reverb:
image: anthonyrawlins/backbeat-reverb:v1.0.2
command: >
./reverb
-cluster=chorus-production
-nats=nats://backbeat-nats:4222
-bar-length=8
-log-level=info
# Internal service ports (not externally exposed - routed via Traefik)
expose:
- "8080" # Admin API
# REQ: BACKBEAT-OPS-002 - Health probes for orchestration (temporarily disabled for testing)
# healthcheck:
# test: ["CMD", "nc", "-z", "localhost", "8080"]
# interval: 30s
# timeout: 10s
# retries: 3
# start_period: 60s
deploy:
replicas: 2 # Stateless, can scale horizontally
restart_policy:
condition: on-failure
delay: 10s
max_attempts: 3
window: 120s
update_config:
parallelism: 1
delay: 15s
failure_action: pause
monitor: 45s
order: start-first
placement:
preferences:
- spread: node.hostname
resources:
limits:
memory: 512M # Larger for window aggregation
cpus: '1.0'
reservations:
memory: 256M
cpus: '0.5'
# Traefik routing for admin API
labels:
- traefik.enable=true
- traefik.http.routers.backbeat-reverb.rule=Host(`backbeat-reverb.chorus.services`)
- traefik.http.routers.backbeat-reverb.tls=true
- traefik.http.routers.backbeat-reverb.tls.certresolver=letsencryptresolver
- traefik.http.services.backbeat-reverb.loadbalancer.server.port=8080
networks:
- chorus_net
- tengig # External network for Traefik
# Container logging
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
tag: "backbeat-reverb/{{.Name}}/{{.ID}}"
# NATS Message Broker - Use existing or deploy dedicated instance
# REQ: BACKBEAT-INT-001 - Topics via NATS for at-least-once delivery
backbeat-nats:
image: nats:2.9-alpine
command: ["--jetstream"]
deploy:
replicas: 1
restart_policy:
condition: on-failure
delay: 10s
max_attempts: 3
window: 120s
placement:
preferences:
- spread: node.hostname
resources:
limits:
memory: 256M
cpus: '0.5'
reservations:
memory: 128M
cpus: '0.25'
networks:
- chorus_net
# Container logging
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
tag: "nats/{{.Name}}/{{.ID}}"
# KACHING services are deployed separately in their own stack
# License validation will access https://kaching.chorus.services/api
# Persistent volumes
volumes:
prometheus_data:
driver: local
driver_opts:
type: none
o: bind
device: /rust/containers/CHORUS/monitoring/prometheus
prometheus_config:
driver: local
driver_opts:
type: none
o: bind
device: /rust/containers/CHORUS/monitoring/prometheus
grafana_data:
driver: local
driver_opts:
type: none
o: bind
device: /rust/containers/CHORUS/monitoring/grafana
chorus_data:
driver: local
whoosh_postgres_data:
driver: local
driver_opts:
type: none
o: bind
device: /rust/containers/WHOOSH/postgres
whoosh_redis_data:
driver: local
driver_opts:
type: none
o: bind
device: /rust/containers/WHOOSH/redis
# Networks for CHORUS communication
networks:
tengig:
external: true
chorus_net:
driver: overlay
attachable: true
configs:
chorus_bootstrap:
file: ./bootstrap.json
secrets:
chorus_license_id:
external: true
name: chorus_license_id
resetdata_api_key:
external: true
name: resetdata_api_key
whoosh_db_password:
external: true
name: whoosh_db_password
gitea_token:
external: true
name: gitea_token
webhook_token:
external: true
name: whoosh_webhook_token
jwt_secret:
external: true
name: whoosh_jwt_secret
service_tokens:
external: true
name: whoosh_service_tokens
redis_password:
external: true
name: whoosh_redis_password