version: "3.9" services: chorus: image: localhost:5000/chorus:march8-evidence-20260226-2 environment: - CHORUS_LICENSE_ID_FILE=/run/secrets/chorus_license_id - CHORUS_CLUSTER_ID=${CHORUS_CLUSTER_ID:-docker-cluster} - CHORUS_KACHING_URL=${CHORUS_KACHING_URL:-http://host.docker.internal:8099} - CHORUS_AGENT_ID=${CHORUS_AGENT_ID:-} - CHORUS_SPECIALIZATION=${CHORUS_SPECIALIZATION:-general_developer} - CHORUS_MAX_TASKS=${CHORUS_MAX_TASKS:-3} - CHORUS_CAPABILITIES=general_development,task_coordination,admin_election - CHORUS_API_PORT=8080 - CHORUS_HEALTH_PORT=8081 - CHORUS_P2P_PORT=9000 - CHORUS_BIND_ADDRESS=0.0.0.0 - CHORUS_MDNS_ENABLED=false - CHORUS_DIALS_PER_SEC=5 - CHORUS_MAX_CONCURRENT_DHT=16 - CHORUS_ELECTION_MIN_TERM=120s - CHORUS_LEADER_MIN_TERM=240s - ASSIGN_URL=${ASSIGN_URL:-} - TASK_SLOT=${TASK_SLOT:-} - TASK_ID=${TASK_ID:-} - NODE_ID=${NODE_ID:-} - WHOOSH_API_BASE_URL=${SWOOSH_API_BASE_URL:-http://swoosh:8080} - WHOOSH_API_ENABLED=true - BOOTSTRAP_JSON=/config/bootstrap.json - CHORUS_BOOTSTRAP_PEERS=${CHORUS_BOOTSTRAP_PEERS:-} - CHORUS_AI_PROVIDER=${CHORUS_AI_PROVIDER:-resetdata} - RESETDATA_BASE_URL=${RESETDATA_BASE_URL:-https://app.resetdata.ai/api/v1} - RESETDATA_API_KEY_FILE=/run/secrets/resetdata_api_key - RESETDATA_MODEL=${RESETDATA_MODEL:-openai/gpt-oss-120b} - OLLAMA_ENDPOINT=${OLLAMA_ENDPOINT:-http://host.docker.internal:11434} - CHORUS_MODELS=${CHORUS_MODELS:-meta/llama-3.1-8b-instruct} - CHORUS_DEFAULT_REASONING_MODEL=${CHORUS_DEFAULT_REASONING_MODEL:-meta/llama-3.1-8b-instruct} - CHORUS_LIGHTRAG_ENABLED=${CHORUS_LIGHTRAG_ENABLED:-true} - CHORUS_LIGHTRAG_BASE_URL=${CHORUS_LIGHTRAG_BASE_URL:-http://host.docker.internal:9621} - CHORUS_LIGHTRAG_TIMEOUT=${CHORUS_LIGHTRAG_TIMEOUT:-30s} - CHORUS_LIGHTRAG_API_KEY=${CHORUS_LIGHTRAG_API_KEY:-your-secure-api-key-here} - CHORUS_LIGHTRAG_DEFAULT_MODE=${CHORUS_LIGHTRAG_DEFAULT_MODE:-hybrid} - LOG_LEVEL=${LOG_LEVEL:-info} - LOG_FORMAT=${LOG_FORMAT:-structured} - CHORUS_BACKBEAT_ENABLED=${CHORUS_BACKBEAT_ENABLED:-true} - CHORUS_BACKBEAT_CLUSTER_ID=${CHORUS_BACKBEAT_CLUSTER_ID:-chorus-production} - CHORUS_BACKBEAT_AGENT_ID=${CHORUS_BACKBEAT_AGENT_ID:-} - CHORUS_BACKBEAT_NATS_URL=${CHORUS_BACKBEAT_NATS_URL:-nats://backbeat-nats:4222} - CHORUS_TRANSPORT_TELEMETRY_INTERVAL=${CHORUS_TRANSPORT_TELEMETRY_INTERVAL:-30s} - CHORUS_TRANSPORT_TELEMETRY_SUBJECT=${CHORUS_TRANSPORT_TELEMETRY_SUBJECT:-chorus.telemetry.transport} - CHORUS_TRANSPORT_METRICS_NATS_URL=${CHORUS_TRANSPORT_METRICS_NATS_URL:-} - CHORUS_TRANSPORT_MODE=${CHORUS_TRANSPORT_MODE:-quic_only} - CHORUS_PROMPTS_DIR=/etc/chorus/prompts - CHORUS_DEFAULT_INSTRUCTIONS_PATH=/etc/chorus/prompts/defaults.md - CHORUS_ROLE=${CHORUS_ROLE:-arbiter} secrets: - chorus_license_id - resetdata_api_key configs: - source: chorus_bootstrap target: /config/bootstrap.json volumes: - chorus_data:/app/data - /rust/containers/WHOOSH/prompts:/etc/chorus/prompts:ro - /rust/containers/CHORUS/models.yaml:/app/configs/models.yaml:ro ports: - "${CHORUS_P2P_PORT:-9000}:9000/tcp" - "${CHORUS_P2P_PORT:-9000}:9000/udp" deploy: labels: - shepherd.autodeploy=true mode: replicated replicas: ${CHORUS_REPLICAS:-20} update_config: parallelism: 1 delay: 10s failure_action: pause order: start-first restart_policy: condition: on-failure delay: 5s max_attempts: 3 window: 120s resources: limits: cpus: "${CHORUS_CPU_LIMIT:-1.0}" memory: "${CHORUS_MEMORY_LIMIT:-4G}" reservations: cpus: "0.2" memory: 128M placement: preferences: - spread: node.hostname networks: - tengig - chorus_ipvlan extra_hosts: - "host.docker.internal:host-gateway" logging: driver: "json-file" options: max-size: "10m" max-file: "3" tag: "{{.ImageName}}/{{.Name}}/{{.ID}}" healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8081/health"] interval: 30s timeout: 10s retries: 3 start_period: 30s # Increased from 10s to allow P2P mesh formation (15s bootstrap + margin) swoosh: image: anthonyrawlins/swoosh:1.0.2 ports: - target: 8080 published: 8800 protocol: tcp mode: ingress environment: - SWOOSH_LISTEN_ADDR=:8080 - SWOOSH_WAL_DIR=/data/wal - SWOOSH_SNAPSHOT_PATH=/data/snapshots/latest.json volumes: - swoosh_data:/data deploy: replicas: 1 restart_policy: condition: on-failure delay: 5s max_attempts: 3 window: 120s update_config: parallelism: 1 delay: 10s failure_action: pause monitor: 60s order: start-first resources: limits: memory: 256M cpus: '0.5' reservations: memory: 128M cpus: '0.25' labels: - traefik.enable=true - traefik.docker.network=tengig - traefik.http.routers.swoosh.rule=Host(`swoosh.chorus.services`) - traefik.http.routers.swoosh.entrypoints=web,web-secured - traefik.http.routers.swoosh.tls=true - traefik.http.routers.swoosh.tls.certresolver=letsencryptresolver - traefik.http.services.swoosh.loadbalancer.server.port=8080 - shepherd.autodeploy=true - traefik.http.services.swoosh.loadbalancer.passhostheader=true networks: - tengig - chorus_ipvlan healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "-O", "/dev/null", "http://localhost:8080/health"] interval: 30s timeout: 10s retries: 3 start_period: 40s postgres: image: postgres:15-alpine environment: - POSTGRES_DB=whoosh - POSTGRES_USER=whoosh - POSTGRES_PASSWORD_FILE=/run/secrets/whoosh_db_password - POSTGRES_INITDB_ARGS=--auth-host=scram-sha-256 secrets: - whoosh_db_password volumes: - whoosh_postgres_data:/var/lib/postgresql/data deploy: replicas: 1 restart_policy: condition: on-failure delay: 5s max_attempts: 3 window: 120s # placement: # constraints: # - node.hostname == ironwood resources: limits: memory: 512M cpus: '1.0' reservations: memory: 256M cpus: '0.5' networks: - tengig - chorus_ipvlan healthcheck: test: ["CMD-SHELL", "pg_isready -h localhost -p 5432 -U whoosh -d whoosh"] interval: 30s timeout: 10s retries: 5 start_period: 40s redis: image: redis:7-alpine command: sh -c 'redis-server --requirepass "$$(cat /run/secrets/redis_password)" --appendonly yes' secrets: - redis_password volumes: - whoosh_redis_data:/data deploy: replicas: 1 restart_policy: condition: on-failure delay: 5s max_attempts: 3 window: 120s placement: preferences: - spread: node.hostname resources: limits: memory: 128M cpus: '0.25' reservations: memory: 64M cpus: '0.1' networks: - chorus_ipvlan healthcheck: test: ["CMD", "sh", "-c", "redis-cli --no-auth-warning -a $$(cat /run/secrets/redis_password) ping"] interval: 30s timeout: 10s retries: 3 start_period: 30s prometheus: image: prom/prometheus:latest command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--web.console.libraries=/usr/share/prometheus/console_libraries' - '--web.console.templates=/usr/share/prometheus/consoles' volumes: - /rust/containers/CHORUS/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro - /rust/containers/CHORUS/monitoring/prometheus:/prometheus - /rust/containers/CHORUS/observability/prometheus/alerts:/etc/prometheus/alerts:ro ports: - "9099:9090" deploy: replicas: 1 labels: - traefik.enable=true - traefik.http.routers.prometheus.rule=Host(`prometheus.chorus.services`) - traefik.http.routers.prometheus.entrypoints=web,web-secured - traefik.http.routers.prometheus.tls=true - traefik.http.routers.prometheus.tls.certresolver=letsencryptresolver - traefik.http.services.prometheus.loadbalancer.server.port=9090 - shepherd.autodeploy=true networks: - chorus_ipvlan - tengig healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/ready"] interval: 30s timeout: 10s retries: 3 start_period: 10s grafana: image: grafana/grafana:latest user: "1000:1000" environment: - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin} - GF_SERVER_ROOT_URL=https://grafana.chorus.services volumes: - /rust/containers/CHORUS/monitoring/grafana:/var/lib/grafana ports: - "3300:3000" deploy: replicas: 1 labels: - traefik.enable=true - traefik.http.routers.grafana.rule=Host(`grafana.chorus.services`) - traefik.http.routers.grafana.entrypoints=web,web-secured - traefik.http.routers.grafana.tls=true - traefik.http.routers.grafana.tls.certresolver=letsencryptresolver - traefik.http.services.grafana.loadbalancer.server.port=3000 - shepherd.autodeploy=true networks: - chorus_ipvlan - tengig healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/api/health"] interval: 30s timeout: 10s retries: 3 start_period: 10s backbeat-pulse: image: docker.io/anthonyrawlins/backbeat-pulse:latest command: > ./pulse -cluster=chorus-production -admin-port=8080 -raft-bind=0.0.0.0:9000 -data-dir=/data -nats=nats://backbeat-nats:4222 -tempo=2 -bar-length=8 -log-level=info expose: - "8080" - "9000" healthcheck: test: ["CMD", "nc", "-z", "localhost", "8080"] interval: 30s timeout: 10s retries: 3 start_period: 60s deploy: replicas: 1 restart_policy: condition: on-failure delay: 30s max_attempts: 5 window: 120s update_config: parallelism: 1 delay: 30s failure_action: pause monitor: 60s order: start-first placement: preferences: - spread: node.hostname resources: limits: memory: 256M cpus: '0.5' reservations: memory: 128M cpus: '0.25' labels: - traefik.enable=true - traefik.http.routers.backbeat-pulse.rule=Host(`backbeat-pulse.chorus.services`) - traefik.http.routers.backbeat-pulse.tls=true - traefik.http.routers.backbeat-pulse.tls.certresolver=letsencryptresolver - traefik.http.services.backbeat-pulse.loadbalancer.server.port=8080 networks: - chorus_ipvlan - tengig logging: driver: "json-file" options: max-size: "10m" max-file: "3" tag: "backbeat-pulse/{{.Name}}/{{.ID}}" backbeat-reverb: image: docker.io/anthonyrawlins/backbeat-reverb:latest command: > ./reverb -cluster=chorus-production -nats=nats://backbeat-nats:4222 -bar-length=8 -log-level=info expose: - "8080" deploy: replicas: 2 restart_policy: condition: on-failure delay: 10s max_attempts: 3 window: 120s update_config: parallelism: 1 delay: 15s failure_action: pause monitor: 45s order: start-first placement: preferences: - spread: node.hostname resources: limits: memory: 512M cpus: '1.0' reservations: memory: 256M cpus: '0.5' labels: - traefik.enable=true - traefik.http.routers.backbeat-reverb.rule=Host(`backbeat-reverb.chorus.services`) - traefik.http.routers.backbeat-reverb.tls=true - traefik.http.routers.backbeat-reverb.tls.certresolver=letsencryptresolver - traefik.http.services.backbeat-reverb.loadbalancer.server.port=8080 networks: - chorus_ipvlan - tengig logging: driver: "json-file" options: max-size: "10m" max-file: "3" tag: "backbeat-reverb/{{.Name}}/{{.ID}}" backbeat-nats: image: nats:2.9-alpine command: ["--jetstream"] deploy: replicas: 1 restart_policy: condition: on-failure delay: 10s max_attempts: 3 window: 120s placement: preferences: - spread: node.hostname resources: limits: memory: 256M cpus: '0.5' reservations: memory: 128M cpus: '0.25' networks: - chorus_ipvlan logging: driver: "json-file" options: max-size: "10m" max-file: "3" tag: "nats/{{.Name}}/{{.ID}}" shepherd: image: containrrr/shepherd:latest environment: SLEEP_TIME: "5m" FILTER_SERVICES: "label=shepherd.autodeploy=true" WITH_REGISTRY_AUTH: "true" ROLLBACK_ON_FAILURE: "true" TZ: "UTC" volumes: - /var/run/docker.sock:/var/run/docker.sock deploy: replicas: 1 restart_policy: condition: any placement: constraints: - node.role == manager hmmm-monitor: image: docker.io/anthonyrawlins/hmmm-monitor:latest environment: - WHOOSH_API_BASE_URL=http://swoosh:8080 ports: - "9001:9001" deploy: labels: - shepherd.autodeploy=true replicas: 1 restart_policy: condition: on-failure delay: 5s max_attempts: 3 window: 120s resources: limits: memory: 128M cpus: '0.25' reservations: memory: 64M cpus: '0.1' networks: - chorus_ipvlan logging: driver: "json-file" options: max-size: "10m" max-file: "3" tag: "hmmm-monitor/{{.Name}}/{{.ID}}" volumes: prometheus_data: driver: local driver_opts: type: none o: bind device: /rust/containers/CHORUS/monitoring/prometheus prometheus_config: driver: local driver_opts: type: none o: bind device: /rust/containers/CHORUS/monitoring/prometheus grafana_data: driver: local driver_opts: type: none o: bind device: /rust/containers/CHORUS/monitoring/grafana chorus_data: driver: local swoosh_data: driver: local driver_opts: type: none o: bind device: /rust/containers/SWOOSH/data whoosh_postgres_data: driver: local driver_opts: type: none o: bind device: /rust/containers/WHOOSH/postgres whoosh_redis_data: driver: local driver_opts: type: none o: bind device: /rust/containers/WHOOSH/redis whoosh_ui: driver: local driver_opts: type: none o: bind device: /rust/containers/WHOOSH/ui networks: tengig: external: true chorus_ipvlan: external: true configs: chorus_bootstrap: file: ./bootstrap.json secrets: chorus_license_id: external: true name: chorus_license_id resetdata_api_key: external: true name: resetdata_api_key_v2 whoosh_db_password: external: true name: whoosh_db_password gitea_token: external: true name: gitea_token webhook_token: external: true name: whoosh_webhook_token jwt_secret: external: true name: whoosh_jwt_secret_v4 service_tokens: external: true name: whoosh_service_tokens redis_password: external: true name: whoosh_redis_password