CHORUS/docker/docker-compose.yml

version: "3.9"

services:
  chorus:
    image: anthonyrawlins/chorus:v0.5.4-p2p-fix

    # REQUIRED: License configuration (CHORUS will not start without this)
    environment:
      # CRITICAL: License configuration - REQUIRED for operation
      - CHORUS_LICENSE_ID_FILE=/run/secrets/chorus_license_id
      - CHORUS_CLUSTER_ID=${CHORUS_CLUSTER_ID:-docker-cluster}
      - CHORUS_KACHING_URL=${CHORUS_KACHING_URL:-https://kaching.chorus.services/api}

      # Agent configuration
      - CHORUS_AGENT_ID=${CHORUS_AGENT_ID:-}  # Auto-generated if not provided
      - CHORUS_SPECIALIZATION=${CHORUS_SPECIALIZATION:-general_developer}
      - CHORUS_MAX_TASKS=${CHORUS_MAX_TASKS:-3}
      - CHORUS_CAPABILITIES=general_development,task_coordination,admin_election

      # Network configuration
      - CHORUS_API_PORT=8080
      - CHORUS_HEALTH_PORT=8081
      - CHORUS_P2P_PORT=9000
      - CHORUS_BIND_ADDRESS=0.0.0.0

      # Scaling optimizations (as per WHOOSH issue #7)
      - CHORUS_MDNS_ENABLED=false  # Disabled for container/swarm environments
      - CHORUS_DIALS_PER_SEC=5     # Rate limit outbound connections to prevent storms
      - CHORUS_MAX_CONCURRENT_DHT=16  # Limit concurrent DHT queries

      # Election stability windows (Medium-risk fix 2.1)
      - CHORUS_ELECTION_MIN_TERM=30s  # Minimum time between elections to prevent churn
      - CHORUS_LEADER_MIN_TERM=45s    # Minimum time before challenging healthy leader

      # Assignment system for runtime configuration (Medium-risk fix 2.2)
      - ASSIGN_URL=${ASSIGN_URL:-}  # Optional: WHOOSH assignment endpoint
      - TASK_SLOT=${TASK_SLOT:-}    # Optional: Task slot identifier
      - TASK_ID=${TASK_ID:-}        # Optional: Task identifier
      - NODE_ID=${NODE_ID:-}        # Optional: Node identifier

      # Bootstrap pool configuration (supports JSON and CSV)
      - BOOTSTRAP_JSON=/config/bootstrap.json  # Optional: JSON bootstrap config
      - CHORUS_BOOTSTRAP_PEERS=${CHORUS_BOOTSTRAP_PEERS:-}  # CSV fallback

      # AI configuration - Provider selection
      - CHORUS_AI_PROVIDER=${CHORUS_AI_PROVIDER:-resetdata}

      # ResetData configuration (default provider)
      - RESETDATA_BASE_URL=${RESETDATA_BASE_URL:-https://models.au-syd.resetdata.ai/v1}
      - RESETDATA_API_KEY_FILE=/run/secrets/resetdata_api_key
      - RESETDATA_MODEL=${RESETDATA_MODEL:-meta/llama-3.1-8b-instruct}

      # Ollama configuration (alternative provider)
      - OLLAMA_ENDPOINT=${OLLAMA_ENDPOINT:-http://host.docker.internal:11434}

      # Model configuration
      - CHORUS_MODELS=${CHORUS_MODELS:-meta/llama-3.1-8b-instruct}
      - CHORUS_DEFAULT_REASONING_MODEL=${CHORUS_DEFAULT_REASONING_MODEL:-meta/llama-3.1-8b-instruct}

      # Logging configuration
      - LOG_LEVEL=${LOG_LEVEL:-info}
      - LOG_FORMAT=${LOG_FORMAT:-structured}

      # BACKBEAT configuration
      - CHORUS_BACKBEAT_ENABLED=${CHORUS_BACKBEAT_ENABLED:-true}
      - CHORUS_BACKBEAT_CLUSTER_ID=${CHORUS_BACKBEAT_CLUSTER_ID:-chorus-production}
      - CHORUS_BACKBEAT_AGENT_ID=${CHORUS_BACKBEAT_AGENT_ID:-}  # Auto-generated from CHORUS_AGENT_ID
      - CHORUS_BACKBEAT_NATS_URL=${CHORUS_BACKBEAT_NATS_URL:-nats://backbeat-nats:4222}

      # Prompt sourcing (mounted volume)
      - CHORUS_PROMPTS_DIR=/etc/chorus/prompts
      - CHORUS_DEFAULT_INSTRUCTIONS_PATH=/etc/chorus/prompts/defaults.md
      - CHORUS_ROLE=${CHORUS_ROLE:-arbiter}

    # Docker secrets for sensitive configuration
    secrets:
      - chorus_license_id
      - resetdata_api_key

    # Configuration files
    configs:
      - source: chorus_bootstrap
        target: /config/bootstrap.json

    # Persistent data storage
    volumes:
      - chorus_data:/app/data
      # Mount prompts directory read-only for role YAMLs and defaults.md
      - /rust/containers/WHOOSH/prompts:/etc/chorus/prompts:ro

    # Network ports
    ports:
      - "${CHORUS_P2P_PORT:-9000}:9000"      # P2P communication

    # Container resource limits
    deploy:
      mode: replicated
      replicas: ${CHORUS_REPLICAS:-9}
      update_config:
        parallelism: 1
        delay: 10s
        failure_action: pause
        order: start-first
      restart_policy:
        condition: on-failure
        delay: 5s
        max_attempts: 3
        window: 120s
      resources:
        limits:
          cpus: "${CHORUS_CPU_LIMIT:-1.0}"
          memory: "${CHORUS_MEMORY_LIMIT:-1G}"
        reservations:
          cpus: "0.1"
          memory: 128M
      placement:
        constraints:
          - node.hostname != acacia
        preferences:
          - spread: node.hostname
      # CHORUS is internal-only, no Traefik labels needed

    # Network configuration
    networks:
      - chorus_net

    # Host resolution for external services
    extra_hosts:
      - "host.docker.internal:host-gateway"

    # Container logging configuration
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "3"
        tag: "{{.ImageName}}/{{.Name}}/{{.ID}}"

    # Health check configuration
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8081/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 10s

  whoosh:
    image: anthonyrawlins/whoosh:scaling-v1.0.0
    ports:
      - target: 8080
        published: 8800
        protocol: tcp
        mode: ingress
    environment:
      # Database configuration
      WHOOSH_DATABASE_DB_HOST: postgres
      WHOOSH_DATABASE_DB_PORT: 5432
      WHOOSH_DATABASE_DB_NAME: whoosh
      WHOOSH_DATABASE_DB_USER: whoosh
      WHOOSH_DATABASE_DB_PASSWORD_FILE: /run/secrets/whoosh_db_password
      WHOOSH_DATABASE_DB_SSL_MODE: disable
      WHOOSH_DATABASE_DB_AUTO_MIGRATE: "true"

      # Server configuration
      WHOOSH_SERVER_LISTEN_ADDR: ":8080"
      WHOOSH_SERVER_READ_TIMEOUT: "30s"
      WHOOSH_SERVER_WRITE_TIMEOUT: "30s"
      WHOOSH_SERVER_SHUTDOWN_TIMEOUT: "30s"

      # GITEA configuration
      WHOOSH_GITEA_BASE_URL: https://gitea.chorus.services
      WHOOSH_GITEA_TOKEN_FILE: /run/secrets/gitea_token
      WHOOSH_GITEA_WEBHOOK_TOKEN_FILE: /run/secrets/webhook_token
      WHOOSH_GITEA_WEBHOOK_PATH: /webhooks/gitea

      # Auth configuration
      WHOOSH_AUTH_JWT_SECRET_FILE: /run/secrets/jwt_secret
      WHOOSH_AUTH_SERVICE_TOKENS_FILE: /run/secrets/service_tokens
      WHOOSH_AUTH_JWT_EXPIRY: "24h"

      # Logging
      WHOOSH_LOGGING_LEVEL: debug
      WHOOSH_LOGGING_ENVIRONMENT: production

      # Redis configuration
      WHOOSH_REDIS_ENABLED: "true"
      WHOOSH_REDIS_HOST: redis
      WHOOSH_REDIS_PORT: 6379
      WHOOSH_REDIS_PASSWORD_FILE: /run/secrets/redis_password
      WHOOSH_REDIS_DATABASE: 0

      # Scaling system configuration
      WHOOSH_SCALING_KACHING_URL: "https://kaching.chorus.services"
      WHOOSH_SCALING_BACKBEAT_URL: "http://backbeat-pulse:8080"
      WHOOSH_SCALING_CHORUS_URL: "http://chorus:9000"

      # BACKBEAT integration configuration (temporarily disabled)
      WHOOSH_BACKBEAT_ENABLED: "false"
      WHOOSH_BACKBEAT_CLUSTER_ID: "chorus-production"
      WHOOSH_BACKBEAT_AGENT_ID: "whoosh"
      WHOOSH_BACKBEAT_NATS_URL: "nats://backbeat-nats:4222"

    secrets:
      - whoosh_db_password
      - gitea_token
      - webhook_token
      - jwt_secret
      - service_tokens
      - redis_password
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock
    deploy:
      replicas: 2
      restart_policy:
        condition: on-failure
        delay: 5s
        max_attempts: 3
        window: 120s
      update_config:
        parallelism: 1
        delay: 10s
        failure_action: pause
        monitor: 60s
        order: start-first
      # rollback_config:
      #   parallelism: 1
      #   delay: 0s
      #   failure_action: pause
      #   monitor: 60s
      #   order: stop-first
      placement:
        constraints:
          - node.hostname != acacia
        preferences:
          - spread: node.hostname
      resources:
        limits:
          memory: 256M
          cpus: '0.5'
        reservations:
          memory: 128M
          cpus: '0.25'
      labels:
        - traefik.enable=true
        - traefik.docker.network=tengig
        - traefik.http.routers.whoosh.rule=Host(`whoosh.chorus.services`)
        - traefik.http.routers.whoosh.tls=true
        - traefik.http.routers.whoosh.tls.certresolver=letsencryptresolver
        - traefik.http.routers.photoprism.entrypoints=web,web-secured
        - traefik.http.services.whoosh.loadbalancer.server.port=8080
        - traefik.http.services.photoprism.loadbalancer.passhostheader=true
        - traefik.http.middlewares.whoosh-auth.basicauth.users=admin:$2y$10$example_hash
    networks:
      - tengig
      - chorus_net
    healthcheck:
      test: ["CMD", "/app/whoosh", "--health-check"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s

  postgres:
    image: postgres:15-alpine
    environment:
      POSTGRES_DB: whoosh
      POSTGRES_USER: whoosh
      POSTGRES_PASSWORD_FILE: /run/secrets/whoosh_db_password
      POSTGRES_INITDB_ARGS: --auth-host=scram-sha-256
    secrets:
      - whoosh_db_password
    volumes:
      - whoosh_postgres_data:/var/lib/postgresql/data
    deploy:
      replicas: 1
      restart_policy:
        condition: on-failure
        delay: 5s
        max_attempts: 3
        window: 120s
      placement:
        preferences:
          - spread: node.hostname
      resources:
        limits:
          memory: 512M
          cpus: '1.0'
        reservations:
          memory: 256M
          cpus: '0.5'
    networks:
      - chorus_net
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -h localhost -p 5432 -U whoosh -d whoosh"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 40s


  redis:
    image: redis:7-alpine
    command: sh -c 'redis-server --requirepass "$$(cat /run/secrets/redis_password)" --appendonly yes'
    secrets:
      - redis_password
    volumes:
      - whoosh_redis_data:/data
    deploy:
      replicas: 1
      restart_policy:
        condition: on-failure
        delay: 5s
        max_attempts: 3
        window: 120s
      placement:
        preferences:
          - spread: node.hostname
      resources:
        limits:
          memory: 128M
          cpus: '0.25'
        reservations:
          memory: 64M
          cpus: '0.1'
    networks:
      - chorus_net
    healthcheck:
      test: ["CMD", "sh", "-c", "redis-cli --no-auth-warning -a $$(cat /run/secrets/redis_password) ping"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 30s


  prometheus:
    image: prom/prometheus:latest
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/usr/share/prometheus/console_libraries'
      - '--web.console.templates=/usr/share/prometheus/consoles'
    volumes:
      - /rust/containers/CHORUS/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - /rust/containers/CHORUS/monitoring/prometheus:/prometheus
    ports:
      - "9099:9090" # Expose Prometheus UI
    deploy:
      replicas: 1
      labels:
        - traefik.enable=true
        - traefik.http.routers.prometheus.rule=Host(`prometheus.chorus.services`)
        - traefik.http.routers.prometheus.entrypoints=web,web-secured
        - traefik.http.routers.prometheus.tls=true
        - traefik.http.routers.prometheus.tls.certresolver=letsencryptresolver
        - traefik.http.services.prometheus.loadbalancer.server.port=9090
    networks:
      - chorus_net
      - tengig
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/ready"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 10s

  grafana:
    image: grafana/grafana:latest
    user: "1000:1000"
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin} # Use a strong password in production
      - GF_SERVER_ROOT_URL=https://grafana.chorus.services
    volumes:
      - /rust/containers/CHORUS/monitoring/grafana:/var/lib/grafana
    ports:
      - "3300:3000" # Expose Grafana UI
    deploy:
      replicas: 1
      labels:
        - traefik.enable=true
        - traefik.http.routers.grafana.rule=Host(`grafana.chorus.services`)
        - traefik.http.routers.grafana.entrypoints=web,web-secured
        - traefik.http.routers.grafana.tls=true
        - traefik.http.routers.grafana.tls.certresolver=letsencryptresolver
        - traefik.http.services.grafana.loadbalancer.server.port=3000
    networks:
      - chorus_net
      - tengig
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/api/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 10s

  # BACKBEAT Pulse Service - Leader-elected tempo broadcaster
  # REQ: BACKBEAT-REQ-001 - Single BeatFrame publisher per cluster
  # REQ: BACKBEAT-OPS-001 - One replica prefers leadership
  backbeat-pulse:
    image: anthonyrawlins/backbeat-pulse:v1.0.5
    command: >
      ./pulse
      -cluster=chorus-production
      -admin-port=8080
      -raft-bind=0.0.0.0:9000
      -data-dir=/data
      -nats=nats://backbeat-nats:4222
      -tempo=2
      -bar-length=8
      -log-level=info

    # Internal service ports (not externally exposed - routed via Traefik)
    expose:
      - "8080"  # Admin API
      - "9000"  # Raft communication

    # REQ: BACKBEAT-OPS-002 - Health probes for liveness/readiness
    healthcheck:
      test: ["CMD", "nc", "-z", "localhost", "8080"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s

    deploy:
      replicas: 1              # Single leader with automatic failover
      restart_policy:
        condition: on-failure
        delay: 30s             # Wait longer for NATS to be ready
        max_attempts: 5
        window: 120s
      update_config:
        parallelism: 1
        delay: 30s             # Wait for leader election
        failure_action: pause
        monitor: 60s
        order: start-first
      placement:
        preferences:
          - spread: node.hostname
      resources:
        limits:
          memory: 256M
          cpus: '0.5'
        reservations:
          memory: 128M
          cpus: '0.25'
      # Traefik routing for admin API
      labels:
        - traefik.enable=true
        - traefik.http.routers.backbeat-pulse.rule=Host(`backbeat-pulse.chorus.services`)
        - traefik.http.routers.backbeat-pulse.tls=true
        - traefik.http.routers.backbeat-pulse.tls.certresolver=letsencryptresolver
        - traefik.http.services.backbeat-pulse.loadbalancer.server.port=8080

    networks:
      - chorus_net
      - tengig              # External network for Traefik

    # Container logging
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "3"
        tag: "backbeat-pulse/{{.Name}}/{{.ID}}"

  # BACKBEAT Reverb Service - StatusClaim aggregator
  # REQ: BACKBEAT-REQ-020 - Subscribe to INT-B and group by window_id
  # REQ: BACKBEAT-OPS-001 - Reverb can scale stateless
  backbeat-reverb:
    image: anthonyrawlins/backbeat-reverb:v1.0.2
    command: >
      ./reverb
      -cluster=chorus-production
      -nats=nats://backbeat-nats:4222
      -bar-length=8
      -log-level=info

    # Internal service ports (not externally exposed - routed via Traefik)
    expose:
      - "8080"  # Admin API

    # REQ: BACKBEAT-OPS-002 - Health probes for orchestration (temporarily disabled for testing)
    # healthcheck:
    #   test: ["CMD", "nc", "-z", "localhost", "8080"]
    #   interval: 30s
    #   timeout: 10s
    #   retries: 3
    #   start_period: 60s

    deploy:
      replicas: 2              # Stateless, can scale horizontally
      restart_policy:
        condition: on-failure
        delay: 10s
        max_attempts: 3
        window: 120s
      update_config:
        parallelism: 1
        delay: 15s
        failure_action: pause
        monitor: 45s
        order: start-first
      placement:
        preferences:
          - spread: node.hostname
      resources:
        limits:
          memory: 512M         # Larger for window aggregation
          cpus: '1.0'
        reservations:
          memory: 256M
          cpus: '0.5'
      # Traefik routing for admin API
      labels:
        - traefik.enable=true
        - traefik.http.routers.backbeat-reverb.rule=Host(`backbeat-reverb.chorus.services`)
        - traefik.http.routers.backbeat-reverb.tls=true
        - traefik.http.routers.backbeat-reverb.tls.certresolver=letsencryptresolver
        - traefik.http.services.backbeat-reverb.loadbalancer.server.port=8080

    networks:
      - chorus_net
      - tengig              # External network for Traefik

    # Container logging
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "3"
        tag: "backbeat-reverb/{{.Name}}/{{.ID}}"

  # NATS Message Broker - Use existing or deploy dedicated instance
  # REQ: BACKBEAT-INT-001 - Topics via NATS for at-least-once delivery
  backbeat-nats:
    image: nats:2.9-alpine
    command: ["--jetstream"]
    deploy:
      replicas: 1
      restart_policy:
        condition: on-failure
        delay: 10s
        max_attempts: 3
        window: 120s
      placement:
        preferences:
          - spread: node.hostname
      resources:
        limits:
          memory: 256M
          cpus: '0.5'
        reservations:
          memory: 128M
          cpus: '0.25'
    networks:
      - chorus_net
    # Container logging
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "3"
        tag: "nats/{{.Name}}/{{.ID}}"

  # KACHING services are deployed separately in their own stack
  # License validation will access https://kaching.chorus.services/api

# Persistent volumes
volumes:
  prometheus_data:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /rust/containers/CHORUS/monitoring/prometheus
  prometheus_config:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /rust/containers/CHORUS/monitoring/prometheus
  grafana_data:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /rust/containers/CHORUS/monitoring/grafana
  chorus_data:
    driver: local
  whoosh_postgres_data:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /rust/containers/WHOOSH/postgres
  whoosh_redis_data:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /rust/containers/WHOOSH/redis


# Networks for CHORUS communication
networks:
  tengig:
    external: true

  chorus_net:
    driver: overlay
    attachable: true


configs:
  chorus_bootstrap:
    file: ./bootstrap.json

secrets:
  chorus_license_id:
    external: true
    name: chorus_license_id
  resetdata_api_key:
    external: true
    name: resetdata_api_key
  whoosh_db_password:
    external: true
    name: whoosh_db_password
  gitea_token:
    external: true
    name: gitea_token
  webhook_token:
    external: true
    name: whoosh_webhook_token
  jwt_secret:
    external: true
    name: whoosh_jwt_secret
  service_tokens:
    external: true
    name: whoosh_service_tokens
  redis_password:
    external: true
    name: whoosh_redis_password