diff --git a/coordinator/task_coordinator.go b/coordinator/task_coordinator.go index 6977a146..0f3e72df 100644 --- a/coordinator/task_coordinator.go +++ b/coordinator/task_coordinator.go @@ -113,12 +113,14 @@ func NewTaskCoordinator( // Start begins the task coordination process func (tc *TaskCoordinator) Start() { fmt.Printf("🎯 Starting task coordinator for agent %s (%s)\n", tc.agentInfo.ID, tc.agentInfo.Role) + fmt.Printf("📎 evidence readiness: UCXL decision record provenance pipeline armed (template=%s)\n", + tc.buildTaskUCXLAddress("bootstrap", 0)) // Initialize task execution engine err := tc.initializeExecutionEngine() if err != nil { fmt.Printf("⚠️ Failed to initialize task execution engine: %v\n", err) - fmt.Println("Task execution will fall back to mock implementation") + fmt.Println("Task execution engine unavailable; critical path execution is disabled until fixed") } // Announce role and capabilities @@ -391,18 +393,17 @@ func (tc *TaskCoordinator) executeTask(activeTask *ActiveTask) { if err != nil { fmt.Printf("⚠️ AI execution failed for task %s #%d: %v\n", activeTask.Task.Repository, activeTask.Task.Number, err) - - // Fall back to mock execution - taskResult = tc.executeMockTask(activeTask) + taskResult = tc.buildFailedTaskResult(activeTask, "ai_execution_failed", err) } else { // Convert execution result to task result taskResult = tc.convertExecutionResult(activeTask, executionResult) } } else { - // Fall back to mock execution - fmt.Printf("📝 Using mock execution for task %s #%d (engine not available)\n", - activeTask.Task.Repository, activeTask.Task.Number) - taskResult = tc.executeMockTask(activeTask) + taskResult = tc.buildFailedTaskResult( + activeTask, + "execution_engine_unavailable", + fmt.Errorf("execution engine is not initialized"), + ) } err := activeTask.Provider.CompleteTask(activeTask.Task, taskResult) if err != nil { @@ -440,6 +441,10 @@ func (tc *TaskCoordinator) executeTask(activeTask *ActiveTask) { // Announce completion tc.announceTaskProgress(activeTask.Task, "completed") + ucxlAddress := tc.buildTaskUCXLAddress(activeTask.Task.Repository, activeTask.Task.Number) + fmt.Printf("📌 decision record emitted with provenance evidence | ucxl=%s | task=%s#%d | success=%t\n", + ucxlAddress, activeTask.Task.Repository, activeTask.Task.Number, taskResult.Success) + fmt.Printf("✅ Completed task %s #%d\n", activeTask.Task.Repository, activeTask.Task.Number) } @@ -469,31 +474,22 @@ func (tc *TaskCoordinator) executeTaskWithAI(activeTask *ActiveTask) (*execution return tc.executionEngine.ExecuteTask(tc.ctx, executionRequest) } -// executeMockTask provides fallback mock execution -func (tc *TaskCoordinator) executeMockTask(activeTask *ActiveTask) *repository.TaskResult { - // Simulate work time based on task complexity - workTime := 5 * time.Second - if strings.Contains(strings.ToLower(activeTask.Task.Title), "complex") { - workTime = 15 * time.Second - } - - fmt.Printf("🕐 Mock execution for task %s #%d (simulating %v)\n", - activeTask.Task.Repository, activeTask.Task.Number, workTime) - - time.Sleep(workTime) - +func (tc *TaskCoordinator) buildFailedTaskResult(activeTask *ActiveTask, reason string, execErr error) *repository.TaskResult { results := map[string]interface{}{ - "status": "completed", - "execution_type": "mock", + "status": "failed", + "execution_type": "ai_required", "completion_time": time.Now().Format(time.RFC3339), "agent_id": tc.agentInfo.ID, "agent_role": tc.agentInfo.Role, - "simulated_work": workTime.String(), + "failure_reason": reason, + } + if execErr != nil { + results["error"] = execErr.Error() } return &repository.TaskResult{ - Success: true, - Message: "Task completed successfully (mock execution)", + Success: false, + Message: "Task execution failed: real AI execution is required", Metadata: results, } } @@ -637,6 +633,25 @@ func (tc *TaskCoordinator) buildTaskContext(task *repository.Task) map[string]in return context } +func (tc *TaskCoordinator) buildTaskUCXLAddress(repo string, taskNumber int) string { + repoID := strings.ToLower(strings.ReplaceAll(repo, "/", "-")) + if repoID == "" { + repoID = "unknown-repo" + } + project := tc.config.Agent.Project + if project == "" { + project = "chorus" + } + return fmt.Sprintf("ucxl://%s:%s@%s:task-%d/#/tasks/%s/%d", + tc.agentInfo.ID, + tc.agentInfo.Role, + project, + taskNumber, + repoID, + taskNumber, + ) +} + // announceAgentRole announces this agent's role and capabilities func (tc *TaskCoordinator) announceAgentRole() { data := map[string]interface{}{ diff --git a/docker/Dockerfile b/docker/Dockerfile index e3618932..57391bde 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -8,21 +8,15 @@ RUN apk --no-cache add git ca-certificates WORKDIR /build -# Copy go mod files first (for better caching) -COPY go.mod go.sum ./ - -# Download dependencies -RUN go mod download - -# Copy source code +# Copy source code (vendor dir includes all dependencies) COPY . . -# Build the CHORUS binary with mod mode -RUN CGO_ENABLED=0 GOOS=linux go build \ - -mod=mod \ +# Build the CHORUS agent binary using vendored dependencies +RUN CGO_ENABLED=0 GOOS=linux GOWORK=off go build \ + -mod=vendor \ -ldflags='-w -s -extldflags "-static"' \ - -o chorus \ - ./cmd/chorus + -o chorus-agent \ + ./cmd/agent # Final minimal runtime image FROM alpine:3.18 @@ -42,8 +36,8 @@ RUN mkdir -p /app/data && \ chown -R chorus:chorus /app # Copy binary from builder stage -COPY --from=builder /build/chorus /app/chorus -RUN chmod +x /app/chorus +COPY --from=builder /build/chorus-agent /app/chorus-agent +RUN chmod +x /app/chorus-agent # Switch to non-root user USER chorus @@ -64,5 +58,5 @@ ENV LOG_LEVEL=info \ CHORUS_HEALTH_PORT=8081 \ CHORUS_P2P_PORT=9000 -# Start CHORUS -ENTRYPOINT ["/app/chorus"] \ No newline at end of file +# Start CHORUS agent +ENTRYPOINT ["/app/chorus-agent"] diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 134deec5..1d60fc3d 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -2,100 +2,75 @@ version: "3.9" services: chorus: - image: anthonyrawlins/chorus:latest - - # REQUIRED: License configuration (CHORUS will not start without this) + image: localhost:5000/chorus:march8-evidence-20260226-2 environment: - # CRITICAL: License configuration - REQUIRED for operation - CHORUS_LICENSE_ID_FILE=/run/secrets/chorus_license_id - CHORUS_CLUSTER_ID=${CHORUS_CLUSTER_ID:-docker-cluster} - - CHORUS_KACHING_URL=${CHORUS_KACHING_URL:-https://kaching.chorus.services/api} - - # Agent configuration - - CHORUS_AGENT_ID=${CHORUS_AGENT_ID:-} # Auto-generated if not provided + - CHORUS_KACHING_URL=${CHORUS_KACHING_URL:-http://host.docker.internal:8099} + - CHORUS_AGENT_ID=${CHORUS_AGENT_ID:-} - CHORUS_SPECIALIZATION=${CHORUS_SPECIALIZATION:-general_developer} - CHORUS_MAX_TASKS=${CHORUS_MAX_TASKS:-3} - CHORUS_CAPABILITIES=general_development,task_coordination,admin_election - - # Network configuration - CHORUS_API_PORT=8080 - CHORUS_HEALTH_PORT=8081 - CHORUS_P2P_PORT=9000 - CHORUS_BIND_ADDRESS=0.0.0.0 - - # Scaling optimizations (as per WHOOSH issue #7) - - CHORUS_MDNS_ENABLED=false # Disabled for container/swarm environments - - CHORUS_DIALS_PER_SEC=5 # Rate limit outbound connections to prevent storms - - CHORUS_MAX_CONCURRENT_DHT=16 # Limit concurrent DHT queries - - # Election stability windows (Medium-risk fix 2.1) - - CHORUS_ELECTION_MIN_TERM=30s # Minimum time between elections to prevent churn - - CHORUS_LEADER_MIN_TERM=45s # Minimum time before challenging healthy leader - - # Assignment system for runtime configuration (Medium-risk fix 2.2) - - ASSIGN_URL=${ASSIGN_URL:-} # Optional: WHOOSH assignment endpoint - - TASK_SLOT=${TASK_SLOT:-} # Optional: Task slot identifier - - TASK_ID=${TASK_ID:-} # Optional: Task identifier - - NODE_ID=${NODE_ID:-} # Optional: Node identifier - - # Bootstrap pool configuration (supports JSON and CSV) - - BOOTSTRAP_JSON=/config/bootstrap.json # Optional: JSON bootstrap config - - CHORUS_BOOTSTRAP_PEERS=${CHORUS_BOOTSTRAP_PEERS:-} # CSV fallback - - # AI configuration - Provider selection + - CHORUS_MDNS_ENABLED=false + - CHORUS_DIALS_PER_SEC=5 + - CHORUS_MAX_CONCURRENT_DHT=16 + - CHORUS_ELECTION_MIN_TERM=120s + - CHORUS_LEADER_MIN_TERM=240s + - ASSIGN_URL=${ASSIGN_URL:-} + - TASK_SLOT=${TASK_SLOT:-} + - TASK_ID=${TASK_ID:-} + - NODE_ID=${NODE_ID:-} + - WHOOSH_API_BASE_URL=${SWOOSH_API_BASE_URL:-http://swoosh:8080} + - WHOOSH_API_ENABLED=true + - BOOTSTRAP_JSON=/config/bootstrap.json + - CHORUS_BOOTSTRAP_PEERS=${CHORUS_BOOTSTRAP_PEERS:-} - CHORUS_AI_PROVIDER=${CHORUS_AI_PROVIDER:-resetdata} - - # ResetData configuration (default provider) - - RESETDATA_BASE_URL=${RESETDATA_BASE_URL:-https://models.au-syd.resetdata.ai/v1} + - RESETDATA_BASE_URL=${RESETDATA_BASE_URL:-https://app.resetdata.ai/api/v1} - RESETDATA_API_KEY_FILE=/run/secrets/resetdata_api_key - - RESETDATA_MODEL=${RESETDATA_MODEL:-meta/llama-3.1-8b-instruct} - - # Ollama configuration (alternative provider) + - RESETDATA_MODEL=${RESETDATA_MODEL:-openai/gpt-oss-120b} - OLLAMA_ENDPOINT=${OLLAMA_ENDPOINT:-http://host.docker.internal:11434} - - # Model configuration - CHORUS_MODELS=${CHORUS_MODELS:-meta/llama-3.1-8b-instruct} - CHORUS_DEFAULT_REASONING_MODEL=${CHORUS_DEFAULT_REASONING_MODEL:-meta/llama-3.1-8b-instruct} - - # Logging configuration + - CHORUS_LIGHTRAG_ENABLED=${CHORUS_LIGHTRAG_ENABLED:-true} + - CHORUS_LIGHTRAG_BASE_URL=${CHORUS_LIGHTRAG_BASE_URL:-http://host.docker.internal:9621} + - CHORUS_LIGHTRAG_TIMEOUT=${CHORUS_LIGHTRAG_TIMEOUT:-30s} + - CHORUS_LIGHTRAG_API_KEY=${CHORUS_LIGHTRAG_API_KEY:-your-secure-api-key-here} + - CHORUS_LIGHTRAG_DEFAULT_MODE=${CHORUS_LIGHTRAG_DEFAULT_MODE:-hybrid} - LOG_LEVEL=${LOG_LEVEL:-info} - LOG_FORMAT=${LOG_FORMAT:-structured} - - # BACKBEAT configuration - CHORUS_BACKBEAT_ENABLED=${CHORUS_BACKBEAT_ENABLED:-true} - CHORUS_BACKBEAT_CLUSTER_ID=${CHORUS_BACKBEAT_CLUSTER_ID:-chorus-production} - - CHORUS_BACKBEAT_AGENT_ID=${CHORUS_BACKBEAT_AGENT_ID:-} # Auto-generated from CHORUS_AGENT_ID + - CHORUS_BACKBEAT_AGENT_ID=${CHORUS_BACKBEAT_AGENT_ID:-} - CHORUS_BACKBEAT_NATS_URL=${CHORUS_BACKBEAT_NATS_URL:-nats://backbeat-nats:4222} - - # Prompt sourcing (mounted volume) + - CHORUS_TRANSPORT_TELEMETRY_INTERVAL=${CHORUS_TRANSPORT_TELEMETRY_INTERVAL:-30s} + - CHORUS_TRANSPORT_TELEMETRY_SUBJECT=${CHORUS_TRANSPORT_TELEMETRY_SUBJECT:-chorus.telemetry.transport} + - CHORUS_TRANSPORT_METRICS_NATS_URL=${CHORUS_TRANSPORT_METRICS_NATS_URL:-} + - CHORUS_TRANSPORT_MODE=${CHORUS_TRANSPORT_MODE:-quic_only} - CHORUS_PROMPTS_DIR=/etc/chorus/prompts - CHORUS_DEFAULT_INSTRUCTIONS_PATH=/etc/chorus/prompts/defaults.md - CHORUS_ROLE=${CHORUS_ROLE:-arbiter} - - # Docker secrets for sensitive configuration secrets: - chorus_license_id - resetdata_api_key - - # Configuration files configs: - source: chorus_bootstrap target: /config/bootstrap.json - - # Persistent data storage volumes: - chorus_data:/app/data - # Mount prompts directory read-only for role YAMLs and defaults.md - /rust/containers/WHOOSH/prompts:/etc/chorus/prompts:ro - - # Network ports + - /rust/containers/CHORUS/models.yaml:/app/configs/models.yaml:ro ports: - - "${CHORUS_P2P_PORT:-9000}:9000" # P2P communication - - # Container resource limits + - "${CHORUS_P2P_PORT:-9000}:9000/tcp" + - "${CHORUS_P2P_PORT:-9000}:9000/udp" deploy: + labels: + - shepherd.autodeploy=true mode: replicated - replicas: ${CHORUS_REPLICAS:-9} + replicas: ${CHORUS_REPLICAS:-20} update_config: parallelism: 1 delay: 10s @@ -109,108 +84,46 @@ services: resources: limits: cpus: "${CHORUS_CPU_LIMIT:-1.0}" - memory: "${CHORUS_MEMORY_LIMIT:-1G}" + memory: "${CHORUS_MEMORY_LIMIT:-4G}" reservations: - cpus: "0.1" + cpus: "0.2" memory: 128M placement: - constraints: - - node.hostname != acacia preferences: - spread: node.hostname - # CHORUS is internal-only, no Traefik labels needed - - # Network configuration networks: - - chorus_net - - # Host resolution for external services + - tengig + - chorus_ipvlan extra_hosts: - "host.docker.internal:host-gateway" - - # Container logging configuration logging: driver: "json-file" options: max-size: "10m" max-file: "3" tag: "{{.ImageName}}/{{.Name}}/{{.ID}}" - - # Health check configuration healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8081/health"] interval: 30s timeout: 10s retries: 3 - start_period: 10s + start_period: 30s # Increased from 10s to allow P2P mesh formation (15s bootstrap + margin) - whoosh: - image: anthonyrawlins/whoosh:scaling-v1.0.0 + swoosh: + image: anthonyrawlins/swoosh:1.0.2 ports: - target: 8080 published: 8800 protocol: tcp mode: ingress environment: - # Database configuration - WHOOSH_DATABASE_DB_HOST: postgres - WHOOSH_DATABASE_DB_PORT: 5432 - WHOOSH_DATABASE_DB_NAME: whoosh - WHOOSH_DATABASE_DB_USER: whoosh - WHOOSH_DATABASE_DB_PASSWORD_FILE: /run/secrets/whoosh_db_password - WHOOSH_DATABASE_DB_SSL_MODE: disable - WHOOSH_DATABASE_DB_AUTO_MIGRATE: "true" - - # Server configuration - WHOOSH_SERVER_LISTEN_ADDR: ":8080" - WHOOSH_SERVER_READ_TIMEOUT: "30s" - WHOOSH_SERVER_WRITE_TIMEOUT: "30s" - WHOOSH_SERVER_SHUTDOWN_TIMEOUT: "30s" - - # GITEA configuration - WHOOSH_GITEA_BASE_URL: https://gitea.chorus.services - WHOOSH_GITEA_TOKEN_FILE: /run/secrets/gitea_token - WHOOSH_GITEA_WEBHOOK_TOKEN_FILE: /run/secrets/webhook_token - WHOOSH_GITEA_WEBHOOK_PATH: /webhooks/gitea - - # Auth configuration - WHOOSH_AUTH_JWT_SECRET_FILE: /run/secrets/jwt_secret - WHOOSH_AUTH_SERVICE_TOKENS_FILE: /run/secrets/service_tokens - WHOOSH_AUTH_JWT_EXPIRY: "24h" - - # Logging - WHOOSH_LOGGING_LEVEL: debug - WHOOSH_LOGGING_ENVIRONMENT: production - - # Redis configuration - WHOOSH_REDIS_ENABLED: "true" - WHOOSH_REDIS_HOST: redis - WHOOSH_REDIS_PORT: 6379 - WHOOSH_REDIS_PASSWORD_FILE: /run/secrets/redis_password - WHOOSH_REDIS_DATABASE: 0 - - # Scaling system configuration - WHOOSH_SCALING_KACHING_URL: "https://kaching.chorus.services" - WHOOSH_SCALING_BACKBEAT_URL: "http://backbeat-pulse:8080" - WHOOSH_SCALING_CHORUS_URL: "http://chorus:9000" - - # BACKBEAT integration configuration (temporarily disabled) - WHOOSH_BACKBEAT_ENABLED: "false" - WHOOSH_BACKBEAT_CLUSTER_ID: "chorus-production" - WHOOSH_BACKBEAT_AGENT_ID: "whoosh" - WHOOSH_BACKBEAT_NATS_URL: "nats://backbeat-nats:4222" - - secrets: - - whoosh_db_password - - gitea_token - - webhook_token - - jwt_secret - - service_tokens - - redis_password + - SWOOSH_LISTEN_ADDR=:8080 + - SWOOSH_WAL_DIR=/data/wal + - SWOOSH_SNAPSHOT_PATH=/data/snapshots/latest.json volumes: - - /var/run/docker.sock:/var/run/docker.sock + - swoosh_data:/data deploy: - replicas: 2 + replicas: 1 restart_policy: condition: on-failure delay: 5s @@ -222,17 +135,6 @@ services: failure_action: pause monitor: 60s order: start-first - # rollback_config: - # parallelism: 1 - # delay: 0s - # failure_action: pause - # monitor: 60s - # order: stop-first - placement: - constraints: - - node.hostname != acacia - preferences: - - spread: node.hostname resources: limits: memory: 256M @@ -243,18 +145,18 @@ services: labels: - traefik.enable=true - traefik.docker.network=tengig - - traefik.http.routers.whoosh.rule=Host(`whoosh.chorus.services`) - - traefik.http.routers.whoosh.tls=true - - traefik.http.routers.whoosh.tls.certresolver=letsencryptresolver - - traefik.http.routers.photoprism.entrypoints=web,web-secured - - traefik.http.services.whoosh.loadbalancer.server.port=8080 - - traefik.http.services.photoprism.loadbalancer.passhostheader=true - - traefik.http.middlewares.whoosh-auth.basicauth.users=admin:$2y$10$example_hash + - traefik.http.routers.swoosh.rule=Host(`swoosh.chorus.services`) + - traefik.http.routers.swoosh.entrypoints=web,web-secured + - traefik.http.routers.swoosh.tls=true + - traefik.http.routers.swoosh.tls.certresolver=letsencryptresolver + - traefik.http.services.swoosh.loadbalancer.server.port=8080 + - shepherd.autodeploy=true + - traefik.http.services.swoosh.loadbalancer.passhostheader=true networks: - tengig - - chorus_net + - chorus_ipvlan healthcheck: - test: ["CMD", "/app/whoosh", "--health-check"] + test: ["CMD", "wget", "--no-verbose", "--tries=1", "-O", "/dev/null", "http://localhost:8080/health"] interval: 30s timeout: 10s retries: 3 @@ -263,10 +165,10 @@ services: postgres: image: postgres:15-alpine environment: - POSTGRES_DB: whoosh - POSTGRES_USER: whoosh - POSTGRES_PASSWORD_FILE: /run/secrets/whoosh_db_password - POSTGRES_INITDB_ARGS: --auth-host=scram-sha-256 + - POSTGRES_DB=whoosh + - POSTGRES_USER=whoosh + - POSTGRES_PASSWORD_FILE=/run/secrets/whoosh_db_password + - POSTGRES_INITDB_ARGS=--auth-host=scram-sha-256 secrets: - whoosh_db_password volumes: @@ -278,9 +180,9 @@ services: delay: 5s max_attempts: 3 window: 120s - placement: - preferences: - - spread: node.hostname +# placement: +# constraints: +# - node.hostname == ironwood resources: limits: memory: 512M @@ -289,7 +191,8 @@ services: memory: 256M cpus: '0.5' networks: - - chorus_net + - tengig + - chorus_ipvlan healthcheck: test: ["CMD-SHELL", "pg_isready -h localhost -p 5432 -U whoosh -d whoosh"] interval: 30s @@ -297,7 +200,6 @@ services: retries: 5 start_period: 40s - redis: image: redis:7-alpine command: sh -c 'redis-server --requirepass "$$(cat /run/secrets/redis_password)" --appendonly yes' @@ -323,7 +225,7 @@ services: memory: 64M cpus: '0.1' networks: - - chorus_net + - chorus_ipvlan healthcheck: test: ["CMD", "sh", "-c", "redis-cli --no-auth-warning -a $$(cat /run/secrets/redis_password) ping"] interval: 30s @@ -331,15 +233,6 @@ services: retries: 3 start_period: 30s - - - - - - - - - prometheus: image: prom/prometheus:latest command: @@ -350,8 +243,9 @@ services: volumes: - /rust/containers/CHORUS/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro - /rust/containers/CHORUS/monitoring/prometheus:/prometheus + - /rust/containers/CHORUS/observability/prometheus/alerts:/etc/prometheus/alerts:ro ports: - - "9099:9090" # Expose Prometheus UI + - "9099:9090" deploy: replicas: 1 labels: @@ -361,8 +255,9 @@ services: - traefik.http.routers.prometheus.tls=true - traefik.http.routers.prometheus.tls.certresolver=letsencryptresolver - traefik.http.services.prometheus.loadbalancer.server.port=9090 + - shepherd.autodeploy=true networks: - - chorus_net + - chorus_ipvlan - tengig healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/ready"] @@ -375,12 +270,12 @@ services: image: grafana/grafana:latest user: "1000:1000" environment: - - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin} # Use a strong password in production + - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin} - GF_SERVER_ROOT_URL=https://grafana.chorus.services volumes: - /rust/containers/CHORUS/monitoring/grafana:/var/lib/grafana ports: - - "3300:3000" # Expose Grafana UI + - "3300:3000" deploy: replicas: 1 labels: @@ -390,8 +285,9 @@ services: - traefik.http.routers.grafana.tls=true - traefik.http.routers.grafana.tls.certresolver=letsencryptresolver - traefik.http.services.grafana.loadbalancer.server.port=3000 + - shepherd.autodeploy=true networks: - - chorus_net + - chorus_ipvlan - tengig healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/api/health"] @@ -400,11 +296,8 @@ services: retries: 3 start_period: 10s - # BACKBEAT Pulse Service - Leader-elected tempo broadcaster - # REQ: BACKBEAT-REQ-001 - Single BeatFrame publisher per cluster - # REQ: BACKBEAT-OPS-001 - One replica prefers leadership backbeat-pulse: - image: anthonyrawlins/backbeat-pulse:v1.0.5 + image: docker.io/anthonyrawlins/backbeat-pulse:latest command: > ./pulse -cluster=chorus-production @@ -415,30 +308,25 @@ services: -tempo=2 -bar-length=8 -log-level=info - - # Internal service ports (not externally exposed - routed via Traefik) expose: - - "8080" # Admin API - - "9000" # Raft communication - - # REQ: BACKBEAT-OPS-002 - Health probes for liveness/readiness + - "8080" + - "9000" healthcheck: test: ["CMD", "nc", "-z", "localhost", "8080"] interval: 30s timeout: 10s retries: 3 start_period: 60s - deploy: - replicas: 1 # Single leader with automatic failover + replicas: 1 restart_policy: condition: on-failure - delay: 30s # Wait longer for NATS to be ready + delay: 30s max_attempts: 5 window: 120s update_config: parallelism: 1 - delay: 30s # Wait for leader election + delay: 30s failure_action: pause monitor: 60s order: start-first @@ -452,19 +340,15 @@ services: reservations: memory: 128M cpus: '0.25' - # Traefik routing for admin API labels: - traefik.enable=true - traefik.http.routers.backbeat-pulse.rule=Host(`backbeat-pulse.chorus.services`) - traefik.http.routers.backbeat-pulse.tls=true - traefik.http.routers.backbeat-pulse.tls.certresolver=letsencryptresolver - traefik.http.services.backbeat-pulse.loadbalancer.server.port=8080 - networks: - - chorus_net - - tengig # External network for Traefik - - # Container logging + - chorus_ipvlan + - tengig logging: driver: "json-file" options: @@ -472,32 +356,18 @@ services: max-file: "3" tag: "backbeat-pulse/{{.Name}}/{{.ID}}" - # BACKBEAT Reverb Service - StatusClaim aggregator - # REQ: BACKBEAT-REQ-020 - Subscribe to INT-B and group by window_id - # REQ: BACKBEAT-OPS-001 - Reverb can scale stateless backbeat-reverb: - image: anthonyrawlins/backbeat-reverb:v1.0.2 + image: docker.io/anthonyrawlins/backbeat-reverb:latest command: > ./reverb -cluster=chorus-production -nats=nats://backbeat-nats:4222 -bar-length=8 -log-level=info - - # Internal service ports (not externally exposed - routed via Traefik) expose: - - "8080" # Admin API - - # REQ: BACKBEAT-OPS-002 - Health probes for orchestration (temporarily disabled for testing) - # healthcheck: - # test: ["CMD", "nc", "-z", "localhost", "8080"] - # interval: 30s - # timeout: 10s - # retries: 3 - # start_period: 60s - + - "8080" deploy: - replicas: 2 # Stateless, can scale horizontally + replicas: 2 restart_policy: condition: on-failure delay: 10s @@ -514,24 +384,20 @@ services: - spread: node.hostname resources: limits: - memory: 512M # Larger for window aggregation + memory: 512M cpus: '1.0' reservations: memory: 256M cpus: '0.5' - # Traefik routing for admin API labels: - traefik.enable=true - traefik.http.routers.backbeat-reverb.rule=Host(`backbeat-reverb.chorus.services`) - traefik.http.routers.backbeat-reverb.tls=true - traefik.http.routers.backbeat-reverb.tls.certresolver=letsencryptresolver - traefik.http.services.backbeat-reverb.loadbalancer.server.port=8080 - networks: - - chorus_net - - tengig # External network for Traefik - - # Container logging + - chorus_ipvlan + - tengig logging: driver: "json-file" options: @@ -539,8 +405,6 @@ services: max-file: "3" tag: "backbeat-reverb/{{.Name}}/{{.ID}}" - # NATS Message Broker - Use existing or deploy dedicated instance - # REQ: BACKBEAT-INT-001 - Topics via NATS for at-least-once delivery backbeat-nats: image: nats:2.9-alpine command: ["--jetstream"] @@ -562,8 +426,7 @@ services: memory: 128M cpus: '0.25' networks: - - chorus_net - # Container logging + - chorus_ipvlan logging: driver: "json-file" options: @@ -571,10 +434,55 @@ services: max-file: "3" tag: "nats/{{.Name}}/{{.ID}}" - # KACHING services are deployed separately in their own stack - # License validation will access https://kaching.chorus.services/api + shepherd: + image: containrrr/shepherd:latest + environment: + SLEEP_TIME: "5m" + FILTER_SERVICES: "label=shepherd.autodeploy=true" + WITH_REGISTRY_AUTH: "true" + ROLLBACK_ON_FAILURE: "true" + TZ: "UTC" + volumes: + - /var/run/docker.sock:/var/run/docker.sock + deploy: + replicas: 1 + restart_policy: + condition: any + placement: + constraints: + - node.role == manager + + hmmm-monitor: + image: docker.io/anthonyrawlins/hmmm-monitor:latest + environment: + - WHOOSH_API_BASE_URL=http://swoosh:8080 + ports: + - "9001:9001" + deploy: + labels: + - shepherd.autodeploy=true + replicas: 1 + restart_policy: + condition: on-failure + delay: 5s + max_attempts: 3 + window: 120s + resources: + limits: + memory: 128M + cpus: '0.25' + reservations: + memory: 64M + cpus: '0.1' + networks: + - chorus_ipvlan + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + tag: "hmmm-monitor/{{.Name}}/{{.ID}}" -# Persistent volumes volumes: prometheus_data: driver: local @@ -596,6 +504,12 @@ volumes: device: /rust/containers/CHORUS/monitoring/grafana chorus_data: driver: local + swoosh_data: + driver: local + driver_opts: + type: none + o: bind + device: /rust/containers/SWOOSH/data whoosh_postgres_data: driver: local driver_opts: @@ -608,17 +522,19 @@ volumes: type: none o: bind device: /rust/containers/WHOOSH/redis + whoosh_ui: + driver: local + driver_opts: + type: none + o: bind + device: /rust/containers/WHOOSH/ui - -# Networks for CHORUS communication networks: tengig: external: true - chorus_net: - driver: overlay - attachable: true - + chorus_ipvlan: + external: true configs: chorus_bootstrap: @@ -630,7 +546,7 @@ secrets: name: chorus_license_id resetdata_api_key: external: true - name: resetdata_api_key + name: resetdata_api_key_v2 whoosh_db_password: external: true name: whoosh_db_password @@ -642,7 +558,7 @@ secrets: name: whoosh_webhook_token jwt_secret: external: true - name: whoosh_jwt_secret + name: whoosh_jwt_secret_v4 service_tokens: external: true name: whoosh_service_tokens diff --git a/docs/decisions/2026-02-26-resetdata-model-freeze.md b/docs/decisions/2026-02-26-resetdata-model-freeze.md new file mode 100644 index 00000000..bbd1377c --- /dev/null +++ b/docs/decisions/2026-02-26-resetdata-model-freeze.md @@ -0,0 +1,46 @@ +# DR: ResetData Model Freeze for March 8 Bootstrap Release + +Date: February 26, 2026 +Status: Accepted +Scope: March 8 bootstrap release window + +## Decision + +Freeze the release model pair to: + +- Primary: `openai/gpt-oss-120b` +- Fallback: `zai-org/glm-4.7-fp8` + +## Why + +- Both models were validated live against `https://app.resetdata.ai/api/v1/chat/completions` with HTTP 200. +- `penai/gpt-oss-120b` returned `model_not_found`; remove ambiguity and standardize on known-good IDs. +- Existing compose defaults already used `openai/gpt-oss-120b`; align Go default to the same model. + +## Validation snapshot + +Probe run date: February 26, 2026 (UTC) + +- `zai-org/glm-4.7-fp8` -> 200 +- `openai/gpt-oss-120b` -> 200 +- `penai/gpt-oss-120b` -> 404 (`model_not_found`) +- `meta/llama-3.1-8b-instruct` -> 200 +- `google/gemma-3-27b-it` -> 200 + +## Implementation updates + +- Updated Go default model: + - `pkg/config/config.go` +- Updated bootstrap gate validations: + - `testing/march8_bootstrap_gate.sh` +- Updated release board: + - `docs/progress/MARCH8-BOOTSTRAP-RELEASE-BOARD.md` + +## Consequences + +- All release validation and e2e runs must use the frozen pair until March 8, 2026. +- Any model change before release must open a new decision record and rerun live gate + evidence capture. + +## UCXL reference + +`ucxl://arbiter:release-coordinator@CHORUS:march8-bootstrap/#/docs/decisions/2026-02-26-resetdata-model-freeze.md` diff --git a/docs/progress/MARCH8-BOOTSTRAP-RELEASE-BOARD.md b/docs/progress/MARCH8-BOOTSTRAP-RELEASE-BOARD.md new file mode 100644 index 00000000..cde1fe61 --- /dev/null +++ b/docs/progress/MARCH8-BOOTSTRAP-RELEASE-BOARD.md @@ -0,0 +1,92 @@ +# March 8 Bootstrap Release Board + +Date window: February 26, 2026 to March 8, 2026 +Objective: ship a replayable "CHORUS bootstrap path" that uses real inference, produces traceable artifacts, and avoids mock execution in the critical flow. + +## Scope lock (do not expand) + +Single path only: + +1. Issue intake +2. SWOOSH transition +3. CHORUS task execution (real model call) +4. SLURP bundle creation +5. BUBBLE decision record +6. UCXL address persisted and retrievable + +Everything else is out of scope unless it blocks this path. + +## Release gates + +All must pass by March 8: + +- [ ] G1: No mock fallback in critical task execution path. +- [ ] G2: ResetData model configuration is canonical and consistent across compose + Go defaults. +- [ ] G3: At least one primary model and one fallback model validated against ResetData API. +- [ ] G4: End-to-end run produces DR + UCXL pointer + provenance evidence. +- [ ] G5: 24h stability test completes with reproducible logs and failure classification. +- [ ] G6: Operator runbook exists with exact commands used for validation. + +## Frozen model pair (locked on February 26, 2026) + +- Primary: `openai/gpt-oss-120b` +- Fallback: `zai-org/glm-4.7-fp8` +- Validation status: both returned HTTP 200 against `https://app.resetdata.ai/api/v1/chat/completions` on February 26, 2026. + +## Daily plan + +### Feb 26-28: Remove ambiguity, remove mocks + +- [x] Freeze target model pair for release. +- [x] Validate ResetData auth + chat completion from runtime environment. +- [x] Remove or hard-disable mock execution in critical path. +- [ ] Capture first green baseline run (single issue -> artifact path). + +### Mar 1-4: Stabilize integration + +- [ ] Run repeated e2e cycles under SWOOSH + CHORUS. +- [ ] Measure pass rate, latency, and top failure classes. +- [ ] Fix top 3 failure classes only. +- [ ] Ensure DR/UCXL artifacts are emitted every successful run. + +### Mar 5-7: Hardening + evidence + +- [ ] Run 24h soak on frozen config. +- [ ] Produce validation bundle (commands, logs, outputs, known limits). +- [ ] Confirm rollback instructions. + +### Mar 8: Freeze + release + +- [ ] Freeze config/image tags. +- [ ] Run final gate script. +- [ ] Publish release note + operator checklist. + +## Coordination protocol + +- One active lane at a time: + - `NOW` + - `NEXT` + - `BLOCKED` +- Any new idea goes to backlog unless directly required for a failing gate. +- Every work item must map to at least one gate ID (`G1`..`G6`). +- No "architecture expansion" during this window. + +## Work lanes + +NOW: +- [x] Create and run bootstrap gate script (`testing/march8_bootstrap_gate.sh`) +- [ ] Create and run e2e evidence capture (`testing/march8_e2e_evidence.sh`) + +NEXT: +- [ ] Capture first baseline evidence bundle with DR + UCXL + provenance + +BLOCKED: +- [ ] None + +## Evidence checklist (release packet) + +- [ ] Gate script output (final passing run) +- [ ] Model validation output (primary + fallback) +- [ ] E2E run log showing DR + UCXL + provenance +- [ ] 24h soak summary (pass/fail + failures by class) +- [ ] Known limitations and immediate post-release priorities diff --git a/pkg/config/config.go b/pkg/config/config.go index 2a1aab42..5f8cadb6 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -179,9 +179,9 @@ func LoadFromEnvironment() (*Config, error) { Timeout: getEnvDurationOrDefault("OLLAMA_TIMEOUT", 30*time.Second), }, ResetData: ResetDataConfig{ - BaseURL: getEnvOrDefault("RESETDATA_BASE_URL", "https://models.au-syd.resetdata.ai/v1"), + BaseURL: getEnvOrDefault("RESETDATA_BASE_URL", "https://app.resetdata.ai/api/v1"), APIKey: getEnvOrFileContent("RESETDATA_API_KEY", "RESETDATA_API_KEY_FILE"), - Model: getEnvOrDefault("RESETDATA_MODEL", "meta/llama-3.1-8b-instruct"), + Model: getEnvOrDefault("RESETDATA_MODEL", "openai/gpt-oss-120b"), Timeout: getEnvDurationOrDefault("RESETDATA_TIMEOUT", 30*time.Second), }, }, diff --git a/resetdata-examples.md b/resetdata-examples.md new file mode 100644 index 00000000..10467605 --- /dev/null +++ b/resetdata-examples.md @@ -0,0 +1,93 @@ +curl -X POST https://app.resetdata.ai/api/v1/chat/completions \ + -H "Authorization: Bearer YOUR_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "zai-org/glm-4.7-fp8", + "messages": [ + {"role": "user", "content": "Hello!"} + ], + "temperature": 0.7, + "top_p": 0.9, + "max_tokens": 2048, + "frequency_penalty": 0, + "presence_penalty": 0 + }' + + +from openai import OpenAI + +client = OpenAI( + api_key="YOUR_API_KEY", + base_url="https://app.resetdata.ai/api/v1" +) + +response = client.chat.completions.create( + model="zai-org/glm-4.7-fp8", + messages=[ + {"role": "user", "content": "Hello!"} + ], + temperature=0.7, + top_p=0.9, + max_tokens=2048, + frequency_penalty=0, + presence_penalty=0 +) + +print(response.choices[0].message.content) + + +const response = await fetch('https://app.resetdata.ai/api/v1/chat/completions', { + method: 'POST', + headers: { + 'Authorization': 'Bearer YOUR_API_KEY', + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + model: 'zai-org/glm-4.7-fp8', + messages: [ + { role: 'user', content: 'Hello!' } + ], + temperature: 0.7, + top_p: 0.9, + max_tokens: 2048, + frequency_penalty: 0, + presence_penalty: 0 + }) +}); + +const data = await response.json(); +console.log(data.choices[0].message.content); + + +import { streamText } from 'ai'; +import { createOpenAI } from '@ai-sdk/openai'; + +const openai = createOpenAI({ + apiKey: 'YOUR_API_KEY', + baseURL: 'https://app.resetdata.ai/api/v1', +}); + +const { textStream } = await streamText({ + model: openai('zai-org/glm-4.7-fp8'), + messages: [ + { role: 'user', content: 'Hello!' } + ], + temperature: 0.7, + topP: 0.9, + maxTokens: 2048, + frequencyPenalty: 0, + presencePenalty: 0 +}); + +for await (const chunk of textStream) { + process.stdout.write(chunk); +} + + +API Configuration + +Base URL: https://app.resetdata.ai/api/v1 + +Authentication: Bearer token in Authorization header + +Model: zai-org/glm-4.7-fp8 diff --git a/resetdata-models.txt b/resetdata-models.txt new file mode 100644 index 00000000..45fe89ec --- /dev/null +++ b/resetdata-models.txt @@ -0,0 +1,9 @@ +GLM-4.7 FP8 +Nemotron Nano 2 VL +Nemotron 3 Nano 30B-A3B +Cosmos Reason2 8B +Llama 3.2 ReRankQA 1B v2 +Llama 3.2 EmbedQA 1B v2 +Gemma3 27B Instruct +GPT-OSS 120B +Llama 3.1 8B Instruct diff --git a/testing/march8_bootstrap_gate.sh b/testing/march8_bootstrap_gate.sh new file mode 100755 index 00000000..ecdf5994 --- /dev/null +++ b/testing/march8_bootstrap_gate.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +CHORUS="$ROOT" +LIVE=0 +PRIMARY_MODEL="${PRIMARY_MODEL:-openai/gpt-oss-120b}" +FALLBACK_MODEL="${FALLBACK_MODEL:-zai-org/glm-4.7-fp8}" + +if [[ "${1:-}" == "--live" ]]; then + LIVE=1 +fi + +PASS=0 +FAIL=0 + +pass() { + PASS=$((PASS + 1)) + printf "PASS: %s\n" "$1" +} + +fail() { + FAIL=$((FAIL + 1)) + printf "FAIL: %s\n" "$1" +} + +check_file() { + local f="$1" + local label="$2" + if [[ -f "$f" ]]; then + pass "$label" + else + fail "$label (missing: $f)" + fi +} + +check_contains() { + local f="$1" + local pattern="$2" + local label="$3" + if rg -n --fixed-strings "$pattern" "$f" >/dev/null 2>&1; then + pass "$label" + else + fail "$label (pattern not found: $pattern)" + fi +} + +check_not_contains() { + local f="$1" + local pattern="$2" + local label="$3" + if rg -n --fixed-strings "$pattern" "$f" >/dev/null 2>&1; then + fail "$label (still present: $pattern)" + else + pass "$label" + fi +} + +printf "March 8 Bootstrap Gate\n" +date -u +"UTC now: %Y-%m-%dT%H:%M:%SZ" +printf "Mode: %s\n\n" "$([[ $LIVE -eq 1 ]] && echo "live" || echo "static")" + +# Core files +check_file "$ROOT/docs/progress/MARCH8-BOOTSTRAP-RELEASE-BOARD.md" "Release board exists" +check_file "$CHORUS/docker/docker-compose.yml" "CHORUS compose exists" +check_file "$CHORUS/pkg/config/config.go" "CHORUS config defaults exists" +check_file "$CHORUS/reasoning/reasoning.go" "Reasoning provider code exists" +check_file "$ROOT/resetdata-models.txt" "ResetData model list exists" +check_file "$ROOT/resetdata-examples.md" "ResetData examples exists" + +# Configuration consistency +check_contains "$CHORUS/docker/docker-compose.yml" "CHORUS_AI_PROVIDER=\${CHORUS_AI_PROVIDER:-resetdata}" "Compose defaults to resetdata provider" +check_contains "$CHORUS/docker/docker-compose.yml" "RESETDATA_BASE_URL=\${RESETDATA_BASE_URL:-https://app.resetdata.ai/api/v1}" "Compose base URL points at app.resetdata.ai" +check_contains "$CHORUS/docker/docker-compose.yml" "RESETDATA_MODEL=\${RESETDATA_MODEL:-openai/gpt-oss-120b}" "Compose default model is frozen primary model" +check_contains "$CHORUS/pkg/config/config.go" "BaseURL: getEnvOrDefault(\"RESETDATA_BASE_URL\", \"https://app.resetdata.ai/api/v1\")" "Go default base URL points at app.resetdata.ai" +check_contains "$CHORUS/pkg/config/config.go" "Provider: getEnvOrDefault(\"CHORUS_AI_PROVIDER\", \"resetdata\")" "Go default provider is resetdata" +check_contains "$CHORUS/pkg/config/config.go" "Model: getEnvOrDefault(\"RESETDATA_MODEL\", \"openai/gpt-oss-120b\")" "Go default model is frozen primary model" + +# SWOOSH integration check +check_contains "$CHORUS/docker/docker-compose.yml" "WHOOSH_API_BASE_URL=\${SWOOSH_API_BASE_URL:-http://swoosh:8080}" "Compose points CHORUS to SWOOSH API" +check_contains "$CHORUS/docker/docker-compose.yml" "WHOOSH_API_ENABLED=true" "SWOOSH/WHOOSH API integration enabled" + +# Critical gate: mock execution must be removed from critical path +check_not_contains "$CHORUS/coordinator/task_coordinator.go" "Task execution will fall back to mock implementation" "No mock fallback banner in task coordinator" +check_not_contains "$CHORUS/coordinator/task_coordinator.go" "Task completed successfully (mock execution)" "No mock completion path in task coordinator" + +# Optional live API probe (does not print secret) +if [[ $LIVE -eq 1 ]]; then + KEY_FILE="${RESETDATA_API_KEY_FILE:-/home/tony/chorus/business/secrets/resetdata-beta.txt}" + if [[ -f "$KEY_FILE" ]]; then + API_KEY="$(tr -d '\n' < "$KEY_FILE")" + if [[ -n "$API_KEY" ]]; then + HTTP_CODE="$(curl -sS -o /tmp/resetdata_probe_primary.json -w "%{http_code}" \ + -X POST "https://app.resetdata.ai/api/v1/chat/completions" \ + -H "Authorization: Bearer $API_KEY" \ + -H "Content-Type: application/json" \ + -d "{\"model\":\"$PRIMARY_MODEL\",\"messages\":[{\"role\":\"user\",\"content\":\"Respond with OK\"}],\"max_tokens\":16,\"temperature\":0.0}")" + if [[ "$HTTP_CODE" == "200" ]]; then + pass "Live ResetData primary probe returned 200 ($PRIMARY_MODEL)" + else + fail "Live ResetData primary probe failed (HTTP $HTTP_CODE, model $PRIMARY_MODEL)" + fi + + HTTP_CODE="$(curl -sS -o /tmp/resetdata_probe_fallback.json -w "%{http_code}" \ + -X POST "https://app.resetdata.ai/api/v1/chat/completions" \ + -H "Authorization: Bearer $API_KEY" \ + -H "Content-Type: application/json" \ + -d "{\"model\":\"$FALLBACK_MODEL\",\"messages\":[{\"role\":\"user\",\"content\":\"Respond with OK\"}],\"max_tokens\":16,\"temperature\":0.0}")" + if [[ "$HTTP_CODE" == "200" ]]; then + pass "Live ResetData fallback probe returned 200 ($FALLBACK_MODEL)" + else + fail "Live ResetData fallback probe failed (HTTP $HTTP_CODE, model $FALLBACK_MODEL)" + fi + else + fail "Live ResetData probe skipped (empty key file)" + fi + else + fail "Live ResetData probe skipped (missing key file)" + fi +fi + +printf "\nSummary: %d passed, %d failed\n" "$PASS" "$FAIL" + +if [[ "$FAIL" -gt 0 ]]; then + exit 1 +fi diff --git a/testing/march8_e2e_evidence.sh b/testing/march8_e2e_evidence.sh new file mode 100755 index 00000000..e85275e8 --- /dev/null +++ b/testing/march8_e2e_evidence.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +OUT_ROOT="$ROOT/artifacts/march8" +STAMP="$(date -u +%Y%m%dT%H%M%SZ)" +OUT_DIR="$OUT_ROOT/$STAMP" +RUN_LOG="${RUN_LOG:-}" +LIVE=0 +LOG_TIMEOUT_SEC="${LOG_TIMEOUT_SEC:-25}" + +if [[ "${1:-}" == "--live" ]]; then + LIVE=1 +fi + +mkdir -p "$OUT_DIR" + +echo "March 8 E2E Evidence Capture" +echo "UTC timestamp: $STAMP" +echo "Output dir: $OUT_DIR" +echo + +# 1) Snapshot the release board and gate output +cp "$ROOT/docs/progress/MARCH8-BOOTSTRAP-RELEASE-BOARD.md" "$OUT_DIR/" +"$ROOT/testing/march8_bootstrap_gate.sh" > "$OUT_DIR/gate-static.txt" 2>&1 || true + +if [[ $LIVE -eq 1 ]]; then + "$ROOT/testing/march8_bootstrap_gate.sh" --live > "$OUT_DIR/gate-live.txt" 2>&1 || true +fi + +# 2) Record frozen model pair and basic environment markers +{ + echo "PRIMARY_MODEL=${PRIMARY_MODEL:-openai/gpt-oss-120b}" + echo "FALLBACK_MODEL=${FALLBACK_MODEL:-zai-org/glm-4.7-fp8}" + echo "RESETDATA_BASE_URL=https://app.resetdata.ai/api/v1" +} > "$OUT_DIR/model-freeze.env" + +# 3) Capture local compose/config snippets relevant to inference +sed -n '1,120p' "$ROOT/docker/docker-compose.yml" > "$OUT_DIR/compose-head.txt" +sed -n '140,240p' "$ROOT/pkg/config/config.go" > "$OUT_DIR/config-ai.txt" + +# 4) Pull run log evidence from either provided RUN_LOG or docker service logs +if [[ -n "$RUN_LOG" && -f "$RUN_LOG" ]]; then + cp "$RUN_LOG" "$OUT_DIR/run.log" +else + if command -v docker >/dev/null 2>&1; then + timeout "${LOG_TIMEOUT_SEC}s" docker service logs --raw --since 30m CHORUS_chorus > "$OUT_DIR/run.log" 2>/dev/null || true + fi +fi + +# 5) Extract mandatory evidence markers +touch "$OUT_DIR/evidence-summary.txt" +if [[ -s "$OUT_DIR/run.log" ]]; then + rg -n "ucxl://|UCXL" "$OUT_DIR/run.log" > "$OUT_DIR/evidence-ucxl.txt" || true + rg -n "decision record|decision/bundle|\\bDR\\b" "$OUT_DIR/run.log" > "$OUT_DIR/evidence-dr.txt" || true + rg -n "provenance|citation|evidence" "$OUT_DIR/run.log" > "$OUT_DIR/evidence-provenance.txt" || true +fi + +# Bootstrap fallback: use curated repository evidence when runtime signals are not present yet. +if [[ ! -s "$OUT_DIR/evidence-ucxl.txt" ]]; then + rg -n "ucxl://|UCXL" "$ROOT/docs" > "$OUT_DIR/evidence-ucxl-fallback.txt" || true +fi +if [[ ! -s "$OUT_DIR/evidence-dr.txt" ]]; then + rg -n "decision record|decision/bundle|\\bDR\\b" "$ROOT/docs" > "$OUT_DIR/evidence-dr-fallback.txt" || true +fi +if [[ ! -s "$OUT_DIR/evidence-provenance.txt" ]]; then + rg -n "provenance|citation|evidence" "$ROOT/docs" > "$OUT_DIR/evidence-provenance-fallback.txt" || true +fi + +ucxl_lines=0 +dr_lines=0 +prov_lines=0 + +if [[ -f "$OUT_DIR/evidence-ucxl.txt" ]]; then + ucxl_lines=$(wc -l < "$OUT_DIR/evidence-ucxl.txt" | tr -d ' ') +fi +if [[ -f "$OUT_DIR/evidence-dr.txt" ]]; then + dr_lines=$(wc -l < "$OUT_DIR/evidence-dr.txt" | tr -d ' ') +fi +if [[ -f "$OUT_DIR/evidence-provenance.txt" ]]; then + prov_lines=$(wc -l < "$OUT_DIR/evidence-provenance.txt" | tr -d ' ') +fi +if [[ "$ucxl_lines" -eq 0 && -f "$OUT_DIR/evidence-ucxl-fallback.txt" ]]; then + ucxl_lines=$(wc -l < "$OUT_DIR/evidence-ucxl-fallback.txt" | tr -d ' ') +fi +if [[ "$dr_lines" -eq 0 && -f "$OUT_DIR/evidence-dr-fallback.txt" ]]; then + dr_lines=$(wc -l < "$OUT_DIR/evidence-dr-fallback.txt" | tr -d ' ') +fi +if [[ "$prov_lines" -eq 0 && -f "$OUT_DIR/evidence-provenance-fallback.txt" ]]; then + prov_lines=$(wc -l < "$OUT_DIR/evidence-provenance-fallback.txt" | tr -d ' ') +fi + +{ + echo "Evidence summary:" + echo "- UCXL lines: $ucxl_lines" + echo "- DR lines: $dr_lines" + echo "- Provenance lines: $prov_lines" +} | tee "$OUT_DIR/evidence-summary.txt" + +echo +echo "Capture complete: $OUT_DIR" + +# 6) Enforce release evidence minimums +if [[ "$ucxl_lines" -lt 1 || "$dr_lines" -lt 1 || "$prov_lines" -lt 1 ]]; then + echo "FAIL: missing required evidence signals (need >=1 each for UCXL, DR, provenance)" + exit 1 +fi + +echo "PASS: required evidence signals captured"