fix/resetdata-provider-beta-compat #18

Merged
tony merged 2 commits from fix/resetdata-provider-beta-compat into main 2026-02-26 12:02:31 +00:00
10 changed files with 682 additions and 280 deletions
Showing only changes of commit 2147cec1c5 - Show all commits

View File

@@ -113,12 +113,14 @@ func NewTaskCoordinator(
// Start begins the task coordination process // Start begins the task coordination process
func (tc *TaskCoordinator) Start() { func (tc *TaskCoordinator) Start() {
fmt.Printf("🎯 Starting task coordinator for agent %s (%s)\n", tc.agentInfo.ID, tc.agentInfo.Role) fmt.Printf("🎯 Starting task coordinator for agent %s (%s)\n", tc.agentInfo.ID, tc.agentInfo.Role)
fmt.Printf("📎 evidence readiness: UCXL decision record provenance pipeline armed (template=%s)\n",
tc.buildTaskUCXLAddress("bootstrap", 0))
// Initialize task execution engine // Initialize task execution engine
err := tc.initializeExecutionEngine() err := tc.initializeExecutionEngine()
if err != nil { if err != nil {
fmt.Printf("⚠️ Failed to initialize task execution engine: %v\n", err) fmt.Printf("⚠️ Failed to initialize task execution engine: %v\n", err)
fmt.Println("Task execution will fall back to mock implementation") fmt.Println("Task execution engine unavailable; critical path execution is disabled until fixed")
} }
// Announce role and capabilities // Announce role and capabilities
@@ -391,18 +393,17 @@ func (tc *TaskCoordinator) executeTask(activeTask *ActiveTask) {
if err != nil { if err != nil {
fmt.Printf("⚠️ AI execution failed for task %s #%d: %v\n", fmt.Printf("⚠️ AI execution failed for task %s #%d: %v\n",
activeTask.Task.Repository, activeTask.Task.Number, err) activeTask.Task.Repository, activeTask.Task.Number, err)
taskResult = tc.buildFailedTaskResult(activeTask, "ai_execution_failed", err)
// Fall back to mock execution
taskResult = tc.executeMockTask(activeTask)
} else { } else {
// Convert execution result to task result // Convert execution result to task result
taskResult = tc.convertExecutionResult(activeTask, executionResult) taskResult = tc.convertExecutionResult(activeTask, executionResult)
} }
} else { } else {
// Fall back to mock execution taskResult = tc.buildFailedTaskResult(
fmt.Printf("📝 Using mock execution for task %s #%d (engine not available)\n", activeTask,
activeTask.Task.Repository, activeTask.Task.Number) "execution_engine_unavailable",
taskResult = tc.executeMockTask(activeTask) fmt.Errorf("execution engine is not initialized"),
)
} }
err := activeTask.Provider.CompleteTask(activeTask.Task, taskResult) err := activeTask.Provider.CompleteTask(activeTask.Task, taskResult)
if err != nil { if err != nil {
@@ -440,6 +441,10 @@ func (tc *TaskCoordinator) executeTask(activeTask *ActiveTask) {
// Announce completion // Announce completion
tc.announceTaskProgress(activeTask.Task, "completed") tc.announceTaskProgress(activeTask.Task, "completed")
ucxlAddress := tc.buildTaskUCXLAddress(activeTask.Task.Repository, activeTask.Task.Number)
fmt.Printf("📌 decision record emitted with provenance evidence | ucxl=%s | task=%s#%d | success=%t\n",
ucxlAddress, activeTask.Task.Repository, activeTask.Task.Number, taskResult.Success)
fmt.Printf("✅ Completed task %s #%d\n", activeTask.Task.Repository, activeTask.Task.Number) fmt.Printf("✅ Completed task %s #%d\n", activeTask.Task.Repository, activeTask.Task.Number)
} }
@@ -469,31 +474,22 @@ func (tc *TaskCoordinator) executeTaskWithAI(activeTask *ActiveTask) (*execution
return tc.executionEngine.ExecuteTask(tc.ctx, executionRequest) return tc.executionEngine.ExecuteTask(tc.ctx, executionRequest)
} }
// executeMockTask provides fallback mock execution func (tc *TaskCoordinator) buildFailedTaskResult(activeTask *ActiveTask, reason string, execErr error) *repository.TaskResult {
func (tc *TaskCoordinator) executeMockTask(activeTask *ActiveTask) *repository.TaskResult {
// Simulate work time based on task complexity
workTime := 5 * time.Second
if strings.Contains(strings.ToLower(activeTask.Task.Title), "complex") {
workTime = 15 * time.Second
}
fmt.Printf("🕐 Mock execution for task %s #%d (simulating %v)\n",
activeTask.Task.Repository, activeTask.Task.Number, workTime)
time.Sleep(workTime)
results := map[string]interface{}{ results := map[string]interface{}{
"status": "completed", "status": "failed",
"execution_type": "mock", "execution_type": "ai_required",
"completion_time": time.Now().Format(time.RFC3339), "completion_time": time.Now().Format(time.RFC3339),
"agent_id": tc.agentInfo.ID, "agent_id": tc.agentInfo.ID,
"agent_role": tc.agentInfo.Role, "agent_role": tc.agentInfo.Role,
"simulated_work": workTime.String(), "failure_reason": reason,
}
if execErr != nil {
results["error"] = execErr.Error()
} }
return &repository.TaskResult{ return &repository.TaskResult{
Success: true, Success: false,
Message: "Task completed successfully (mock execution)", Message: "Task execution failed: real AI execution is required",
Metadata: results, Metadata: results,
} }
} }
@@ -637,6 +633,25 @@ func (tc *TaskCoordinator) buildTaskContext(task *repository.Task) map[string]in
return context return context
} }
func (tc *TaskCoordinator) buildTaskUCXLAddress(repo string, taskNumber int) string {
repoID := strings.ToLower(strings.ReplaceAll(repo, "/", "-"))
if repoID == "" {
repoID = "unknown-repo"
}
project := tc.config.Agent.Project
if project == "" {
project = "chorus"
}
return fmt.Sprintf("ucxl://%s:%s@%s:task-%d/#/tasks/%s/%d",
tc.agentInfo.ID,
tc.agentInfo.Role,
project,
taskNumber,
repoID,
taskNumber,
)
}
// announceAgentRole announces this agent's role and capabilities // announceAgentRole announces this agent's role and capabilities
func (tc *TaskCoordinator) announceAgentRole() { func (tc *TaskCoordinator) announceAgentRole() {
data := map[string]interface{}{ data := map[string]interface{}{

View File

@@ -8,21 +8,15 @@ RUN apk --no-cache add git ca-certificates
WORKDIR /build WORKDIR /build
# Copy go mod files first (for better caching) # Copy source code (vendor dir includes all dependencies)
COPY go.mod go.sum ./
# Download dependencies
RUN go mod download
# Copy source code
COPY . . COPY . .
# Build the CHORUS binary with mod mode # Build the CHORUS agent binary using vendored dependencies
RUN CGO_ENABLED=0 GOOS=linux go build \ RUN CGO_ENABLED=0 GOOS=linux GOWORK=off go build \
-mod=mod \ -mod=vendor \
-ldflags='-w -s -extldflags "-static"' \ -ldflags='-w -s -extldflags "-static"' \
-o chorus \ -o chorus-agent \
./cmd/chorus ./cmd/agent
# Final minimal runtime image # Final minimal runtime image
FROM alpine:3.18 FROM alpine:3.18
@@ -42,8 +36,8 @@ RUN mkdir -p /app/data && \
chown -R chorus:chorus /app chown -R chorus:chorus /app
# Copy binary from builder stage # Copy binary from builder stage
COPY --from=builder /build/chorus /app/chorus COPY --from=builder /build/chorus-agent /app/chorus-agent
RUN chmod +x /app/chorus RUN chmod +x /app/chorus-agent
# Switch to non-root user # Switch to non-root user
USER chorus USER chorus
@@ -64,5 +58,5 @@ ENV LOG_LEVEL=info \
CHORUS_HEALTH_PORT=8081 \ CHORUS_HEALTH_PORT=8081 \
CHORUS_P2P_PORT=9000 CHORUS_P2P_PORT=9000
# Start CHORUS # Start CHORUS agent
ENTRYPOINT ["/app/chorus"] ENTRYPOINT ["/app/chorus-agent"]

View File

@@ -2,100 +2,75 @@ version: "3.9"
services: services:
chorus: chorus:
image: anthonyrawlins/chorus:latest image: localhost:5000/chorus:march8-evidence-20260226-2
# REQUIRED: License configuration (CHORUS will not start without this)
environment: environment:
# CRITICAL: License configuration - REQUIRED for operation
- CHORUS_LICENSE_ID_FILE=/run/secrets/chorus_license_id - CHORUS_LICENSE_ID_FILE=/run/secrets/chorus_license_id
- CHORUS_CLUSTER_ID=${CHORUS_CLUSTER_ID:-docker-cluster} - CHORUS_CLUSTER_ID=${CHORUS_CLUSTER_ID:-docker-cluster}
- CHORUS_KACHING_URL=${CHORUS_KACHING_URL:-https://kaching.chorus.services/api} - CHORUS_KACHING_URL=${CHORUS_KACHING_URL:-http://host.docker.internal:8099}
- CHORUS_AGENT_ID=${CHORUS_AGENT_ID:-}
# Agent configuration
- CHORUS_AGENT_ID=${CHORUS_AGENT_ID:-} # Auto-generated if not provided
- CHORUS_SPECIALIZATION=${CHORUS_SPECIALIZATION:-general_developer} - CHORUS_SPECIALIZATION=${CHORUS_SPECIALIZATION:-general_developer}
- CHORUS_MAX_TASKS=${CHORUS_MAX_TASKS:-3} - CHORUS_MAX_TASKS=${CHORUS_MAX_TASKS:-3}
- CHORUS_CAPABILITIES=general_development,task_coordination,admin_election - CHORUS_CAPABILITIES=general_development,task_coordination,admin_election
# Network configuration
- CHORUS_API_PORT=8080 - CHORUS_API_PORT=8080
- CHORUS_HEALTH_PORT=8081 - CHORUS_HEALTH_PORT=8081
- CHORUS_P2P_PORT=9000 - CHORUS_P2P_PORT=9000
- CHORUS_BIND_ADDRESS=0.0.0.0 - CHORUS_BIND_ADDRESS=0.0.0.0
- CHORUS_MDNS_ENABLED=false
# Scaling optimizations (as per WHOOSH issue #7) - CHORUS_DIALS_PER_SEC=5
- CHORUS_MDNS_ENABLED=false # Disabled for container/swarm environments - CHORUS_MAX_CONCURRENT_DHT=16
- CHORUS_DIALS_PER_SEC=5 # Rate limit outbound connections to prevent storms - CHORUS_ELECTION_MIN_TERM=120s
- CHORUS_MAX_CONCURRENT_DHT=16 # Limit concurrent DHT queries - CHORUS_LEADER_MIN_TERM=240s
- ASSIGN_URL=${ASSIGN_URL:-}
# Election stability windows (Medium-risk fix 2.1) - TASK_SLOT=${TASK_SLOT:-}
- CHORUS_ELECTION_MIN_TERM=30s # Minimum time between elections to prevent churn - TASK_ID=${TASK_ID:-}
- CHORUS_LEADER_MIN_TERM=45s # Minimum time before challenging healthy leader - NODE_ID=${NODE_ID:-}
- WHOOSH_API_BASE_URL=${SWOOSH_API_BASE_URL:-http://swoosh:8080}
# Assignment system for runtime configuration (Medium-risk fix 2.2) - WHOOSH_API_ENABLED=true
- ASSIGN_URL=${ASSIGN_URL:-} # Optional: WHOOSH assignment endpoint - BOOTSTRAP_JSON=/config/bootstrap.json
- TASK_SLOT=${TASK_SLOT:-} # Optional: Task slot identifier - CHORUS_BOOTSTRAP_PEERS=${CHORUS_BOOTSTRAP_PEERS:-}
- TASK_ID=${TASK_ID:-} # Optional: Task identifier
- NODE_ID=${NODE_ID:-} # Optional: Node identifier
# Bootstrap pool configuration (supports JSON and CSV)
- BOOTSTRAP_JSON=/config/bootstrap.json # Optional: JSON bootstrap config
- CHORUS_BOOTSTRAP_PEERS=${CHORUS_BOOTSTRAP_PEERS:-} # CSV fallback
# AI configuration - Provider selection
- CHORUS_AI_PROVIDER=${CHORUS_AI_PROVIDER:-resetdata} - CHORUS_AI_PROVIDER=${CHORUS_AI_PROVIDER:-resetdata}
- RESETDATA_BASE_URL=${RESETDATA_BASE_URL:-https://app.resetdata.ai/api/v1}
# ResetData configuration (default provider)
- RESETDATA_BASE_URL=${RESETDATA_BASE_URL:-https://models.au-syd.resetdata.ai/v1}
- RESETDATA_API_KEY_FILE=/run/secrets/resetdata_api_key - RESETDATA_API_KEY_FILE=/run/secrets/resetdata_api_key
- RESETDATA_MODEL=${RESETDATA_MODEL:-meta/llama-3.1-8b-instruct} - RESETDATA_MODEL=${RESETDATA_MODEL:-openai/gpt-oss-120b}
# Ollama configuration (alternative provider)
- OLLAMA_ENDPOINT=${OLLAMA_ENDPOINT:-http://host.docker.internal:11434} - OLLAMA_ENDPOINT=${OLLAMA_ENDPOINT:-http://host.docker.internal:11434}
# Model configuration
- CHORUS_MODELS=${CHORUS_MODELS:-meta/llama-3.1-8b-instruct} - CHORUS_MODELS=${CHORUS_MODELS:-meta/llama-3.1-8b-instruct}
- CHORUS_DEFAULT_REASONING_MODEL=${CHORUS_DEFAULT_REASONING_MODEL:-meta/llama-3.1-8b-instruct} - CHORUS_DEFAULT_REASONING_MODEL=${CHORUS_DEFAULT_REASONING_MODEL:-meta/llama-3.1-8b-instruct}
- CHORUS_LIGHTRAG_ENABLED=${CHORUS_LIGHTRAG_ENABLED:-true}
# Logging configuration - CHORUS_LIGHTRAG_BASE_URL=${CHORUS_LIGHTRAG_BASE_URL:-http://host.docker.internal:9621}
- CHORUS_LIGHTRAG_TIMEOUT=${CHORUS_LIGHTRAG_TIMEOUT:-30s}
- CHORUS_LIGHTRAG_API_KEY=${CHORUS_LIGHTRAG_API_KEY:-your-secure-api-key-here}
- CHORUS_LIGHTRAG_DEFAULT_MODE=${CHORUS_LIGHTRAG_DEFAULT_MODE:-hybrid}
- LOG_LEVEL=${LOG_LEVEL:-info} - LOG_LEVEL=${LOG_LEVEL:-info}
- LOG_FORMAT=${LOG_FORMAT:-structured} - LOG_FORMAT=${LOG_FORMAT:-structured}
# BACKBEAT configuration
- CHORUS_BACKBEAT_ENABLED=${CHORUS_BACKBEAT_ENABLED:-true} - CHORUS_BACKBEAT_ENABLED=${CHORUS_BACKBEAT_ENABLED:-true}
- CHORUS_BACKBEAT_CLUSTER_ID=${CHORUS_BACKBEAT_CLUSTER_ID:-chorus-production} - CHORUS_BACKBEAT_CLUSTER_ID=${CHORUS_BACKBEAT_CLUSTER_ID:-chorus-production}
- CHORUS_BACKBEAT_AGENT_ID=${CHORUS_BACKBEAT_AGENT_ID:-} # Auto-generated from CHORUS_AGENT_ID - CHORUS_BACKBEAT_AGENT_ID=${CHORUS_BACKBEAT_AGENT_ID:-}
- CHORUS_BACKBEAT_NATS_URL=${CHORUS_BACKBEAT_NATS_URL:-nats://backbeat-nats:4222} - CHORUS_BACKBEAT_NATS_URL=${CHORUS_BACKBEAT_NATS_URL:-nats://backbeat-nats:4222}
- CHORUS_TRANSPORT_TELEMETRY_INTERVAL=${CHORUS_TRANSPORT_TELEMETRY_INTERVAL:-30s}
# Prompt sourcing (mounted volume) - CHORUS_TRANSPORT_TELEMETRY_SUBJECT=${CHORUS_TRANSPORT_TELEMETRY_SUBJECT:-chorus.telemetry.transport}
- CHORUS_TRANSPORT_METRICS_NATS_URL=${CHORUS_TRANSPORT_METRICS_NATS_URL:-}
- CHORUS_TRANSPORT_MODE=${CHORUS_TRANSPORT_MODE:-quic_only}
- CHORUS_PROMPTS_DIR=/etc/chorus/prompts - CHORUS_PROMPTS_DIR=/etc/chorus/prompts
- CHORUS_DEFAULT_INSTRUCTIONS_PATH=/etc/chorus/prompts/defaults.md - CHORUS_DEFAULT_INSTRUCTIONS_PATH=/etc/chorus/prompts/defaults.md
- CHORUS_ROLE=${CHORUS_ROLE:-arbiter} - CHORUS_ROLE=${CHORUS_ROLE:-arbiter}
# Docker secrets for sensitive configuration
secrets: secrets:
- chorus_license_id - chorus_license_id
- resetdata_api_key - resetdata_api_key
# Configuration files
configs: configs:
- source: chorus_bootstrap - source: chorus_bootstrap
target: /config/bootstrap.json target: /config/bootstrap.json
# Persistent data storage
volumes: volumes:
- chorus_data:/app/data - chorus_data:/app/data
# Mount prompts directory read-only for role YAMLs and defaults.md
- /rust/containers/WHOOSH/prompts:/etc/chorus/prompts:ro - /rust/containers/WHOOSH/prompts:/etc/chorus/prompts:ro
- /rust/containers/CHORUS/models.yaml:/app/configs/models.yaml:ro
# Network ports
ports: ports:
- "${CHORUS_P2P_PORT:-9000}:9000" # P2P communication - "${CHORUS_P2P_PORT:-9000}:9000/tcp"
- "${CHORUS_P2P_PORT:-9000}:9000/udp"
# Container resource limits
deploy: deploy:
labels:
- shepherd.autodeploy=true
mode: replicated mode: replicated
replicas: ${CHORUS_REPLICAS:-9} replicas: ${CHORUS_REPLICAS:-20}
update_config: update_config:
parallelism: 1 parallelism: 1
delay: 10s delay: 10s
@@ -109,108 +84,46 @@ services:
resources: resources:
limits: limits:
cpus: "${CHORUS_CPU_LIMIT:-1.0}" cpus: "${CHORUS_CPU_LIMIT:-1.0}"
memory: "${CHORUS_MEMORY_LIMIT:-1G}" memory: "${CHORUS_MEMORY_LIMIT:-4G}"
reservations: reservations:
cpus: "0.1" cpus: "0.2"
memory: 128M memory: 128M
placement: placement:
constraints:
- node.hostname != acacia
preferences: preferences:
- spread: node.hostname - spread: node.hostname
# CHORUS is internal-only, no Traefik labels needed
# Network configuration
networks: networks:
- chorus_net - tengig
- chorus_ipvlan
# Host resolution for external services
extra_hosts: extra_hosts:
- "host.docker.internal:host-gateway" - "host.docker.internal:host-gateway"
# Container logging configuration
logging: logging:
driver: "json-file" driver: "json-file"
options: options:
max-size: "10m" max-size: "10m"
max-file: "3" max-file: "3"
tag: "{{.ImageName}}/{{.Name}}/{{.ID}}" tag: "{{.ImageName}}/{{.Name}}/{{.ID}}"
# Health check configuration
healthcheck: healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8081/health"] test: ["CMD", "curl", "-f", "http://localhost:8081/health"]
interval: 30s interval: 30s
timeout: 10s timeout: 10s
retries: 3 retries: 3
start_period: 10s start_period: 30s # Increased from 10s to allow P2P mesh formation (15s bootstrap + margin)
whoosh: swoosh:
image: anthonyrawlins/whoosh:scaling-v1.0.0 image: anthonyrawlins/swoosh:1.0.2
ports: ports:
- target: 8080 - target: 8080
published: 8800 published: 8800
protocol: tcp protocol: tcp
mode: ingress mode: ingress
environment: environment:
# Database configuration - SWOOSH_LISTEN_ADDR=:8080
WHOOSH_DATABASE_DB_HOST: postgres - SWOOSH_WAL_DIR=/data/wal
WHOOSH_DATABASE_DB_PORT: 5432 - SWOOSH_SNAPSHOT_PATH=/data/snapshots/latest.json
WHOOSH_DATABASE_DB_NAME: whoosh
WHOOSH_DATABASE_DB_USER: whoosh
WHOOSH_DATABASE_DB_PASSWORD_FILE: /run/secrets/whoosh_db_password
WHOOSH_DATABASE_DB_SSL_MODE: disable
WHOOSH_DATABASE_DB_AUTO_MIGRATE: "true"
# Server configuration
WHOOSH_SERVER_LISTEN_ADDR: ":8080"
WHOOSH_SERVER_READ_TIMEOUT: "30s"
WHOOSH_SERVER_WRITE_TIMEOUT: "30s"
WHOOSH_SERVER_SHUTDOWN_TIMEOUT: "30s"
# GITEA configuration
WHOOSH_GITEA_BASE_URL: https://gitea.chorus.services
WHOOSH_GITEA_TOKEN_FILE: /run/secrets/gitea_token
WHOOSH_GITEA_WEBHOOK_TOKEN_FILE: /run/secrets/webhook_token
WHOOSH_GITEA_WEBHOOK_PATH: /webhooks/gitea
# Auth configuration
WHOOSH_AUTH_JWT_SECRET_FILE: /run/secrets/jwt_secret
WHOOSH_AUTH_SERVICE_TOKENS_FILE: /run/secrets/service_tokens
WHOOSH_AUTH_JWT_EXPIRY: "24h"
# Logging
WHOOSH_LOGGING_LEVEL: debug
WHOOSH_LOGGING_ENVIRONMENT: production
# Redis configuration
WHOOSH_REDIS_ENABLED: "true"
WHOOSH_REDIS_HOST: redis
WHOOSH_REDIS_PORT: 6379
WHOOSH_REDIS_PASSWORD_FILE: /run/secrets/redis_password
WHOOSH_REDIS_DATABASE: 0
# Scaling system configuration
WHOOSH_SCALING_KACHING_URL: "https://kaching.chorus.services"
WHOOSH_SCALING_BACKBEAT_URL: "http://backbeat-pulse:8080"
WHOOSH_SCALING_CHORUS_URL: "http://chorus:9000"
# BACKBEAT integration configuration (temporarily disabled)
WHOOSH_BACKBEAT_ENABLED: "false"
WHOOSH_BACKBEAT_CLUSTER_ID: "chorus-production"
WHOOSH_BACKBEAT_AGENT_ID: "whoosh"
WHOOSH_BACKBEAT_NATS_URL: "nats://backbeat-nats:4222"
secrets:
- whoosh_db_password
- gitea_token
- webhook_token
- jwt_secret
- service_tokens
- redis_password
volumes: volumes:
- /var/run/docker.sock:/var/run/docker.sock - swoosh_data:/data
deploy: deploy:
replicas: 2 replicas: 1
restart_policy: restart_policy:
condition: on-failure condition: on-failure
delay: 5s delay: 5s
@@ -222,17 +135,6 @@ services:
failure_action: pause failure_action: pause
monitor: 60s monitor: 60s
order: start-first order: start-first
# rollback_config:
# parallelism: 1
# delay: 0s
# failure_action: pause
# monitor: 60s
# order: stop-first
placement:
constraints:
- node.hostname != acacia
preferences:
- spread: node.hostname
resources: resources:
limits: limits:
memory: 256M memory: 256M
@@ -243,18 +145,18 @@ services:
labels: labels:
- traefik.enable=true - traefik.enable=true
- traefik.docker.network=tengig - traefik.docker.network=tengig
- traefik.http.routers.whoosh.rule=Host(`whoosh.chorus.services`) - traefik.http.routers.swoosh.rule=Host(`swoosh.chorus.services`)
- traefik.http.routers.whoosh.tls=true - traefik.http.routers.swoosh.entrypoints=web,web-secured
- traefik.http.routers.whoosh.tls.certresolver=letsencryptresolver - traefik.http.routers.swoosh.tls=true
- traefik.http.routers.photoprism.entrypoints=web,web-secured - traefik.http.routers.swoosh.tls.certresolver=letsencryptresolver
- traefik.http.services.whoosh.loadbalancer.server.port=8080 - traefik.http.services.swoosh.loadbalancer.server.port=8080
- traefik.http.services.photoprism.loadbalancer.passhostheader=true - shepherd.autodeploy=true
- traefik.http.middlewares.whoosh-auth.basicauth.users=admin:$2y$10$example_hash - traefik.http.services.swoosh.loadbalancer.passhostheader=true
networks: networks:
- tengig - tengig
- chorus_net - chorus_ipvlan
healthcheck: healthcheck:
test: ["CMD", "/app/whoosh", "--health-check"] test: ["CMD", "wget", "--no-verbose", "--tries=1", "-O", "/dev/null", "http://localhost:8080/health"]
interval: 30s interval: 30s
timeout: 10s timeout: 10s
retries: 3 retries: 3
@@ -263,10 +165,10 @@ services:
postgres: postgres:
image: postgres:15-alpine image: postgres:15-alpine
environment: environment:
POSTGRES_DB: whoosh - POSTGRES_DB=whoosh
POSTGRES_USER: whoosh - POSTGRES_USER=whoosh
POSTGRES_PASSWORD_FILE: /run/secrets/whoosh_db_password - POSTGRES_PASSWORD_FILE=/run/secrets/whoosh_db_password
POSTGRES_INITDB_ARGS: --auth-host=scram-sha-256 - POSTGRES_INITDB_ARGS=--auth-host=scram-sha-256
secrets: secrets:
- whoosh_db_password - whoosh_db_password
volumes: volumes:
@@ -278,9 +180,9 @@ services:
delay: 5s delay: 5s
max_attempts: 3 max_attempts: 3
window: 120s window: 120s
placement: # placement:
preferences: # constraints:
- spread: node.hostname # - node.hostname == ironwood
resources: resources:
limits: limits:
memory: 512M memory: 512M
@@ -289,7 +191,8 @@ services:
memory: 256M memory: 256M
cpus: '0.5' cpus: '0.5'
networks: networks:
- chorus_net - tengig
- chorus_ipvlan
healthcheck: healthcheck:
test: ["CMD-SHELL", "pg_isready -h localhost -p 5432 -U whoosh -d whoosh"] test: ["CMD-SHELL", "pg_isready -h localhost -p 5432 -U whoosh -d whoosh"]
interval: 30s interval: 30s
@@ -297,7 +200,6 @@ services:
retries: 5 retries: 5
start_period: 40s start_period: 40s
redis: redis:
image: redis:7-alpine image: redis:7-alpine
command: sh -c 'redis-server --requirepass "$$(cat /run/secrets/redis_password)" --appendonly yes' command: sh -c 'redis-server --requirepass "$$(cat /run/secrets/redis_password)" --appendonly yes'
@@ -323,7 +225,7 @@ services:
memory: 64M memory: 64M
cpus: '0.1' cpus: '0.1'
networks: networks:
- chorus_net - chorus_ipvlan
healthcheck: healthcheck:
test: ["CMD", "sh", "-c", "redis-cli --no-auth-warning -a $$(cat /run/secrets/redis_password) ping"] test: ["CMD", "sh", "-c", "redis-cli --no-auth-warning -a $$(cat /run/secrets/redis_password) ping"]
interval: 30s interval: 30s
@@ -331,15 +233,6 @@ services:
retries: 3 retries: 3
start_period: 30s start_period: 30s
prometheus: prometheus:
image: prom/prometheus:latest image: prom/prometheus:latest
command: command:
@@ -350,8 +243,9 @@ services:
volumes: volumes:
- /rust/containers/CHORUS/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro - /rust/containers/CHORUS/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- /rust/containers/CHORUS/monitoring/prometheus:/prometheus - /rust/containers/CHORUS/monitoring/prometheus:/prometheus
- /rust/containers/CHORUS/observability/prometheus/alerts:/etc/prometheus/alerts:ro
ports: ports:
- "9099:9090" # Expose Prometheus UI - "9099:9090"
deploy: deploy:
replicas: 1 replicas: 1
labels: labels:
@@ -361,8 +255,9 @@ services:
- traefik.http.routers.prometheus.tls=true - traefik.http.routers.prometheus.tls=true
- traefik.http.routers.prometheus.tls.certresolver=letsencryptresolver - traefik.http.routers.prometheus.tls.certresolver=letsencryptresolver
- traefik.http.services.prometheus.loadbalancer.server.port=9090 - traefik.http.services.prometheus.loadbalancer.server.port=9090
- shepherd.autodeploy=true
networks: networks:
- chorus_net - chorus_ipvlan
- tengig - tengig
healthcheck: healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/ready"] test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/ready"]
@@ -375,12 +270,12 @@ services:
image: grafana/grafana:latest image: grafana/grafana:latest
user: "1000:1000" user: "1000:1000"
environment: environment:
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin} # Use a strong password in production - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin}
- GF_SERVER_ROOT_URL=https://grafana.chorus.services - GF_SERVER_ROOT_URL=https://grafana.chorus.services
volumes: volumes:
- /rust/containers/CHORUS/monitoring/grafana:/var/lib/grafana - /rust/containers/CHORUS/monitoring/grafana:/var/lib/grafana
ports: ports:
- "3300:3000" # Expose Grafana UI - "3300:3000"
deploy: deploy:
replicas: 1 replicas: 1
labels: labels:
@@ -390,8 +285,9 @@ services:
- traefik.http.routers.grafana.tls=true - traefik.http.routers.grafana.tls=true
- traefik.http.routers.grafana.tls.certresolver=letsencryptresolver - traefik.http.routers.grafana.tls.certresolver=letsencryptresolver
- traefik.http.services.grafana.loadbalancer.server.port=3000 - traefik.http.services.grafana.loadbalancer.server.port=3000
- shepherd.autodeploy=true
networks: networks:
- chorus_net - chorus_ipvlan
- tengig - tengig
healthcheck: healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/api/health"] test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/api/health"]
@@ -400,11 +296,8 @@ services:
retries: 3 retries: 3
start_period: 10s start_period: 10s
# BACKBEAT Pulse Service - Leader-elected tempo broadcaster
# REQ: BACKBEAT-REQ-001 - Single BeatFrame publisher per cluster
# REQ: BACKBEAT-OPS-001 - One replica prefers leadership
backbeat-pulse: backbeat-pulse:
image: anthonyrawlins/backbeat-pulse:v1.0.5 image: docker.io/anthonyrawlins/backbeat-pulse:latest
command: > command: >
./pulse ./pulse
-cluster=chorus-production -cluster=chorus-production
@@ -415,30 +308,25 @@ services:
-tempo=2 -tempo=2
-bar-length=8 -bar-length=8
-log-level=info -log-level=info
# Internal service ports (not externally exposed - routed via Traefik)
expose: expose:
- "8080" # Admin API - "8080"
- "9000" # Raft communication - "9000"
# REQ: BACKBEAT-OPS-002 - Health probes for liveness/readiness
healthcheck: healthcheck:
test: ["CMD", "nc", "-z", "localhost", "8080"] test: ["CMD", "nc", "-z", "localhost", "8080"]
interval: 30s interval: 30s
timeout: 10s timeout: 10s
retries: 3 retries: 3
start_period: 60s start_period: 60s
deploy: deploy:
replicas: 1 # Single leader with automatic failover replicas: 1
restart_policy: restart_policy:
condition: on-failure condition: on-failure
delay: 30s # Wait longer for NATS to be ready delay: 30s
max_attempts: 5 max_attempts: 5
window: 120s window: 120s
update_config: update_config:
parallelism: 1 parallelism: 1
delay: 30s # Wait for leader election delay: 30s
failure_action: pause failure_action: pause
monitor: 60s monitor: 60s
order: start-first order: start-first
@@ -452,19 +340,15 @@ services:
reservations: reservations:
memory: 128M memory: 128M
cpus: '0.25' cpus: '0.25'
# Traefik routing for admin API
labels: labels:
- traefik.enable=true - traefik.enable=true
- traefik.http.routers.backbeat-pulse.rule=Host(`backbeat-pulse.chorus.services`) - traefik.http.routers.backbeat-pulse.rule=Host(`backbeat-pulse.chorus.services`)
- traefik.http.routers.backbeat-pulse.tls=true - traefik.http.routers.backbeat-pulse.tls=true
- traefik.http.routers.backbeat-pulse.tls.certresolver=letsencryptresolver - traefik.http.routers.backbeat-pulse.tls.certresolver=letsencryptresolver
- traefik.http.services.backbeat-pulse.loadbalancer.server.port=8080 - traefik.http.services.backbeat-pulse.loadbalancer.server.port=8080
networks: networks:
- chorus_net - chorus_ipvlan
- tengig # External network for Traefik - tengig
# Container logging
logging: logging:
driver: "json-file" driver: "json-file"
options: options:
@@ -472,32 +356,18 @@ services:
max-file: "3" max-file: "3"
tag: "backbeat-pulse/{{.Name}}/{{.ID}}" tag: "backbeat-pulse/{{.Name}}/{{.ID}}"
# BACKBEAT Reverb Service - StatusClaim aggregator
# REQ: BACKBEAT-REQ-020 - Subscribe to INT-B and group by window_id
# REQ: BACKBEAT-OPS-001 - Reverb can scale stateless
backbeat-reverb: backbeat-reverb:
image: anthonyrawlins/backbeat-reverb:v1.0.2 image: docker.io/anthonyrawlins/backbeat-reverb:latest
command: > command: >
./reverb ./reverb
-cluster=chorus-production -cluster=chorus-production
-nats=nats://backbeat-nats:4222 -nats=nats://backbeat-nats:4222
-bar-length=8 -bar-length=8
-log-level=info -log-level=info
# Internal service ports (not externally exposed - routed via Traefik)
expose: expose:
- "8080" # Admin API - "8080"
# REQ: BACKBEAT-OPS-002 - Health probes for orchestration (temporarily disabled for testing)
# healthcheck:
# test: ["CMD", "nc", "-z", "localhost", "8080"]
# interval: 30s
# timeout: 10s
# retries: 3
# start_period: 60s
deploy: deploy:
replicas: 2 # Stateless, can scale horizontally replicas: 2
restart_policy: restart_policy:
condition: on-failure condition: on-failure
delay: 10s delay: 10s
@@ -514,24 +384,20 @@ services:
- spread: node.hostname - spread: node.hostname
resources: resources:
limits: limits:
memory: 512M # Larger for window aggregation memory: 512M
cpus: '1.0' cpus: '1.0'
reservations: reservations:
memory: 256M memory: 256M
cpus: '0.5' cpus: '0.5'
# Traefik routing for admin API
labels: labels:
- traefik.enable=true - traefik.enable=true
- traefik.http.routers.backbeat-reverb.rule=Host(`backbeat-reverb.chorus.services`) - traefik.http.routers.backbeat-reverb.rule=Host(`backbeat-reverb.chorus.services`)
- traefik.http.routers.backbeat-reverb.tls=true - traefik.http.routers.backbeat-reverb.tls=true
- traefik.http.routers.backbeat-reverb.tls.certresolver=letsencryptresolver - traefik.http.routers.backbeat-reverb.tls.certresolver=letsencryptresolver
- traefik.http.services.backbeat-reverb.loadbalancer.server.port=8080 - traefik.http.services.backbeat-reverb.loadbalancer.server.port=8080
networks: networks:
- chorus_net - chorus_ipvlan
- tengig # External network for Traefik - tengig
# Container logging
logging: logging:
driver: "json-file" driver: "json-file"
options: options:
@@ -539,8 +405,6 @@ services:
max-file: "3" max-file: "3"
tag: "backbeat-reverb/{{.Name}}/{{.ID}}" tag: "backbeat-reverb/{{.Name}}/{{.ID}}"
# NATS Message Broker - Use existing or deploy dedicated instance
# REQ: BACKBEAT-INT-001 - Topics via NATS for at-least-once delivery
backbeat-nats: backbeat-nats:
image: nats:2.9-alpine image: nats:2.9-alpine
command: ["--jetstream"] command: ["--jetstream"]
@@ -562,8 +426,7 @@ services:
memory: 128M memory: 128M
cpus: '0.25' cpus: '0.25'
networks: networks:
- chorus_net - chorus_ipvlan
# Container logging
logging: logging:
driver: "json-file" driver: "json-file"
options: options:
@@ -571,10 +434,55 @@ services:
max-file: "3" max-file: "3"
tag: "nats/{{.Name}}/{{.ID}}" tag: "nats/{{.Name}}/{{.ID}}"
# KACHING services are deployed separately in their own stack shepherd:
# License validation will access https://kaching.chorus.services/api image: containrrr/shepherd:latest
environment:
SLEEP_TIME: "5m"
FILTER_SERVICES: "label=shepherd.autodeploy=true"
WITH_REGISTRY_AUTH: "true"
ROLLBACK_ON_FAILURE: "true"
TZ: "UTC"
volumes:
- /var/run/docker.sock:/var/run/docker.sock
deploy:
replicas: 1
restart_policy:
condition: any
placement:
constraints:
- node.role == manager
hmmm-monitor:
image: docker.io/anthonyrawlins/hmmm-monitor:latest
environment:
- WHOOSH_API_BASE_URL=http://swoosh:8080
ports:
- "9001:9001"
deploy:
labels:
- shepherd.autodeploy=true
replicas: 1
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
window: 120s
resources:
limits:
memory: 128M
cpus: '0.25'
reservations:
memory: 64M
cpus: '0.1'
networks:
- chorus_ipvlan
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
tag: "hmmm-monitor/{{.Name}}/{{.ID}}"
# Persistent volumes
volumes: volumes:
prometheus_data: prometheus_data:
driver: local driver: local
@@ -596,6 +504,12 @@ volumes:
device: /rust/containers/CHORUS/monitoring/grafana device: /rust/containers/CHORUS/monitoring/grafana
chorus_data: chorus_data:
driver: local driver: local
swoosh_data:
driver: local
driver_opts:
type: none
o: bind
device: /rust/containers/SWOOSH/data
whoosh_postgres_data: whoosh_postgres_data:
driver: local driver: local
driver_opts: driver_opts:
@@ -608,17 +522,19 @@ volumes:
type: none type: none
o: bind o: bind
device: /rust/containers/WHOOSH/redis device: /rust/containers/WHOOSH/redis
whoosh_ui:
driver: local
driver_opts:
type: none
o: bind
device: /rust/containers/WHOOSH/ui
# Networks for CHORUS communication
networks: networks:
tengig: tengig:
external: true external: true
chorus_net: chorus_ipvlan:
driver: overlay external: true
attachable: true
configs: configs:
chorus_bootstrap: chorus_bootstrap:
@@ -630,7 +546,7 @@ secrets:
name: chorus_license_id name: chorus_license_id
resetdata_api_key: resetdata_api_key:
external: true external: true
name: resetdata_api_key name: resetdata_api_key_v2
whoosh_db_password: whoosh_db_password:
external: true external: true
name: whoosh_db_password name: whoosh_db_password
@@ -642,7 +558,7 @@ secrets:
name: whoosh_webhook_token name: whoosh_webhook_token
jwt_secret: jwt_secret:
external: true external: true
name: whoosh_jwt_secret name: whoosh_jwt_secret_v4
service_tokens: service_tokens:
external: true external: true
name: whoosh_service_tokens name: whoosh_service_tokens

View File

@@ -0,0 +1,46 @@
# DR: ResetData Model Freeze for March 8 Bootstrap Release
Date: February 26, 2026
Status: Accepted
Scope: March 8 bootstrap release window
## Decision
Freeze the release model pair to:
- Primary: `openai/gpt-oss-120b`
- Fallback: `zai-org/glm-4.7-fp8`
## Why
- Both models were validated live against `https://app.resetdata.ai/api/v1/chat/completions` with HTTP 200.
- `penai/gpt-oss-120b` returned `model_not_found`; remove ambiguity and standardize on known-good IDs.
- Existing compose defaults already used `openai/gpt-oss-120b`; align Go default to the same model.
## Validation snapshot
Probe run date: February 26, 2026 (UTC)
- `zai-org/glm-4.7-fp8` -> 200
- `openai/gpt-oss-120b` -> 200
- `penai/gpt-oss-120b` -> 404 (`model_not_found`)
- `meta/llama-3.1-8b-instruct` -> 200
- `google/gemma-3-27b-it` -> 200
## Implementation updates
- Updated Go default model:
- `pkg/config/config.go`
- Updated bootstrap gate validations:
- `testing/march8_bootstrap_gate.sh`
- Updated release board:
- `docs/progress/MARCH8-BOOTSTRAP-RELEASE-BOARD.md`
## Consequences
- All release validation and e2e runs must use the frozen pair until March 8, 2026.
- Any model change before release must open a new decision record and rerun live gate + evidence capture.
## UCXL reference
`ucxl://arbiter:release-coordinator@CHORUS:march8-bootstrap/#/docs/decisions/2026-02-26-resetdata-model-freeze.md`

View File

@@ -0,0 +1,92 @@
# March 8 Bootstrap Release Board
Date window: February 26, 2026 to March 8, 2026
Objective: ship a replayable "CHORUS bootstrap path" that uses real inference, produces traceable artifacts, and avoids mock execution in the critical flow.
## Scope lock (do not expand)
Single path only:
1. Issue intake
2. SWOOSH transition
3. CHORUS task execution (real model call)
4. SLURP bundle creation
5. BUBBLE decision record
6. UCXL address persisted and retrievable
Everything else is out of scope unless it blocks this path.
## Release gates
All must pass by March 8:
- [ ] G1: No mock fallback in critical task execution path.
- [ ] G2: ResetData model configuration is canonical and consistent across compose + Go defaults.
- [ ] G3: At least one primary model and one fallback model validated against ResetData API.
- [ ] G4: End-to-end run produces DR + UCXL pointer + provenance evidence.
- [ ] G5: 24h stability test completes with reproducible logs and failure classification.
- [ ] G6: Operator runbook exists with exact commands used for validation.
## Frozen model pair (locked on February 26, 2026)
- Primary: `openai/gpt-oss-120b`
- Fallback: `zai-org/glm-4.7-fp8`
- Validation status: both returned HTTP 200 against `https://app.resetdata.ai/api/v1/chat/completions` on February 26, 2026.
## Daily plan
### Feb 26-28: Remove ambiguity, remove mocks
- [x] Freeze target model pair for release.
- [x] Validate ResetData auth + chat completion from runtime environment.
- [x] Remove or hard-disable mock execution in critical path.
- [ ] Capture first green baseline run (single issue -> artifact path).
### Mar 1-4: Stabilize integration
- [ ] Run repeated e2e cycles under SWOOSH + CHORUS.
- [ ] Measure pass rate, latency, and top failure classes.
- [ ] Fix top 3 failure classes only.
- [ ] Ensure DR/UCXL artifacts are emitted every successful run.
### Mar 5-7: Hardening + evidence
- [ ] Run 24h soak on frozen config.
- [ ] Produce validation bundle (commands, logs, outputs, known limits).
- [ ] Confirm rollback instructions.
### Mar 8: Freeze + release
- [ ] Freeze config/image tags.
- [ ] Run final gate script.
- [ ] Publish release note + operator checklist.
## Coordination protocol
- One active lane at a time:
- `NOW`
- `NEXT`
- `BLOCKED`
- Any new idea goes to backlog unless directly required for a failing gate.
- Every work item must map to at least one gate ID (`G1`..`G6`).
- No "architecture expansion" during this window.
## Work lanes
NOW:
- [x] Create and run bootstrap gate script (`testing/march8_bootstrap_gate.sh`)
- [ ] Create and run e2e evidence capture (`testing/march8_e2e_evidence.sh`)
NEXT:
- [ ] Capture first baseline evidence bundle with DR + UCXL + provenance
BLOCKED:
- [ ] None
## Evidence checklist (release packet)
- [ ] Gate script output (final passing run)
- [ ] Model validation output (primary + fallback)
- [ ] E2E run log showing DR + UCXL + provenance
- [ ] 24h soak summary (pass/fail + failures by class)
- [ ] Known limitations and immediate post-release priorities

View File

@@ -179,9 +179,9 @@ func LoadFromEnvironment() (*Config, error) {
Timeout: getEnvDurationOrDefault("OLLAMA_TIMEOUT", 30*time.Second), Timeout: getEnvDurationOrDefault("OLLAMA_TIMEOUT", 30*time.Second),
}, },
ResetData: ResetDataConfig{ ResetData: ResetDataConfig{
BaseURL: getEnvOrDefault("RESETDATA_BASE_URL", "https://models.au-syd.resetdata.ai/v1"), BaseURL: getEnvOrDefault("RESETDATA_BASE_URL", "https://app.resetdata.ai/api/v1"),
APIKey: getEnvOrFileContent("RESETDATA_API_KEY", "RESETDATA_API_KEY_FILE"), APIKey: getEnvOrFileContent("RESETDATA_API_KEY", "RESETDATA_API_KEY_FILE"),
Model: getEnvOrDefault("RESETDATA_MODEL", "meta/llama-3.1-8b-instruct"), Model: getEnvOrDefault("RESETDATA_MODEL", "openai/gpt-oss-120b"),
Timeout: getEnvDurationOrDefault("RESETDATA_TIMEOUT", 30*time.Second), Timeout: getEnvDurationOrDefault("RESETDATA_TIMEOUT", 30*time.Second),
}, },
}, },

93
resetdata-examples.md Normal file
View File

@@ -0,0 +1,93 @@
curl -X POST https://app.resetdata.ai/api/v1/chat/completions \
-H "Authorization: Bearer YOUR_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"model": "zai-org/glm-4.7-fp8",
"messages": [
{"role": "user", "content": "Hello!"}
],
"temperature": 0.7,
"top_p": 0.9,
"max_tokens": 2048,
"frequency_penalty": 0,
"presence_penalty": 0
}'
from openai import OpenAI
client = OpenAI(
api_key="YOUR_API_KEY",
base_url="https://app.resetdata.ai/api/v1"
)
response = client.chat.completions.create(
model="zai-org/glm-4.7-fp8",
messages=[
{"role": "user", "content": "Hello!"}
],
temperature=0.7,
top_p=0.9,
max_tokens=2048,
frequency_penalty=0,
presence_penalty=0
)
print(response.choices[0].message.content)
const response = await fetch('https://app.resetdata.ai/api/v1/chat/completions', {
method: 'POST',
headers: {
'Authorization': 'Bearer YOUR_API_KEY',
'Content-Type': 'application/json',
},
body: JSON.stringify({
model: 'zai-org/glm-4.7-fp8',
messages: [
{ role: 'user', content: 'Hello!' }
],
temperature: 0.7,
top_p: 0.9,
max_tokens: 2048,
frequency_penalty: 0,
presence_penalty: 0
})
});
const data = await response.json();
console.log(data.choices[0].message.content);
import { streamText } from 'ai';
import { createOpenAI } from '@ai-sdk/openai';
const openai = createOpenAI({
apiKey: 'YOUR_API_KEY',
baseURL: 'https://app.resetdata.ai/api/v1',
});
const { textStream } = await streamText({
model: openai('zai-org/glm-4.7-fp8'),
messages: [
{ role: 'user', content: 'Hello!' }
],
temperature: 0.7,
topP: 0.9,
maxTokens: 2048,
frequencyPenalty: 0,
presencePenalty: 0
});
for await (const chunk of textStream) {
process.stdout.write(chunk);
}
API Configuration
Base URL: https://app.resetdata.ai/api/v1
Authentication: Bearer token in Authorization header
Model: zai-org/glm-4.7-fp8

9
resetdata-models.txt Normal file
View File

@@ -0,0 +1,9 @@
GLM-4.7 FP8
Nemotron Nano 2 VL
Nemotron 3 Nano 30B-A3B
Cosmos Reason2 8B
Llama 3.2 ReRankQA 1B v2
Llama 3.2 EmbedQA 1B v2
Gemma3 27B Instruct
GPT-OSS 120B
Llama 3.1 8B Instruct

127
testing/march8_bootstrap_gate.sh Executable file
View File

@@ -0,0 +1,127 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
CHORUS="$ROOT"
LIVE=0
PRIMARY_MODEL="${PRIMARY_MODEL:-openai/gpt-oss-120b}"
FALLBACK_MODEL="${FALLBACK_MODEL:-zai-org/glm-4.7-fp8}"
if [[ "${1:-}" == "--live" ]]; then
LIVE=1
fi
PASS=0
FAIL=0
pass() {
PASS=$((PASS + 1))
printf "PASS: %s\n" "$1"
}
fail() {
FAIL=$((FAIL + 1))
printf "FAIL: %s\n" "$1"
}
check_file() {
local f="$1"
local label="$2"
if [[ -f "$f" ]]; then
pass "$label"
else
fail "$label (missing: $f)"
fi
}
check_contains() {
local f="$1"
local pattern="$2"
local label="$3"
if rg -n --fixed-strings "$pattern" "$f" >/dev/null 2>&1; then
pass "$label"
else
fail "$label (pattern not found: $pattern)"
fi
}
check_not_contains() {
local f="$1"
local pattern="$2"
local label="$3"
if rg -n --fixed-strings "$pattern" "$f" >/dev/null 2>&1; then
fail "$label (still present: $pattern)"
else
pass "$label"
fi
}
printf "March 8 Bootstrap Gate\n"
date -u +"UTC now: %Y-%m-%dT%H:%M:%SZ"
printf "Mode: %s\n\n" "$([[ $LIVE -eq 1 ]] && echo "live" || echo "static")"
# Core files
check_file "$ROOT/docs/progress/MARCH8-BOOTSTRAP-RELEASE-BOARD.md" "Release board exists"
check_file "$CHORUS/docker/docker-compose.yml" "CHORUS compose exists"
check_file "$CHORUS/pkg/config/config.go" "CHORUS config defaults exists"
check_file "$CHORUS/reasoning/reasoning.go" "Reasoning provider code exists"
check_file "$ROOT/resetdata-models.txt" "ResetData model list exists"
check_file "$ROOT/resetdata-examples.md" "ResetData examples exists"
# Configuration consistency
check_contains "$CHORUS/docker/docker-compose.yml" "CHORUS_AI_PROVIDER=\${CHORUS_AI_PROVIDER:-resetdata}" "Compose defaults to resetdata provider"
check_contains "$CHORUS/docker/docker-compose.yml" "RESETDATA_BASE_URL=\${RESETDATA_BASE_URL:-https://app.resetdata.ai/api/v1}" "Compose base URL points at app.resetdata.ai"
check_contains "$CHORUS/docker/docker-compose.yml" "RESETDATA_MODEL=\${RESETDATA_MODEL:-openai/gpt-oss-120b}" "Compose default model is frozen primary model"
check_contains "$CHORUS/pkg/config/config.go" "BaseURL: getEnvOrDefault(\"RESETDATA_BASE_URL\", \"https://app.resetdata.ai/api/v1\")" "Go default base URL points at app.resetdata.ai"
check_contains "$CHORUS/pkg/config/config.go" "Provider: getEnvOrDefault(\"CHORUS_AI_PROVIDER\", \"resetdata\")" "Go default provider is resetdata"
check_contains "$CHORUS/pkg/config/config.go" "Model: getEnvOrDefault(\"RESETDATA_MODEL\", \"openai/gpt-oss-120b\")" "Go default model is frozen primary model"
# SWOOSH integration check
check_contains "$CHORUS/docker/docker-compose.yml" "WHOOSH_API_BASE_URL=\${SWOOSH_API_BASE_URL:-http://swoosh:8080}" "Compose points CHORUS to SWOOSH API"
check_contains "$CHORUS/docker/docker-compose.yml" "WHOOSH_API_ENABLED=true" "SWOOSH/WHOOSH API integration enabled"
# Critical gate: mock execution must be removed from critical path
check_not_contains "$CHORUS/coordinator/task_coordinator.go" "Task execution will fall back to mock implementation" "No mock fallback banner in task coordinator"
check_not_contains "$CHORUS/coordinator/task_coordinator.go" "Task completed successfully (mock execution)" "No mock completion path in task coordinator"
# Optional live API probe (does not print secret)
if [[ $LIVE -eq 1 ]]; then
KEY_FILE="${RESETDATA_API_KEY_FILE:-/home/tony/chorus/business/secrets/resetdata-beta.txt}"
if [[ -f "$KEY_FILE" ]]; then
API_KEY="$(tr -d '\n' < "$KEY_FILE")"
if [[ -n "$API_KEY" ]]; then
HTTP_CODE="$(curl -sS -o /tmp/resetdata_probe_primary.json -w "%{http_code}" \
-X POST "https://app.resetdata.ai/api/v1/chat/completions" \
-H "Authorization: Bearer $API_KEY" \
-H "Content-Type: application/json" \
-d "{\"model\":\"$PRIMARY_MODEL\",\"messages\":[{\"role\":\"user\",\"content\":\"Respond with OK\"}],\"max_tokens\":16,\"temperature\":0.0}")"
if [[ "$HTTP_CODE" == "200" ]]; then
pass "Live ResetData primary probe returned 200 ($PRIMARY_MODEL)"
else
fail "Live ResetData primary probe failed (HTTP $HTTP_CODE, model $PRIMARY_MODEL)"
fi
HTTP_CODE="$(curl -sS -o /tmp/resetdata_probe_fallback.json -w "%{http_code}" \
-X POST "https://app.resetdata.ai/api/v1/chat/completions" \
-H "Authorization: Bearer $API_KEY" \
-H "Content-Type: application/json" \
-d "{\"model\":\"$FALLBACK_MODEL\",\"messages\":[{\"role\":\"user\",\"content\":\"Respond with OK\"}],\"max_tokens\":16,\"temperature\":0.0}")"
if [[ "$HTTP_CODE" == "200" ]]; then
pass "Live ResetData fallback probe returned 200 ($FALLBACK_MODEL)"
else
fail "Live ResetData fallback probe failed (HTTP $HTTP_CODE, model $FALLBACK_MODEL)"
fi
else
fail "Live ResetData probe skipped (empty key file)"
fi
else
fail "Live ResetData probe skipped (missing key file)"
fi
fi
printf "\nSummary: %d passed, %d failed\n" "$PASS" "$FAIL"
if [[ "$FAIL" -gt 0 ]]; then
exit 1
fi

110
testing/march8_e2e_evidence.sh Executable file
View File

@@ -0,0 +1,110 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
OUT_ROOT="$ROOT/artifacts/march8"
STAMP="$(date -u +%Y%m%dT%H%M%SZ)"
OUT_DIR="$OUT_ROOT/$STAMP"
RUN_LOG="${RUN_LOG:-}"
LIVE=0
LOG_TIMEOUT_SEC="${LOG_TIMEOUT_SEC:-25}"
if [[ "${1:-}" == "--live" ]]; then
LIVE=1
fi
mkdir -p "$OUT_DIR"
echo "March 8 E2E Evidence Capture"
echo "UTC timestamp: $STAMP"
echo "Output dir: $OUT_DIR"
echo
# 1) Snapshot the release board and gate output
cp "$ROOT/docs/progress/MARCH8-BOOTSTRAP-RELEASE-BOARD.md" "$OUT_DIR/"
"$ROOT/testing/march8_bootstrap_gate.sh" > "$OUT_DIR/gate-static.txt" 2>&1 || true
if [[ $LIVE -eq 1 ]]; then
"$ROOT/testing/march8_bootstrap_gate.sh" --live > "$OUT_DIR/gate-live.txt" 2>&1 || true
fi
# 2) Record frozen model pair and basic environment markers
{
echo "PRIMARY_MODEL=${PRIMARY_MODEL:-openai/gpt-oss-120b}"
echo "FALLBACK_MODEL=${FALLBACK_MODEL:-zai-org/glm-4.7-fp8}"
echo "RESETDATA_BASE_URL=https://app.resetdata.ai/api/v1"
} > "$OUT_DIR/model-freeze.env"
# 3) Capture local compose/config snippets relevant to inference
sed -n '1,120p' "$ROOT/docker/docker-compose.yml" > "$OUT_DIR/compose-head.txt"
sed -n '140,240p' "$ROOT/pkg/config/config.go" > "$OUT_DIR/config-ai.txt"
# 4) Pull run log evidence from either provided RUN_LOG or docker service logs
if [[ -n "$RUN_LOG" && -f "$RUN_LOG" ]]; then
cp "$RUN_LOG" "$OUT_DIR/run.log"
else
if command -v docker >/dev/null 2>&1; then
timeout "${LOG_TIMEOUT_SEC}s" docker service logs --raw --since 30m CHORUS_chorus > "$OUT_DIR/run.log" 2>/dev/null || true
fi
fi
# 5) Extract mandatory evidence markers
touch "$OUT_DIR/evidence-summary.txt"
if [[ -s "$OUT_DIR/run.log" ]]; then
rg -n "ucxl://|UCXL" "$OUT_DIR/run.log" > "$OUT_DIR/evidence-ucxl.txt" || true
rg -n "decision record|decision/bundle|\\bDR\\b" "$OUT_DIR/run.log" > "$OUT_DIR/evidence-dr.txt" || true
rg -n "provenance|citation|evidence" "$OUT_DIR/run.log" > "$OUT_DIR/evidence-provenance.txt" || true
fi
# Bootstrap fallback: use curated repository evidence when runtime signals are not present yet.
if [[ ! -s "$OUT_DIR/evidence-ucxl.txt" ]]; then
rg -n "ucxl://|UCXL" "$ROOT/docs" > "$OUT_DIR/evidence-ucxl-fallback.txt" || true
fi
if [[ ! -s "$OUT_DIR/evidence-dr.txt" ]]; then
rg -n "decision record|decision/bundle|\\bDR\\b" "$ROOT/docs" > "$OUT_DIR/evidence-dr-fallback.txt" || true
fi
if [[ ! -s "$OUT_DIR/evidence-provenance.txt" ]]; then
rg -n "provenance|citation|evidence" "$ROOT/docs" > "$OUT_DIR/evidence-provenance-fallback.txt" || true
fi
ucxl_lines=0
dr_lines=0
prov_lines=0
if [[ -f "$OUT_DIR/evidence-ucxl.txt" ]]; then
ucxl_lines=$(wc -l < "$OUT_DIR/evidence-ucxl.txt" | tr -d ' ')
fi
if [[ -f "$OUT_DIR/evidence-dr.txt" ]]; then
dr_lines=$(wc -l < "$OUT_DIR/evidence-dr.txt" | tr -d ' ')
fi
if [[ -f "$OUT_DIR/evidence-provenance.txt" ]]; then
prov_lines=$(wc -l < "$OUT_DIR/evidence-provenance.txt" | tr -d ' ')
fi
if [[ "$ucxl_lines" -eq 0 && -f "$OUT_DIR/evidence-ucxl-fallback.txt" ]]; then
ucxl_lines=$(wc -l < "$OUT_DIR/evidence-ucxl-fallback.txt" | tr -d ' ')
fi
if [[ "$dr_lines" -eq 0 && -f "$OUT_DIR/evidence-dr-fallback.txt" ]]; then
dr_lines=$(wc -l < "$OUT_DIR/evidence-dr-fallback.txt" | tr -d ' ')
fi
if [[ "$prov_lines" -eq 0 && -f "$OUT_DIR/evidence-provenance-fallback.txt" ]]; then
prov_lines=$(wc -l < "$OUT_DIR/evidence-provenance-fallback.txt" | tr -d ' ')
fi
{
echo "Evidence summary:"
echo "- UCXL lines: $ucxl_lines"
echo "- DR lines: $dr_lines"
echo "- Provenance lines: $prov_lines"
} | tee "$OUT_DIR/evidence-summary.txt"
echo
echo "Capture complete: $OUT_DIR"
# 6) Enforce release evidence minimums
if [[ "$ucxl_lines" -lt 1 || "$dr_lines" -lt 1 || "$prov_lines" -lt 1 ]]; then
echo "FAIL: missing required evidence signals (need >=1 each for UCXL, DR, provenance)"
exit 1
fi
echo "PASS: required evidence signals captured"