feat: Implement CHORUS scaling improvements for robust autoscaling
Address WHOOSH issue #7 with comprehensive scaling optimizations to prevent license server, bootstrap peer, and control plane collapse during fast scale-out. HIGH-RISK FIXES (Must-Do): ✅ License gate already implemented with cache + circuit breaker + grace window ✅ mDNS disabled in container environments (CHORUS_MDNS_ENABLED=false) ✅ Connection rate limiting (5 dials/sec, 16 concurrent DHT queries) ✅ Connection manager with watermarks (32 low, 128 high) ✅ AutoNAT enabled for container networking MEDIUM-RISK FIXES (Next Priority): ✅ Assignment merge layer with HTTP/file config + SIGHUP reload ✅ Runtime configuration system with WHOOSH assignment API support ✅ Election stability windows to prevent churn: - CHORUS_ELECTION_MIN_TERM=30s (minimum time between elections) - CHORUS_LEADER_MIN_TERM=45s (minimum time before challenging healthy leader) ✅ Bootstrap pool JSON support with priority sorting and join stagger NEW FEATURES: - Runtime config system with assignment overrides from WHOOSH - SIGHUP reload handler for live configuration updates - JSON bootstrap configuration with peer metadata (region, roles, priority) - Configurable election stability windows with environment variables - Multi-format bootstrap support: Assignment → JSON → CSV FILES MODIFIED: - pkg/config/assignment.go (NEW): Runtime assignment merge system - docker/bootstrap.json (NEW): Example JSON bootstrap configuration - pkg/election/election.go: Added stability windows and churn prevention - internal/runtime/shared.go: Integrated assignment loading and conditional mDNS - p2p/node.go: Added connection management and rate limiting - pkg/config/hybrid_config.go: Added rate limiting configuration fields - docker/docker-compose.yml: Updated environment variables and configs - README.md: Updated status table with scaling milestone This implementation enables wave-based autoscaling without system collapse, addressing all scaling concerns from WHOOSH issue #7. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
38
docker/bootstrap.json
Normal file
38
docker/bootstrap.json
Normal file
@@ -0,0 +1,38 @@
|
||||
{
|
||||
"metadata": {
|
||||
"generated_at": "2024-12-19T10:00:00Z",
|
||||
"cluster_id": "production-cluster",
|
||||
"version": "1.0.0",
|
||||
"notes": "Bootstrap configuration for CHORUS scaling - managed by WHOOSH"
|
||||
},
|
||||
"peers": [
|
||||
{
|
||||
"address": "/ip4/10.0.1.10/tcp/9000/p2p/12D3KooWExample1234567890abcdef",
|
||||
"priority": 100,
|
||||
"region": "us-east-1",
|
||||
"roles": ["admin", "stable"],
|
||||
"enabled": true
|
||||
},
|
||||
{
|
||||
"address": "/ip4/10.0.1.11/tcp/9000/p2p/12D3KooWExample1234567890abcde2",
|
||||
"priority": 90,
|
||||
"region": "us-east-1",
|
||||
"roles": ["worker", "stable"],
|
||||
"enabled": true
|
||||
},
|
||||
{
|
||||
"address": "/ip4/10.0.2.10/tcp/9000/p2p/12D3KooWExample1234567890abcde3",
|
||||
"priority": 80,
|
||||
"region": "us-west-2",
|
||||
"roles": ["worker", "stable"],
|
||||
"enabled": true
|
||||
},
|
||||
{
|
||||
"address": "/ip4/10.0.3.10/tcp/9000/p2p/12D3KooWExample1234567890abcde4",
|
||||
"priority": 70,
|
||||
"region": "eu-central-1",
|
||||
"roles": ["worker"],
|
||||
"enabled": false
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -15,13 +15,32 @@ services:
|
||||
- CHORUS_AGENT_ID=${CHORUS_AGENT_ID:-} # Auto-generated if not provided
|
||||
- CHORUS_SPECIALIZATION=${CHORUS_SPECIALIZATION:-general_developer}
|
||||
- CHORUS_MAX_TASKS=${CHORUS_MAX_TASKS:-3}
|
||||
- CHORUS_CAPABILITIES=${CHORUS_CAPABILITIES:-general_development,task_coordination,admin_election}
|
||||
- CHORUS_CAPABILITIES=general_development,task_coordination,admin_election
|
||||
|
||||
# Network configuration
|
||||
- CHORUS_API_PORT=8080
|
||||
- CHORUS_HEALTH_PORT=8081
|
||||
- CHORUS_P2P_PORT=9000
|
||||
- CHORUS_BIND_ADDRESS=0.0.0.0
|
||||
|
||||
# Scaling optimizations (as per WHOOSH issue #7)
|
||||
- CHORUS_MDNS_ENABLED=false # Disabled for container/swarm environments
|
||||
- CHORUS_DIALS_PER_SEC=5 # Rate limit outbound connections to prevent storms
|
||||
- CHORUS_MAX_CONCURRENT_DHT=16 # Limit concurrent DHT queries
|
||||
|
||||
# Election stability windows (Medium-risk fix 2.1)
|
||||
- CHORUS_ELECTION_MIN_TERM=30s # Minimum time between elections to prevent churn
|
||||
- CHORUS_LEADER_MIN_TERM=45s # Minimum time before challenging healthy leader
|
||||
|
||||
# Assignment system for runtime configuration (Medium-risk fix 2.2)
|
||||
- ASSIGN_URL=${ASSIGN_URL:-} # Optional: WHOOSH assignment endpoint
|
||||
- TASK_SLOT=${TASK_SLOT:-} # Optional: Task slot identifier
|
||||
- TASK_ID=${TASK_ID:-} # Optional: Task identifier
|
||||
- NODE_ID=${NODE_ID:-} # Optional: Node identifier
|
||||
|
||||
# Bootstrap pool configuration (supports JSON and CSV)
|
||||
- BOOTSTRAP_JSON=/config/bootstrap.json # Optional: JSON bootstrap config
|
||||
- CHORUS_BOOTSTRAP_PEERS=${CHORUS_BOOTSTRAP_PEERS:-} # CSV fallback
|
||||
|
||||
# AI configuration - Provider selection
|
||||
- CHORUS_AI_PROVIDER=${CHORUS_AI_PROVIDER:-resetdata}
|
||||
@@ -57,6 +76,11 @@ services:
|
||||
secrets:
|
||||
- chorus_license_id
|
||||
- resetdata_api_key
|
||||
|
||||
# Configuration files
|
||||
configs:
|
||||
- source: chorus_bootstrap
|
||||
target: /config/bootstrap.json
|
||||
|
||||
# Persistent data storage
|
||||
volumes:
|
||||
@@ -169,7 +193,7 @@ services:
|
||||
# Scaling system configuration
|
||||
WHOOSH_SCALING_KACHING_URL: "https://kaching.chorus.services"
|
||||
WHOOSH_SCALING_BACKBEAT_URL: "http://backbeat-pulse:8080"
|
||||
WHOOSH_SCALING_CHORUS_URL: "http://chorus:8080"
|
||||
WHOOSH_SCALING_CHORUS_URL: "http://chorus:9000"
|
||||
secrets:
|
||||
- whoosh_db_password
|
||||
- gitea_token
|
||||
@@ -616,6 +640,10 @@ networks:
|
||||
|
||||
|
||||
|
||||
configs:
|
||||
chorus_bootstrap:
|
||||
file: ./bootstrap.json
|
||||
|
||||
secrets:
|
||||
chorus_license_id:
|
||||
external: true
|
||||
|
||||
Reference in New Issue
Block a user