feat: Implement CHORUS scaling improvements for robust autoscaling

Address WHOOSH issue #7 with comprehensive scaling optimizations to prevent
license server, bootstrap peer, and control plane collapse during fast scale-out.

HIGH-RISK FIXES (Must-Do):
 License gate already implemented with cache + circuit breaker + grace window
 mDNS disabled in container environments (CHORUS_MDNS_ENABLED=false)
 Connection rate limiting (5 dials/sec, 16 concurrent DHT queries)
 Connection manager with watermarks (32 low, 128 high)
 AutoNAT enabled for container networking

MEDIUM-RISK FIXES (Next Priority):
 Assignment merge layer with HTTP/file config + SIGHUP reload
 Runtime configuration system with WHOOSH assignment API support
 Election stability windows to prevent churn:
  - CHORUS_ELECTION_MIN_TERM=30s (minimum time between elections)
  - CHORUS_LEADER_MIN_TERM=45s (minimum time before challenging healthy leader)
 Bootstrap pool JSON support with priority sorting and join stagger

NEW FEATURES:
- Runtime config system with assignment overrides from WHOOSH
- SIGHUP reload handler for live configuration updates
- JSON bootstrap configuration with peer metadata (region, roles, priority)
- Configurable election stability windows with environment variables
- Multi-format bootstrap support: Assignment → JSON → CSV

FILES MODIFIED:
- pkg/config/assignment.go (NEW): Runtime assignment merge system
- docker/bootstrap.json (NEW): Example JSON bootstrap configuration
- pkg/election/election.go: Added stability windows and churn prevention
- internal/runtime/shared.go: Integrated assignment loading and conditional mDNS
- p2p/node.go: Added connection management and rate limiting
- pkg/config/hybrid_config.go: Added rate limiting configuration fields
- docker/docker-compose.yml: Updated environment variables and configs
- README.md: Updated status table with scaling milestone

This implementation enables wave-based autoscaling without system collapse,
addressing all scaling concerns from WHOOSH issue #7.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
anthonyrawlins
2025-09-23 17:50:40 +10:00
parent 26e4ef7d8b
commit e523c4b543
8 changed files with 776 additions and 20 deletions

View File

@@ -105,6 +105,7 @@ func (t *SimpleTaskTracker) publishTaskCompletion(taskID string, success bool, s
// SharedRuntime contains all the shared P2P infrastructure components
type SharedRuntime struct {
Config *config.Config
RuntimeConfig *config.RuntimeConfig
Logger *SimpleLogger
Context context.Context
Cancel context.CancelFunc
@@ -149,6 +150,28 @@ func Initialize(appMode string) (*SharedRuntime, error) {
runtime.Config = cfg
runtime.Logger.Info("✅ Configuration loaded successfully")
// Initialize runtime configuration with assignment support
runtime.RuntimeConfig = config.NewRuntimeConfig(cfg)
// Load assignment if ASSIGN_URL is configured
if assignURL := os.Getenv("ASSIGN_URL"); assignURL != "" {
runtime.Logger.Info("📡 Loading assignment from WHOOSH: %s", assignURL)
ctx, cancel := context.WithTimeout(runtime.Context, 10*time.Second)
if err := runtime.RuntimeConfig.LoadAssignment(ctx, assignURL); err != nil {
runtime.Logger.Warn("⚠️ Failed to load assignment (continuing with base config): %v", err)
} else {
runtime.Logger.Info("✅ Assignment loaded successfully")
}
cancel()
// Start reload handler for SIGHUP
runtime.RuntimeConfig.StartReloadHandler(runtime.Context, assignURL)
runtime.Logger.Info("📡 SIGHUP reload handler started for assignment updates")
} else {
runtime.Logger.Info("⚪ No ASSIGN_URL configured, using static configuration")
}
runtime.Logger.Info("🤖 Agent ID: %s", cfg.Agent.ID)
runtime.Logger.Info("🎯 Specialization: %s", cfg.Agent.Specialization)
@@ -225,12 +248,17 @@ func Initialize(appMode string) (*SharedRuntime, error) {
runtime.HypercoreLog = hlog
runtime.Logger.Info("📝 Hypercore logger initialized")
// Initialize mDNS discovery
mdnsDiscovery, err := discovery.NewMDNSDiscovery(ctx, node.Host(), "chorus-peer-discovery")
if err != nil {
return nil, fmt.Errorf("failed to create mDNS discovery: %v", err)
// Initialize mDNS discovery (disabled in container environments for scaling)
if cfg.V2.DHT.MDNSEnabled {
mdnsDiscovery, err := discovery.NewMDNSDiscovery(ctx, node.Host(), "chorus-peer-discovery")
if err != nil {
return nil, fmt.Errorf("failed to create mDNS discovery: %v", err)
}
runtime.MDNSDiscovery = mdnsDiscovery
runtime.Logger.Info("🔍 mDNS discovery enabled for local network")
} else {
runtime.Logger.Info("⚪ mDNS discovery disabled (recommended for container/swarm deployments)")
}
runtime.MDNSDiscovery = mdnsDiscovery
// Initialize PubSub with hypercore logging
ps, err := pubsub.NewPubSubWithLogger(ctx, node.Host(), "chorus/coordination/v1", "hmmm/meta-discussion/v1", hlog)
@@ -283,6 +311,7 @@ func (r *SharedRuntime) Cleanup() {
if r.MDNSDiscovery != nil {
r.MDNSDiscovery.Close()
r.Logger.Info("🔍 mDNS discovery closed")
}
if r.PubSub != nil {
@@ -407,8 +436,20 @@ func (r *SharedRuntime) initializeDHTStorage() error {
}
}
// Connect to bootstrap peers if configured
for _, addrStr := range r.Config.V2.DHT.BootstrapPeers {
// Connect to bootstrap peers (with assignment override support)
bootstrapPeers := r.RuntimeConfig.GetBootstrapPeers()
if len(bootstrapPeers) == 0 {
bootstrapPeers = r.Config.V2.DHT.BootstrapPeers
}
// Apply join stagger if configured
joinStagger := r.RuntimeConfig.GetJoinStagger()
if joinStagger > 0 {
r.Logger.Info("⏱️ Applying join stagger delay: %v", joinStagger)
time.Sleep(joinStagger)
}
for _, addrStr := range bootstrapPeers {
addr, err := multiaddr.NewMultiaddr(addrStr)
if err != nil {
r.Logger.Warn("⚠️ Invalid bootstrap address %s: %v", addrStr, err)