feat: Implement CHORUS scaling improvements for robust autoscaling
Address WHOOSH issue #7 with comprehensive scaling optimizations to prevent license server, bootstrap peer, and control plane collapse during fast scale-out. HIGH-RISK FIXES (Must-Do): ✅ License gate already implemented with cache + circuit breaker + grace window ✅ mDNS disabled in container environments (CHORUS_MDNS_ENABLED=false) ✅ Connection rate limiting (5 dials/sec, 16 concurrent DHT queries) ✅ Connection manager with watermarks (32 low, 128 high) ✅ AutoNAT enabled for container networking MEDIUM-RISK FIXES (Next Priority): ✅ Assignment merge layer with HTTP/file config + SIGHUP reload ✅ Runtime configuration system with WHOOSH assignment API support ✅ Election stability windows to prevent churn: - CHORUS_ELECTION_MIN_TERM=30s (minimum time between elections) - CHORUS_LEADER_MIN_TERM=45s (minimum time before challenging healthy leader) ✅ Bootstrap pool JSON support with priority sorting and join stagger NEW FEATURES: - Runtime config system with assignment overrides from WHOOSH - SIGHUP reload handler for live configuration updates - JSON bootstrap configuration with peer metadata (region, roles, priority) - Configurable election stability windows with environment variables - Multi-format bootstrap support: Assignment → JSON → CSV FILES MODIFIED: - pkg/config/assignment.go (NEW): Runtime assignment merge system - docker/bootstrap.json (NEW): Example JSON bootstrap configuration - pkg/election/election.go: Added stability windows and churn prevention - internal/runtime/shared.go: Integrated assignment loading and conditional mDNS - p2p/node.go: Added connection management and rate limiting - pkg/config/hybrid_config.go: Added rate limiting configuration fields - docker/docker-compose.yml: Updated environment variables and configs - README.md: Updated status table with scaling milestone This implementation enables wave-based autoscaling without system collapse, addressing all scaling concerns from WHOOSH issue #7. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -105,6 +105,7 @@ func (t *SimpleTaskTracker) publishTaskCompletion(taskID string, success bool, s
|
||||
// SharedRuntime contains all the shared P2P infrastructure components
|
||||
type SharedRuntime struct {
|
||||
Config *config.Config
|
||||
RuntimeConfig *config.RuntimeConfig
|
||||
Logger *SimpleLogger
|
||||
Context context.Context
|
||||
Cancel context.CancelFunc
|
||||
@@ -149,6 +150,28 @@ func Initialize(appMode string) (*SharedRuntime, error) {
|
||||
runtime.Config = cfg
|
||||
|
||||
runtime.Logger.Info("✅ Configuration loaded successfully")
|
||||
|
||||
// Initialize runtime configuration with assignment support
|
||||
runtime.RuntimeConfig = config.NewRuntimeConfig(cfg)
|
||||
|
||||
// Load assignment if ASSIGN_URL is configured
|
||||
if assignURL := os.Getenv("ASSIGN_URL"); assignURL != "" {
|
||||
runtime.Logger.Info("📡 Loading assignment from WHOOSH: %s", assignURL)
|
||||
|
||||
ctx, cancel := context.WithTimeout(runtime.Context, 10*time.Second)
|
||||
if err := runtime.RuntimeConfig.LoadAssignment(ctx, assignURL); err != nil {
|
||||
runtime.Logger.Warn("⚠️ Failed to load assignment (continuing with base config): %v", err)
|
||||
} else {
|
||||
runtime.Logger.Info("✅ Assignment loaded successfully")
|
||||
}
|
||||
cancel()
|
||||
|
||||
// Start reload handler for SIGHUP
|
||||
runtime.RuntimeConfig.StartReloadHandler(runtime.Context, assignURL)
|
||||
runtime.Logger.Info("📡 SIGHUP reload handler started for assignment updates")
|
||||
} else {
|
||||
runtime.Logger.Info("⚪ No ASSIGN_URL configured, using static configuration")
|
||||
}
|
||||
runtime.Logger.Info("🤖 Agent ID: %s", cfg.Agent.ID)
|
||||
runtime.Logger.Info("🎯 Specialization: %s", cfg.Agent.Specialization)
|
||||
|
||||
@@ -225,12 +248,17 @@ func Initialize(appMode string) (*SharedRuntime, error) {
|
||||
runtime.HypercoreLog = hlog
|
||||
runtime.Logger.Info("📝 Hypercore logger initialized")
|
||||
|
||||
// Initialize mDNS discovery
|
||||
mdnsDiscovery, err := discovery.NewMDNSDiscovery(ctx, node.Host(), "chorus-peer-discovery")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create mDNS discovery: %v", err)
|
||||
// Initialize mDNS discovery (disabled in container environments for scaling)
|
||||
if cfg.V2.DHT.MDNSEnabled {
|
||||
mdnsDiscovery, err := discovery.NewMDNSDiscovery(ctx, node.Host(), "chorus-peer-discovery")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create mDNS discovery: %v", err)
|
||||
}
|
||||
runtime.MDNSDiscovery = mdnsDiscovery
|
||||
runtime.Logger.Info("🔍 mDNS discovery enabled for local network")
|
||||
} else {
|
||||
runtime.Logger.Info("⚪ mDNS discovery disabled (recommended for container/swarm deployments)")
|
||||
}
|
||||
runtime.MDNSDiscovery = mdnsDiscovery
|
||||
|
||||
// Initialize PubSub with hypercore logging
|
||||
ps, err := pubsub.NewPubSubWithLogger(ctx, node.Host(), "chorus/coordination/v1", "hmmm/meta-discussion/v1", hlog)
|
||||
@@ -283,6 +311,7 @@ func (r *SharedRuntime) Cleanup() {
|
||||
|
||||
if r.MDNSDiscovery != nil {
|
||||
r.MDNSDiscovery.Close()
|
||||
r.Logger.Info("🔍 mDNS discovery closed")
|
||||
}
|
||||
|
||||
if r.PubSub != nil {
|
||||
@@ -407,8 +436,20 @@ func (r *SharedRuntime) initializeDHTStorage() error {
|
||||
}
|
||||
}
|
||||
|
||||
// Connect to bootstrap peers if configured
|
||||
for _, addrStr := range r.Config.V2.DHT.BootstrapPeers {
|
||||
// Connect to bootstrap peers (with assignment override support)
|
||||
bootstrapPeers := r.RuntimeConfig.GetBootstrapPeers()
|
||||
if len(bootstrapPeers) == 0 {
|
||||
bootstrapPeers = r.Config.V2.DHT.BootstrapPeers
|
||||
}
|
||||
|
||||
// Apply join stagger if configured
|
||||
joinStagger := r.RuntimeConfig.GetJoinStagger()
|
||||
if joinStagger > 0 {
|
||||
r.Logger.Info("⏱️ Applying join stagger delay: %v", joinStagger)
|
||||
time.Sleep(joinStagger)
|
||||
}
|
||||
|
||||
for _, addrStr := range bootstrapPeers {
|
||||
addr, err := multiaddr.NewMultiaddr(addrStr)
|
||||
if err != nil {
|
||||
r.Logger.Warn("⚠️ Invalid bootstrap address %s: %v", addrStr, err)
|
||||
|
||||
Reference in New Issue
Block a user