feat: Implement CHORUS scaling improvements for robust autoscaling
Address WHOOSH issue #7 with comprehensive scaling optimizations to prevent license server, bootstrap peer, and control plane collapse during fast scale-out. HIGH-RISK FIXES (Must-Do): ✅ License gate already implemented with cache + circuit breaker + grace window ✅ mDNS disabled in container environments (CHORUS_MDNS_ENABLED=false) ✅ Connection rate limiting (5 dials/sec, 16 concurrent DHT queries) ✅ Connection manager with watermarks (32 low, 128 high) ✅ AutoNAT enabled for container networking MEDIUM-RISK FIXES (Next Priority): ✅ Assignment merge layer with HTTP/file config + SIGHUP reload ✅ Runtime configuration system with WHOOSH assignment API support ✅ Election stability windows to prevent churn: - CHORUS_ELECTION_MIN_TERM=30s (minimum time between elections) - CHORUS_LEADER_MIN_TERM=45s (minimum time before challenging healthy leader) ✅ Bootstrap pool JSON support with priority sorting and join stagger NEW FEATURES: - Runtime config system with assignment overrides from WHOOSH - SIGHUP reload handler for live configuration updates - JSON bootstrap configuration with peer metadata (region, roles, priority) - Configurable election stability windows with environment variables - Multi-format bootstrap support: Assignment → JSON → CSV FILES MODIFIED: - pkg/config/assignment.go (NEW): Runtime assignment merge system - docker/bootstrap.json (NEW): Example JSON bootstrap configuration - pkg/election/election.go: Added stability windows and churn prevention - internal/runtime/shared.go: Integrated assignment loading and conditional mDNS - p2p/node.go: Added connection management and rate limiting - pkg/config/hybrid_config.go: Added rate limiting configuration fields - docker/docker-compose.yml: Updated environment variables and configs - README.md: Updated status table with scaling milestone This implementation enables wave-based autoscaling without system collapse, addressing all scaling concerns from WHOOSH issue #7. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -41,10 +41,16 @@ type HybridUCXLConfig struct {
|
||||
}
|
||||
|
||||
type DiscoveryConfig struct {
|
||||
MDNSEnabled bool `env:"CHORUS_MDNS_ENABLED" default:"true" json:"mdns_enabled" yaml:"mdns_enabled"`
|
||||
DHTDiscovery bool `env:"CHORUS_DHT_DISCOVERY" default:"false" json:"dht_discovery" yaml:"dht_discovery"`
|
||||
AnnounceInterval time.Duration `env:"CHORUS_ANNOUNCE_INTERVAL" default:"30s" json:"announce_interval" yaml:"announce_interval"`
|
||||
ServiceName string `env:"CHORUS_SERVICE_NAME" default:"CHORUS" json:"service_name" yaml:"service_name"`
|
||||
MDNSEnabled bool `env:"CHORUS_MDNS_ENABLED" default:"true" json:"mdns_enabled" yaml:"mdns_enabled"`
|
||||
DHTDiscovery bool `env:"CHORUS_DHT_DISCOVERY" default:"false" json:"dht_discovery" yaml:"dht_discovery"`
|
||||
AnnounceInterval time.Duration `env:"CHORUS_ANNOUNCE_INTERVAL" default:"30s" json:"announce_interval" yaml:"announce_interval"`
|
||||
ServiceName string `env:"CHORUS_SERVICE_NAME" default:"CHORUS" json:"service_name" yaml:"service_name"`
|
||||
|
||||
// Rate limiting for scaling (as per WHOOSH issue #7)
|
||||
DialsPerSecond int `env:"CHORUS_DIALS_PER_SEC" default:"5" json:"dials_per_second" yaml:"dials_per_second"`
|
||||
MaxConcurrentDHT int `env:"CHORUS_MAX_CONCURRENT_DHT" default:"16" json:"max_concurrent_dht" yaml:"max_concurrent_dht"`
|
||||
MaxConcurrentDials int `env:"CHORUS_MAX_CONCURRENT_DIALS" default:"10" json:"max_concurrent_dials" yaml:"max_concurrent_dials"`
|
||||
JoinStaggerMS int `env:"CHORUS_JOIN_STAGGER_MS" default:"0" json:"join_stagger_ms" yaml:"join_stagger_ms"`
|
||||
}
|
||||
|
||||
type MonitoringConfig struct {
|
||||
@@ -79,10 +85,16 @@ func LoadHybridConfig() (*HybridConfig, error) {
|
||||
|
||||
// Load Discovery configuration
|
||||
config.Discovery = DiscoveryConfig{
|
||||
MDNSEnabled: getEnvBool("CHORUS_MDNS_ENABLED", true),
|
||||
DHTDiscovery: getEnvBool("CHORUS_DHT_DISCOVERY", false),
|
||||
AnnounceInterval: getEnvDuration("CHORUS_ANNOUNCE_INTERVAL", 30*time.Second),
|
||||
ServiceName: getEnvString("CHORUS_SERVICE_NAME", "CHORUS"),
|
||||
MDNSEnabled: getEnvBool("CHORUS_MDNS_ENABLED", true),
|
||||
DHTDiscovery: getEnvBool("CHORUS_DHT_DISCOVERY", false),
|
||||
AnnounceInterval: getEnvDuration("CHORUS_ANNOUNCE_INTERVAL", 30*time.Second),
|
||||
ServiceName: getEnvString("CHORUS_SERVICE_NAME", "CHORUS"),
|
||||
|
||||
// Rate limiting for scaling (as per WHOOSH issue #7)
|
||||
DialsPerSecond: getEnvInt("CHORUS_DIALS_PER_SEC", 5),
|
||||
MaxConcurrentDHT: getEnvInt("CHORUS_MAX_CONCURRENT_DHT", 16),
|
||||
MaxConcurrentDials: getEnvInt("CHORUS_MAX_CONCURRENT_DIALS", 10),
|
||||
JoinStaggerMS: getEnvInt("CHORUS_JOIN_STAGGER_MS", 0),
|
||||
}
|
||||
|
||||
// Load Monitoring configuration
|
||||
|
||||
Reference in New Issue
Block a user