feat: Implement CHORUS scaling improvements for robust autoscaling
Address WHOOSH issue #7 with comprehensive scaling optimizations to prevent license server, bootstrap peer, and control plane collapse during fast scale-out. HIGH-RISK FIXES (Must-Do): ✅ License gate already implemented with cache + circuit breaker + grace window ✅ mDNS disabled in container environments (CHORUS_MDNS_ENABLED=false) ✅ Connection rate limiting (5 dials/sec, 16 concurrent DHT queries) ✅ Connection manager with watermarks (32 low, 128 high) ✅ AutoNAT enabled for container networking MEDIUM-RISK FIXES (Next Priority): ✅ Assignment merge layer with HTTP/file config + SIGHUP reload ✅ Runtime configuration system with WHOOSH assignment API support ✅ Election stability windows to prevent churn: - CHORUS_ELECTION_MIN_TERM=30s (minimum time between elections) - CHORUS_LEADER_MIN_TERM=45s (minimum time before challenging healthy leader) ✅ Bootstrap pool JSON support with priority sorting and join stagger NEW FEATURES: - Runtime config system with assignment overrides from WHOOSH - SIGHUP reload handler for live configuration updates - JSON bootstrap configuration with peer metadata (region, roles, priority) - Configurable election stability windows with environment variables - Multi-format bootstrap support: Assignment → JSON → CSV FILES MODIFIED: - pkg/config/assignment.go (NEW): Runtime assignment merge system - docker/bootstrap.json (NEW): Example JSON bootstrap configuration - pkg/election/election.go: Added stability windows and churn prevention - internal/runtime/shared.go: Integrated assignment loading and conditional mDNS - p2p/node.go: Added connection management and rate limiting - pkg/config/hybrid_config.go: Added rate limiting configuration fields - docker/docker-compose.yml: Updated environment variables and configs - README.md: Updated status table with scaling milestone This implementation enables wave-based autoscaling without system collapse, addressing all scaling concerns from WHOOSH issue #7. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -6,6 +6,7 @@ import (
|
||||
"fmt"
|
||||
"log"
|
||||
"math/rand"
|
||||
"os"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@@ -102,6 +103,11 @@ type ElectionManager struct {
|
||||
onAdminChanged func(oldAdmin, newAdmin string)
|
||||
onElectionComplete func(winner string)
|
||||
|
||||
// Stability window to prevent election churn (Medium-risk fix 2.1)
|
||||
lastElectionTime time.Time
|
||||
electionStabilityWindow time.Duration
|
||||
leaderStabilityWindow time.Duration
|
||||
|
||||
startTime time.Time
|
||||
}
|
||||
|
||||
@@ -137,6 +143,10 @@ func NewElectionManager(
|
||||
votes: make(map[string]string),
|
||||
electionTrigger: make(chan ElectionTrigger, 10),
|
||||
startTime: time.Now(),
|
||||
|
||||
// Initialize stability windows (as per WHOOSH issue #7)
|
||||
electionStabilityWindow: getElectionStabilityWindow(cfg),
|
||||
leaderStabilityWindow: getLeaderStabilityWindow(cfg),
|
||||
}
|
||||
|
||||
// Initialize heartbeat manager
|
||||
@@ -220,11 +230,13 @@ func (em *ElectionManager) Stop() {
|
||||
}
|
||||
}
|
||||
|
||||
// TriggerElection manually triggers an election
|
||||
// TriggerElection manually triggers an election with stability window checks
|
||||
func (em *ElectionManager) TriggerElection(trigger ElectionTrigger) {
|
||||
// Check if election already in progress
|
||||
em.mu.RLock()
|
||||
currentState := em.state
|
||||
currentAdmin := em.currentAdmin
|
||||
lastElection := em.lastElectionTime
|
||||
em.mu.RUnlock()
|
||||
|
||||
if currentState != StateIdle {
|
||||
@@ -232,6 +244,26 @@ func (em *ElectionManager) TriggerElection(trigger ElectionTrigger) {
|
||||
return
|
||||
}
|
||||
|
||||
// Apply stability window to prevent election churn (WHOOSH issue #7)
|
||||
now := time.Now()
|
||||
if !lastElection.IsZero() {
|
||||
timeSinceElection := now.Sub(lastElection)
|
||||
|
||||
// If we have a current admin, check leader stability window
|
||||
if currentAdmin != "" && timeSinceElection < em.leaderStabilityWindow {
|
||||
log.Printf("⏳ Leader stability window active (%.1fs remaining), ignoring trigger: %s",
|
||||
(em.leaderStabilityWindow - timeSinceElection).Seconds(), trigger)
|
||||
return
|
||||
}
|
||||
|
||||
// General election stability window
|
||||
if timeSinceElection < em.electionStabilityWindow {
|
||||
log.Printf("⏳ Election stability window active (%.1fs remaining), ignoring trigger: %s",
|
||||
(em.electionStabilityWindow - timeSinceElection).Seconds(), trigger)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
select {
|
||||
case em.electionTrigger <- trigger:
|
||||
log.Printf("🗳️ Election triggered: %s", trigger)
|
||||
@@ -442,6 +474,7 @@ func (em *ElectionManager) beginElection(trigger ElectionTrigger) {
|
||||
em.mu.Lock()
|
||||
em.state = StateElecting
|
||||
em.currentTerm++
|
||||
em.lastElectionTime = time.Now() // Record election timestamp for stability window
|
||||
term := em.currentTerm
|
||||
em.candidates = make(map[string]*AdminCandidate)
|
||||
em.votes = make(map[string]string)
|
||||
@@ -1119,3 +1152,43 @@ func (hm *HeartbeatManager) GetHeartbeatStatus() map[string]interface{} {
|
||||
|
||||
return status
|
||||
}
|
||||
|
||||
// Helper functions for stability window configuration
|
||||
|
||||
// getElectionStabilityWindow gets the minimum time between elections
|
||||
func getElectionStabilityWindow(cfg *config.Config) time.Duration {
|
||||
// Try to get from environment or use default
|
||||
if stability := os.Getenv("CHORUS_ELECTION_MIN_TERM"); stability != "" {
|
||||
if duration, err := time.ParseDuration(stability); err == nil {
|
||||
return duration
|
||||
}
|
||||
}
|
||||
|
||||
// Try to get from config structure if it exists
|
||||
if cfg.Security.ElectionConfig.DiscoveryTimeout > 0 {
|
||||
// Use double the discovery timeout as default stability window
|
||||
return cfg.Security.ElectionConfig.DiscoveryTimeout * 2
|
||||
}
|
||||
|
||||
// Default fallback
|
||||
return 30 * time.Second
|
||||
}
|
||||
|
||||
// getLeaderStabilityWindow gets the minimum time before challenging a healthy leader
|
||||
func getLeaderStabilityWindow(cfg *config.Config) time.Duration {
|
||||
// Try to get from environment or use default
|
||||
if stability := os.Getenv("CHORUS_LEADER_MIN_TERM"); stability != "" {
|
||||
if duration, err := time.ParseDuration(stability); err == nil {
|
||||
return duration
|
||||
}
|
||||
}
|
||||
|
||||
// Try to get from config structure if it exists
|
||||
if cfg.Security.ElectionConfig.HeartbeatTimeout > 0 {
|
||||
// Use 3x heartbeat timeout as default leader stability
|
||||
return cfg.Security.ElectionConfig.HeartbeatTimeout * 3
|
||||
}
|
||||
|
||||
// Default fallback
|
||||
return 45 * time.Second
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user