feat: Implement CHORUS scaling improvements for robust autoscaling

Address WHOOSH issue #7 with comprehensive scaling optimizations to prevent
license server, bootstrap peer, and control plane collapse during fast scale-out.

HIGH-RISK FIXES (Must-Do):
 License gate already implemented with cache + circuit breaker + grace window
 mDNS disabled in container environments (CHORUS_MDNS_ENABLED=false)
 Connection rate limiting (5 dials/sec, 16 concurrent DHT queries)
 Connection manager with watermarks (32 low, 128 high)
 AutoNAT enabled for container networking

MEDIUM-RISK FIXES (Next Priority):
 Assignment merge layer with HTTP/file config + SIGHUP reload
 Runtime configuration system with WHOOSH assignment API support
 Election stability windows to prevent churn:
  - CHORUS_ELECTION_MIN_TERM=30s (minimum time between elections)
  - CHORUS_LEADER_MIN_TERM=45s (minimum time before challenging healthy leader)
 Bootstrap pool JSON support with priority sorting and join stagger

NEW FEATURES:
- Runtime config system with assignment overrides from WHOOSH
- SIGHUP reload handler for live configuration updates
- JSON bootstrap configuration with peer metadata (region, roles, priority)
- Configurable election stability windows with environment variables
- Multi-format bootstrap support: Assignment → JSON → CSV

FILES MODIFIED:
- pkg/config/assignment.go (NEW): Runtime assignment merge system
- docker/bootstrap.json (NEW): Example JSON bootstrap configuration
- pkg/election/election.go: Added stability windows and churn prevention
- internal/runtime/shared.go: Integrated assignment loading and conditional mDNS
- p2p/node.go: Added connection management and rate limiting
- pkg/config/hybrid_config.go: Added rate limiting configuration fields
- docker/docker-compose.yml: Updated environment variables and configs
- README.md: Updated status table with scaling milestone

This implementation enables wave-based autoscaling without system collapse,
addressing all scaling concerns from WHOOSH issue #7.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
anthonyrawlins
2025-09-23 17:50:40 +10:00
parent 26e4ef7d8b
commit e523c4b543
8 changed files with 776 additions and 20 deletions

View File

@@ -6,6 +6,7 @@ import (
"fmt"
"log"
"math/rand"
"os"
"sync"
"time"
@@ -102,6 +103,11 @@ type ElectionManager struct {
onAdminChanged func(oldAdmin, newAdmin string)
onElectionComplete func(winner string)
// Stability window to prevent election churn (Medium-risk fix 2.1)
lastElectionTime time.Time
electionStabilityWindow time.Duration
leaderStabilityWindow time.Duration
startTime time.Time
}
@@ -137,6 +143,10 @@ func NewElectionManager(
votes: make(map[string]string),
electionTrigger: make(chan ElectionTrigger, 10),
startTime: time.Now(),
// Initialize stability windows (as per WHOOSH issue #7)
electionStabilityWindow: getElectionStabilityWindow(cfg),
leaderStabilityWindow: getLeaderStabilityWindow(cfg),
}
// Initialize heartbeat manager
@@ -220,11 +230,13 @@ func (em *ElectionManager) Stop() {
}
}
// TriggerElection manually triggers an election
// TriggerElection manually triggers an election with stability window checks
func (em *ElectionManager) TriggerElection(trigger ElectionTrigger) {
// Check if election already in progress
em.mu.RLock()
currentState := em.state
currentAdmin := em.currentAdmin
lastElection := em.lastElectionTime
em.mu.RUnlock()
if currentState != StateIdle {
@@ -232,6 +244,26 @@ func (em *ElectionManager) TriggerElection(trigger ElectionTrigger) {
return
}
// Apply stability window to prevent election churn (WHOOSH issue #7)
now := time.Now()
if !lastElection.IsZero() {
timeSinceElection := now.Sub(lastElection)
// If we have a current admin, check leader stability window
if currentAdmin != "" && timeSinceElection < em.leaderStabilityWindow {
log.Printf("⏳ Leader stability window active (%.1fs remaining), ignoring trigger: %s",
(em.leaderStabilityWindow - timeSinceElection).Seconds(), trigger)
return
}
// General election stability window
if timeSinceElection < em.electionStabilityWindow {
log.Printf("⏳ Election stability window active (%.1fs remaining), ignoring trigger: %s",
(em.electionStabilityWindow - timeSinceElection).Seconds(), trigger)
return
}
}
select {
case em.electionTrigger <- trigger:
log.Printf("🗳️ Election triggered: %s", trigger)
@@ -442,6 +474,7 @@ func (em *ElectionManager) beginElection(trigger ElectionTrigger) {
em.mu.Lock()
em.state = StateElecting
em.currentTerm++
em.lastElectionTime = time.Now() // Record election timestamp for stability window
term := em.currentTerm
em.candidates = make(map[string]*AdminCandidate)
em.votes = make(map[string]string)
@@ -1119,3 +1152,43 @@ func (hm *HeartbeatManager) GetHeartbeatStatus() map[string]interface{} {
return status
}
// Helper functions for stability window configuration
// getElectionStabilityWindow gets the minimum time between elections
func getElectionStabilityWindow(cfg *config.Config) time.Duration {
// Try to get from environment or use default
if stability := os.Getenv("CHORUS_ELECTION_MIN_TERM"); stability != "" {
if duration, err := time.ParseDuration(stability); err == nil {
return duration
}
}
// Try to get from config structure if it exists
if cfg.Security.ElectionConfig.DiscoveryTimeout > 0 {
// Use double the discovery timeout as default stability window
return cfg.Security.ElectionConfig.DiscoveryTimeout * 2
}
// Default fallback
return 30 * time.Second
}
// getLeaderStabilityWindow gets the minimum time before challenging a healthy leader
func getLeaderStabilityWindow(cfg *config.Config) time.Duration {
// Try to get from environment or use default
if stability := os.Getenv("CHORUS_LEADER_MIN_TERM"); stability != "" {
if duration, err := time.ParseDuration(stability); err == nil {
return duration
}
}
// Try to get from config structure if it exists
if cfg.Security.ElectionConfig.HeartbeatTimeout > 0 {
// Use 3x heartbeat timeout as default leader stability
return cfg.Security.ElectionConfig.HeartbeatTimeout * 3
}
// Default fallback
return 45 * time.Second
}