 e523c4b543
			
		
	
	e523c4b543
	
	
	
		
			
			Address WHOOSH issue #7 with comprehensive scaling optimizations to prevent license server, bootstrap peer, and control plane collapse during fast scale-out. HIGH-RISK FIXES (Must-Do): ✅ License gate already implemented with cache + circuit breaker + grace window ✅ mDNS disabled in container environments (CHORUS_MDNS_ENABLED=false) ✅ Connection rate limiting (5 dials/sec, 16 concurrent DHT queries) ✅ Connection manager with watermarks (32 low, 128 high) ✅ AutoNAT enabled for container networking MEDIUM-RISK FIXES (Next Priority): ✅ Assignment merge layer with HTTP/file config + SIGHUP reload ✅ Runtime configuration system with WHOOSH assignment API support ✅ Election stability windows to prevent churn: - CHORUS_ELECTION_MIN_TERM=30s (minimum time between elections) - CHORUS_LEADER_MIN_TERM=45s (minimum time before challenging healthy leader) ✅ Bootstrap pool JSON support with priority sorting and join stagger NEW FEATURES: - Runtime config system with assignment overrides from WHOOSH - SIGHUP reload handler for live configuration updates - JSON bootstrap configuration with peer metadata (region, roles, priority) - Configurable election stability windows with environment variables - Multi-format bootstrap support: Assignment → JSON → CSV FILES MODIFIED: - pkg/config/assignment.go (NEW): Runtime assignment merge system - docker/bootstrap.json (NEW): Example JSON bootstrap configuration - pkg/election/election.go: Added stability windows and churn prevention - internal/runtime/shared.go: Integrated assignment loading and conditional mDNS - p2p/node.go: Added connection management and rate limiting - pkg/config/hybrid_config.go: Added rate limiting configuration fields - docker/docker-compose.yml: Updated environment variables and configs - README.md: Updated status table with scaling milestone This implementation enables wave-based autoscaling without system collapse, addressing all scaling concerns from WHOOSH issue #7. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
		
			
				
	
	
		
			266 lines
		
	
	
		
			9.5 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			266 lines
		
	
	
		
			9.5 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| package config
 | |
| 
 | |
| import (
 | |
| 	"fmt"
 | |
| 	"os"
 | |
| 	"strconv"
 | |
| 	"strings"
 | |
| 	"time"
 | |
| )
 | |
| 
 | |
| // HybridConfig manages feature flags and configuration for Phase 2 hybrid mode
 | |
| type HybridConfig struct {
 | |
| 	// DHT Configuration
 | |
| 	DHT HybridDHTConfig `json:"dht" yaml:"dht"`
 | |
| 	
 | |
| 	// UCXL Configuration  
 | |
| 	UCXL HybridUCXLConfig `json:"ucxl" yaml:"ucxl"`
 | |
| 	
 | |
| 	// Discovery Configuration
 | |
| 	Discovery DiscoveryConfig `json:"discovery" yaml:"discovery"`
 | |
| 	
 | |
| 	// Monitoring Configuration
 | |
| 	Monitoring MonitoringConfig `json:"monitoring" yaml:"monitoring"`
 | |
| }
 | |
| 
 | |
| type HybridDHTConfig struct {
 | |
| 	Backend             string        `env:"CHORUS_DHT_BACKEND" default:"mock" json:"backend" yaml:"backend"`
 | |
| 	BootstrapNodes      []string      `env:"CHORUS_DHT_BOOTSTRAP_NODES" json:"bootstrap_nodes" yaml:"bootstrap_nodes"`
 | |
| 	FallbackOnError     bool          `env:"CHORUS_FALLBACK_ON_ERROR" default:"true" json:"fallback_on_error" yaml:"fallback_on_error"`
 | |
| 	HealthCheckInterval time.Duration `env:"CHORUS_HEALTH_CHECK_INTERVAL" default:"30s" json:"health_check_interval" yaml:"health_check_interval"`
 | |
| 	MaxRetries          int           `env:"CHORUS_DHT_MAX_RETRIES" default:"3" json:"max_retries" yaml:"max_retries"`
 | |
| 	RetryBackoff        time.Duration `env:"CHORUS_DHT_RETRY_BACKOFF" default:"1s" json:"retry_backoff" yaml:"retry_backoff"`
 | |
| 	OperationTimeout    time.Duration `env:"CHORUS_DHT_OPERATION_TIMEOUT" default:"10s" json:"operation_timeout" yaml:"operation_timeout"`
 | |
| }
 | |
| 
 | |
| type HybridUCXLConfig struct {
 | |
| 	CacheEnabled       bool          `env:"CHORUS_UCXL_CACHE_ENABLED" default:"true" json:"cache_enabled" yaml:"cache_enabled"`
 | |
| 	CacheTTL          time.Duration `env:"CHORUS_UCXL_CACHE_TTL" default:"5m" json:"cache_ttl" yaml:"cache_ttl"`
 | |
| 	UseDistributed    bool          `env:"CHORUS_UCXL_USE_DISTRIBUTED" default:"false" json:"use_distributed" yaml:"use_distributed"`
 | |
| 	MaxCacheSize      int           `env:"CHORUS_UCXL_MAX_CACHE_SIZE" default:"10000" json:"max_cache_size" yaml:"max_cache_size"`
 | |
| }
 | |
| 
 | |
| type DiscoveryConfig struct {
 | |
| 	MDNSEnabled        bool          `env:"CHORUS_MDNS_ENABLED" default:"true" json:"mdns_enabled" yaml:"mdns_enabled"`
 | |
| 	DHTDiscovery       bool          `env:"CHORUS_DHT_DISCOVERY" default:"false" json:"dht_discovery" yaml:"dht_discovery"`
 | |
| 	AnnounceInterval   time.Duration `env:"CHORUS_ANNOUNCE_INTERVAL" default:"30s" json:"announce_interval" yaml:"announce_interval"`
 | |
| 	ServiceName        string        `env:"CHORUS_SERVICE_NAME" default:"CHORUS" json:"service_name" yaml:"service_name"`
 | |
| 
 | |
| 	// Rate limiting for scaling (as per WHOOSH issue #7)
 | |
| 	DialsPerSecond     int           `env:"CHORUS_DIALS_PER_SEC" default:"5" json:"dials_per_second" yaml:"dials_per_second"`
 | |
| 	MaxConcurrentDHT   int           `env:"CHORUS_MAX_CONCURRENT_DHT" default:"16" json:"max_concurrent_dht" yaml:"max_concurrent_dht"`
 | |
| 	MaxConcurrentDials int           `env:"CHORUS_MAX_CONCURRENT_DIALS" default:"10" json:"max_concurrent_dials" yaml:"max_concurrent_dials"`
 | |
| 	JoinStaggerMS      int           `env:"CHORUS_JOIN_STAGGER_MS" default:"0" json:"join_stagger_ms" yaml:"join_stagger_ms"`
 | |
| }
 | |
| 
 | |
| type MonitoringConfig struct {
 | |
| 	Enabled           bool          `env:"CHORUS_MONITORING_ENABLED" default:"true" json:"enabled" yaml:"enabled"`
 | |
| 	MetricsInterval   time.Duration `env:"CHORUS_METRICS_INTERVAL" default:"15s" json:"metrics_interval" yaml:"metrics_interval"`
 | |
| 	HealthEndpoint    string        `env:"CHORUS_HEALTH_ENDPOINT" default:"/health" json:"health_endpoint" yaml:"health_endpoint"`
 | |
| 	MetricsEndpoint   string        `env:"CHORUS_METRICS_ENDPOINT" default:"/metrics" json:"metrics_endpoint" yaml:"metrics_endpoint"`
 | |
| }
 | |
| 
 | |
| // LoadHybridConfig loads configuration from environment variables with defaults
 | |
| func LoadHybridConfig() (*HybridConfig, error) {
 | |
| 	config := &HybridConfig{}
 | |
| 	
 | |
| 	// Load DHT configuration
 | |
| 	config.DHT = HybridDHTConfig{
 | |
| 		Backend:             getEnvString("CHORUS_DHT_BACKEND", "mock"),
 | |
| 		BootstrapNodes:      getEnvStringSlice("CHORUS_DHT_BOOTSTRAP_NODES", []string{}),
 | |
| 		FallbackOnError:     getEnvBool("CHORUS_FALLBACK_ON_ERROR", true),
 | |
| 		HealthCheckInterval: getEnvDuration("CHORUS_HEALTH_CHECK_INTERVAL", 30*time.Second),
 | |
| 		MaxRetries:          getEnvInt("CHORUS_DHT_MAX_RETRIES", 3),
 | |
| 		RetryBackoff:        getEnvDuration("CHORUS_DHT_RETRY_BACKOFF", 1*time.Second),
 | |
| 		OperationTimeout:    getEnvDuration("CHORUS_DHT_OPERATION_TIMEOUT", 10*time.Second),
 | |
| 	}
 | |
| 	
 | |
| 	// Load UCXL configuration
 | |
| 	config.UCXL = HybridUCXLConfig{
 | |
| 		CacheEnabled:    getEnvBool("CHORUS_UCXL_CACHE_ENABLED", true),
 | |
| 		CacheTTL:        getEnvDuration("CHORUS_UCXL_CACHE_TTL", 5*time.Minute),
 | |
| 		UseDistributed:  getEnvBool("CHORUS_UCXL_USE_DISTRIBUTED", false),
 | |
| 		MaxCacheSize:    getEnvInt("CHORUS_UCXL_MAX_CACHE_SIZE", 10000),
 | |
| 	}
 | |
| 	
 | |
| 	// Load Discovery configuration
 | |
| 	config.Discovery = DiscoveryConfig{
 | |
| 		MDNSEnabled:        getEnvBool("CHORUS_MDNS_ENABLED", true),
 | |
| 		DHTDiscovery:       getEnvBool("CHORUS_DHT_DISCOVERY", false),
 | |
| 		AnnounceInterval:   getEnvDuration("CHORUS_ANNOUNCE_INTERVAL", 30*time.Second),
 | |
| 		ServiceName:        getEnvString("CHORUS_SERVICE_NAME", "CHORUS"),
 | |
| 
 | |
| 		// Rate limiting for scaling (as per WHOOSH issue #7)
 | |
| 		DialsPerSecond:     getEnvInt("CHORUS_DIALS_PER_SEC", 5),
 | |
| 		MaxConcurrentDHT:   getEnvInt("CHORUS_MAX_CONCURRENT_DHT", 16),
 | |
| 		MaxConcurrentDials: getEnvInt("CHORUS_MAX_CONCURRENT_DIALS", 10),
 | |
| 		JoinStaggerMS:      getEnvInt("CHORUS_JOIN_STAGGER_MS", 0),
 | |
| 	}
 | |
| 	
 | |
| 	// Load Monitoring configuration
 | |
| 	config.Monitoring = MonitoringConfig{
 | |
| 		Enabled:         getEnvBool("CHORUS_MONITORING_ENABLED", true),
 | |
| 		MetricsInterval: getEnvDuration("CHORUS_METRICS_INTERVAL", 15*time.Second),
 | |
| 		HealthEndpoint:  getEnvString("CHORUS_HEALTH_ENDPOINT", "/health"),
 | |
| 		MetricsEndpoint: getEnvString("CHORUS_METRICS_ENDPOINT", "/metrics"),
 | |
| 	}
 | |
| 	
 | |
| 	// Validate configuration
 | |
| 	if err := config.Validate(); err != nil {
 | |
| 		return nil, fmt.Errorf("invalid configuration: %w", err)
 | |
| 	}
 | |
| 	
 | |
| 	return config, nil
 | |
| }
 | |
| 
 | |
| // Validate checks configuration values for correctness
 | |
| func (c *HybridConfig) Validate() error {
 | |
| 	// Validate DHT backend
 | |
| 	validBackends := []string{"mock", "real", "hybrid"}
 | |
| 	if !hybridContains(validBackends, c.DHT.Backend) {
 | |
| 		return fmt.Errorf("invalid DHT backend '%s', must be one of: %v", c.DHT.Backend, validBackends)
 | |
| 	}
 | |
| 	
 | |
| 	// Validate timeouts
 | |
| 	if c.DHT.HealthCheckInterval < time.Second {
 | |
| 		return fmt.Errorf("health check interval too short: %v", c.DHT.HealthCheckInterval)
 | |
| 	}
 | |
| 	
 | |
| 	if c.DHT.OperationTimeout < 100*time.Millisecond {
 | |
| 		return fmt.Errorf("operation timeout too short: %v", c.DHT.OperationTimeout)
 | |
| 	}
 | |
| 	
 | |
| 	// Validate cache settings
 | |
| 	if c.UCXL.MaxCacheSize < 0 {
 | |
| 		return fmt.Errorf("max cache size must be non-negative: %d", c.UCXL.MaxCacheSize)
 | |
| 	}
 | |
| 	
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // IsRealDHTEnabled returns true if real DHT should be used
 | |
| func (c *HybridConfig) IsRealDHTEnabled() bool {
 | |
| 	return c.DHT.Backend == "real" || c.DHT.Backend == "hybrid"
 | |
| }
 | |
| 
 | |
| // IsMockDHTEnabled returns true if mock DHT should be used
 | |
| func (c *HybridConfig) IsMockDHTEnabled() bool {
 | |
| 	return c.DHT.Backend == "mock" || c.DHT.Backend == "hybrid"
 | |
| }
 | |
| 
 | |
| // IsFallbackEnabled returns true if fallback to mock is enabled
 | |
| func (c *HybridConfig) IsFallbackEnabled() bool {
 | |
| 	return c.DHT.FallbackOnError && c.IsMockDHTEnabled()
 | |
| }
 | |
| 
 | |
| // GetDHTBootstrapNodes returns the list of bootstrap nodes for real DHT
 | |
| func (c *HybridConfig) GetDHTBootstrapNodes() []string {
 | |
| 	return c.DHT.BootstrapNodes
 | |
| }
 | |
| 
 | |
| // Helper functions for environment variable parsing
 | |
| 
 | |
| func getEnvString(key, defaultValue string) string {
 | |
| 	if value := os.Getenv(key); value != "" {
 | |
| 		return value
 | |
| 	}
 | |
| 	return defaultValue
 | |
| }
 | |
| 
 | |
| func getEnvBool(key string, defaultValue bool) bool {
 | |
| 	if value := os.Getenv(key); value != "" {
 | |
| 		parsed, err := strconv.ParseBool(value)
 | |
| 		if err == nil {
 | |
| 			return parsed
 | |
| 		}
 | |
| 	}
 | |
| 	return defaultValue
 | |
| }
 | |
| 
 | |
| func getEnvInt(key string, defaultValue int) int {
 | |
| 	if value := os.Getenv(key); value != "" {
 | |
| 		parsed, err := strconv.Atoi(value)
 | |
| 		if err == nil {
 | |
| 			return parsed
 | |
| 		}
 | |
| 	}
 | |
| 	return defaultValue
 | |
| }
 | |
| 
 | |
| func getEnvDuration(key string, defaultValue time.Duration) time.Duration {
 | |
| 	if value := os.Getenv(key); value != "" {
 | |
| 		parsed, err := time.ParseDuration(value)
 | |
| 		if err == nil {
 | |
| 			return parsed
 | |
| 		}
 | |
| 	}
 | |
| 	return defaultValue
 | |
| }
 | |
| 
 | |
| func getEnvStringSlice(key string, defaultValue []string) []string {
 | |
| 	if value := os.Getenv(key); value != "" {
 | |
| 		return strings.Split(value, ",")
 | |
| 	}
 | |
| 	return defaultValue
 | |
| }
 | |
| 
 | |
| func hybridContains(slice []string, item string) bool {
 | |
| 	for _, s := range slice {
 | |
| 		if s == item {
 | |
| 			return true
 | |
| 		}
 | |
| 	}
 | |
| 	return false
 | |
| }
 | |
| 
 | |
| // ConfigurationChangeEvent represents a configuration update
 | |
| type ConfigurationChangeEvent struct {
 | |
| 	Component string
 | |
| 	Old       interface{}
 | |
| 	New       interface{}
 | |
| 	Timestamp time.Time
 | |
| }
 | |
| 
 | |
| // ConfigWatcher provides real-time configuration updates
 | |
| type ConfigWatcher struct {
 | |
| 	events chan ConfigurationChangeEvent
 | |
| 	config *HybridConfig
 | |
| }
 | |
| 
 | |
| // NewConfigWatcher creates a new configuration watcher
 | |
| func NewConfigWatcher(config *HybridConfig) *ConfigWatcher {
 | |
| 	return &ConfigWatcher{
 | |
| 		events: make(chan ConfigurationChangeEvent, 100),
 | |
| 		config: config,
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // Events returns the configuration change events channel
 | |
| func (w *ConfigWatcher) Events() <-chan ConfigurationChangeEvent {
 | |
| 	return w.events
 | |
| }
 | |
| 
 | |
| // UpdateDHTBackend changes the DHT backend at runtime
 | |
| func (w *ConfigWatcher) UpdateDHTBackend(backend string) error {
 | |
| 	validBackends := []string{"mock", "real", "hybrid"}
 | |
| 	if !hybridContains(validBackends, backend) {
 | |
| 		return fmt.Errorf("invalid DHT backend '%s'", backend)
 | |
| 	}
 | |
| 	
 | |
| 	old := w.config.DHT.Backend
 | |
| 	w.config.DHT.Backend = backend
 | |
| 	
 | |
| 	w.events <- ConfigurationChangeEvent{
 | |
| 		Component: "dht.backend",
 | |
| 		Old:       old,
 | |
| 		New:       backend,
 | |
| 		Timestamp: time.Now(),
 | |
| 	}
 | |
| 	
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // Close closes the configuration watcher
 | |
| func (w *ConfigWatcher) Close() {
 | |
| 	close(w.events)
 | |
| } |