Files
CHORUS/pkg/config/hybrid_config.go
anthonyrawlins e523c4b543 feat: Implement CHORUS scaling improvements for robust autoscaling
Address WHOOSH issue #7 with comprehensive scaling optimizations to prevent
license server, bootstrap peer, and control plane collapse during fast scale-out.

HIGH-RISK FIXES (Must-Do):
 License gate already implemented with cache + circuit breaker + grace window
 mDNS disabled in container environments (CHORUS_MDNS_ENABLED=false)
 Connection rate limiting (5 dials/sec, 16 concurrent DHT queries)
 Connection manager with watermarks (32 low, 128 high)
 AutoNAT enabled for container networking

MEDIUM-RISK FIXES (Next Priority):
 Assignment merge layer with HTTP/file config + SIGHUP reload
 Runtime configuration system with WHOOSH assignment API support
 Election stability windows to prevent churn:
  - CHORUS_ELECTION_MIN_TERM=30s (minimum time between elections)
  - CHORUS_LEADER_MIN_TERM=45s (minimum time before challenging healthy leader)
 Bootstrap pool JSON support with priority sorting and join stagger

NEW FEATURES:
- Runtime config system with assignment overrides from WHOOSH
- SIGHUP reload handler for live configuration updates
- JSON bootstrap configuration with peer metadata (region, roles, priority)
- Configurable election stability windows with environment variables
- Multi-format bootstrap support: Assignment → JSON → CSV

FILES MODIFIED:
- pkg/config/assignment.go (NEW): Runtime assignment merge system
- docker/bootstrap.json (NEW): Example JSON bootstrap configuration
- pkg/election/election.go: Added stability windows and churn prevention
- internal/runtime/shared.go: Integrated assignment loading and conditional mDNS
- p2p/node.go: Added connection management and rate limiting
- pkg/config/hybrid_config.go: Added rate limiting configuration fields
- docker/docker-compose.yml: Updated environment variables and configs
- README.md: Updated status table with scaling milestone

This implementation enables wave-based autoscaling without system collapse,
addressing all scaling concerns from WHOOSH issue #7.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-23 17:50:40 +10:00

266 lines
9.5 KiB
Go

package config
import (
"fmt"
"os"
"strconv"
"strings"
"time"
)
// HybridConfig manages feature flags and configuration for Phase 2 hybrid mode
type HybridConfig struct {
// DHT Configuration
DHT HybridDHTConfig `json:"dht" yaml:"dht"`
// UCXL Configuration
UCXL HybridUCXLConfig `json:"ucxl" yaml:"ucxl"`
// Discovery Configuration
Discovery DiscoveryConfig `json:"discovery" yaml:"discovery"`
// Monitoring Configuration
Monitoring MonitoringConfig `json:"monitoring" yaml:"monitoring"`
}
type HybridDHTConfig struct {
Backend string `env:"CHORUS_DHT_BACKEND" default:"mock" json:"backend" yaml:"backend"`
BootstrapNodes []string `env:"CHORUS_DHT_BOOTSTRAP_NODES" json:"bootstrap_nodes" yaml:"bootstrap_nodes"`
FallbackOnError bool `env:"CHORUS_FALLBACK_ON_ERROR" default:"true" json:"fallback_on_error" yaml:"fallback_on_error"`
HealthCheckInterval time.Duration `env:"CHORUS_HEALTH_CHECK_INTERVAL" default:"30s" json:"health_check_interval" yaml:"health_check_interval"`
MaxRetries int `env:"CHORUS_DHT_MAX_RETRIES" default:"3" json:"max_retries" yaml:"max_retries"`
RetryBackoff time.Duration `env:"CHORUS_DHT_RETRY_BACKOFF" default:"1s" json:"retry_backoff" yaml:"retry_backoff"`
OperationTimeout time.Duration `env:"CHORUS_DHT_OPERATION_TIMEOUT" default:"10s" json:"operation_timeout" yaml:"operation_timeout"`
}
type HybridUCXLConfig struct {
CacheEnabled bool `env:"CHORUS_UCXL_CACHE_ENABLED" default:"true" json:"cache_enabled" yaml:"cache_enabled"`
CacheTTL time.Duration `env:"CHORUS_UCXL_CACHE_TTL" default:"5m" json:"cache_ttl" yaml:"cache_ttl"`
UseDistributed bool `env:"CHORUS_UCXL_USE_DISTRIBUTED" default:"false" json:"use_distributed" yaml:"use_distributed"`
MaxCacheSize int `env:"CHORUS_UCXL_MAX_CACHE_SIZE" default:"10000" json:"max_cache_size" yaml:"max_cache_size"`
}
type DiscoveryConfig struct {
MDNSEnabled bool `env:"CHORUS_MDNS_ENABLED" default:"true" json:"mdns_enabled" yaml:"mdns_enabled"`
DHTDiscovery bool `env:"CHORUS_DHT_DISCOVERY" default:"false" json:"dht_discovery" yaml:"dht_discovery"`
AnnounceInterval time.Duration `env:"CHORUS_ANNOUNCE_INTERVAL" default:"30s" json:"announce_interval" yaml:"announce_interval"`
ServiceName string `env:"CHORUS_SERVICE_NAME" default:"CHORUS" json:"service_name" yaml:"service_name"`
// Rate limiting for scaling (as per WHOOSH issue #7)
DialsPerSecond int `env:"CHORUS_DIALS_PER_SEC" default:"5" json:"dials_per_second" yaml:"dials_per_second"`
MaxConcurrentDHT int `env:"CHORUS_MAX_CONCURRENT_DHT" default:"16" json:"max_concurrent_dht" yaml:"max_concurrent_dht"`
MaxConcurrentDials int `env:"CHORUS_MAX_CONCURRENT_DIALS" default:"10" json:"max_concurrent_dials" yaml:"max_concurrent_dials"`
JoinStaggerMS int `env:"CHORUS_JOIN_STAGGER_MS" default:"0" json:"join_stagger_ms" yaml:"join_stagger_ms"`
}
type MonitoringConfig struct {
Enabled bool `env:"CHORUS_MONITORING_ENABLED" default:"true" json:"enabled" yaml:"enabled"`
MetricsInterval time.Duration `env:"CHORUS_METRICS_INTERVAL" default:"15s" json:"metrics_interval" yaml:"metrics_interval"`
HealthEndpoint string `env:"CHORUS_HEALTH_ENDPOINT" default:"/health" json:"health_endpoint" yaml:"health_endpoint"`
MetricsEndpoint string `env:"CHORUS_METRICS_ENDPOINT" default:"/metrics" json:"metrics_endpoint" yaml:"metrics_endpoint"`
}
// LoadHybridConfig loads configuration from environment variables with defaults
func LoadHybridConfig() (*HybridConfig, error) {
config := &HybridConfig{}
// Load DHT configuration
config.DHT = HybridDHTConfig{
Backend: getEnvString("CHORUS_DHT_BACKEND", "mock"),
BootstrapNodes: getEnvStringSlice("CHORUS_DHT_BOOTSTRAP_NODES", []string{}),
FallbackOnError: getEnvBool("CHORUS_FALLBACK_ON_ERROR", true),
HealthCheckInterval: getEnvDuration("CHORUS_HEALTH_CHECK_INTERVAL", 30*time.Second),
MaxRetries: getEnvInt("CHORUS_DHT_MAX_RETRIES", 3),
RetryBackoff: getEnvDuration("CHORUS_DHT_RETRY_BACKOFF", 1*time.Second),
OperationTimeout: getEnvDuration("CHORUS_DHT_OPERATION_TIMEOUT", 10*time.Second),
}
// Load UCXL configuration
config.UCXL = HybridUCXLConfig{
CacheEnabled: getEnvBool("CHORUS_UCXL_CACHE_ENABLED", true),
CacheTTL: getEnvDuration("CHORUS_UCXL_CACHE_TTL", 5*time.Minute),
UseDistributed: getEnvBool("CHORUS_UCXL_USE_DISTRIBUTED", false),
MaxCacheSize: getEnvInt("CHORUS_UCXL_MAX_CACHE_SIZE", 10000),
}
// Load Discovery configuration
config.Discovery = DiscoveryConfig{
MDNSEnabled: getEnvBool("CHORUS_MDNS_ENABLED", true),
DHTDiscovery: getEnvBool("CHORUS_DHT_DISCOVERY", false),
AnnounceInterval: getEnvDuration("CHORUS_ANNOUNCE_INTERVAL", 30*time.Second),
ServiceName: getEnvString("CHORUS_SERVICE_NAME", "CHORUS"),
// Rate limiting for scaling (as per WHOOSH issue #7)
DialsPerSecond: getEnvInt("CHORUS_DIALS_PER_SEC", 5),
MaxConcurrentDHT: getEnvInt("CHORUS_MAX_CONCURRENT_DHT", 16),
MaxConcurrentDials: getEnvInt("CHORUS_MAX_CONCURRENT_DIALS", 10),
JoinStaggerMS: getEnvInt("CHORUS_JOIN_STAGGER_MS", 0),
}
// Load Monitoring configuration
config.Monitoring = MonitoringConfig{
Enabled: getEnvBool("CHORUS_MONITORING_ENABLED", true),
MetricsInterval: getEnvDuration("CHORUS_METRICS_INTERVAL", 15*time.Second),
HealthEndpoint: getEnvString("CHORUS_HEALTH_ENDPOINT", "/health"),
MetricsEndpoint: getEnvString("CHORUS_METRICS_ENDPOINT", "/metrics"),
}
// Validate configuration
if err := config.Validate(); err != nil {
return nil, fmt.Errorf("invalid configuration: %w", err)
}
return config, nil
}
// Validate checks configuration values for correctness
func (c *HybridConfig) Validate() error {
// Validate DHT backend
validBackends := []string{"mock", "real", "hybrid"}
if !hybridContains(validBackends, c.DHT.Backend) {
return fmt.Errorf("invalid DHT backend '%s', must be one of: %v", c.DHT.Backend, validBackends)
}
// Validate timeouts
if c.DHT.HealthCheckInterval < time.Second {
return fmt.Errorf("health check interval too short: %v", c.DHT.HealthCheckInterval)
}
if c.DHT.OperationTimeout < 100*time.Millisecond {
return fmt.Errorf("operation timeout too short: %v", c.DHT.OperationTimeout)
}
// Validate cache settings
if c.UCXL.MaxCacheSize < 0 {
return fmt.Errorf("max cache size must be non-negative: %d", c.UCXL.MaxCacheSize)
}
return nil
}
// IsRealDHTEnabled returns true if real DHT should be used
func (c *HybridConfig) IsRealDHTEnabled() bool {
return c.DHT.Backend == "real" || c.DHT.Backend == "hybrid"
}
// IsMockDHTEnabled returns true if mock DHT should be used
func (c *HybridConfig) IsMockDHTEnabled() bool {
return c.DHT.Backend == "mock" || c.DHT.Backend == "hybrid"
}
// IsFallbackEnabled returns true if fallback to mock is enabled
func (c *HybridConfig) IsFallbackEnabled() bool {
return c.DHT.FallbackOnError && c.IsMockDHTEnabled()
}
// GetDHTBootstrapNodes returns the list of bootstrap nodes for real DHT
func (c *HybridConfig) GetDHTBootstrapNodes() []string {
return c.DHT.BootstrapNodes
}
// Helper functions for environment variable parsing
func getEnvString(key, defaultValue string) string {
if value := os.Getenv(key); value != "" {
return value
}
return defaultValue
}
func getEnvBool(key string, defaultValue bool) bool {
if value := os.Getenv(key); value != "" {
parsed, err := strconv.ParseBool(value)
if err == nil {
return parsed
}
}
return defaultValue
}
func getEnvInt(key string, defaultValue int) int {
if value := os.Getenv(key); value != "" {
parsed, err := strconv.Atoi(value)
if err == nil {
return parsed
}
}
return defaultValue
}
func getEnvDuration(key string, defaultValue time.Duration) time.Duration {
if value := os.Getenv(key); value != "" {
parsed, err := time.ParseDuration(value)
if err == nil {
return parsed
}
}
return defaultValue
}
func getEnvStringSlice(key string, defaultValue []string) []string {
if value := os.Getenv(key); value != "" {
return strings.Split(value, ",")
}
return defaultValue
}
func hybridContains(slice []string, item string) bool {
for _, s := range slice {
if s == item {
return true
}
}
return false
}
// ConfigurationChangeEvent represents a configuration update
type ConfigurationChangeEvent struct {
Component string
Old interface{}
New interface{}
Timestamp time.Time
}
// ConfigWatcher provides real-time configuration updates
type ConfigWatcher struct {
events chan ConfigurationChangeEvent
config *HybridConfig
}
// NewConfigWatcher creates a new configuration watcher
func NewConfigWatcher(config *HybridConfig) *ConfigWatcher {
return &ConfigWatcher{
events: make(chan ConfigurationChangeEvent, 100),
config: config,
}
}
// Events returns the configuration change events channel
func (w *ConfigWatcher) Events() <-chan ConfigurationChangeEvent {
return w.events
}
// UpdateDHTBackend changes the DHT backend at runtime
func (w *ConfigWatcher) UpdateDHTBackend(backend string) error {
validBackends := []string{"mock", "real", "hybrid"}
if !hybridContains(validBackends, backend) {
return fmt.Errorf("invalid DHT backend '%s'", backend)
}
old := w.config.DHT.Backend
w.config.DHT.Backend = backend
w.events <- ConfigurationChangeEvent{
Component: "dht.backend",
Old: old,
New: backend,
Timestamp: time.Now(),
}
return nil
}
// Close closes the configuration watcher
func (w *ConfigWatcher) Close() {
close(w.events)
}