Address WHOOSH issue #7 with comprehensive scaling optimizations to prevent license server, bootstrap peer, and control plane collapse during fast scale-out. HIGH-RISK FIXES (Must-Do): ✅ License gate already implemented with cache + circuit breaker + grace window ✅ mDNS disabled in container environments (CHORUS_MDNS_ENABLED=false) ✅ Connection rate limiting (5 dials/sec, 16 concurrent DHT queries) ✅ Connection manager with watermarks (32 low, 128 high) ✅ AutoNAT enabled for container networking MEDIUM-RISK FIXES (Next Priority): ✅ Assignment merge layer with HTTP/file config + SIGHUP reload ✅ Runtime configuration system with WHOOSH assignment API support ✅ Election stability windows to prevent churn: - CHORUS_ELECTION_MIN_TERM=30s (minimum time between elections) - CHORUS_LEADER_MIN_TERM=45s (minimum time before challenging healthy leader) ✅ Bootstrap pool JSON support with priority sorting and join stagger NEW FEATURES: - Runtime config system with assignment overrides from WHOOSH - SIGHUP reload handler for live configuration updates - JSON bootstrap configuration with peer metadata (region, roles, priority) - Configurable election stability windows with environment variables - Multi-format bootstrap support: Assignment → JSON → CSV FILES MODIFIED: - pkg/config/assignment.go (NEW): Runtime assignment merge system - docker/bootstrap.json (NEW): Example JSON bootstrap configuration - pkg/election/election.go: Added stability windows and churn prevention - internal/runtime/shared.go: Integrated assignment loading and conditional mDNS - p2p/node.go: Added connection management and rate limiting - pkg/config/hybrid_config.go: Added rate limiting configuration fields - docker/docker-compose.yml: Updated environment variables and configs - README.md: Updated status table with scaling milestone This implementation enables wave-based autoscaling without system collapse, addressing all scaling concerns from WHOOSH issue #7. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
517 lines
14 KiB
Go
517 lines
14 KiB
Go
package config
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"os"
|
|
"signal"
|
|
"strings"
|
|
"sync"
|
|
"syscall"
|
|
"time"
|
|
)
|
|
|
|
// RuntimeConfig manages runtime configuration with assignment overrides
|
|
type RuntimeConfig struct {
|
|
Base *Config `json:"base"`
|
|
Override *AssignmentConfig `json:"override"`
|
|
mu sync.RWMutex
|
|
reloadCh chan struct{}
|
|
}
|
|
|
|
// AssignmentConfig represents runtime assignment from WHOOSH
|
|
type AssignmentConfig struct {
|
|
// Assignment metadata
|
|
AssignmentID string `json:"assignment_id"`
|
|
TaskSlot string `json:"task_slot"`
|
|
TaskID string `json:"task_id"`
|
|
ClusterID string `json:"cluster_id"`
|
|
AssignedAt time.Time `json:"assigned_at"`
|
|
ExpiresAt time.Time `json:"expires_at,omitempty"`
|
|
|
|
// Agent configuration overrides
|
|
Agent *AgentConfig `json:"agent,omitempty"`
|
|
Network *NetworkConfig `json:"network,omitempty"`
|
|
AI *AIConfig `json:"ai,omitempty"`
|
|
Logging *LoggingConfig `json:"logging,omitempty"`
|
|
|
|
// Bootstrap configuration for scaling
|
|
BootstrapPeers []string `json:"bootstrap_peers,omitempty"`
|
|
JoinStagger int `json:"join_stagger_ms,omitempty"`
|
|
|
|
// Runtime capabilities
|
|
RuntimeCapabilities []string `json:"runtime_capabilities,omitempty"`
|
|
|
|
// Key derivation for encryption
|
|
RoleKey string `json:"role_key,omitempty"`
|
|
ClusterSecret string `json:"cluster_secret,omitempty"`
|
|
|
|
// Custom fields
|
|
Custom map[string]interface{} `json:"custom,omitempty"`
|
|
}
|
|
|
|
// AssignmentRequest represents a request for assignment from WHOOSH
|
|
type AssignmentRequest struct {
|
|
ClusterID string `json:"cluster_id"`
|
|
TaskSlot string `json:"task_slot,omitempty"`
|
|
TaskID string `json:"task_id,omitempty"`
|
|
AgentID string `json:"agent_id"`
|
|
NodeID string `json:"node_id"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
}
|
|
|
|
// NewRuntimeConfig creates a new runtime configuration manager
|
|
func NewRuntimeConfig(baseConfig *Config) *RuntimeConfig {
|
|
return &RuntimeConfig{
|
|
Base: baseConfig,
|
|
Override: nil,
|
|
reloadCh: make(chan struct{}, 1),
|
|
}
|
|
}
|
|
|
|
// Get returns the effective configuration value, with override taking precedence
|
|
func (rc *RuntimeConfig) Get(field string) interface{} {
|
|
rc.mu.RLock()
|
|
defer rc.mu.RUnlock()
|
|
|
|
// Try override first
|
|
if rc.Override != nil {
|
|
if value := rc.getFromAssignment(field); value != nil {
|
|
return value
|
|
}
|
|
}
|
|
|
|
// Fall back to base configuration
|
|
return rc.getFromBase(field)
|
|
}
|
|
|
|
// GetConfig returns a merged configuration with overrides applied
|
|
func (rc *RuntimeConfig) GetConfig() *Config {
|
|
rc.mu.RLock()
|
|
defer rc.mu.RUnlock()
|
|
|
|
if rc.Override == nil {
|
|
return rc.Base
|
|
}
|
|
|
|
// Create a copy of base config
|
|
merged := *rc.Base
|
|
|
|
// Apply overrides
|
|
if rc.Override.Agent != nil {
|
|
rc.mergeAgentConfig(&merged.Agent, rc.Override.Agent)
|
|
}
|
|
if rc.Override.Network != nil {
|
|
rc.mergeNetworkConfig(&merged.Network, rc.Override.Network)
|
|
}
|
|
if rc.Override.AI != nil {
|
|
rc.mergeAIConfig(&merged.AI, rc.Override.AI)
|
|
}
|
|
if rc.Override.Logging != nil {
|
|
rc.mergeLoggingConfig(&merged.Logging, rc.Override.Logging)
|
|
}
|
|
|
|
return &merged
|
|
}
|
|
|
|
// LoadAssignment fetches assignment from WHOOSH and applies it
|
|
func (rc *RuntimeConfig) LoadAssignment(ctx context.Context, assignURL string) error {
|
|
if assignURL == "" {
|
|
return nil // No assignment URL configured
|
|
}
|
|
|
|
// Build assignment request
|
|
agentID := rc.Base.Agent.ID
|
|
if agentID == "" {
|
|
agentID = "unknown"
|
|
}
|
|
|
|
req := AssignmentRequest{
|
|
ClusterID: rc.Base.License.ClusterID,
|
|
TaskSlot: os.Getenv("TASK_SLOT"),
|
|
TaskID: os.Getenv("TASK_ID"),
|
|
AgentID: agentID,
|
|
NodeID: os.Getenv("NODE_ID"),
|
|
Timestamp: time.Now(),
|
|
}
|
|
|
|
// Make HTTP request to WHOOSH
|
|
assignment, err := rc.fetchAssignment(ctx, assignURL, req)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to fetch assignment: %w", err)
|
|
}
|
|
|
|
// Apply assignment
|
|
rc.mu.Lock()
|
|
rc.Override = assignment
|
|
rc.mu.Unlock()
|
|
|
|
return nil
|
|
}
|
|
|
|
// StartReloadHandler starts a signal handler for SIGHUP configuration reloads
|
|
func (rc *RuntimeConfig) StartReloadHandler(ctx context.Context, assignURL string) {
|
|
sigCh := make(chan os.Signal, 1)
|
|
signal.Notify(sigCh, syscall.SIGHUP)
|
|
|
|
go func() {
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-sigCh:
|
|
fmt.Println("📡 Received SIGHUP, reloading assignment configuration...")
|
|
if err := rc.LoadAssignment(ctx, assignURL); err != nil {
|
|
fmt.Printf("❌ Failed to reload assignment: %v\n", err)
|
|
} else {
|
|
fmt.Println("✅ Assignment configuration reloaded successfully")
|
|
}
|
|
case <-rc.reloadCh:
|
|
// Manual reload trigger
|
|
if err := rc.LoadAssignment(ctx, assignURL); err != nil {
|
|
fmt.Printf("❌ Failed to reload assignment: %v\n", err)
|
|
} else {
|
|
fmt.Println("✅ Assignment configuration reloaded successfully")
|
|
}
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
// Reload triggers a manual configuration reload
|
|
func (rc *RuntimeConfig) Reload() {
|
|
select {
|
|
case rc.reloadCh <- struct{}{}:
|
|
default:
|
|
// Channel full, reload already pending
|
|
}
|
|
}
|
|
|
|
// fetchAssignment makes HTTP request to WHOOSH assignment API
|
|
func (rc *RuntimeConfig) fetchAssignment(ctx context.Context, assignURL string, req AssignmentRequest) (*AssignmentConfig, error) {
|
|
// Build query parameters
|
|
queryParams := fmt.Sprintf("?cluster_id=%s&agent_id=%s&node_id=%s",
|
|
req.ClusterID, req.AgentID, req.NodeID)
|
|
|
|
if req.TaskSlot != "" {
|
|
queryParams += "&task_slot=" + req.TaskSlot
|
|
}
|
|
if req.TaskID != "" {
|
|
queryParams += "&task_id=" + req.TaskID
|
|
}
|
|
|
|
// Create HTTP request
|
|
httpReq, err := http.NewRequestWithContext(ctx, "GET", assignURL+queryParams, nil)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create assignment request: %w", err)
|
|
}
|
|
|
|
httpReq.Header.Set("Accept", "application/json")
|
|
httpReq.Header.Set("User-Agent", "CHORUS-Agent/0.1.0")
|
|
|
|
// Make request with timeout
|
|
client := &http.Client{Timeout: 10 * time.Second}
|
|
resp, err := client.Do(httpReq)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("assignment request failed: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode == http.StatusNotFound {
|
|
// No assignment available
|
|
return nil, nil
|
|
}
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
body, _ := io.ReadAll(resp.Body)
|
|
return nil, fmt.Errorf("assignment request failed with status %d: %s", resp.StatusCode, string(body))
|
|
}
|
|
|
|
// Parse assignment response
|
|
var assignment AssignmentConfig
|
|
if err := json.NewDecoder(resp.Body).Decode(&assignment); err != nil {
|
|
return nil, fmt.Errorf("failed to decode assignment response: %w", err)
|
|
}
|
|
|
|
return &assignment, nil
|
|
}
|
|
|
|
// Helper methods for getting values from different sources
|
|
func (rc *RuntimeConfig) getFromAssignment(field string) interface{} {
|
|
if rc.Override == nil {
|
|
return nil
|
|
}
|
|
|
|
// Simple field mapping - in a real implementation, you'd use reflection
|
|
// or a more sophisticated field mapping system
|
|
switch field {
|
|
case "agent.id":
|
|
if rc.Override.Agent != nil && rc.Override.Agent.ID != "" {
|
|
return rc.Override.Agent.ID
|
|
}
|
|
case "agent.role":
|
|
if rc.Override.Agent != nil && rc.Override.Agent.Role != "" {
|
|
return rc.Override.Agent.Role
|
|
}
|
|
case "agent.capabilities":
|
|
if len(rc.Override.RuntimeCapabilities) > 0 {
|
|
return rc.Override.RuntimeCapabilities
|
|
}
|
|
case "bootstrap_peers":
|
|
if len(rc.Override.BootstrapPeers) > 0 {
|
|
return rc.Override.BootstrapPeers
|
|
}
|
|
case "join_stagger":
|
|
if rc.Override.JoinStagger > 0 {
|
|
return rc.Override.JoinStagger
|
|
}
|
|
}
|
|
|
|
// Check custom fields
|
|
if rc.Override.Custom != nil {
|
|
if val, exists := rc.Override.Custom[field]; exists {
|
|
return val
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (rc *RuntimeConfig) getFromBase(field string) interface{} {
|
|
// Simple field mapping for base config
|
|
switch field {
|
|
case "agent.id":
|
|
return rc.Base.Agent.ID
|
|
case "agent.role":
|
|
return rc.Base.Agent.Role
|
|
case "agent.capabilities":
|
|
return rc.Base.Agent.Capabilities
|
|
default:
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// Helper methods for merging configuration sections
|
|
func (rc *RuntimeConfig) mergeAgentConfig(base *AgentConfig, override *AgentConfig) {
|
|
if override.ID != "" {
|
|
base.ID = override.ID
|
|
}
|
|
if override.Specialization != "" {
|
|
base.Specialization = override.Specialization
|
|
}
|
|
if override.MaxTasks > 0 {
|
|
base.MaxTasks = override.MaxTasks
|
|
}
|
|
if len(override.Capabilities) > 0 {
|
|
base.Capabilities = override.Capabilities
|
|
}
|
|
if len(override.Models) > 0 {
|
|
base.Models = override.Models
|
|
}
|
|
if override.Role != "" {
|
|
base.Role = override.Role
|
|
}
|
|
if override.Project != "" {
|
|
base.Project = override.Project
|
|
}
|
|
if len(override.Expertise) > 0 {
|
|
base.Expertise = override.Expertise
|
|
}
|
|
if override.ReportsTo != "" {
|
|
base.ReportsTo = override.ReportsTo
|
|
}
|
|
if len(override.Deliverables) > 0 {
|
|
base.Deliverables = override.Deliverables
|
|
}
|
|
if override.ModelSelectionWebhook != "" {
|
|
base.ModelSelectionWebhook = override.ModelSelectionWebhook
|
|
}
|
|
if override.DefaultReasoningModel != "" {
|
|
base.DefaultReasoningModel = override.DefaultReasoningModel
|
|
}
|
|
}
|
|
|
|
func (rc *RuntimeConfig) mergeNetworkConfig(base *NetworkConfig, override *NetworkConfig) {
|
|
if override.P2PPort > 0 {
|
|
base.P2PPort = override.P2PPort
|
|
}
|
|
if override.APIPort > 0 {
|
|
base.APIPort = override.APIPort
|
|
}
|
|
if override.HealthPort > 0 {
|
|
base.HealthPort = override.HealthPort
|
|
}
|
|
if override.BindAddr != "" {
|
|
base.BindAddr = override.BindAddr
|
|
}
|
|
}
|
|
|
|
func (rc *RuntimeConfig) mergeAIConfig(base *AIConfig, override *AIConfig) {
|
|
if override.Provider != "" {
|
|
base.Provider = override.Provider
|
|
}
|
|
// Merge Ollama config if present
|
|
if override.Ollama.Endpoint != "" {
|
|
base.Ollama.Endpoint = override.Ollama.Endpoint
|
|
}
|
|
if override.Ollama.Timeout > 0 {
|
|
base.Ollama.Timeout = override.Ollama.Timeout
|
|
}
|
|
// Merge ResetData config if present
|
|
if override.ResetData.BaseURL != "" {
|
|
base.ResetData.BaseURL = override.ResetData.BaseURL
|
|
}
|
|
}
|
|
|
|
func (rc *RuntimeConfig) mergeLoggingConfig(base *LoggingConfig, override *LoggingConfig) {
|
|
if override.Level != "" {
|
|
base.Level = override.Level
|
|
}
|
|
if override.Format != "" {
|
|
base.Format = override.Format
|
|
}
|
|
}
|
|
|
|
// BootstrapConfig represents JSON bootstrap configuration
|
|
type BootstrapConfig struct {
|
|
Peers []BootstrapPeer `json:"peers"`
|
|
Metadata BootstrapMeta `json:"metadata,omitempty"`
|
|
}
|
|
|
|
// BootstrapPeer represents a single bootstrap peer
|
|
type BootstrapPeer struct {
|
|
Address string `json:"address"`
|
|
Priority int `json:"priority,omitempty"`
|
|
Region string `json:"region,omitempty"`
|
|
Roles []string `json:"roles,omitempty"`
|
|
Enabled bool `json:"enabled"`
|
|
}
|
|
|
|
// BootstrapMeta contains metadata about the bootstrap configuration
|
|
type BootstrapMeta struct {
|
|
GeneratedAt time.Time `json:"generated_at,omitempty"`
|
|
ClusterID string `json:"cluster_id,omitempty"`
|
|
Version string `json:"version,omitempty"`
|
|
Notes string `json:"notes,omitempty"`
|
|
}
|
|
|
|
// GetBootstrapPeers returns bootstrap peers with assignment override support and JSON config
|
|
func (rc *RuntimeConfig) GetBootstrapPeers() []string {
|
|
rc.mu.RLock()
|
|
defer rc.mu.RUnlock()
|
|
|
|
// First priority: Assignment override from WHOOSH
|
|
if rc.Override != nil && len(rc.Override.BootstrapPeers) > 0 {
|
|
return rc.Override.BootstrapPeers
|
|
}
|
|
|
|
// Second priority: JSON bootstrap configuration
|
|
if jsonPeers := rc.loadBootstrapJSON(); len(jsonPeers) > 0 {
|
|
return jsonPeers
|
|
}
|
|
|
|
// Third priority: Environment variable (CSV format)
|
|
if bootstrapEnv := os.Getenv("CHORUS_BOOTSTRAP_PEERS"); bootstrapEnv != "" {
|
|
peers := strings.Split(bootstrapEnv, ",")
|
|
// Trim whitespace from each peer
|
|
for i, peer := range peers {
|
|
peers[i] = strings.TrimSpace(peer)
|
|
}
|
|
return peers
|
|
}
|
|
|
|
return []string{}
|
|
}
|
|
|
|
// loadBootstrapJSON loads bootstrap peers from JSON file
|
|
func (rc *RuntimeConfig) loadBootstrapJSON() []string {
|
|
jsonPath := os.Getenv("BOOTSTRAP_JSON")
|
|
if jsonPath == "" {
|
|
return nil
|
|
}
|
|
|
|
// Check if file exists
|
|
if _, err := os.Stat(jsonPath); os.IsNotExist(err) {
|
|
return nil
|
|
}
|
|
|
|
// Read and parse JSON file
|
|
data, err := os.ReadFile(jsonPath)
|
|
if err != nil {
|
|
fmt.Printf("⚠️ Failed to read bootstrap JSON file %s: %v\n", jsonPath, err)
|
|
return nil
|
|
}
|
|
|
|
var config BootstrapConfig
|
|
if err := json.Unmarshal(data, &config); err != nil {
|
|
fmt.Printf("⚠️ Failed to parse bootstrap JSON file %s: %v\n", jsonPath, err)
|
|
return nil
|
|
}
|
|
|
|
// Extract enabled peer addresses, sorted by priority
|
|
var peers []string
|
|
enabledPeers := make([]BootstrapPeer, 0, len(config.Peers))
|
|
|
|
// Filter enabled peers
|
|
for _, peer := range config.Peers {
|
|
if peer.Enabled && peer.Address != "" {
|
|
enabledPeers = append(enabledPeers, peer)
|
|
}
|
|
}
|
|
|
|
// Sort by priority (higher priority first)
|
|
for i := 0; i < len(enabledPeers)-1; i++ {
|
|
for j := i + 1; j < len(enabledPeers); j++ {
|
|
if enabledPeers[j].Priority > enabledPeers[i].Priority {
|
|
enabledPeers[i], enabledPeers[j] = enabledPeers[j], enabledPeers[i]
|
|
}
|
|
}
|
|
}
|
|
|
|
// Extract addresses
|
|
for _, peer := range enabledPeers {
|
|
peers = append(peers, peer.Address)
|
|
}
|
|
|
|
if len(peers) > 0 {
|
|
fmt.Printf("📋 Loaded %d bootstrap peers from JSON: %s\n", len(peers), jsonPath)
|
|
}
|
|
|
|
return peers
|
|
}
|
|
|
|
// GetJoinStagger returns join stagger delay with assignment override support
|
|
func (rc *RuntimeConfig) GetJoinStagger() time.Duration {
|
|
rc.mu.RLock()
|
|
defer rc.mu.RUnlock()
|
|
|
|
if rc.Override != nil && rc.Override.JoinStagger > 0 {
|
|
return time.Duration(rc.Override.JoinStagger) * time.Millisecond
|
|
}
|
|
|
|
// Fall back to environment variable
|
|
if staggerEnv := os.Getenv("CHORUS_JOIN_STAGGER_MS"); staggerEnv != "" {
|
|
if ms, err := time.ParseDuration(staggerEnv + "ms"); err == nil {
|
|
return ms
|
|
}
|
|
}
|
|
|
|
return 0
|
|
}
|
|
|
|
// GetAssignmentInfo returns current assignment metadata
|
|
func (rc *RuntimeConfig) GetAssignmentInfo() *AssignmentConfig {
|
|
rc.mu.RLock()
|
|
defer rc.mu.RUnlock()
|
|
|
|
if rc.Override == nil {
|
|
return nil
|
|
}
|
|
|
|
// Return a copy to prevent external modification
|
|
assignment := *rc.Override
|
|
return &assignment
|
|
} |