This commit preserves substantial development work including: ## Core Infrastructure: - **Bootstrap Pool Manager** (pkg/bootstrap/pool_manager.go): Advanced peer discovery and connection management for distributed CHORUS clusters - **Runtime Configuration System** (pkg/config/runtime_config.go): Dynamic configuration updates and assignment-based role management - **Cryptographic Key Derivation** (pkg/crypto/key_derivation.go): Secure key management for P2P networking and DHT operations ## Enhanced Monitoring & Operations: - **Comprehensive Monitoring Stack**: Added Prometheus and Grafana services with full metrics collection, alerting, and dashboard visualization - **License Gate System** (internal/licensing/license_gate.go): Advanced license validation with circuit breaker patterns - **Enhanced P2P Configuration**: Improved networking configuration for better peer discovery and connection reliability ## Health & Reliability: - **DHT Health Check Fix**: Temporarily disabled problematic DHT health checks to prevent container shutdown issues - **Enhanced License Validation**: Improved error handling and retry logic for license server communication ## Docker & Deployment: - **Optimized Container Configuration**: Updated Dockerfile and compose configurations for better resource management and networking - **Static Binary Support**: Proper compilation flags for Alpine containers This work addresses the P2P networking issues that were preventing proper leader election in CHORUS clusters and establishes the foundation for reliable distributed operation. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
354 lines
9.7 KiB
Go
354 lines
9.7 KiB
Go
package config
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io/ioutil"
|
|
"net/http"
|
|
"net/url"
|
|
"os"
|
|
"os/signal"
|
|
"sync"
|
|
"syscall"
|
|
"time"
|
|
)
|
|
|
|
// RuntimeConfig provides dynamic configuration with assignment override support
|
|
type RuntimeConfig struct {
|
|
mu sync.RWMutex
|
|
base *Config // Base configuration from environment
|
|
over *Config // Override configuration from assignment
|
|
}
|
|
|
|
// AssignmentConfig represents configuration received from WHOOSH assignment
|
|
type AssignmentConfig struct {
|
|
Role string `json:"role,omitempty"`
|
|
Model string `json:"model,omitempty"`
|
|
PromptUCXL string `json:"prompt_ucxl,omitempty"`
|
|
Specialization string `json:"specialization,omitempty"`
|
|
Capabilities []string `json:"capabilities,omitempty"`
|
|
Environment map[string]string `json:"environment,omitempty"`
|
|
BootstrapPeers []string `json:"bootstrap_peers,omitempty"`
|
|
JoinStaggerMS int `json:"join_stagger_ms,omitempty"`
|
|
DialsPerSecond int `json:"dials_per_second,omitempty"`
|
|
MaxConcurrentDHT int `json:"max_concurrent_dht,omitempty"`
|
|
AssignmentID string `json:"assignment_id,omitempty"`
|
|
ConfigEpoch int64 `json:"config_epoch,omitempty"`
|
|
}
|
|
|
|
// NewRuntimeConfig creates a new runtime configuration manager
|
|
func NewRuntimeConfig(baseConfig *Config) *RuntimeConfig {
|
|
return &RuntimeConfig{
|
|
base: baseConfig,
|
|
over: &Config{}, // Empty override initially
|
|
}
|
|
}
|
|
|
|
// Get retrieves a configuration value with override precedence
|
|
func (rc *RuntimeConfig) Get(key string) interface{} {
|
|
rc.mu.RLock()
|
|
defer rc.mu.RUnlock()
|
|
|
|
// Check override first, then base
|
|
if value := rc.getFromConfig(rc.over, key); value != nil {
|
|
return value
|
|
}
|
|
return rc.getFromConfig(rc.base, key)
|
|
}
|
|
|
|
// getFromConfig extracts a value from a config struct by key
|
|
func (rc *RuntimeConfig) getFromConfig(cfg *Config, key string) interface{} {
|
|
if cfg == nil {
|
|
return nil
|
|
}
|
|
|
|
switch key {
|
|
case "agent.role":
|
|
if cfg.Agent.Role != "" {
|
|
return cfg.Agent.Role
|
|
}
|
|
case "agent.specialization":
|
|
if cfg.Agent.Specialization != "" {
|
|
return cfg.Agent.Specialization
|
|
}
|
|
case "agent.capabilities":
|
|
if len(cfg.Agent.Capabilities) > 0 {
|
|
return cfg.Agent.Capabilities
|
|
}
|
|
case "agent.models":
|
|
if len(cfg.Agent.Models) > 0 {
|
|
return cfg.Agent.Models
|
|
}
|
|
case "agent.default_reasoning_model":
|
|
if cfg.Agent.DefaultReasoningModel != "" {
|
|
return cfg.Agent.DefaultReasoningModel
|
|
}
|
|
case "v2.dht.bootstrap_peers":
|
|
if len(cfg.V2.DHT.BootstrapPeers) > 0 {
|
|
return cfg.V2.DHT.BootstrapPeers
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// GetString retrieves a string configuration value
|
|
func (rc *RuntimeConfig) GetString(key string) string {
|
|
if value := rc.Get(key); value != nil {
|
|
if str, ok := value.(string); ok {
|
|
return str
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// GetStringSlice retrieves a string slice configuration value
|
|
func (rc *RuntimeConfig) GetStringSlice(key string) []string {
|
|
if value := rc.Get(key); value != nil {
|
|
if slice, ok := value.([]string); ok {
|
|
return slice
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// GetInt retrieves an integer configuration value
|
|
func (rc *RuntimeConfig) GetInt(key string) int {
|
|
if value := rc.Get(key); value != nil {
|
|
if i, ok := value.(int); ok {
|
|
return i
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// LoadAssignment loads configuration from WHOOSH assignment endpoint
|
|
func (rc *RuntimeConfig) LoadAssignment(ctx context.Context) error {
|
|
assignURL := os.Getenv("ASSIGN_URL")
|
|
if assignURL == "" {
|
|
return nil // No assignment URL configured
|
|
}
|
|
|
|
// Build assignment request URL with task identity
|
|
params := url.Values{}
|
|
if taskSlot := os.Getenv("TASK_SLOT"); taskSlot != "" {
|
|
params.Set("slot", taskSlot)
|
|
}
|
|
if taskID := os.Getenv("TASK_ID"); taskID != "" {
|
|
params.Set("task", taskID)
|
|
}
|
|
if clusterID := os.Getenv("CHORUS_CLUSTER_ID"); clusterID != "" {
|
|
params.Set("cluster", clusterID)
|
|
}
|
|
|
|
fullURL := assignURL
|
|
if len(params) > 0 {
|
|
fullURL += "?" + params.Encode()
|
|
}
|
|
|
|
// Fetch assignment with timeout
|
|
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
|
|
defer cancel()
|
|
|
|
req, err := http.NewRequestWithContext(ctx, "GET", fullURL, nil)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create assignment request: %w", err)
|
|
}
|
|
|
|
client := &http.Client{Timeout: 10 * time.Second}
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return fmt.Errorf("assignment request failed: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return fmt.Errorf("assignment request failed with status %d", resp.StatusCode)
|
|
}
|
|
|
|
// Parse assignment response
|
|
var assignment AssignmentConfig
|
|
if err := json.NewDecoder(resp.Body).Decode(&assignment); err != nil {
|
|
return fmt.Errorf("failed to decode assignment response: %w", err)
|
|
}
|
|
|
|
// Apply assignment to override config
|
|
if err := rc.applyAssignment(&assignment); err != nil {
|
|
return fmt.Errorf("failed to apply assignment: %w", err)
|
|
}
|
|
|
|
fmt.Printf("📥 Loaded assignment: role=%s, model=%s, epoch=%d\n",
|
|
assignment.Role, assignment.Model, assignment.ConfigEpoch)
|
|
|
|
return nil
|
|
}
|
|
|
|
// LoadAssignmentFromFile loads configuration from a file (for config objects)
|
|
func (rc *RuntimeConfig) LoadAssignmentFromFile(filePath string) error {
|
|
if filePath == "" {
|
|
return nil // No file configured
|
|
}
|
|
|
|
data, err := ioutil.ReadFile(filePath)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to read assignment file %s: %w", filePath, err)
|
|
}
|
|
|
|
var assignment AssignmentConfig
|
|
if err := json.Unmarshal(data, &assignment); err != nil {
|
|
return fmt.Errorf("failed to parse assignment file: %w", err)
|
|
}
|
|
|
|
if err := rc.applyAssignment(&assignment); err != nil {
|
|
return fmt.Errorf("failed to apply file assignment: %w", err)
|
|
}
|
|
|
|
fmt.Printf("📁 Loaded assignment from file: role=%s, model=%s\n",
|
|
assignment.Role, assignment.Model)
|
|
|
|
return nil
|
|
}
|
|
|
|
// applyAssignment applies an assignment to the override configuration
|
|
func (rc *RuntimeConfig) applyAssignment(assignment *AssignmentConfig) error {
|
|
rc.mu.Lock()
|
|
defer rc.mu.Unlock()
|
|
|
|
// Create new override config
|
|
override := &Config{
|
|
Agent: AgentConfig{
|
|
Role: assignment.Role,
|
|
Specialization: assignment.Specialization,
|
|
Capabilities: assignment.Capabilities,
|
|
DefaultReasoningModel: assignment.Model,
|
|
},
|
|
V2: V2Config{
|
|
DHT: DHTConfig{
|
|
BootstrapPeers: assignment.BootstrapPeers,
|
|
},
|
|
},
|
|
}
|
|
|
|
// Handle models array
|
|
if assignment.Model != "" {
|
|
override.Agent.Models = []string{assignment.Model}
|
|
}
|
|
|
|
// Apply environment variables from assignment
|
|
for key, value := range assignment.Environment {
|
|
os.Setenv(key, value)
|
|
}
|
|
|
|
rc.over = override
|
|
|
|
return nil
|
|
}
|
|
|
|
// StartReloadHandler starts a signal handler for configuration reload (SIGHUP)
|
|
func (rc *RuntimeConfig) StartReloadHandler(ctx context.Context) {
|
|
sigChan := make(chan os.Signal, 1)
|
|
signal.Notify(sigChan, syscall.SIGHUP)
|
|
|
|
go func() {
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-sigChan:
|
|
fmt.Println("🔄 Received SIGHUP, reloading configuration...")
|
|
if err := rc.LoadAssignment(ctx); err != nil {
|
|
fmt.Printf("⚠️ Failed to reload assignment: %v\n", err)
|
|
} else {
|
|
fmt.Println("✅ Configuration reloaded successfully")
|
|
}
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
// GetBaseConfig returns the base configuration (from environment)
|
|
func (rc *RuntimeConfig) GetBaseConfig() *Config {
|
|
rc.mu.RLock()
|
|
defer rc.mu.RUnlock()
|
|
return rc.base
|
|
}
|
|
|
|
// GetEffectiveConfig returns the effective merged configuration
|
|
func (rc *RuntimeConfig) GetEffectiveConfig() *Config {
|
|
rc.mu.RLock()
|
|
defer rc.mu.RUnlock()
|
|
|
|
// Start with base config
|
|
effective := *rc.base
|
|
|
|
// Apply overrides
|
|
if rc.over.Agent.Role != "" {
|
|
effective.Agent.Role = rc.over.Agent.Role
|
|
}
|
|
if rc.over.Agent.Specialization != "" {
|
|
effective.Agent.Specialization = rc.over.Agent.Specialization
|
|
}
|
|
if len(rc.over.Agent.Capabilities) > 0 {
|
|
effective.Agent.Capabilities = rc.over.Agent.Capabilities
|
|
}
|
|
if len(rc.over.Agent.Models) > 0 {
|
|
effective.Agent.Models = rc.over.Agent.Models
|
|
}
|
|
if rc.over.Agent.DefaultReasoningModel != "" {
|
|
effective.Agent.DefaultReasoningModel = rc.over.Agent.DefaultReasoningModel
|
|
}
|
|
if len(rc.over.V2.DHT.BootstrapPeers) > 0 {
|
|
effective.V2.DHT.BootstrapPeers = rc.over.V2.DHT.BootstrapPeers
|
|
}
|
|
|
|
return &effective
|
|
}
|
|
|
|
// GetAssignmentStats returns assignment statistics for monitoring
|
|
func (rc *RuntimeConfig) GetAssignmentStats() map[string]interface{} {
|
|
rc.mu.RLock()
|
|
defer rc.mu.RUnlock()
|
|
|
|
hasOverride := rc.over.Agent.Role != "" ||
|
|
rc.over.Agent.Specialization != "" ||
|
|
len(rc.over.Agent.Capabilities) > 0 ||
|
|
len(rc.over.V2.DHT.BootstrapPeers) > 0
|
|
|
|
stats := map[string]interface{}{
|
|
"has_assignment": hasOverride,
|
|
"assign_url": os.Getenv("ASSIGN_URL"),
|
|
"task_slot": os.Getenv("TASK_SLOT"),
|
|
"task_id": os.Getenv("TASK_ID"),
|
|
}
|
|
|
|
if hasOverride {
|
|
stats["assigned_role"] = rc.over.Agent.Role
|
|
stats["assigned_specialization"] = rc.over.Agent.Specialization
|
|
stats["assigned_capabilities"] = rc.over.Agent.Capabilities
|
|
stats["assigned_models"] = rc.over.Agent.Models
|
|
stats["bootstrap_peers_count"] = len(rc.over.V2.DHT.BootstrapPeers)
|
|
}
|
|
|
|
return stats
|
|
}
|
|
|
|
// InitializeAssignmentFromEnv initializes assignment from environment variables
|
|
func (rc *RuntimeConfig) InitializeAssignmentFromEnv(ctx context.Context) error {
|
|
// Try loading from assignment URL first
|
|
if err := rc.LoadAssignment(ctx); err != nil {
|
|
fmt.Printf("⚠️ Failed to load assignment from URL: %v\n", err)
|
|
}
|
|
|
|
// Try loading from file (for config objects)
|
|
if assignFile := os.Getenv("ASSIGNMENT_FILE"); assignFile != "" {
|
|
if err := rc.LoadAssignmentFromFile(assignFile); err != nil {
|
|
fmt.Printf("⚠️ Failed to load assignment from file: %v\n", err)
|
|
}
|
|
}
|
|
|
|
// Start reload handler for SIGHUP
|
|
rc.StartReloadHandler(ctx)
|
|
|
|
return nil
|
|
} |