feat: Preserve comprehensive CHORUS enhancements and P2P improvements
This commit preserves substantial development work including: ## Core Infrastructure: - **Bootstrap Pool Manager** (pkg/bootstrap/pool_manager.go): Advanced peer discovery and connection management for distributed CHORUS clusters - **Runtime Configuration System** (pkg/config/runtime_config.go): Dynamic configuration updates and assignment-based role management - **Cryptographic Key Derivation** (pkg/crypto/key_derivation.go): Secure key management for P2P networking and DHT operations ## Enhanced Monitoring & Operations: - **Comprehensive Monitoring Stack**: Added Prometheus and Grafana services with full metrics collection, alerting, and dashboard visualization - **License Gate System** (internal/licensing/license_gate.go): Advanced license validation with circuit breaker patterns - **Enhanced P2P Configuration**: Improved networking configuration for better peer discovery and connection reliability ## Health & Reliability: - **DHT Health Check Fix**: Temporarily disabled problematic DHT health checks to prevent container shutdown issues - **Enhanced License Validation**: Improved error handling and retry logic for license server communication ## Docker & Deployment: - **Optimized Container Configuration**: Updated Dockerfile and compose configurations for better resource management and networking - **Static Binary Support**: Proper compilation flags for Alpine containers This work addresses the P2P networking issues that were preventing proper leader election in CHORUS clusters and establishes the foundation for reliable distributed operation. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
354
pkg/config/runtime_config.go
Normal file
354
pkg/config/runtime_config.go
Normal file
@@ -0,0 +1,354 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"os/signal"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
||||
// RuntimeConfig provides dynamic configuration with assignment override support
|
||||
type RuntimeConfig struct {
|
||||
mu sync.RWMutex
|
||||
base *Config // Base configuration from environment
|
||||
over *Config // Override configuration from assignment
|
||||
}
|
||||
|
||||
// AssignmentConfig represents configuration received from WHOOSH assignment
|
||||
type AssignmentConfig struct {
|
||||
Role string `json:"role,omitempty"`
|
||||
Model string `json:"model,omitempty"`
|
||||
PromptUCXL string `json:"prompt_ucxl,omitempty"`
|
||||
Specialization string `json:"specialization,omitempty"`
|
||||
Capabilities []string `json:"capabilities,omitempty"`
|
||||
Environment map[string]string `json:"environment,omitempty"`
|
||||
BootstrapPeers []string `json:"bootstrap_peers,omitempty"`
|
||||
JoinStaggerMS int `json:"join_stagger_ms,omitempty"`
|
||||
DialsPerSecond int `json:"dials_per_second,omitempty"`
|
||||
MaxConcurrentDHT int `json:"max_concurrent_dht,omitempty"`
|
||||
AssignmentID string `json:"assignment_id,omitempty"`
|
||||
ConfigEpoch int64 `json:"config_epoch,omitempty"`
|
||||
}
|
||||
|
||||
// NewRuntimeConfig creates a new runtime configuration manager
|
||||
func NewRuntimeConfig(baseConfig *Config) *RuntimeConfig {
|
||||
return &RuntimeConfig{
|
||||
base: baseConfig,
|
||||
over: &Config{}, // Empty override initially
|
||||
}
|
||||
}
|
||||
|
||||
// Get retrieves a configuration value with override precedence
|
||||
func (rc *RuntimeConfig) Get(key string) interface{} {
|
||||
rc.mu.RLock()
|
||||
defer rc.mu.RUnlock()
|
||||
|
||||
// Check override first, then base
|
||||
if value := rc.getFromConfig(rc.over, key); value != nil {
|
||||
return value
|
||||
}
|
||||
return rc.getFromConfig(rc.base, key)
|
||||
}
|
||||
|
||||
// getFromConfig extracts a value from a config struct by key
|
||||
func (rc *RuntimeConfig) getFromConfig(cfg *Config, key string) interface{} {
|
||||
if cfg == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
switch key {
|
||||
case "agent.role":
|
||||
if cfg.Agent.Role != "" {
|
||||
return cfg.Agent.Role
|
||||
}
|
||||
case "agent.specialization":
|
||||
if cfg.Agent.Specialization != "" {
|
||||
return cfg.Agent.Specialization
|
||||
}
|
||||
case "agent.capabilities":
|
||||
if len(cfg.Agent.Capabilities) > 0 {
|
||||
return cfg.Agent.Capabilities
|
||||
}
|
||||
case "agent.models":
|
||||
if len(cfg.Agent.Models) > 0 {
|
||||
return cfg.Agent.Models
|
||||
}
|
||||
case "agent.default_reasoning_model":
|
||||
if cfg.Agent.DefaultReasoningModel != "" {
|
||||
return cfg.Agent.DefaultReasoningModel
|
||||
}
|
||||
case "v2.dht.bootstrap_peers":
|
||||
if len(cfg.V2.DHT.BootstrapPeers) > 0 {
|
||||
return cfg.V2.DHT.BootstrapPeers
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetString retrieves a string configuration value
|
||||
func (rc *RuntimeConfig) GetString(key string) string {
|
||||
if value := rc.Get(key); value != nil {
|
||||
if str, ok := value.(string); ok {
|
||||
return str
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// GetStringSlice retrieves a string slice configuration value
|
||||
func (rc *RuntimeConfig) GetStringSlice(key string) []string {
|
||||
if value := rc.Get(key); value != nil {
|
||||
if slice, ok := value.([]string); ok {
|
||||
return slice
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetInt retrieves an integer configuration value
|
||||
func (rc *RuntimeConfig) GetInt(key string) int {
|
||||
if value := rc.Get(key); value != nil {
|
||||
if i, ok := value.(int); ok {
|
||||
return i
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// LoadAssignment loads configuration from WHOOSH assignment endpoint
|
||||
func (rc *RuntimeConfig) LoadAssignment(ctx context.Context) error {
|
||||
assignURL := os.Getenv("ASSIGN_URL")
|
||||
if assignURL == "" {
|
||||
return nil // No assignment URL configured
|
||||
}
|
||||
|
||||
// Build assignment request URL with task identity
|
||||
params := url.Values{}
|
||||
if taskSlot := os.Getenv("TASK_SLOT"); taskSlot != "" {
|
||||
params.Set("slot", taskSlot)
|
||||
}
|
||||
if taskID := os.Getenv("TASK_ID"); taskID != "" {
|
||||
params.Set("task", taskID)
|
||||
}
|
||||
if clusterID := os.Getenv("CHORUS_CLUSTER_ID"); clusterID != "" {
|
||||
params.Set("cluster", clusterID)
|
||||
}
|
||||
|
||||
fullURL := assignURL
|
||||
if len(params) > 0 {
|
||||
fullURL += "?" + params.Encode()
|
||||
}
|
||||
|
||||
// Fetch assignment with timeout
|
||||
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", fullURL, nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create assignment request: %w", err)
|
||||
}
|
||||
|
||||
client := &http.Client{Timeout: 10 * time.Second}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("assignment request failed: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return fmt.Errorf("assignment request failed with status %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
// Parse assignment response
|
||||
var assignment AssignmentConfig
|
||||
if err := json.NewDecoder(resp.Body).Decode(&assignment); err != nil {
|
||||
return fmt.Errorf("failed to decode assignment response: %w", err)
|
||||
}
|
||||
|
||||
// Apply assignment to override config
|
||||
if err := rc.applyAssignment(&assignment); err != nil {
|
||||
return fmt.Errorf("failed to apply assignment: %w", err)
|
||||
}
|
||||
|
||||
fmt.Printf("📥 Loaded assignment: role=%s, model=%s, epoch=%d\n",
|
||||
assignment.Role, assignment.Model, assignment.ConfigEpoch)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// LoadAssignmentFromFile loads configuration from a file (for config objects)
|
||||
func (rc *RuntimeConfig) LoadAssignmentFromFile(filePath string) error {
|
||||
if filePath == "" {
|
||||
return nil // No file configured
|
||||
}
|
||||
|
||||
data, err := ioutil.ReadFile(filePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read assignment file %s: %w", filePath, err)
|
||||
}
|
||||
|
||||
var assignment AssignmentConfig
|
||||
if err := json.Unmarshal(data, &assignment); err != nil {
|
||||
return fmt.Errorf("failed to parse assignment file: %w", err)
|
||||
}
|
||||
|
||||
if err := rc.applyAssignment(&assignment); err != nil {
|
||||
return fmt.Errorf("failed to apply file assignment: %w", err)
|
||||
}
|
||||
|
||||
fmt.Printf("📁 Loaded assignment from file: role=%s, model=%s\n",
|
||||
assignment.Role, assignment.Model)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// applyAssignment applies an assignment to the override configuration
|
||||
func (rc *RuntimeConfig) applyAssignment(assignment *AssignmentConfig) error {
|
||||
rc.mu.Lock()
|
||||
defer rc.mu.Unlock()
|
||||
|
||||
// Create new override config
|
||||
override := &Config{
|
||||
Agent: AgentConfig{
|
||||
Role: assignment.Role,
|
||||
Specialization: assignment.Specialization,
|
||||
Capabilities: assignment.Capabilities,
|
||||
DefaultReasoningModel: assignment.Model,
|
||||
},
|
||||
V2: V2Config{
|
||||
DHT: DHTConfig{
|
||||
BootstrapPeers: assignment.BootstrapPeers,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
// Handle models array
|
||||
if assignment.Model != "" {
|
||||
override.Agent.Models = []string{assignment.Model}
|
||||
}
|
||||
|
||||
// Apply environment variables from assignment
|
||||
for key, value := range assignment.Environment {
|
||||
os.Setenv(key, value)
|
||||
}
|
||||
|
||||
rc.over = override
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// StartReloadHandler starts a signal handler for configuration reload (SIGHUP)
|
||||
func (rc *RuntimeConfig) StartReloadHandler(ctx context.Context) {
|
||||
sigChan := make(chan os.Signal, 1)
|
||||
signal.Notify(sigChan, syscall.SIGHUP)
|
||||
|
||||
go func() {
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-sigChan:
|
||||
fmt.Println("🔄 Received SIGHUP, reloading configuration...")
|
||||
if err := rc.LoadAssignment(ctx); err != nil {
|
||||
fmt.Printf("⚠️ Failed to reload assignment: %v\n", err)
|
||||
} else {
|
||||
fmt.Println("✅ Configuration reloaded successfully")
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// GetBaseConfig returns the base configuration (from environment)
|
||||
func (rc *RuntimeConfig) GetBaseConfig() *Config {
|
||||
rc.mu.RLock()
|
||||
defer rc.mu.RUnlock()
|
||||
return rc.base
|
||||
}
|
||||
|
||||
// GetEffectiveConfig returns the effective merged configuration
|
||||
func (rc *RuntimeConfig) GetEffectiveConfig() *Config {
|
||||
rc.mu.RLock()
|
||||
defer rc.mu.RUnlock()
|
||||
|
||||
// Start with base config
|
||||
effective := *rc.base
|
||||
|
||||
// Apply overrides
|
||||
if rc.over.Agent.Role != "" {
|
||||
effective.Agent.Role = rc.over.Agent.Role
|
||||
}
|
||||
if rc.over.Agent.Specialization != "" {
|
||||
effective.Agent.Specialization = rc.over.Agent.Specialization
|
||||
}
|
||||
if len(rc.over.Agent.Capabilities) > 0 {
|
||||
effective.Agent.Capabilities = rc.over.Agent.Capabilities
|
||||
}
|
||||
if len(rc.over.Agent.Models) > 0 {
|
||||
effective.Agent.Models = rc.over.Agent.Models
|
||||
}
|
||||
if rc.over.Agent.DefaultReasoningModel != "" {
|
||||
effective.Agent.DefaultReasoningModel = rc.over.Agent.DefaultReasoningModel
|
||||
}
|
||||
if len(rc.over.V2.DHT.BootstrapPeers) > 0 {
|
||||
effective.V2.DHT.BootstrapPeers = rc.over.V2.DHT.BootstrapPeers
|
||||
}
|
||||
|
||||
return &effective
|
||||
}
|
||||
|
||||
// GetAssignmentStats returns assignment statistics for monitoring
|
||||
func (rc *RuntimeConfig) GetAssignmentStats() map[string]interface{} {
|
||||
rc.mu.RLock()
|
||||
defer rc.mu.RUnlock()
|
||||
|
||||
hasOverride := rc.over.Agent.Role != "" ||
|
||||
rc.over.Agent.Specialization != "" ||
|
||||
len(rc.over.Agent.Capabilities) > 0 ||
|
||||
len(rc.over.V2.DHT.BootstrapPeers) > 0
|
||||
|
||||
stats := map[string]interface{}{
|
||||
"has_assignment": hasOverride,
|
||||
"assign_url": os.Getenv("ASSIGN_URL"),
|
||||
"task_slot": os.Getenv("TASK_SLOT"),
|
||||
"task_id": os.Getenv("TASK_ID"),
|
||||
}
|
||||
|
||||
if hasOverride {
|
||||
stats["assigned_role"] = rc.over.Agent.Role
|
||||
stats["assigned_specialization"] = rc.over.Agent.Specialization
|
||||
stats["assigned_capabilities"] = rc.over.Agent.Capabilities
|
||||
stats["assigned_models"] = rc.over.Agent.Models
|
||||
stats["bootstrap_peers_count"] = len(rc.over.V2.DHT.BootstrapPeers)
|
||||
}
|
||||
|
||||
return stats
|
||||
}
|
||||
|
||||
// InitializeAssignmentFromEnv initializes assignment from environment variables
|
||||
func (rc *RuntimeConfig) InitializeAssignmentFromEnv(ctx context.Context) error {
|
||||
// Try loading from assignment URL first
|
||||
if err := rc.LoadAssignment(ctx); err != nil {
|
||||
fmt.Printf("⚠️ Failed to load assignment from URL: %v\n", err)
|
||||
}
|
||||
|
||||
// Try loading from file (for config objects)
|
||||
if assignFile := os.Getenv("ASSIGNMENT_FILE"); assignFile != "" {
|
||||
if err := rc.LoadAssignmentFromFile(assignFile); err != nil {
|
||||
fmt.Printf("⚠️ Failed to load assignment from file: %v\n", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Start reload handler for SIGHUP
|
||||
rc.StartReloadHandler(ctx)
|
||||
|
||||
return nil
|
||||
}
|
||||
Reference in New Issue
Block a user