324 lines
		
	
	
		
			9.3 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			324 lines
		
	
	
		
			9.3 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| package runtime
 | |
| 
 | |
| import (
 | |
| 	"context"
 | |
| 	"fmt"
 | |
| 	"time"
 | |
| 
 | |
| 	"chorus/internal/logging"
 | |
| 	"chorus/pkg/dht"
 | |
| 	"chorus/pkg/health"
 | |
| 	"chorus/pkg/shutdown"
 | |
| 	"chorus/pubsub"
 | |
| )
 | |
| 
 | |
| // simpleLogger implements basic logging for shutdown and health systems
 | |
| type simpleLogger struct {
 | |
| 	logger logging.Logger
 | |
| }
 | |
| 
 | |
| func (l *simpleLogger) Info(msg string, args ...interface{}) {
 | |
| 	l.logger.Info(msg, args...)
 | |
| }
 | |
| 
 | |
| func (l *simpleLogger) Warn(msg string, args ...interface{}) {
 | |
| 	l.logger.Warn(msg, args...)
 | |
| }
 | |
| 
 | |
| func (l *simpleLogger) Error(msg string, args ...interface{}) {
 | |
| 	l.logger.Error(msg, args...)
 | |
| }
 | |
| 
 | |
| // StartAgentMode runs the autonomous agent with all standard behaviors
 | |
| func (r *SharedRuntime) StartAgentMode() error {
 | |
| 	// Announce capabilities and role
 | |
| 	go r.announceAvailability()
 | |
| 	go r.announceCapabilitiesOnChange()
 | |
| 	go r.announceRoleOnStartup()
 | |
| 
 | |
| 	// Start status reporting
 | |
| 	go r.statusReporter()
 | |
| 
 | |
| 	r.Logger.Info("🔍 Listening for peers on container network...")
 | |
| 	r.Logger.Info("📡 Ready for task coordination and meta-discussion")
 | |
| 	r.Logger.Info("🎯 HMMM collaborative reasoning enabled")
 | |
| 
 | |
| 	// === Comprehensive Health Monitoring & Graceful Shutdown ===
 | |
| 	shutdownManager := shutdown.NewManager(30*time.Second, &simpleLogger{logger: r.Logger})
 | |
| 
 | |
| 	healthManager := health.NewManager(r.Node.ID().ShortString(), AppVersion, &simpleLogger{logger: r.Logger})
 | |
| 	healthManager.SetShutdownManager(shutdownManager)
 | |
| 
 | |
| 	// Register health checks
 | |
| 	r.setupHealthChecks(healthManager)
 | |
| 
 | |
| 	// Register components for graceful shutdown
 | |
| 	r.setupGracefulShutdown(shutdownManager, healthManager)
 | |
| 
 | |
| 	// Start health monitoring
 | |
| 	if err := healthManager.Start(); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	r.HealthManager = healthManager
 | |
| 	r.Logger.Info("❤️ Health monitoring started")
 | |
| 
 | |
| 	// Start health HTTP server
 | |
| 	if err := healthManager.StartHTTPServer(r.Config.Network.HealthPort); err != nil {
 | |
| 		r.Logger.Error("❌ Failed to start health HTTP server: %v", err)
 | |
| 	} else {
 | |
| 		r.Logger.Info("🏥 Health endpoints available at http://localhost:%d/health", r.Config.Network.HealthPort)
 | |
| 	}
 | |
| 
 | |
| 	// Start shutdown manager
 | |
| 	shutdownManager.Start()
 | |
| 	r.ShutdownManager = shutdownManager
 | |
| 	r.Logger.Info("🛡️ Graceful shutdown manager started")
 | |
| 
 | |
| 	r.Logger.Info("✅ CHORUS agent system fully operational with health monitoring")
 | |
| 
 | |
| 	// Wait for graceful shutdown
 | |
| 	shutdownManager.Wait()
 | |
| 	r.Logger.Info("✅ CHORUS agent system shutdown completed")
 | |
| 
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // announceAvailability broadcasts current working status for task assignment
 | |
| func (r *SharedRuntime) announceAvailability() {
 | |
| 	ticker := time.NewTicker(30 * time.Second)
 | |
| 	defer ticker.Stop()
 | |
| 
 | |
| 	for ; ; <-ticker.C {
 | |
| 		currentTasks := r.TaskTracker.GetActiveTasks()
 | |
| 		maxTasks := r.TaskTracker.GetMaxTasks()
 | |
| 		isAvailable := len(currentTasks) < maxTasks
 | |
| 
 | |
| 		status := "ready"
 | |
| 		if len(currentTasks) >= maxTasks {
 | |
| 			status = "busy"
 | |
| 		} else if len(currentTasks) > 0 {
 | |
| 			status = "working"
 | |
| 		}
 | |
| 
 | |
| 		availability := map[string]interface{}{
 | |
| 			"node_id":            r.Node.ID().ShortString(),
 | |
| 			"available_for_work": isAvailable,
 | |
| 			"current_tasks":      len(currentTasks),
 | |
| 			"max_tasks":          maxTasks,
 | |
| 			"last_activity":      time.Now().Unix(),
 | |
| 			"status":             status,
 | |
| 			"timestamp":          time.Now().Unix(),
 | |
| 		}
 | |
| 		if err := r.PubSub.PublishBzzzMessage(pubsub.AvailabilityBcast, availability); err != nil {
 | |
| 			r.Logger.Error("❌ Failed to announce availability: %v", err)
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // statusReporter provides periodic status updates
 | |
| func (r *SharedRuntime) statusReporter() {
 | |
| 	ticker := time.NewTicker(60 * time.Second)
 | |
| 	defer ticker.Stop()
 | |
| 
 | |
| 	for ; ; <-ticker.C {
 | |
| 		peers := r.Node.ConnectedPeers()
 | |
| 		r.Logger.Info("📊 Status: %d connected peers", peers)
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // announceCapabilitiesOnChange announces capabilities when they change
 | |
| func (r *SharedRuntime) announceCapabilitiesOnChange() {
 | |
| 	if r.PubSub == nil {
 | |
| 		r.Logger.Warn("⚠️ Capability broadcast skipped: PubSub not initialized")
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	r.Logger.Info("📢 Broadcasting agent capabilities to network")
 | |
| 
 | |
| 	activeTaskCount := 0
 | |
| 	if r.TaskTracker != nil {
 | |
| 		activeTaskCount = len(r.TaskTracker.GetActiveTasks())
 | |
| 	}
 | |
| 
 | |
| 	announcement := map[string]interface{}{
 | |
| 		"agent_id":       r.Config.Agent.ID,
 | |
| 		"node_id":        r.Node.ID().ShortString(),
 | |
| 		"version":        AppVersion,
 | |
| 		"capabilities":   r.Config.Agent.Capabilities,
 | |
| 		"expertise":      r.Config.Agent.Expertise,
 | |
| 		"models":         r.Config.Agent.Models,
 | |
| 		"specialization": r.Config.Agent.Specialization,
 | |
| 		"max_tasks":      r.Config.Agent.MaxTasks,
 | |
| 		"current_tasks":  activeTaskCount,
 | |
| 		"timestamp":      time.Now().Unix(),
 | |
| 		"availability":   "ready",
 | |
| 	}
 | |
| 
 | |
| 	if err := r.PubSub.PublishBzzzMessage(pubsub.CapabilityBcast, announcement); err != nil {
 | |
| 		r.Logger.Error("❌ Failed to broadcast capabilities: %v", err)
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	r.Logger.Info("✅ Capabilities broadcast published")
 | |
| 
 | |
| 	// TODO: Watch for live capability changes (role updates, model changes) and re-broadcast
 | |
| }
 | |
| 
 | |
| // announceRoleOnStartup announces role when the agent starts
 | |
| func (r *SharedRuntime) announceRoleOnStartup() {
 | |
| 	role := r.Config.Agent.Role
 | |
| 	if role == "" {
 | |
| 		r.Logger.Info("🎭 No agent role configured; skipping role announcement")
 | |
| 		return
 | |
| 	}
 | |
| 	if r.PubSub == nil {
 | |
| 		r.Logger.Warn("⚠️ Role announcement skipped: PubSub not initialized")
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	r.Logger.Info("🎭 Announcing agent role to collaboration mesh")
 | |
| 
 | |
| 	announcement := map[string]interface{}{
 | |
| 		"agent_id":       r.Config.Agent.ID,
 | |
| 		"node_id":        r.Node.ID().ShortString(),
 | |
| 		"role":           role,
 | |
| 		"expertise":      r.Config.Agent.Expertise,
 | |
| 		"capabilities":   r.Config.Agent.Capabilities,
 | |
| 		"reports_to":     r.Config.Agent.ReportsTo,
 | |
| 		"specialization": r.Config.Agent.Specialization,
 | |
| 		"timestamp":      time.Now().Unix(),
 | |
| 	}
 | |
| 
 | |
| 	opts := pubsub.MessageOptions{
 | |
| 		FromRole: role,
 | |
| 		Priority: "medium",
 | |
| 		ThreadID: fmt.Sprintf("role:%s", role),
 | |
| 	}
 | |
| 
 | |
| 	if err := r.PubSub.PublishRoleBasedMessage(pubsub.RoleAnnouncement, announcement, opts); err != nil {
 | |
| 		r.Logger.Error("❌ Failed to announce role: %v", err)
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	r.Logger.Info("✅ Role announcement published")
 | |
| }
 | |
| 
 | |
| func (r *SharedRuntime) setupHealthChecks(healthManager *health.Manager) {
 | |
| 	// Add BACKBEAT health check
 | |
| 	if r.BackbeatIntegration != nil {
 | |
| 		backbeatCheck := &health.HealthCheck{
 | |
| 			Name:        "backbeat",
 | |
| 			Description: "BACKBEAT timing integration health",
 | |
| 			Interval:    30 * time.Second,
 | |
| 			Timeout:     10 * time.Second,
 | |
| 			Enabled:     true,
 | |
| 			Critical:    false,
 | |
| 			Checker: func(ctx context.Context) health.CheckResult {
 | |
| 				healthInfo := r.BackbeatIntegration.GetHealth()
 | |
| 				connected, _ := healthInfo["connected"].(bool)
 | |
| 
 | |
| 				result := health.CheckResult{
 | |
| 					Healthy:   connected,
 | |
| 					Details:   healthInfo,
 | |
| 					Timestamp: time.Now(),
 | |
| 				}
 | |
| 
 | |
| 				if connected {
 | |
| 					result.Message = "BACKBEAT integration healthy and connected"
 | |
| 				} else {
 | |
| 					result.Message = "BACKBEAT integration not connected"
 | |
| 				}
 | |
| 
 | |
| 				return result
 | |
| 			},
 | |
| 		}
 | |
| 		healthManager.RegisterCheck(backbeatCheck)
 | |
| 	}
 | |
| 
 | |
| 	// Register enhanced health instrumentation when core subsystems are available
 | |
| 	if r.PubSub == nil {
 | |
| 		r.Logger.Warn("⚠️ Skipping enhanced health checks: PubSub not initialized")
 | |
| 		return
 | |
| 	}
 | |
| 	if r.ElectionManager == nil {
 | |
| 		r.Logger.Warn("⚠️ Skipping enhanced health checks: election manager not ready")
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	var replication *dht.ReplicationManager
 | |
| 	if r.DHTNode != nil {
 | |
| 		replication = r.DHTNode.ReplicationManager()
 | |
| 	}
 | |
| 
 | |
| 	enhanced := health.NewEnhancedHealthChecks(
 | |
| 		healthManager,
 | |
| 		r.ElectionManager,
 | |
| 		r.DHTNode,
 | |
| 		r.PubSub,
 | |
| 		replication,
 | |
| 		&simpleLogger{logger: r.Logger},
 | |
| 	)
 | |
| 
 | |
| 	r.EnhancedHealth = enhanced
 | |
| 	r.Logger.Info("🩺 Enhanced health checks registered")
 | |
| }
 | |
| 
 | |
| func (r *SharedRuntime) setupGracefulShutdown(shutdownManager *shutdown.Manager, healthManager *health.Manager) {
 | |
| 	if shutdownManager == nil {
 | |
| 		r.Logger.Warn("⚠️ Shutdown manager not initialized; graceful teardown skipped")
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	if r.HTTPServer != nil {
 | |
| 		httpComponent := shutdown.NewGenericComponent("http-api-server", 10, true).
 | |
| 			SetShutdownFunc(func(ctx context.Context) error {
 | |
| 				return r.HTTPServer.Stop()
 | |
| 			})
 | |
| 		shutdownManager.Register(httpComponent)
 | |
| 	}
 | |
| 
 | |
| 	if healthManager != nil {
 | |
| 		healthComponent := shutdown.NewGenericComponent("health-manager", 15, true).
 | |
| 			SetShutdownFunc(func(ctx context.Context) error {
 | |
| 				return healthManager.Stop()
 | |
| 			})
 | |
| 		shutdownManager.Register(healthComponent)
 | |
| 	}
 | |
| 
 | |
| 	if r.UCXIServer != nil {
 | |
| 		ucxiComponent := shutdown.NewGenericComponent("ucxi-server", 20, true).
 | |
| 			SetShutdownFunc(func(ctx context.Context) error {
 | |
| 				return r.UCXIServer.Stop()
 | |
| 			})
 | |
| 		shutdownManager.Register(ucxiComponent)
 | |
| 	}
 | |
| 
 | |
| 	if r.PubSub != nil {
 | |
| 		shutdownManager.Register(shutdown.NewPubSubComponent("pubsub", r.PubSub.Close, 30))
 | |
| 	}
 | |
| 
 | |
| 	if r.DHTNode != nil {
 | |
| 		dhtComponent := shutdown.NewGenericComponent("dht-node", 35, true).
 | |
| 			SetCloser(r.DHTNode.Close)
 | |
| 		shutdownManager.Register(dhtComponent)
 | |
| 	}
 | |
| 
 | |
| 	if r.Node != nil {
 | |
| 		shutdownManager.Register(shutdown.NewP2PNodeComponent("p2p-node", r.Node.Close, 40))
 | |
| 	}
 | |
| 
 | |
| 	if r.ElectionManager != nil {
 | |
| 		shutdownManager.Register(shutdown.NewElectionManagerComponent("election-manager", r.ElectionManager.Stop, 45))
 | |
| 	}
 | |
| 
 | |
| 	if r.BackbeatIntegration != nil {
 | |
| 		backbeatComponent := shutdown.NewGenericComponent("backbeat-integration", 50, true).
 | |
| 			SetShutdownFunc(func(ctx context.Context) error {
 | |
| 				return r.BackbeatIntegration.Stop()
 | |
| 			})
 | |
| 		shutdownManager.Register(backbeatComponent)
 | |
| 	}
 | |
| 
 | |
| 	r.Logger.Info("🛡️ Graceful shutdown components registered")
 | |
| }
 | 
