package p2p import ( "context" "fmt" "net" "net/http" "sync" "time" "github.com/rs/zerolog/log" ) // Agent represents a CHORUS agent discovered via P2P networking within the Docker Swarm cluster. // This struct defines the complete metadata we track for each AI agent, enabling intelligent // team formation and workload distribution. // // Design decision: We use JSON tags for API serialization since this data is exposed via // REST endpoints to the WHOOSH UI. The omitempty tag on CurrentTeam allows agents to be // unassigned without cluttering the JSON response with empty fields. type Agent struct { ID string `json:"id"` // Unique identifier (e.g., "chorus-agent-001") Name string `json:"name"` // Human-readable name for UI display Status string `json:"status"` // online/idle/working - current availability Capabilities []string `json:"capabilities"` // Skills: ["go_development", "database_design"] Model string `json:"model"` // LLM model ("llama3.1:8b", "codellama", etc.) Endpoint string `json:"endpoint"` // HTTP API endpoint for task assignment LastSeen time.Time `json:"last_seen"` // Timestamp of last health check response TasksCompleted int `json:"tasks_completed"` // Performance metric for load balancing CurrentTeam string `json:"current_team,omitempty"` // Active team assignment (optional) P2PAddr string `json:"p2p_addr"` // Peer-to-peer communication address ClusterID string `json:"cluster_id"` // Docker Swarm cluster identifier } // Discovery handles P2P agent discovery for CHORUS agents within the Docker Swarm network. // This service maintains a real-time registry of available agents and their capabilities, // enabling the WHOOSH orchestrator to make intelligent team formation decisions. // // Design decisions: // 1. RWMutex for thread-safe concurrent access (many readers, few writers) // 2. Context-based cancellation for clean shutdown in Docker containers // 3. Map storage for O(1) agent lookup by ID // 4. Separate channels for different types of shutdown signaling type Discovery struct { agents map[string]*Agent // Thread-safe registry of discovered agents mu sync.RWMutex // Protects agents map from concurrent access listeners []net.PacketConn // UDP listeners for P2P broadcasts (future use) stopCh chan struct{} // Channel for shutdown coordination ctx context.Context // Context for graceful cancellation cancel context.CancelFunc // Function to trigger context cancellation } // NewDiscovery creates a new P2P discovery service with proper initialization. // This constructor ensures all channels and contexts are properly set up for // concurrent operation within the Docker Swarm environment. // // Implementation decision: We use context.WithCancel rather than a timeout context // because agent discovery should run indefinitely until explicitly stopped. func NewDiscovery() *Discovery { // Create cancellable context for graceful shutdown coordination ctx, cancel := context.WithCancel(context.Background()) return &Discovery{ agents: make(map[string]*Agent), // Initialize empty agent registry stopCh: make(chan struct{}), // Unbuffered channel for shutdown signaling ctx: ctx, // Parent context for all goroutines cancel: cancel, // Cancellation function for cleanup } } // Start begins listening for CHORUS agent P2P broadcasts and starts background services. // This method launches goroutines for agent discovery and cleanup, enabling real-time // monitoring of the CHORUS agent ecosystem. // // Implementation decision: We use goroutines rather than a worker pool because the // workload is I/O bound (HTTP health checks) and we want immediate responsiveness. func (d *Discovery) Start() error { log.Info().Msg("๐Ÿ” Starting CHORUS P2P agent discovery") // Launch agent discovery in separate goroutine to avoid blocking startup. // This continuously polls CHORUS agents via their health endpoints to // maintain an up-to-date registry of available agents and capabilities. go d.listenForBroadcasts() // Launch cleanup service to remove stale agents that haven't responded // to health checks. This prevents the UI from showing offline agents // and ensures accurate team formation decisions. go d.cleanupStaleAgents() return nil // Always succeeds since goroutines handle errors internally } // Stop shuts down the P2P discovery service func (d *Discovery) Stop() error { log.Info().Msg("๐Ÿ” Stopping CHORUS P2P agent discovery") d.cancel() close(d.stopCh) for _, listener := range d.listeners { listener.Close() } return nil } // GetAgents returns all currently discovered agents func (d *Discovery) GetAgents() []*Agent { d.mu.RLock() defer d.mu.RUnlock() agents := make([]*Agent, 0, len(d.agents)) for _, agent := range d.agents { agents = append(agents, agent) } return agents } // listenForBroadcasts listens for CHORUS agent P2P broadcasts func (d *Discovery) listenForBroadcasts() { // For now, simulate discovering the 9 CHORUS replicas that are running // In a full implementation, this would listen on UDP multicast for actual P2P broadcasts log.Info().Msg("๐Ÿ” Simulating P2P discovery of CHORUS agents") // Since we know CHORUS is running 9 replicas, let's simulate discovering them ticker := time.NewTicker(10 * time.Second) defer ticker.Stop() for { select { case <-d.ctx.Done(): return case <-ticker.C: d.simulateAgentDiscovery() } } } // simulateAgentDiscovery discovers CHORUS agents by querying their health endpoints func (d *Discovery) simulateAgentDiscovery() { log.Debug().Msg("๐Ÿ” Discovering CHORUS agents via health endpoints") // Query Docker DNS for CHORUS service tasks // In Docker Swarm, tasks can be discovered via the service name d.discoverCHORUSReplicas() } // discoverCHORUSReplicas discovers running CHORUS replicas in the Docker Swarm network. // This function implements a discovery strategy that works around Docker Swarm's round-robin // DNS by making multiple requests to discover individual service replicas. // // Technical challenges and solutions: // 1. Docker Swarm round-robin DNS makes it hard to discover individual replicas // 2. We use multiple HTTP requests to hit different replicas via load balancer // 3. Generate synthetic agent IDs since CHORUS doesn't expose unique identifiers yet // 4. Create realistic agent metadata for team formation algorithms // // This approach is a pragmatic MVP solution - in production, CHORUS agents would // register themselves with unique IDs and capabilities via a proper discovery protocol. func (d *Discovery) discoverCHORUSReplicas() { // HTTP client with short timeout for health checks. We use 5 seconds because: // 1. Health endpoints should respond quickly (< 1s typically) // 2. We're making multiple requests, so timeouts add up // 3. Docker Swarm networking is usually fast within cluster client := &http.Client{Timeout: 5 * time.Second} baseTime := time.Now() // Consistent timestamp for this discovery cycle // Local map to track agents discovered in this cycle. We use a map to ensure // we don't create duplicate agents if we happen to hit the same replica twice. discovered := make(map[string]*Agent) // Discovery strategy: Make multiple requests to the service endpoint. // Docker Swarm's round-robin load balancing will distribute these across // different replicas, allowing us to discover individual instances. // 15 attempts gives us good coverage of a 9-replica service. for attempt := 1; attempt <= 15; attempt++ { // Use the CHORUS health port (8081) rather than API port (8080) because: // 1. Health endpoints are lightweight and fast // 2. They don't require authentication or complex request processing // 3. They're designed to be called frequently for monitoring endpoint := "http://chorus:8081/health" // Make the health check request. Docker Swarm will route this to one // of the available CHORUS replicas based on its load balancing algorithm. resp, err := client.Get(endpoint) if err != nil { // Log connection failures at debug level since some failures are expected // during service startup or when replicas are being updated. log.Debug(). Err(err). Str("endpoint", endpoint). Int("attempt", attempt). Msg("Failed to query CHORUS health endpoint") continue } // Process successful health check responses if resp.StatusCode == http.StatusOK { // Generate a synthetic agent ID since CHORUS doesn't provide unique IDs yet. // In production, this would come from the health check response body. // Using zero-padded numbers ensures consistent sorting in the UI. agentID := fmt.Sprintf("chorus-agent-%03d", len(discovered)+1) // Only create new agent if we haven't seen this ID before in this cycle if _, exists := discovered[agentID]; !exists { // Create agent with realistic metadata for team formation. // These capabilities and models would normally come from the // actual CHORUS agent configuration. agent := &Agent{ ID: agentID, Name: fmt.Sprintf("CHORUS Agent %d", len(discovered)+1), Status: "online", // Default to online since health check succeeded // Standard CHORUS agent capabilities - these define what types of // tasks the agent can handle in team formation algorithms Capabilities: []string{"general_development", "task_coordination", "ai_integration"}, Model: "llama3.1:8b", // Standard model for CHORUS agents Endpoint: "http://chorus:8080", // API port for task assignment LastSeen: baseTime, // Consistent timestamp for this discovery cycle // Synthetic task completion count for load balancing algorithms. // In production, this would be actual metrics from agent performance. TasksCompleted: len(discovered) * 2, P2PAddr: "chorus:9000", // P2P communication port ClusterID: "docker-unified-stack", // Docker Swarm cluster identifier } // Add some variety to agent status for realistic team formation testing. // This simulates real-world scenarios where agents have different availability. if len(discovered)%3 == 0 { agent.Status = "idle" // Every third agent is idle } else if len(discovered) == 6 { // One agent is actively working on a team assignment agent.Status = "working" agent.CurrentTeam = "development-team-alpha" } // Add to discovered agents and log the discovery discovered[agentID] = agent log.Debug(). Str("agent_id", agentID). Str("status", agent.Status). Msg("๐Ÿค– Discovered CHORUS agent") } } resp.Body.Close() // Stop discovery once we've found the expected number of agents. // This prevents unnecessary HTTP requests and speeds up discovery cycles. if len(discovered) >= 9 { break } // Brief pause between requests to avoid overwhelming the service and // to allow Docker Swarm's load balancer to potentially route to different replicas. time.Sleep(100 * time.Millisecond) } // Add all discovered agents for _, agent := range discovered { d.addOrUpdateAgent(agent) } log.Info(). Int("discovered_count", len(discovered)). Msg("๐ŸŽญ CHORUS agent discovery completed") } // addOrUpdateAgent adds or updates an agent in the discovery cache func (d *Discovery) addOrUpdateAgent(agent *Agent) { d.mu.Lock() defer d.mu.Unlock() existing, exists := d.agents[agent.ID] if exists { // Update existing agent existing.Status = agent.Status existing.LastSeen = agent.LastSeen existing.TasksCompleted = agent.TasksCompleted existing.CurrentTeam = agent.CurrentTeam } else { // Add new agent d.agents[agent.ID] = agent log.Info(). Str("agent_id", agent.ID). Str("p2p_addr", agent.P2PAddr). Msg("๐Ÿค– Discovered new CHORUS agent") } } // cleanupStaleAgents removes agents that haven't been seen recently func (d *Discovery) cleanupStaleAgents() { ticker := time.NewTicker(60 * time.Second) defer ticker.Stop() for { select { case <-d.ctx.Done(): return case <-ticker.C: d.removeStaleAgents() } } } // removeStaleAgents removes agents that haven't been seen in 5 minutes func (d *Discovery) removeStaleAgents() { d.mu.Lock() defer d.mu.Unlock() staleThreshold := time.Now().Add(-5 * time.Minute) for id, agent := range d.agents { if agent.LastSeen.Before(staleThreshold) { delete(d.agents, id) log.Info(). Str("agent_id", id). Time("last_seen", agent.LastSeen). Msg("๐Ÿงน Removed stale agent") } } }