WHOOSH/internal/p2p/discovery.go

package p2p

import (
	"context"
	"fmt"
	"net"
	"net/http"
	"sync"
	"time"

	"github.com/rs/zerolog/log"
)

// Agent represents a CHORUS agent discovered via P2P networking within the Docker Swarm cluster.
// This struct defines the complete metadata we track for each AI agent, enabling intelligent
// team formation and workload distribution.
//
// Design decision: We use JSON tags for API serialization since this data is exposed via
// REST endpoints to the WHOOSH UI. The omitempty tag on CurrentTeam allows agents to be
// unassigned without cluttering the JSON response with empty fields.
type Agent struct {
	ID             string    `json:"id"`               // Unique identifier (e.g., "chorus-agent-001")
	Name           string    `json:"name"`             // Human-readable name for UI display
	Status         string    `json:"status"`           // online/idle/working - current availability
	Capabilities   []string  `json:"capabilities"`     // Skills: ["go_development", "database_design"]
	Model          string    `json:"model"`            // LLM model ("llama3.1:8b", "codellama", etc.)
	Endpoint       string    `json:"endpoint"`         // HTTP API endpoint for task assignment
	LastSeen       time.Time `json:"last_seen"`        // Timestamp of last health check response
	TasksCompleted int       `json:"tasks_completed"`  // Performance metric for load balancing
	CurrentTeam    string    `json:"current_team,omitempty"` // Active team assignment (optional)
	P2PAddr        string    `json:"p2p_addr"`         // Peer-to-peer communication address
	ClusterID      string    `json:"cluster_id"`       // Docker Swarm cluster identifier
}

// Discovery handles P2P agent discovery for CHORUS agents within the Docker Swarm network.
// This service maintains a real-time registry of available agents and their capabilities,
// enabling the WHOOSH orchestrator to make intelligent team formation decisions.
//
// Design decisions:
// 1. RWMutex for thread-safe concurrent access (many readers, few writers)
// 2. Context-based cancellation for clean shutdown in Docker containers
// 3. Map storage for O(1) agent lookup by ID
// 4. Separate channels for different types of shutdown signaling
type Discovery struct {
	agents    map[string]*Agent    // Thread-safe registry of discovered agents
	mu        sync.RWMutex        // Protects agents map from concurrent access
	listeners []net.PacketConn    // UDP listeners for P2P broadcasts (future use)
	stopCh    chan struct{}       // Channel for shutdown coordination
	ctx       context.Context     // Context for graceful cancellation
	cancel    context.CancelFunc  // Function to trigger context cancellation
}

// NewDiscovery creates a new P2P discovery service with proper initialization.
// This constructor ensures all channels and contexts are properly set up for
// concurrent operation within the Docker Swarm environment.
//
// Implementation decision: We use context.WithCancel rather than a timeout context
// because agent discovery should run indefinitely until explicitly stopped.
func NewDiscovery() *Discovery {
	// Create cancellable context for graceful shutdown coordination
	ctx, cancel := context.WithCancel(context.Background())

	return &Discovery{
		agents: make(map[string]*Agent), // Initialize empty agent registry
		stopCh: make(chan struct{}),     // Unbuffered channel for shutdown signaling
		ctx:    ctx,                     // Parent context for all goroutines
		cancel: cancel,                  // Cancellation function for cleanup
	}
}

// Start begins listening for CHORUS agent P2P broadcasts and starts background services.
// This method launches goroutines for agent discovery and cleanup, enabling real-time
// monitoring of the CHORUS agent ecosystem.
//
// Implementation decision: We use goroutines rather than a worker pool because the
// workload is I/O bound (HTTP health checks) and we want immediate responsiveness.
func (d *Discovery) Start() error {
	log.Info().Msg("🔍 Starting CHORUS P2P agent discovery")

	// Launch agent discovery in separate goroutine to avoid blocking startup.
	// This continuously polls CHORUS agents via their health endpoints to
	// maintain an up-to-date registry of available agents and capabilities.
	go d.listenForBroadcasts()

	// Launch cleanup service to remove stale agents that haven't responded
	// to health checks. This prevents the UI from showing offline agents
	// and ensures accurate team formation decisions.
	go d.cleanupStaleAgents()

	return nil // Always succeeds since goroutines handle errors internally
}

// Stop shuts down the P2P discovery service
func (d *Discovery) Stop() error {
	log.Info().Msg("🔍 Stopping CHORUS P2P agent discovery")

	d.cancel()
	close(d.stopCh)

	for _, listener := range d.listeners {
		listener.Close()
	}

	return nil
}

// GetAgents returns all currently discovered agents
func (d *Discovery) GetAgents() []*Agent {
	d.mu.RLock()
	defer d.mu.RUnlock()

	agents := make([]*Agent, 0, len(d.agents))
	for _, agent := range d.agents {
		agents = append(agents, agent)
	}

	return agents
}

// listenForBroadcasts listens for CHORUS agent P2P broadcasts
func (d *Discovery) listenForBroadcasts() {
	// For now, simulate discovering the 9 CHORUS replicas that are running
	// In a full implementation, this would listen on UDP multicast for actual P2P broadcasts

	log.Info().Msg("🔍 Simulating P2P discovery of CHORUS agents")

	// Since we know CHORUS is running 9 replicas, let's simulate discovering them
	ticker := time.NewTicker(10 * time.Second)
	defer ticker.Stop()

	for {
		select {
		case <-d.ctx.Done():
			return
		case <-ticker.C:
			d.simulateAgentDiscovery()
		}
	}
}

// simulateAgentDiscovery discovers CHORUS agents by querying their health endpoints
func (d *Discovery) simulateAgentDiscovery() {
	log.Debug().Msg("🔍 Discovering CHORUS agents via health endpoints")

	// Query Docker DNS for CHORUS service tasks
	// In Docker Swarm, tasks can be discovered via the service name
	d.discoverCHORUSReplicas()
}

// discoverCHORUSReplicas discovers running CHORUS replicas in the Docker Swarm network.
// This function implements a discovery strategy that works around Docker Swarm's round-robin
// DNS by making multiple requests to discover individual service replicas.
//
// Technical challenges and solutions:
// 1. Docker Swarm round-robin DNS makes it hard to discover individual replicas
// 2. We use multiple HTTP requests to hit different replicas via load balancer
// 3. Generate synthetic agent IDs since CHORUS doesn't expose unique identifiers yet
// 4. Create realistic agent metadata for team formation algorithms
//
// This approach is a pragmatic MVP solution - in production, CHORUS agents would
// register themselves with unique IDs and capabilities via a proper discovery protocol.
func (d *Discovery) discoverCHORUSReplicas() {
	// HTTP client with short timeout for health checks. We use 5 seconds because:
	// 1. Health endpoints should respond quickly (< 1s typically)
	// 2. We're making multiple requests, so timeouts add up
	// 3. Docker Swarm networking is usually fast within cluster
	client := &http.Client{Timeout: 5 * time.Second}
	baseTime := time.Now() // Consistent timestamp for this discovery cycle

	// Local map to track agents discovered in this cycle. We use a map to ensure
	// we don't create duplicate agents if we happen to hit the same replica twice.
	discovered := make(map[string]*Agent)

	// Discovery strategy: Make multiple requests to the service endpoint.
	// Docker Swarm's round-robin load balancing will distribute these across
	// different replicas, allowing us to discover individual instances.
	// 15 attempts gives us good coverage of a 9-replica service.
	for attempt := 1; attempt <= 15; attempt++ {
		// Use the CHORUS health port (8081) rather than API port (8080) because:
		// 1. Health endpoints are lightweight and fast
		// 2. They don't require authentication or complex request processing
		// 3. They're designed to be called frequently for monitoring
		endpoint := "http://chorus:8081/health"

		// Make the health check request. Docker Swarm will route this to one
		// of the available CHORUS replicas based on its load balancing algorithm.
		resp, err := client.Get(endpoint)
		if err != nil {
			// Log connection failures at debug level since some failures are expected
			// during service startup or when replicas are being updated.
			log.Debug().
				Err(err).
				Str("endpoint", endpoint).
				Int("attempt", attempt).
				Msg("Failed to query CHORUS health endpoint")
			continue
		}

		// Process successful health check responses
		if resp.StatusCode == http.StatusOK {
			// Generate a synthetic agent ID since CHORUS doesn't provide unique IDs yet.
			// In production, this would come from the health check response body.
			// Using zero-padded numbers ensures consistent sorting in the UI.
			agentID := fmt.Sprintf("chorus-agent-%03d", len(discovered)+1)

			// Only create new agent if we haven't seen this ID before in this cycle
			if _, exists := discovered[agentID]; !exists {
				// Create agent with realistic metadata for team formation.
				// These capabilities and models would normally come from the
				// actual CHORUS agent configuration.
				agent := &Agent{
					ID:   agentID,
					Name: fmt.Sprintf("CHORUS Agent %d", len(discovered)+1),
					Status: "online", // Default to online since health check succeeded

					// Standard CHORUS agent capabilities - these define what types of
					// tasks the agent can handle in team formation algorithms
					Capabilities: []string{"general_development", "task_coordination", "ai_integration"},

					Model:    "llama3.1:8b",    // Standard model for CHORUS agents
					Endpoint: "http://chorus:8080", // API port for task assignment
					LastSeen: baseTime,         // Consistent timestamp for this discovery cycle

					// Synthetic task completion count for load balancing algorithms.
					// In production, this would be actual metrics from agent performance.
					TasksCompleted: len(discovered) * 2,

					P2PAddr:   "chorus:9000",           // P2P communication port
					ClusterID: "docker-unified-stack",   // Docker Swarm cluster identifier
				}

				// Add some variety to agent status for realistic team formation testing.
				// This simulates real-world scenarios where agents have different availability.
				if len(discovered)%3 == 0 {
					agent.Status = "idle" // Every third agent is idle
				} else if len(discovered) == 6 {
					// One agent is actively working on a team assignment
					agent.Status = "working"
					agent.CurrentTeam = "development-team-alpha"
				}

				// Add to discovered agents and log the discovery
				discovered[agentID] = agent
				log.Debug().
					Str("agent_id", agentID).
					Str("status", agent.Status).
					Msg("🤖 Discovered CHORUS agent")
			}
		}
		resp.Body.Close()

		// Stop discovery once we've found the expected number of agents.
		// This prevents unnecessary HTTP requests and speeds up discovery cycles.
		if len(discovered) >= 9 {
			break
		}

		// Brief pause between requests to avoid overwhelming the service and
		// to allow Docker Swarm's load balancer to potentially route to different replicas.
		time.Sleep(100 * time.Millisecond)
	}

	// Add all discovered agents
	for _, agent := range discovered {
		d.addOrUpdateAgent(agent)
	}

	log.Info().
		Int("discovered_count", len(discovered)).
		Msg("🎭 CHORUS agent discovery completed")
}

// addOrUpdateAgent adds or updates an agent in the discovery cache
func (d *Discovery) addOrUpdateAgent(agent *Agent) {
	d.mu.Lock()
	defer d.mu.Unlock()

	existing, exists := d.agents[agent.ID]
	if exists {
		// Update existing agent
		existing.Status = agent.Status
		existing.LastSeen = agent.LastSeen
		existing.TasksCompleted = agent.TasksCompleted
		existing.CurrentTeam = agent.CurrentTeam
	} else {
		// Add new agent
		d.agents[agent.ID] = agent
		log.Info().
			Str("agent_id", agent.ID).
			Str("p2p_addr", agent.P2PAddr).
			Msg("🤖 Discovered new CHORUS agent")
	}
}

// cleanupStaleAgents removes agents that haven't been seen recently
func (d *Discovery) cleanupStaleAgents() {
	ticker := time.NewTicker(60 * time.Second)
	defer ticker.Stop()

	for {
		select {
		case <-d.ctx.Done():
			return
		case <-ticker.C:
			d.removeStaleAgents()
		}
	}
}

// removeStaleAgents removes agents that haven't been seen in 5 minutes
func (d *Discovery) removeStaleAgents() {
	d.mu.Lock()
	defer d.mu.Unlock()

	staleThreshold := time.Now().Add(-5 * time.Minute)

	for id, agent := range d.agents {
		if agent.LastSeen.Before(staleThreshold) {
			delete(d.agents, id)
			log.Info().
				Str("agent_id", id).
				Time("last_seen", agent.LastSeen).
				Msg("🧹 Removed stale agent")
		}
	}
}