Files
WHOOSH/internal/p2p/discovery.go
Claude Code 56ea52b743 Implement initial scan logic and council formation for WHOOSH project kickoffs
- Replace incremental sync with full scan for new repositories
- Add initial_scan status to bypass Since parameter filtering
- Implement council formation detection for Design Brief issues
- Add version display to WHOOSH UI header for debugging
- Fix Docker token authentication with trailing newline removal
- Add comprehensive council orchestration with Docker Swarm integration
- Include BACKBEAT prototype integration for distributed timing
- Support council-specific agent roles and deployment strategies
- Transition repositories to active status after content discovery

Key architectural improvements:
- Full scan approach for new project detection vs incremental sync
- Council formation triggered by chorus-entrypoint labeled Design Briefs
- Proper token handling and authentication for Gitea API calls
- Support for both initial discovery and ongoing task monitoring

This enables autonomous project kickoff workflows where Design Brief issues
automatically trigger formation of specialized agent councils for new projects.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-12 09:49:36 +10:00

257 lines
8.7 KiB
Go

package p2p
import (
"context"
"net"
"net/http"
"sync"
"time"
"github.com/rs/zerolog/log"
)
// Agent represents a CHORUS agent discovered via P2P networking within the Docker Swarm cluster.
// This struct defines the complete metadata we track for each AI agent, enabling intelligent
// team formation and workload distribution.
//
// Design decision: We use JSON tags for API serialization since this data is exposed via
// REST endpoints to the WHOOSH UI. The omitempty tag on CurrentTeam allows agents to be
// unassigned without cluttering the JSON response with empty fields.
type Agent struct {
ID string `json:"id"` // Unique identifier (e.g., "chorus-agent-001")
Name string `json:"name"` // Human-readable name for UI display
Status string `json:"status"` // online/idle/working - current availability
Capabilities []string `json:"capabilities"` // Skills: ["go_development", "database_design"]
Model string `json:"model"` // LLM model ("llama3.1:8b", "codellama", etc.)
Endpoint string `json:"endpoint"` // HTTP API endpoint for task assignment
LastSeen time.Time `json:"last_seen"` // Timestamp of last health check response
TasksCompleted int `json:"tasks_completed"` // Performance metric for load balancing
CurrentTeam string `json:"current_team,omitempty"` // Active team assignment (optional)
P2PAddr string `json:"p2p_addr"` // Peer-to-peer communication address
ClusterID string `json:"cluster_id"` // Docker Swarm cluster identifier
}
// Discovery handles P2P agent discovery for CHORUS agents within the Docker Swarm network.
// This service maintains a real-time registry of available agents and their capabilities,
// enabling the WHOOSH orchestrator to make intelligent team formation decisions.
//
// Design decisions:
// 1. RWMutex for thread-safe concurrent access (many readers, few writers)
// 2. Context-based cancellation for clean shutdown in Docker containers
// 3. Map storage for O(1) agent lookup by ID
// 4. Separate channels for different types of shutdown signaling
type Discovery struct {
agents map[string]*Agent // Thread-safe registry of discovered agents
mu sync.RWMutex // Protects agents map from concurrent access
listeners []net.PacketConn // UDP listeners for P2P broadcasts (future use)
stopCh chan struct{} // Channel for shutdown coordination
ctx context.Context // Context for graceful cancellation
cancel context.CancelFunc // Function to trigger context cancellation
}
// NewDiscovery creates a new P2P discovery service with proper initialization.
// This constructor ensures all channels and contexts are properly set up for
// concurrent operation within the Docker Swarm environment.
//
// Implementation decision: We use context.WithCancel rather than a timeout context
// because agent discovery should run indefinitely until explicitly stopped.
func NewDiscovery() *Discovery {
// Create cancellable context for graceful shutdown coordination
ctx, cancel := context.WithCancel(context.Background())
return &Discovery{
agents: make(map[string]*Agent), // Initialize empty agent registry
stopCh: make(chan struct{}), // Unbuffered channel for shutdown signaling
ctx: ctx, // Parent context for all goroutines
cancel: cancel, // Cancellation function for cleanup
}
}
// Start begins listening for CHORUS agent P2P broadcasts and starts background services.
// This method launches goroutines for agent discovery and cleanup, enabling real-time
// monitoring of the CHORUS agent ecosystem.
//
// Implementation decision: We use goroutines rather than a worker pool because the
// workload is I/O bound (HTTP health checks) and we want immediate responsiveness.
func (d *Discovery) Start() error {
log.Info().Msg("🔍 Starting CHORUS P2P agent discovery")
// Launch agent discovery in separate goroutine to avoid blocking startup.
// This continuously polls CHORUS agents via their health endpoints to
// maintain an up-to-date registry of available agents and capabilities.
go d.listenForBroadcasts()
// Launch cleanup service to remove stale agents that haven't responded
// to health checks. This prevents the UI from showing offline agents
// and ensures accurate team formation decisions.
go d.cleanupStaleAgents()
return nil // Always succeeds since goroutines handle errors internally
}
// Stop shuts down the P2P discovery service
func (d *Discovery) Stop() error {
log.Info().Msg("🔍 Stopping CHORUS P2P agent discovery")
d.cancel()
close(d.stopCh)
for _, listener := range d.listeners {
listener.Close()
}
return nil
}
// GetAgents returns all currently discovered agents
func (d *Discovery) GetAgents() []*Agent {
d.mu.RLock()
defer d.mu.RUnlock()
agents := make([]*Agent, 0, len(d.agents))
for _, agent := range d.agents {
agents = append(agents, agent)
}
return agents
}
// listenForBroadcasts listens for CHORUS agent P2P broadcasts
func (d *Discovery) listenForBroadcasts() {
log.Info().Msg("🔍 Starting real CHORUS agent discovery")
// Real discovery polling every 30 seconds to avoid overwhelming the service
ticker := time.NewTicker(30 * time.Second)
defer ticker.Stop()
// Run initial discovery immediately
d.discoverRealCHORUSAgents()
for {
select {
case <-d.ctx.Done():
return
case <-ticker.C:
d.discoverRealCHORUSAgents()
}
}
}
// discoverRealCHORUSAgents discovers actual CHORUS agents by querying their health endpoints
func (d *Discovery) discoverRealCHORUSAgents() {
log.Debug().Msg("🔍 Discovering real CHORUS agents via health endpoints")
// Query the actual CHORUS service to see what's running
d.queryActualCHORUSService()
}
// queryActualCHORUSService queries the real CHORUS service to discover actual running agents.
// This function replaces the previous simulation and discovers only what's actually running.
func (d *Discovery) queryActualCHORUSService() {
client := &http.Client{Timeout: 10 * time.Second}
// Try to query the CHORUS health endpoint
endpoint := "http://chorus:8081/health"
resp, err := client.Get(endpoint)
if err != nil {
log.Debug().
Err(err).
Str("endpoint", endpoint).
Msg("Failed to reach CHORUS health endpoint")
return
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
log.Debug().
Int("status_code", resp.StatusCode).
Str("endpoint", endpoint).
Msg("CHORUS health endpoint returned non-200 status")
return
}
// CHORUS is responding, so create a single agent entry for the actual instance
agentID := "chorus-agent-001"
agent := &Agent{
ID: agentID,
Name: "CHORUS Agent",
Status: "online",
Capabilities: []string{
"general_development",
"task_coordination",
"ai_integration",
"code_analysis",
"autonomous_development",
},
Model: "llama3.1:8b",
Endpoint: "http://chorus:8080",
LastSeen: time.Now(),
TasksCompleted: 0, // Will be updated by actual task completion tracking
P2PAddr: "chorus:9000",
ClusterID: "docker-unified-stack",
}
// Check if CHORUS has an API endpoint that provides more detailed info
// For now, we'll just use the single discovered instance
d.addOrUpdateAgent(agent)
log.Info().
Str("agent_id", agentID).
Str("endpoint", endpoint).
Msg("🤖 Discovered real CHORUS agent")
}
// addOrUpdateAgent adds or updates an agent in the discovery cache
func (d *Discovery) addOrUpdateAgent(agent *Agent) {
d.mu.Lock()
defer d.mu.Unlock()
existing, exists := d.agents[agent.ID]
if exists {
// Update existing agent
existing.Status = agent.Status
existing.LastSeen = agent.LastSeen
existing.TasksCompleted = agent.TasksCompleted
existing.CurrentTeam = agent.CurrentTeam
} else {
// Add new agent
d.agents[agent.ID] = agent
log.Info().
Str("agent_id", agent.ID).
Str("p2p_addr", agent.P2PAddr).
Msg("🤖 Discovered new CHORUS agent")
}
}
// cleanupStaleAgents removes agents that haven't been seen recently
func (d *Discovery) cleanupStaleAgents() {
ticker := time.NewTicker(60 * time.Second)
defer ticker.Stop()
for {
select {
case <-d.ctx.Done():
return
case <-ticker.C:
d.removeStaleAgents()
}
}
}
// removeStaleAgents removes agents that haven't been seen in 5 minutes
func (d *Discovery) removeStaleAgents() {
d.mu.Lock()
defer d.mu.Unlock()
staleThreshold := time.Now().Add(-5 * time.Minute)
for id, agent := range d.agents {
if agent.LastSeen.Before(staleThreshold) {
delete(d.agents, id)
log.Info().
Str("agent_id", id).
Time("last_seen", agent.LastSeen).
Msg("🧹 Removed stale agent")
}
}
}