package election import ( "context" "encoding/json" "fmt" "log" "math/rand" "sync" "time" "github.com/anthonyrawlins/bzzz/pkg/config" "github.com/anthonyrawlins/bzzz/pubsub" libp2p "github.com/libp2p/go-libp2p/core/host" "github.com/libp2p/go-libp2p/core/peer" ) // ElectionTrigger represents why an election was triggered type ElectionTrigger string const ( TriggerHeartbeatTimeout ElectionTrigger = "admin_heartbeat_timeout" TriggerDiscoveryFailure ElectionTrigger = "no_admin_discovered" TriggerSplitBrain ElectionTrigger = "split_brain_detected" TriggerQuorumRestored ElectionTrigger = "quorum_restored" TriggerManual ElectionTrigger = "manual_trigger" ) // ElectionState represents the current election state type ElectionState string const ( StateIdle ElectionState = "idle" StateDiscovering ElectionState = "discovering" StateElecting ElectionState = "electing" StateReconstructing ElectionState = "reconstructing_keys" StateComplete ElectionState = "complete" ) // AdminCandidate represents a node candidate for admin role type AdminCandidate struct { NodeID string `json:"node_id"` PeerID peer.ID `json:"peer_id"` Capabilities []string `json:"capabilities"` Uptime time.Duration `json:"uptime"` Resources ResourceMetrics `json:"resources"` Experience time.Duration `json:"experience"` Score float64 `json:"score"` Metadata map[string]interface{} `json:"metadata,omitempty"` } // ResourceMetrics holds node resource information for election scoring type ResourceMetrics struct { CPUUsage float64 `json:"cpu_usage"` MemoryUsage float64 `json:"memory_usage"` DiskUsage float64 `json:"disk_usage"` NetworkQuality float64 `json:"network_quality"` } // ElectionMessage represents election-related messages type ElectionMessage struct { Type string `json:"type"` NodeID string `json:"node_id"` Timestamp time.Time `json:"timestamp"` Term int `json:"term"` Data interface{} `json:"data,omitempty"` } // ElectionManager handles admin election coordination type ElectionManager struct { ctx context.Context cancel context.CancelFunc config *config.Config host libp2p.Host pubsub *pubsub.PubSub nodeID string // Election state mu sync.RWMutex state ElectionState currentTerm int lastHeartbeat time.Time currentAdmin string candidates map[string]*AdminCandidate votes map[string]string // voter -> candidate // Timers and channels heartbeatTimer *time.Timer discoveryTimer *time.Timer electionTimer *time.Timer electionTrigger chan ElectionTrigger // Callbacks onAdminChanged func(oldAdmin, newAdmin string) onElectionComplete func(winner string) startTime time.Time } // NewElectionManager creates a new election manager func NewElectionManager( ctx context.Context, cfg *config.Config, host libp2p.Host, ps *pubsub.PubSub, nodeID string, ) *ElectionManager { electionCtx, cancel := context.WithCancel(ctx) em := &ElectionManager{ ctx: electionCtx, cancel: cancel, config: cfg, host: host, pubsub: ps, nodeID: nodeID, state: StateIdle, candidates: make(map[string]*AdminCandidate), votes: make(map[string]string), electionTrigger: make(chan ElectionTrigger, 10), startTime: time.Now(), } return em } // Start begins the election management system func (em *ElectionManager) Start() error { log.Printf("πŸ—³οΈ Starting election manager for node %s", em.nodeID) // Subscribe to election-related messages if err := em.pubsub.Subscribe("bzzz/election/v1", em.handleElectionMessage); err != nil { return fmt.Errorf("failed to subscribe to election messages: %w", err) } if err := em.pubsub.Subscribe("bzzz/admin/heartbeat/v1", em.handleAdminHeartbeat); err != nil { return fmt.Errorf("failed to subscribe to admin heartbeat: %w", err) } // Start discovery process go em.startDiscoveryLoop() // Start election coordinator go em.electionCoordinator() log.Printf("βœ… Election manager started") return nil } // Stop shuts down the election manager func (em *ElectionManager) Stop() { log.Printf("πŸ›‘ Stopping election manager") em.cancel() em.mu.Lock() defer em.mu.Unlock() if em.heartbeatTimer != nil { em.heartbeatTimer.Stop() } if em.discoveryTimer != nil { em.discoveryTimer.Stop() } if em.electionTimer != nil { em.electionTimer.Stop() } } // TriggerElection manually triggers an election func (em *ElectionManager) TriggerElection(trigger ElectionTrigger) { select { case em.electionTrigger <- trigger: log.Printf("πŸ—³οΈ Election triggered: %s", trigger) default: log.Printf("⚠️ Election trigger buffer full, ignoring: %s", trigger) } } // GetCurrentAdmin returns the current admin node ID func (em *ElectionManager) GetCurrentAdmin() string { em.mu.RLock() defer em.mu.RUnlock() return em.currentAdmin } // IsCurrentAdmin checks if this node is the current admin func (em *ElectionManager) IsCurrentAdmin() bool { return em.GetCurrentAdmin() == em.nodeID } // GetElectionState returns the current election state func (em *ElectionManager) GetElectionState() ElectionState { em.mu.RLock() defer em.mu.RUnlock() return em.state } // SetCallbacks sets election event callbacks func (em *ElectionManager) SetCallbacks( onAdminChanged func(oldAdmin, newAdmin string), onElectionComplete func(winner string), ) { em.onAdminChanged = onAdminChanged em.onElectionComplete = onElectionComplete } // startDiscoveryLoop starts the admin discovery loop func (em *ElectionManager) startDiscoveryLoop() { log.Printf("πŸ” Starting admin discovery loop") for { select { case <-em.ctx.Done(): return case <-time.After(em.config.Security.ElectionConfig.DiscoveryTimeout): em.performAdminDiscovery() } } } // performAdminDiscovery attempts to discover existing admin func (em *ElectionManager) performAdminDiscovery() { em.mu.Lock() currentState := em.state lastHeartbeat := em.lastHeartbeat em.mu.Unlock() // Only discover if we're idle or the heartbeat is stale if currentState != StateIdle { return } // Check if admin heartbeat has timed out if !lastHeartbeat.IsZero() && time.Since(lastHeartbeat) > em.config.Security.ElectionConfig.HeartbeatTimeout { log.Printf("⚰️ Admin heartbeat timeout detected (last: %v)", lastHeartbeat) em.TriggerElection(TriggerHeartbeatTimeout) return } // If we haven't heard from an admin recently, try to discover one if lastHeartbeat.IsZero() || time.Since(lastHeartbeat) > em.config.Security.ElectionConfig.DiscoveryTimeout/2 { em.sendDiscoveryRequest() } } // sendDiscoveryRequest broadcasts admin discovery request func (em *ElectionManager) sendDiscoveryRequest() { discoveryMsg := ElectionMessage{ Type: "admin_discovery_request", NodeID: em.nodeID, Timestamp: time.Now(), } if err := em.publishElectionMessage(discoveryMsg); err != nil { log.Printf("❌ Failed to send admin discovery request: %v", err) } } // electionCoordinator handles the main election logic func (em *ElectionManager) electionCoordinator() { log.Printf("🎯 Election coordinator started") for { select { case <-em.ctx.Done(): return case trigger := <-em.electionTrigger: em.handleElectionTrigger(trigger) } } } // handleElectionTrigger processes election triggers func (em *ElectionManager) handleElectionTrigger(trigger ElectionTrigger) { log.Printf("πŸ”₯ Processing election trigger: %s", trigger) em.mu.Lock() currentState := em.state em.mu.Unlock() // Ignore triggers if we're already in an election if currentState != StateIdle { log.Printf("⏸️ Ignoring election trigger, current state: %s", currentState) return } // Begin election process em.beginElection(trigger) } // beginElection starts a new election func (em *ElectionManager) beginElection(trigger ElectionTrigger) { log.Printf("πŸ—³οΈ Beginning election due to: %s", trigger) em.mu.Lock() em.state = StateElecting em.currentTerm++ term := em.currentTerm em.candidates = make(map[string]*AdminCandidate) em.votes = make(map[string]string) em.mu.Unlock() // Announce candidacy if this node can be admin if em.canBeAdmin() { em.announceCandidacy(term) } // Send election announcement electionMsg := ElectionMessage{ Type: "election_started", NodeID: em.nodeID, Timestamp: time.Now(), Term: term, Data: map[string]interface{}{ "trigger": string(trigger), }, } if err := em.publishElectionMessage(electionMsg); err != nil { log.Printf("❌ Failed to announce election start: %v", err) } // Start election timeout em.startElectionTimeout(term) } // canBeAdmin checks if this node can become admin func (em *ElectionManager) canBeAdmin() bool { // Check if node has admin capabilities for _, cap := range em.config.Agent.Capabilities { if cap == "admin_election" || cap == "context_curation" || cap == "project_manager" { return true } } return false } // announceCandidacy announces this node as an election candidate func (em *ElectionManager) announceCandidacy(term int) { uptime := time.Since(em.startTime) candidate := &AdminCandidate{ NodeID: em.nodeID, PeerID: em.host.ID(), Capabilities: em.config.Agent.Capabilities, Uptime: uptime, Resources: em.getResourceMetrics(), Experience: uptime, // For now, use uptime as experience Metadata: map[string]interface{}{ "specialization": em.config.Agent.Specialization, "models": em.config.Agent.Models, }, } // Calculate candidate score candidate.Score = em.calculateCandidateScore(candidate) candidacyMsg := ElectionMessage{ Type: "candidacy_announcement", NodeID: em.nodeID, Timestamp: time.Now(), Term: term, Data: candidate, } log.Printf("πŸ“’ Announcing candidacy (score: %.2f)", candidate.Score) if err := em.publishElectionMessage(candidacyMsg); err != nil { log.Printf("❌ Failed to announce candidacy: %v", err) } } // getResourceMetrics collects current node resource metrics func (em *ElectionManager) getResourceMetrics() ResourceMetrics { // TODO: Implement actual resource collection // For now, return simulated values return ResourceMetrics{ CPUUsage: rand.Float64() * 0.5, // 0-50% CPU MemoryUsage: rand.Float64() * 0.7, // 0-70% Memory DiskUsage: rand.Float64() * 0.6, // 0-60% Disk NetworkQuality: 0.8 + rand.Float64()*0.2, // 80-100% Network Quality } } // calculateCandidateScore calculates election score for a candidate func (em *ElectionManager) calculateCandidateScore(candidate *AdminCandidate) float64 { scoring := em.config.Security.ElectionConfig.LeadershipScoring // Normalize metrics to 0-1 range uptimeScore := min(1.0, candidate.Uptime.Hours()/24.0) // Up to 24 hours gets full score // Capability score - higher for admin/coordination capabilities capabilityScore := 0.0 adminCapabilities := []string{"admin_election", "context_curation", "key_reconstruction", "semantic_analysis", "project_manager"} for _, cap := range candidate.Capabilities { for _, adminCap := range adminCapabilities { if cap == adminCap { weight := 0.25 // Default weight // Project manager capabilities get higher weight if adminCap == "project_manager" || adminCap == "context_curation" { weight = 0.35 } capabilityScore += weight } } } capabilityScore = min(1.0, capabilityScore) // Resource score - lower usage is better resourceScore := (1.0 - candidate.Resources.CPUUsage) * 0.3 + (1.0 - candidate.Resources.MemoryUsage) * 0.3 + (1.0 - candidate.Resources.DiskUsage) * 0.2 + candidate.Resources.NetworkQuality * 0.2 experienceScore := min(1.0, candidate.Experience.Hours()/168.0) // Up to 1 week gets full score // Weighted final score finalScore := uptimeScore*scoring.UptimeWeight + capabilityScore*scoring.CapabilityWeight + resourceScore*scoring.ResourceWeight + candidate.Resources.NetworkQuality*scoring.NetworkWeight + experienceScore*scoring.ExperienceWeight return finalScore } // startElectionTimeout starts the election timeout timer func (em *ElectionManager) startElectionTimeout(term int) { em.mu.Lock() defer em.mu.Unlock() if em.electionTimer != nil { em.electionTimer.Stop() } em.electionTimer = time.AfterFunc(em.config.Security.ElectionConfig.ElectionTimeout, func() { em.completeElection(term) }) } // completeElection completes the election and announces winner func (em *ElectionManager) completeElection(term int) { em.mu.Lock() defer em.mu.Unlock() // Verify this is still the current term if term != em.currentTerm { log.Printf("⏰ Election timeout for old term %d, ignoring", term) return } log.Printf("⏰ Election timeout reached, tallying votes") // Find the winning candidate winner := em.findElectionWinner() if winner == nil { log.Printf("❌ No winner found in election") em.state = StateIdle // Trigger another election after a delay go func() { time.Sleep(em.config.Security.ElectionConfig.DiscoveryBackoff) em.TriggerElection(TriggerDiscoveryFailure) }() return } log.Printf("πŸ† Election winner: %s (score: %.2f)", winner.NodeID, winner.Score) // Update admin oldAdmin := em.currentAdmin em.currentAdmin = winner.NodeID em.state = StateComplete // Announce the winner winnerMsg := ElectionMessage{ Type: "election_winner", NodeID: em.nodeID, Timestamp: time.Now(), Term: term, Data: winner, } em.mu.Unlock() // Unlock before publishing if err := em.publishElectionMessage(winnerMsg); err != nil { log.Printf("❌ Failed to announce election winner: %v", err) } // Trigger callbacks if em.onAdminChanged != nil { em.onAdminChanged(oldAdmin, winner.NodeID) } if em.onElectionComplete != nil { em.onElectionComplete(winner.NodeID) } em.mu.Lock() em.state = StateIdle // Reset state for next election } // findElectionWinner determines the election winner based on votes and scores func (em *ElectionManager) findElectionWinner() *AdminCandidate { if len(em.candidates) == 0 { return nil } // Count votes for each candidate voteCounts := make(map[string]int) totalVotes := 0 // Initialize vote counts for all candidates for candidateID := range em.candidates { voteCounts[candidateID] = 0 } // Tally actual votes for _, candidateID := range em.votes { if _, exists := em.candidates[candidateID]; exists { voteCounts[candidateID]++ totalVotes++ } } // If no votes cast, fall back to highest scoring candidate if totalVotes == 0 { var winner *AdminCandidate highestScore := -1.0 for _, candidate := range em.candidates { if candidate.Score > highestScore { highestScore = candidate.Score winner = candidate } } return winner } // Find candidate with most votes var winner *AdminCandidate maxVotes := -1 highestScore := -1.0 for candidateID, voteCount := range voteCounts { candidate := em.candidates[candidateID] if voteCount > maxVotes || (voteCount == maxVotes && candidate.Score > highestScore) { maxVotes = voteCount highestScore = candidate.Score winner = candidate } } log.Printf("πŸ—³οΈ Election results: %d total votes, winner: %s with %d votes (score: %.2f)", totalVotes, winner.NodeID, maxVotes, winner.Score) return winner } // handleElectionMessage processes incoming election messages func (em *ElectionManager) handleElectionMessage(data []byte) { var msg ElectionMessage if err := json.Unmarshal(data, &msg); err != nil { log.Printf("❌ Failed to unmarshal election message: %v", err) return } // Ignore messages from ourselves if msg.NodeID == em.nodeID { return } switch msg.Type { case "admin_discovery_request": em.handleAdminDiscoveryRequest(msg) case "admin_discovery_response": em.handleAdminDiscoveryResponse(msg) case "election_started": em.handleElectionStarted(msg) case "candidacy_announcement": em.handleCandidacyAnnouncement(msg) case "election_vote": em.handleElectionVote(msg) case "election_winner": em.handleElectionWinner(msg) } } // handleAdminDiscoveryRequest responds to admin discovery requests func (em *ElectionManager) handleAdminDiscoveryRequest(msg ElectionMessage) { em.mu.RLock() currentAdmin := em.currentAdmin state := em.state em.mu.RUnlock() // Only respond if we know who the current admin is and we're idle if currentAdmin != "" && state == StateIdle { responseMsg := ElectionMessage{ Type: "admin_discovery_response", NodeID: em.nodeID, Timestamp: time.Now(), Data: map[string]interface{}{ "current_admin": currentAdmin, }, } if err := em.publishElectionMessage(responseMsg); err != nil { log.Printf("❌ Failed to send admin discovery response: %v", err) } } } // handleAdminDiscoveryResponse processes admin discovery responses func (em *ElectionManager) handleAdminDiscoveryResponse(msg ElectionMessage) { if data, ok := msg.Data.(map[string]interface{}); ok { if admin, ok := data["current_admin"].(string); ok && admin != "" { em.mu.Lock() if em.currentAdmin == "" { log.Printf("πŸ“‘ Discovered admin: %s", admin) em.currentAdmin = admin } em.mu.Unlock() } } } // handleElectionStarted processes election start announcements func (em *ElectionManager) handleElectionStarted(msg ElectionMessage) { em.mu.Lock() defer em.mu.Unlock() // If we receive an election start with a higher term, join the election if msg.Term > em.currentTerm { log.Printf("πŸ”„ Joining election with term %d", msg.Term) em.currentTerm = msg.Term em.state = StateElecting em.candidates = make(map[string]*AdminCandidate) em.votes = make(map[string]string) // Announce candidacy if eligible if em.canBeAdmin() { go em.announceCandidacy(msg.Term) } } } // handleCandidacyAnnouncement processes candidacy announcements func (em *ElectionManager) handleCandidacyAnnouncement(msg ElectionMessage) { em.mu.Lock() defer em.mu.Unlock() // Only process if it's for the current term if msg.Term != em.currentTerm { return } // Convert data to candidate struct candidateData, err := json.Marshal(msg.Data) if err != nil { log.Printf("❌ Failed to marshal candidate data: %v", err) return } var candidate AdminCandidate if err := json.Unmarshal(candidateData, &candidate); err != nil { log.Printf("❌ Failed to unmarshal candidate: %v", err) return } log.Printf("πŸ“ Received candidacy from %s (score: %.2f)", candidate.NodeID, candidate.Score) em.candidates[candidate.NodeID] = &candidate } // handleElectionVote processes election votes func (em *ElectionManager) handleElectionVote(msg ElectionMessage) { em.mu.Lock() defer em.mu.Unlock() // Extract vote data voteData, ok := msg.Data.(map[string]interface{}) if !ok { log.Printf("❌ Invalid vote data format from %s", msg.NodeID) return } candidateID, ok := voteData["candidate"].(string) if !ok { log.Printf("❌ Invalid candidate ID in vote from %s", msg.NodeID) return } // Validate candidate exists if _, exists := em.candidates[candidateID]; !exists { log.Printf("❌ Vote for unknown candidate %s from %s", candidateID, msg.NodeID) return } // Prevent duplicate voting if existingVote, exists := em.votes[msg.NodeID]; exists { log.Printf("⚠️ Node %s already voted for %s, updating to %s", msg.NodeID, existingVote, candidateID) } // Record the vote em.votes[msg.NodeID] = candidateID log.Printf("πŸ—³οΈ Recorded vote from %s for candidate %s", msg.NodeID, candidateID) } // handleElectionWinner processes election winner announcements func (em *ElectionManager) handleElectionWinner(msg ElectionMessage) { candidateData, err := json.Marshal(msg.Data) if err != nil { log.Printf("❌ Failed to marshal winner data: %v", err) return } var winner AdminCandidate if err := json.Unmarshal(candidateData, &winner); err != nil { log.Printf("❌ Failed to unmarshal winner: %v", err) return } em.mu.Lock() oldAdmin := em.currentAdmin em.currentAdmin = winner.NodeID em.state = StateIdle em.mu.Unlock() log.Printf("πŸ‘‘ New admin elected: %s", winner.NodeID) // Trigger callback if em.onAdminChanged != nil { em.onAdminChanged(oldAdmin, winner.NodeID) } } // handleAdminHeartbeat processes admin heartbeat messages func (em *ElectionManager) handleAdminHeartbeat(data []byte) { var heartbeat struct { NodeID string `json:"node_id"` Timestamp time.Time `json:"timestamp"` } if err := json.Unmarshal(data, &heartbeat); err != nil { log.Printf("❌ Failed to unmarshal heartbeat: %v", err) return } em.mu.Lock() defer em.mu.Unlock() // Update admin and heartbeat timestamp if em.currentAdmin == "" || em.currentAdmin == heartbeat.NodeID { em.currentAdmin = heartbeat.NodeID em.lastHeartbeat = heartbeat.Timestamp } } // publishElectionMessage publishes an election message func (em *ElectionManager) publishElectionMessage(msg ElectionMessage) error { data, err := json.Marshal(msg) if err != nil { return fmt.Errorf("failed to marshal election message: %w", err) } return em.pubsub.Publish("bzzz/election/v1", data) } // SendAdminHeartbeat sends admin heartbeat (only if this node is admin) func (em *ElectionManager) SendAdminHeartbeat() error { if !em.IsCurrentAdmin() { return fmt.Errorf("not current admin") } heartbeat := struct { NodeID string `json:"node_id"` Timestamp time.Time `json:"timestamp"` }{ NodeID: em.nodeID, Timestamp: time.Now(), } data, err := json.Marshal(heartbeat) if err != nil { return fmt.Errorf("failed to marshal heartbeat: %w", err) } return em.pubsub.Publish("bzzz/admin/heartbeat/v1", data) } // min returns the minimum of two float64 values func min(a, b float64) float64 { if a < b { return a } return b }