package election import ( "context" "encoding/json" "fmt" "log" "math/rand" "os" "sync" "time" "chorus/pkg/config" "chorus/pubsub" libp2p "github.com/libp2p/go-libp2p/core/host" "github.com/libp2p/go-libp2p/core/peer" ) // ElectionTrigger represents why an election was triggered type ElectionTrigger string const ( TriggerHeartbeatTimeout ElectionTrigger = "admin_heartbeat_timeout" TriggerDiscoveryFailure ElectionTrigger = "no_admin_discovered" TriggerSplitBrain ElectionTrigger = "split_brain_detected" TriggerQuorumRestored ElectionTrigger = "quorum_restored" TriggerManual ElectionTrigger = "manual_trigger" ) // ElectionState represents the current election state type ElectionState string const ( electionTopic = "CHORUS/election/v1" adminHeartbeatTopic = "CHORUS/admin/heartbeat/v1" ) const ( StateIdle ElectionState = "idle" StateDiscovering ElectionState = "discovering" StateElecting ElectionState = "electing" StateReconstructing ElectionState = "reconstructing_keys" StateComplete ElectionState = "complete" ) // AdminCandidate represents a node candidate for admin role type AdminCandidate struct { NodeID string `json:"node_id"` PeerID peer.ID `json:"peer_id"` Capabilities []string `json:"capabilities"` Uptime time.Duration `json:"uptime"` Resources ResourceMetrics `json:"resources"` Experience time.Duration `json:"experience"` Score float64 `json:"score"` Metadata map[string]interface{} `json:"metadata,omitempty"` } // ResourceMetrics holds node resource information for election scoring type ResourceMetrics struct { CPUUsage float64 `json:"cpu_usage"` MemoryUsage float64 `json:"memory_usage"` DiskUsage float64 `json:"disk_usage"` NetworkQuality float64 `json:"network_quality"` } // ElectionMessage represents election-related messages type ElectionMessage struct { Type string `json:"type"` NodeID string `json:"node_id"` Timestamp time.Time `json:"timestamp"` Term int `json:"term"` Data interface{} `json:"data,omitempty"` } // ElectionManager handles admin election coordination type ElectionManager struct { ctx context.Context cancel context.CancelFunc config *config.Config host libp2p.Host pubsub *pubsub.PubSub nodeID string // Election state mu sync.RWMutex state ElectionState currentTerm int lastHeartbeat time.Time currentAdmin string candidates map[string]*AdminCandidate votes map[string]string // voter -> candidate // Timers and channels heartbeatTimer *time.Timer discoveryTimer *time.Timer electionTimer *time.Timer electionTrigger chan ElectionTrigger // Heartbeat management heartbeatManager *HeartbeatManager // Callbacks onAdminChanged func(oldAdmin, newAdmin string) onElectionComplete func(winner string) // Stability window to prevent election churn (Medium-risk fix 2.1) lastElectionTime time.Time electionStabilityWindow time.Duration leaderStabilityWindow time.Duration startTime time.Time } // HeartbeatManager manages admin heartbeat lifecycle type HeartbeatManager struct { mu sync.Mutex isRunning bool stopCh chan struct{} ticker *time.Ticker electionMgr *ElectionManager logger func(msg string, args ...interface{}) } // NewElectionManager creates a new election manager func NewElectionManager( ctx context.Context, cfg *config.Config, host libp2p.Host, ps *pubsub.PubSub, nodeID string, ) *ElectionManager { electionCtx, cancel := context.WithCancel(ctx) em := &ElectionManager{ ctx: electionCtx, cancel: cancel, config: cfg, host: host, pubsub: ps, nodeID: nodeID, state: StateIdle, candidates: make(map[string]*AdminCandidate), votes: make(map[string]string), electionTrigger: make(chan ElectionTrigger, 10), startTime: time.Now(), // Initialize stability windows (as per WHOOSH issue #7) electionStabilityWindow: getElectionStabilityWindow(cfg), leaderStabilityWindow: getLeaderStabilityWindow(cfg), } // Initialize heartbeat manager em.heartbeatManager = &HeartbeatManager{ electionMgr: em, logger: func(msg string, args ...interface{}) { log.Printf("[HEARTBEAT] "+msg, args...) }, } return em } // Start begins the election management system func (em *ElectionManager) Start() error { log.Printf("πŸ—³οΈ Starting election manager for node %s", em.nodeID) if err := em.pubsub.SubscribeRawTopic(electionTopic, func(data []byte, _ peer.ID) { em.handleElectionMessage(data) }); err != nil { return fmt.Errorf("failed to subscribe to election messages: %w", err) } if err := em.pubsub.SubscribeRawTopic(adminHeartbeatTopic, func(data []byte, _ peer.ID) { em.handleAdminHeartbeat(data) }); err != nil { return fmt.Errorf("failed to subscribe to admin heartbeat: %w", err) } // Start discovery process log.Printf("πŸ” About to start discovery loop goroutine...") go func() { log.Printf("πŸ” Discovery loop goroutine started successfully") em.startDiscoveryLoop() }() // Start election coordinator log.Printf("πŸ—³οΈ About to start election coordinator goroutine...") go func() { log.Printf("πŸ—³οΈ Election coordinator goroutine started successfully") em.electionCoordinator() }() // Start heartbeat if this node is already admin at startup if em.IsCurrentAdmin() { go func() { // Slight delay to ensure everything is initialized time.Sleep(2 * time.Second) if err := em.heartbeatManager.StartHeartbeat(); err != nil { log.Printf("⚠️ Failed to start initial heartbeat: %v", err) } }() } log.Printf("βœ… Election manager started") return nil } // Stop shuts down the election manager func (em *ElectionManager) Stop() { log.Printf("πŸ›‘ Stopping election manager") // Stop heartbeat first if em.heartbeatManager != nil { em.heartbeatManager.StopHeartbeat() } em.cancel() em.mu.Lock() defer em.mu.Unlock() if em.heartbeatTimer != nil { em.heartbeatTimer.Stop() } if em.discoveryTimer != nil { em.discoveryTimer.Stop() } if em.electionTimer != nil { em.electionTimer.Stop() } } // TriggerElection manually triggers an election with stability window checks func (em *ElectionManager) TriggerElection(trigger ElectionTrigger) { // Check if election already in progress em.mu.RLock() currentState := em.state currentAdmin := em.currentAdmin lastElection := em.lastElectionTime em.mu.RUnlock() if currentState != StateIdle { log.Printf("πŸ—³οΈ Election already in progress (state: %s), ignoring trigger: %s", currentState, trigger) return } // Apply stability window to prevent election churn (WHOOSH issue #7) now := time.Now() if !lastElection.IsZero() { timeSinceElection := now.Sub(lastElection) // If we have a current admin, check leader stability window if currentAdmin != "" && timeSinceElection < em.leaderStabilityWindow { log.Printf("⏳ Leader stability window active (%.1fs remaining), ignoring trigger: %s", (em.leaderStabilityWindow - timeSinceElection).Seconds(), trigger) return } // General election stability window if timeSinceElection < em.electionStabilityWindow { log.Printf("⏳ Election stability window active (%.1fs remaining), ignoring trigger: %s", (em.electionStabilityWindow - timeSinceElection).Seconds(), trigger) return } } select { case em.electionTrigger <- trigger: log.Printf("πŸ—³οΈ Election triggered: %s", trigger) default: log.Printf("⚠️ Election trigger buffer full, ignoring: %s", trigger) } } // GetCurrentAdmin returns the current admin node ID func (em *ElectionManager) GetCurrentAdmin() string { em.mu.RLock() defer em.mu.RUnlock() return em.currentAdmin } // IsCurrentAdmin checks if this node is the current admin func (em *ElectionManager) IsCurrentAdmin() bool { return em.GetCurrentAdmin() == em.nodeID } // GetElectionState returns the current election state func (em *ElectionManager) GetElectionState() ElectionState { em.mu.RLock() defer em.mu.RUnlock() return em.state } // SetCallbacks sets election event callbacks func (em *ElectionManager) SetCallbacks( onAdminChanged func(oldAdmin, newAdmin string), onElectionComplete func(winner string), ) { em.onAdminChanged = onAdminChanged em.onElectionComplete = onElectionComplete } // GetHeartbeatStatus returns the current heartbeat status func (em *ElectionManager) GetHeartbeatStatus() map[string]interface{} { if em.heartbeatManager == nil { return map[string]interface{}{ "error": "heartbeat manager not initialized", } } return em.heartbeatManager.GetHeartbeatStatus() } // startDiscoveryLoop starts the admin discovery loop func (em *ElectionManager) startDiscoveryLoop() { defer func() { if r := recover(); r != nil { log.Printf("πŸ” PANIC in discovery loop: %v", r) } log.Printf("πŸ” Discovery loop goroutine exiting") }() log.Printf("πŸ” ENHANCED-DEBUG: Starting admin discovery loop with timeout: %v", em.config.Security.ElectionConfig.DiscoveryTimeout) log.Printf("πŸ” ENHANCED-DEBUG: Context status: err=%v", em.ctx.Err()) log.Printf("πŸ” ENHANCED-DEBUG: Node ID: %s, Can be admin: %v", em.nodeID, em.canBeAdmin()) for { log.Printf("πŸ” Discovery loop iteration starting, waiting for timeout...") log.Printf("πŸ” Context status before select: err=%v", em.ctx.Err()) select { case <-em.ctx.Done(): log.Printf("πŸ” Discovery loop cancelled via context: %v", em.ctx.Err()) return case <-time.After(em.config.Security.ElectionConfig.DiscoveryTimeout): log.Printf("πŸ” Discovery timeout triggered! Calling performAdminDiscovery()...") em.performAdminDiscovery() } } } // performAdminDiscovery attempts to discover existing admin func (em *ElectionManager) performAdminDiscovery() { em.mu.Lock() currentState := em.state lastHeartbeat := em.lastHeartbeat em.mu.Unlock() log.Printf("πŸ” Discovery check: state=%s, lastHeartbeat=%v, canAdmin=%v", currentState, lastHeartbeat, em.canBeAdmin()) // Only discover if we're idle or the heartbeat is stale if currentState != StateIdle { log.Printf("πŸ” Skipping discovery - not in idle state (current: %s)", currentState) return } // Check if admin heartbeat has timed out if !lastHeartbeat.IsZero() && time.Since(lastHeartbeat) > em.config.Security.ElectionConfig.HeartbeatTimeout { log.Printf("⚰️ Admin heartbeat timeout detected (last: %v)", lastHeartbeat) em.TriggerElection(TriggerHeartbeatTimeout) return } // If we haven't heard from an admin recently, try to discover one timeSinceHeartbeat := time.Since(lastHeartbeat) discoveryThreshold := em.config.Security.ElectionConfig.DiscoveryTimeout / 2 log.Printf("πŸ” Heartbeat check: isZero=%v, timeSince=%v, threshold=%v", lastHeartbeat.IsZero(), timeSinceHeartbeat, discoveryThreshold) if lastHeartbeat.IsZero() || timeSinceHeartbeat > discoveryThreshold { log.Printf("πŸ” Sending discovery request...") em.sendDiscoveryRequest() // 🚨 CRITICAL FIX: If we have no admin and can become admin, trigger election after discovery timeout em.mu.Lock() currentAdmin := em.currentAdmin em.mu.Unlock() if currentAdmin == "" && em.canBeAdmin() { log.Printf("πŸ—³οΈ No admin discovered and we can be admin - scheduling election check") go func() { // Add randomization to prevent simultaneous elections from all nodes baseDelay := em.config.Security.ElectionConfig.DiscoveryTimeout * 2 randomDelay := time.Duration(rand.Intn(int(em.config.Security.ElectionConfig.DiscoveryTimeout))) totalDelay := baseDelay + randomDelay log.Printf("πŸ—³οΈ Waiting %v before checking if election needed", totalDelay) time.Sleep(totalDelay) // Check again if still no admin and no one else started election em.mu.RLock() stillNoAdmin := em.currentAdmin == "" stillIdle := em.state == StateIdle em.mu.RUnlock() if stillNoAdmin && stillIdle && em.canBeAdmin() { log.Printf("πŸ—³οΈ Election grace period expired with no admin - triggering election") em.TriggerElection(TriggerDiscoveryFailure) } else { log.Printf("πŸ—³οΈ Election check: admin=%s, state=%s - skipping election", em.currentAdmin, em.state) } }() } } else { log.Printf("πŸ” Discovery threshold not met - waiting") } } // sendDiscoveryRequest broadcasts admin discovery request func (em *ElectionManager) sendDiscoveryRequest() { em.mu.RLock() currentAdmin := em.currentAdmin em.mu.RUnlock() // WHOAMI debug message if currentAdmin == "" { log.Printf("πŸ€– WHOAMI: I'm %s and I have no leader", em.nodeID) } else { log.Printf("πŸ€– WHOAMI: I'm %s and my leader is %s", em.nodeID, currentAdmin) } log.Printf("πŸ“‘ Sending admin discovery request from node %s", em.nodeID) discoveryMsg := ElectionMessage{ Type: "admin_discovery_request", NodeID: em.nodeID, Timestamp: time.Now(), } if err := em.publishElectionMessage(discoveryMsg); err != nil { log.Printf("❌ Failed to send admin discovery request: %v", err) } else { log.Printf("βœ… Admin discovery request sent successfully") } } // electionCoordinator handles the main election logic func (em *ElectionManager) electionCoordinator() { log.Printf("🎯 Election coordinator started") for { select { case <-em.ctx.Done(): return case trigger := <-em.electionTrigger: em.handleElectionTrigger(trigger) } } } // handleElectionTrigger processes election triggers func (em *ElectionManager) handleElectionTrigger(trigger ElectionTrigger) { log.Printf("πŸ”₯ Processing election trigger: %s", trigger) em.mu.Lock() currentState := em.state em.mu.Unlock() // Ignore triggers if we're already in an election if currentState != StateIdle { log.Printf("⏸️ Ignoring election trigger, current state: %s", currentState) return } // Begin election process em.beginElection(trigger) } // beginElection starts a new election func (em *ElectionManager) beginElection(trigger ElectionTrigger) { log.Printf("πŸ—³οΈ Beginning election due to: %s", trigger) em.mu.Lock() em.state = StateElecting em.currentTerm++ em.lastElectionTime = time.Now() // Record election timestamp for stability window term := em.currentTerm em.candidates = make(map[string]*AdminCandidate) em.votes = make(map[string]string) em.mu.Unlock() // Announce candidacy if this node can be admin if em.canBeAdmin() { em.announceCandidacy(term) } // Send election announcement electionMsg := ElectionMessage{ Type: "election_started", NodeID: em.nodeID, Timestamp: time.Now(), Term: term, Data: map[string]interface{}{ "trigger": string(trigger), }, } if err := em.publishElectionMessage(electionMsg); err != nil { log.Printf("❌ Failed to announce election start: %v", err) } // Start election timeout em.startElectionTimeout(term) } // canBeAdmin checks if this node can become admin func (em *ElectionManager) canBeAdmin() bool { // Check if node has admin capabilities for _, cap := range em.config.Agent.Capabilities { if cap == "admin_election" || cap == "context_curation" || cap == "project_manager" { return true } } return false } // announceCandidacy announces this node as an election candidate func (em *ElectionManager) announceCandidacy(term int) { uptime := time.Since(em.startTime) candidate := &AdminCandidate{ NodeID: em.nodeID, PeerID: em.host.ID(), Capabilities: em.config.Agent.Capabilities, Uptime: uptime, Resources: em.getResourceMetrics(), Experience: uptime, // For now, use uptime as experience Metadata: map[string]interface{}{ "specialization": em.config.Agent.Specialization, "models": em.config.Agent.Models, }, } // Calculate candidate score candidate.Score = em.calculateCandidateScore(candidate) candidacyMsg := ElectionMessage{ Type: "candidacy_announcement", NodeID: em.nodeID, Timestamp: time.Now(), Term: term, Data: candidate, } log.Printf("πŸ“’ Announcing candidacy (score: %.2f)", candidate.Score) if err := em.publishElectionMessage(candidacyMsg); err != nil { log.Printf("❌ Failed to announce candidacy: %v", err) } } // getResourceMetrics collects current node resource metrics func (em *ElectionManager) getResourceMetrics() ResourceMetrics { // TODO: Implement actual resource collection // For now, return simulated values return ResourceMetrics{ CPUUsage: rand.Float64() * 0.5, // 0-50% CPU MemoryUsage: rand.Float64() * 0.7, // 0-70% Memory DiskUsage: rand.Float64() * 0.6, // 0-60% Disk NetworkQuality: 0.8 + rand.Float64()*0.2, // 80-100% Network Quality } } // calculateCandidateScore calculates election score for a candidate func (em *ElectionManager) calculateCandidateScore(candidate *AdminCandidate) float64 { // TODO: Add LeadershipScoring to config.ElectionConfig // scoring := em.config.Security.ElectionConfig.LeadershipScoring // Default scoring weights handled inline // Normalize metrics to 0-1 range uptimeScore := min(1.0, candidate.Uptime.Hours()/24.0) // Up to 24 hours gets full score // Capability score - higher for admin/coordination capabilities capabilityScore := 0.0 adminCapabilities := []string{"admin_election", "context_curation", "key_reconstruction", "semantic_analysis", "project_manager"} for _, cap := range candidate.Capabilities { for _, adminCap := range adminCapabilities { if cap == adminCap { weight := 0.25 // Default weight // Project manager capabilities get higher weight if adminCap == "project_manager" || adminCap == "context_curation" { weight = 0.35 } capabilityScore += weight } } } capabilityScore = min(1.0, capabilityScore) // Resource score - lower usage is better resourceScore := (1.0-candidate.Resources.CPUUsage)*0.3 + (1.0-candidate.Resources.MemoryUsage)*0.3 + (1.0-candidate.Resources.DiskUsage)*0.2 + candidate.Resources.NetworkQuality*0.2 experienceScore := min(1.0, candidate.Experience.Hours()/168.0) // Up to 1 week gets full score // Weighted final score (using default weights) finalScore := uptimeScore*0.3 + capabilityScore*0.2 + resourceScore*0.2 + candidate.Resources.NetworkQuality*0.15 + experienceScore*0.15 return finalScore } // startElectionTimeout starts the election timeout timer func (em *ElectionManager) startElectionTimeout(term int) { em.mu.Lock() defer em.mu.Unlock() if em.electionTimer != nil { em.electionTimer.Stop() } em.electionTimer = time.AfterFunc(em.config.Security.ElectionConfig.ElectionTimeout, func() { em.completeElection(term) }) } // completeElection completes the election and announces winner func (em *ElectionManager) completeElection(term int) { em.mu.Lock() defer em.mu.Unlock() // Verify this is still the current term if term != em.currentTerm { log.Printf("⏰ Election timeout for old term %d, ignoring", term) return } log.Printf("⏰ Election timeout reached, tallying votes") // Find the winning candidate winner := em.findElectionWinner() if winner == nil { log.Printf("❌ No winner found in election") em.state = StateIdle // Trigger another election after a delay go func() { time.Sleep(em.config.Security.ElectionConfig.DiscoveryBackoff) em.TriggerElection(TriggerDiscoveryFailure) }() return } log.Printf("πŸ† Election winner: %s (score: %.2f)", winner.NodeID, winner.Score) // Update admin oldAdmin := em.currentAdmin em.currentAdmin = winner.NodeID em.state = StateComplete // Announce the winner winnerMsg := ElectionMessage{ Type: "election_winner", NodeID: em.nodeID, Timestamp: time.Now(), Term: term, Data: winner, } em.mu.Unlock() // Unlock before publishing if err := em.publishElectionMessage(winnerMsg); err != nil { log.Printf("❌ Failed to announce election winner: %v", err) } // Handle heartbeat lifecycle based on admin change em.handleHeartbeatTransition(oldAdmin, winner.NodeID) // Trigger callbacks if em.onAdminChanged != nil { em.onAdminChanged(oldAdmin, winner.NodeID) } if em.onElectionComplete != nil { em.onElectionComplete(winner.NodeID) } em.mu.Lock() em.state = StateIdle // Reset state for next election } // findElectionWinner determines the election winner based on votes and scores func (em *ElectionManager) findElectionWinner() *AdminCandidate { if len(em.candidates) == 0 { return nil } // Count votes for each candidate voteCounts := make(map[string]int) totalVotes := 0 // Initialize vote counts for all candidates for candidateID := range em.candidates { voteCounts[candidateID] = 0 } // Tally actual votes for _, candidateID := range em.votes { if _, exists := em.candidates[candidateID]; exists { voteCounts[candidateID]++ totalVotes++ } } // If no votes cast, fall back to highest scoring candidate if totalVotes == 0 { var winner *AdminCandidate highestScore := -1.0 for _, candidate := range em.candidates { if candidate.Score > highestScore { highestScore = candidate.Score winner = candidate } } return winner } // Find candidate with most votes var winner *AdminCandidate maxVotes := -1 highestScore := -1.0 for candidateID, voteCount := range voteCounts { candidate := em.candidates[candidateID] if voteCount > maxVotes || (voteCount == maxVotes && candidate.Score > highestScore) { maxVotes = voteCount highestScore = candidate.Score winner = candidate } } log.Printf("πŸ—³οΈ Election results: %d total votes, winner: %s with %d votes (score: %.2f)", totalVotes, winner.NodeID, maxVotes, winner.Score) return winner } // handleElectionMessage processes incoming election messages func (em *ElectionManager) handleElectionMessage(data []byte) { var msg ElectionMessage if err := json.Unmarshal(data, &msg); err != nil { log.Printf("❌ Failed to unmarshal election message: %v", err) return } // Ignore messages from ourselves if msg.NodeID == em.nodeID { return } switch msg.Type { case "admin_discovery_request": em.handleAdminDiscoveryRequest(msg) case "admin_discovery_response": em.handleAdminDiscoveryResponse(msg) case "election_started": em.handleElectionStarted(msg) case "candidacy_announcement": em.handleCandidacyAnnouncement(msg) case "election_vote": em.handleElectionVote(msg) case "election_winner": em.handleElectionWinner(msg) } } // handleAdminDiscoveryRequest responds to admin discovery requests func (em *ElectionManager) handleAdminDiscoveryRequest(msg ElectionMessage) { em.mu.RLock() currentAdmin := em.currentAdmin state := em.state em.mu.RUnlock() log.Printf("πŸ“© Received admin discovery request from %s (my leader: %s, state: %s)", msg.NodeID, currentAdmin, state) // Only respond if we know who the current admin is and we're idle if currentAdmin != "" && state == StateIdle { responseMsg := ElectionMessage{ Type: "admin_discovery_response", NodeID: em.nodeID, Timestamp: time.Now(), Data: map[string]interface{}{ "current_admin": currentAdmin, }, } log.Printf("πŸ“€ Responding to discovery with admin: %s", currentAdmin) if err := em.publishElectionMessage(responseMsg); err != nil { log.Printf("❌ Failed to send admin discovery response: %v", err) } else { log.Printf("βœ… Admin discovery response sent successfully") } } else { log.Printf("πŸ”‡ Not responding to discovery (admin=%s, state=%s)", currentAdmin, state) } } // handleAdminDiscoveryResponse processes admin discovery responses func (em *ElectionManager) handleAdminDiscoveryResponse(msg ElectionMessage) { log.Printf("πŸ“₯ Received admin discovery response from %s", msg.NodeID) if data, ok := msg.Data.(map[string]interface{}); ok { if admin, ok := data["current_admin"].(string); ok && admin != "" { em.mu.Lock() oldAdmin := em.currentAdmin if em.currentAdmin == "" { log.Printf("πŸ“‘ Discovered admin: %s (reported by %s)", admin, msg.NodeID) em.currentAdmin = admin em.lastHeartbeat = time.Now() // Set initial heartbeat } else if em.currentAdmin != admin { log.Printf("⚠️ Admin conflict: I know %s, but %s reports %s", em.currentAdmin, msg.NodeID, admin) } else { log.Printf("πŸ“‘ Admin confirmed: %s (reported by %s)", admin, msg.NodeID) } em.mu.Unlock() // Trigger callback if admin changed if oldAdmin != admin && em.onAdminChanged != nil { em.onAdminChanged(oldAdmin, admin) } } } else { log.Printf("❌ Invalid admin discovery response from %s", msg.NodeID) } } // handleElectionStarted processes election start announcements func (em *ElectionManager) handleElectionStarted(msg ElectionMessage) { em.mu.Lock() defer em.mu.Unlock() // If we receive an election start with a higher term, join the election if msg.Term > em.currentTerm { log.Printf("πŸ”„ Joining election with term %d", msg.Term) em.currentTerm = msg.Term em.state = StateElecting em.candidates = make(map[string]*AdminCandidate) em.votes = make(map[string]string) // Announce candidacy if eligible if em.canBeAdmin() { go em.announceCandidacy(msg.Term) } } } // handleCandidacyAnnouncement processes candidacy announcements func (em *ElectionManager) handleCandidacyAnnouncement(msg ElectionMessage) { em.mu.Lock() defer em.mu.Unlock() // Only process if it's for the current term if msg.Term != em.currentTerm { return } // Convert data to candidate struct candidateData, err := json.Marshal(msg.Data) if err != nil { log.Printf("❌ Failed to marshal candidate data: %v", err) return } var candidate AdminCandidate if err := json.Unmarshal(candidateData, &candidate); err != nil { log.Printf("❌ Failed to unmarshal candidate: %v", err) return } log.Printf("πŸ“ Received candidacy from %s (score: %.2f)", candidate.NodeID, candidate.Score) em.candidates[candidate.NodeID] = &candidate } // handleElectionVote processes election votes func (em *ElectionManager) handleElectionVote(msg ElectionMessage) { em.mu.Lock() defer em.mu.Unlock() // Extract vote data voteData, ok := msg.Data.(map[string]interface{}) if !ok { log.Printf("❌ Invalid vote data format from %s", msg.NodeID) return } candidateID, ok := voteData["candidate"].(string) if !ok { log.Printf("❌ Invalid candidate ID in vote from %s", msg.NodeID) return } // Validate candidate exists if _, exists := em.candidates[candidateID]; !exists { log.Printf("❌ Vote for unknown candidate %s from %s", candidateID, msg.NodeID) return } // Prevent duplicate voting if existingVote, exists := em.votes[msg.NodeID]; exists { log.Printf("⚠️ Node %s already voted for %s, updating to %s", msg.NodeID, existingVote, candidateID) } // Record the vote em.votes[msg.NodeID] = candidateID log.Printf("πŸ—³οΈ Recorded vote from %s for candidate %s", msg.NodeID, candidateID) } // handleElectionWinner processes election winner announcements func (em *ElectionManager) handleElectionWinner(msg ElectionMessage) { candidateData, err := json.Marshal(msg.Data) if err != nil { log.Printf("❌ Failed to marshal winner data: %v", err) return } var winner AdminCandidate if err := json.Unmarshal(candidateData, &winner); err != nil { log.Printf("❌ Failed to unmarshal winner: %v", err) return } em.mu.Lock() oldAdmin := em.currentAdmin em.currentAdmin = winner.NodeID em.state = StateIdle em.mu.Unlock() log.Printf("πŸ‘‘ New admin elected: %s", winner.NodeID) // Handle heartbeat lifecycle based on admin change em.handleHeartbeatTransition(oldAdmin, winner.NodeID) // Trigger callback if em.onAdminChanged != nil { em.onAdminChanged(oldAdmin, winner.NodeID) } } // handleHeartbeatTransition manages heartbeat start/stop on admin transitions func (em *ElectionManager) handleHeartbeatTransition(oldAdmin, newAdmin string) { // If we lost admin role, stop heartbeat if oldAdmin == em.nodeID && newAdmin != em.nodeID { log.Printf("πŸ”„ Lost admin role, stopping heartbeat") if err := em.heartbeatManager.StopHeartbeat(); err != nil { log.Printf("⚠️ Error stopping heartbeat: %v", err) } } // If we gained admin role, start heartbeat if newAdmin == em.nodeID && oldAdmin != em.nodeID { log.Printf("πŸ”„ Gained admin role, starting heartbeat") // Start with slight delay to ensure election is fully settled go func() { time.Sleep(1 * time.Second) if err := em.heartbeatManager.StartHeartbeat(); err != nil { log.Printf("⚠️ Error starting heartbeat: %v", err) } }() } } // handleAdminHeartbeat processes admin heartbeat messages func (em *ElectionManager) handleAdminHeartbeat(data []byte) { var heartbeat struct { NodeID string `json:"node_id"` Timestamp time.Time `json:"timestamp"` } if err := json.Unmarshal(data, &heartbeat); err != nil { log.Printf("❌ Failed to unmarshal heartbeat: %v", err) return } em.mu.Lock() defer em.mu.Unlock() // Update admin and heartbeat timestamp if em.currentAdmin == "" || em.currentAdmin == heartbeat.NodeID { em.currentAdmin = heartbeat.NodeID em.lastHeartbeat = heartbeat.Timestamp } } // publishElectionMessage publishes an election message func (em *ElectionManager) publishElectionMessage(msg ElectionMessage) error { data, err := json.Marshal(msg) if err != nil { return fmt.Errorf("failed to marshal election message: %w", err) } return em.pubsub.PublishRaw(electionTopic, data) } // SendAdminHeartbeat sends admin heartbeat (only if this node is admin) func (em *ElectionManager) SendAdminHeartbeat() error { if !em.IsCurrentAdmin() { return fmt.Errorf("not current admin") } heartbeat := struct { NodeID string `json:"node_id"` Timestamp time.Time `json:"timestamp"` }{ NodeID: em.nodeID, Timestamp: time.Now(), } data, err := json.Marshal(heartbeat) if err != nil { return fmt.Errorf("failed to marshal heartbeat: %w", err) } return em.pubsub.PublishRaw(adminHeartbeatTopic, data) } // min returns the minimum of two float64 values func min(a, b float64) float64 { if a < b { return a } return b } // HeartbeatManager methods // NewHeartbeatManager creates a new heartbeat manager func NewHeartbeatManager(electionMgr *ElectionManager) *HeartbeatManager { return &HeartbeatManager{ electionMgr: electionMgr, logger: func(msg string, args ...interface{}) { log.Printf("[HEARTBEAT] "+msg, args...) }, } } // StartHeartbeat begins heartbeat transmission func (hm *HeartbeatManager) StartHeartbeat() error { hm.mu.Lock() defer hm.mu.Unlock() if hm.isRunning { hm.logger("Heartbeat already running") return nil } if !hm.electionMgr.IsCurrentAdmin() { return fmt.Errorf("not admin, cannot start heartbeat") } hm.logger("Starting admin heartbeat transmission") hm.stopCh = make(chan struct{}) interval := hm.electionMgr.config.Security.ElectionConfig.HeartbeatTimeout / 2 hm.ticker = time.NewTicker(interval) hm.isRunning = true // Start heartbeat goroutine go hm.heartbeatLoop() hm.logger("Admin heartbeat started (interval: %v)", interval) return nil } // StopHeartbeat stops heartbeat transmission func (hm *HeartbeatManager) StopHeartbeat() error { hm.mu.Lock() defer hm.mu.Unlock() if !hm.isRunning { return nil } hm.logger("Stopping admin heartbeat transmission") // Signal stop close(hm.stopCh) // Stop ticker if hm.ticker != nil { hm.ticker.Stop() hm.ticker = nil } hm.isRunning = false hm.logger("Admin heartbeat stopped") return nil } // IsRunning returns whether heartbeat is currently active func (hm *HeartbeatManager) IsRunning() bool { hm.mu.Lock() defer hm.mu.Unlock() return hm.isRunning } // heartbeatLoop runs the heartbeat transmission loop func (hm *HeartbeatManager) heartbeatLoop() { defer func() { hm.mu.Lock() hm.isRunning = false hm.mu.Unlock() hm.logger("Heartbeat loop terminated") }() for { select { case <-hm.ticker.C: // Only send heartbeat if still admin if hm.electionMgr.IsCurrentAdmin() { if err := hm.electionMgr.SendAdminHeartbeat(); err != nil { hm.logger("Failed to send heartbeat: %v", err) } } else { hm.logger("No longer admin, stopping heartbeat") return } case <-hm.stopCh: hm.logger("Heartbeat stop signal received") return case <-hm.electionMgr.ctx.Done(): hm.logger("Election manager context cancelled") return } } } // GetHeartbeatStatus returns current heartbeat status func (hm *HeartbeatManager) GetHeartbeatStatus() map[string]interface{} { hm.mu.Lock() defer hm.mu.Unlock() status := map[string]interface{}{ "running": hm.isRunning, "is_admin": hm.electionMgr.IsCurrentAdmin(), "last_sent": time.Now(), // TODO: Track actual last sent time } if hm.isRunning && hm.ticker != nil { // Calculate next heartbeat time (approximate) interval := hm.electionMgr.config.Security.ElectionConfig.HeartbeatTimeout / 2 status["interval"] = interval.String() status["next_heartbeat"] = time.Now().Add(interval) } return status } // Helper functions for stability window configuration // getElectionStabilityWindow gets the minimum time between elections func getElectionStabilityWindow(cfg *config.Config) time.Duration { // Try to get from environment or use default if stability := os.Getenv("CHORUS_ELECTION_MIN_TERM"); stability != "" { if duration, err := time.ParseDuration(stability); err == nil { return duration } } // Try to get from config structure if it exists if cfg.Security.ElectionConfig.DiscoveryTimeout > 0 { // Use double the discovery timeout as default stability window return cfg.Security.ElectionConfig.DiscoveryTimeout * 2 } // Default fallback return 30 * time.Second } // getLeaderStabilityWindow gets the minimum time before challenging a healthy leader func getLeaderStabilityWindow(cfg *config.Config) time.Duration { // Try to get from environment or use default if stability := os.Getenv("CHORUS_LEADER_MIN_TERM"); stability != "" { if duration, err := time.ParseDuration(stability); err == nil { return duration } } // Try to get from config structure if it exists if cfg.Security.ElectionConfig.HeartbeatTimeout > 0 { // Use 3x heartbeat timeout as default leader stability return cfg.Security.ElectionConfig.HeartbeatTimeout * 3 } // Default fallback return 45 * time.Second }