 e523c4b543
			
		
	
	e523c4b543
	
	
	
		
			
			Address WHOOSH issue #7 with comprehensive scaling optimizations to prevent license server, bootstrap peer, and control plane collapse during fast scale-out. HIGH-RISK FIXES (Must-Do): ✅ License gate already implemented with cache + circuit breaker + grace window ✅ mDNS disabled in container environments (CHORUS_MDNS_ENABLED=false) ✅ Connection rate limiting (5 dials/sec, 16 concurrent DHT queries) ✅ Connection manager with watermarks (32 low, 128 high) ✅ AutoNAT enabled for container networking MEDIUM-RISK FIXES (Next Priority): ✅ Assignment merge layer with HTTP/file config + SIGHUP reload ✅ Runtime configuration system with WHOOSH assignment API support ✅ Election stability windows to prevent churn: - CHORUS_ELECTION_MIN_TERM=30s (minimum time between elections) - CHORUS_LEADER_MIN_TERM=45s (minimum time before challenging healthy leader) ✅ Bootstrap pool JSON support with priority sorting and join stagger NEW FEATURES: - Runtime config system with assignment overrides from WHOOSH - SIGHUP reload handler for live configuration updates - JSON bootstrap configuration with peer metadata (region, roles, priority) - Configurable election stability windows with environment variables - Multi-format bootstrap support: Assignment → JSON → CSV FILES MODIFIED: - pkg/config/assignment.go (NEW): Runtime assignment merge system - docker/bootstrap.json (NEW): Example JSON bootstrap configuration - pkg/election/election.go: Added stability windows and churn prevention - internal/runtime/shared.go: Integrated assignment loading and conditional mDNS - p2p/node.go: Added connection management and rate limiting - pkg/config/hybrid_config.go: Added rate limiting configuration fields - docker/docker-compose.yml: Updated environment variables and configs - README.md: Updated status table with scaling milestone This implementation enables wave-based autoscaling without system collapse, addressing all scaling concerns from WHOOSH issue #7. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
		
			
				
	
	
		
			1195 lines
		
	
	
		
			34 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			1195 lines
		
	
	
		
			34 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| package election
 | |
| 
 | |
| import (
 | |
| 	"context"
 | |
| 	"encoding/json"
 | |
| 	"fmt"
 | |
| 	"log"
 | |
| 	"math/rand"
 | |
| 	"os"
 | |
| 	"sync"
 | |
| 	"time"
 | |
| 
 | |
| 	"chorus/pkg/config"
 | |
| 	"chorus/pubsub"
 | |
| 	libp2p "github.com/libp2p/go-libp2p/core/host"
 | |
| 	"github.com/libp2p/go-libp2p/core/peer"
 | |
| )
 | |
| 
 | |
| // ElectionTrigger represents why an election was triggered
 | |
| type ElectionTrigger string
 | |
| 
 | |
| const (
 | |
| 	TriggerHeartbeatTimeout ElectionTrigger = "admin_heartbeat_timeout"
 | |
| 	TriggerDiscoveryFailure ElectionTrigger = "no_admin_discovered"
 | |
| 	TriggerSplitBrain       ElectionTrigger = "split_brain_detected"
 | |
| 	TriggerQuorumRestored   ElectionTrigger = "quorum_restored"
 | |
| 	TriggerManual           ElectionTrigger = "manual_trigger"
 | |
| )
 | |
| 
 | |
| // ElectionState represents the current election state
 | |
| type ElectionState string
 | |
| 
 | |
| const (
 | |
| 	electionTopic       = "CHORUS/election/v1"
 | |
| 	adminHeartbeatTopic = "CHORUS/admin/heartbeat/v1"
 | |
| )
 | |
| 
 | |
| const (
 | |
| 	StateIdle           ElectionState = "idle"
 | |
| 	StateDiscovering    ElectionState = "discovering"
 | |
| 	StateElecting       ElectionState = "electing"
 | |
| 	StateReconstructing ElectionState = "reconstructing_keys"
 | |
| 	StateComplete       ElectionState = "complete"
 | |
| )
 | |
| 
 | |
| // AdminCandidate represents a node candidate for admin role
 | |
| type AdminCandidate struct {
 | |
| 	NodeID       string                 `json:"node_id"`
 | |
| 	PeerID       peer.ID                `json:"peer_id"`
 | |
| 	Capabilities []string               `json:"capabilities"`
 | |
| 	Uptime       time.Duration          `json:"uptime"`
 | |
| 	Resources    ResourceMetrics        `json:"resources"`
 | |
| 	Experience   time.Duration          `json:"experience"`
 | |
| 	Score        float64                `json:"score"`
 | |
| 	Metadata     map[string]interface{} `json:"metadata,omitempty"`
 | |
| }
 | |
| 
 | |
| // ResourceMetrics holds node resource information for election scoring
 | |
| type ResourceMetrics struct {
 | |
| 	CPUUsage       float64 `json:"cpu_usage"`
 | |
| 	MemoryUsage    float64 `json:"memory_usage"`
 | |
| 	DiskUsage      float64 `json:"disk_usage"`
 | |
| 	NetworkQuality float64 `json:"network_quality"`
 | |
| }
 | |
| 
 | |
| // ElectionMessage represents election-related messages
 | |
| type ElectionMessage struct {
 | |
| 	Type      string      `json:"type"`
 | |
| 	NodeID    string      `json:"node_id"`
 | |
| 	Timestamp time.Time   `json:"timestamp"`
 | |
| 	Term      int         `json:"term"`
 | |
| 	Data      interface{} `json:"data,omitempty"`
 | |
| }
 | |
| 
 | |
| // ElectionManager handles admin election coordination
 | |
| type ElectionManager struct {
 | |
| 	ctx    context.Context
 | |
| 	cancel context.CancelFunc
 | |
| 	config *config.Config
 | |
| 	host   libp2p.Host
 | |
| 	pubsub *pubsub.PubSub
 | |
| 	nodeID string
 | |
| 
 | |
| 	// Election state
 | |
| 	mu            sync.RWMutex
 | |
| 	state         ElectionState
 | |
| 	currentTerm   int
 | |
| 	lastHeartbeat time.Time
 | |
| 	currentAdmin  string
 | |
| 	candidates    map[string]*AdminCandidate
 | |
| 	votes         map[string]string // voter -> candidate
 | |
| 
 | |
| 	// Timers and channels
 | |
| 	heartbeatTimer  *time.Timer
 | |
| 	discoveryTimer  *time.Timer
 | |
| 	electionTimer   *time.Timer
 | |
| 	electionTrigger chan ElectionTrigger
 | |
| 
 | |
| 	// Heartbeat management
 | |
| 	heartbeatManager *HeartbeatManager
 | |
| 
 | |
| 	// Callbacks
 | |
| 	onAdminChanged     func(oldAdmin, newAdmin string)
 | |
| 	onElectionComplete func(winner string)
 | |
| 
 | |
| 	// Stability window to prevent election churn (Medium-risk fix 2.1)
 | |
| 	lastElectionTime    time.Time
 | |
| 	electionStabilityWindow time.Duration
 | |
| 	leaderStabilityWindow   time.Duration
 | |
| 
 | |
| 	startTime time.Time
 | |
| }
 | |
| 
 | |
| // HeartbeatManager manages admin heartbeat lifecycle
 | |
| type HeartbeatManager struct {
 | |
| 	mu          sync.Mutex
 | |
| 	isRunning   bool
 | |
| 	stopCh      chan struct{}
 | |
| 	ticker      *time.Ticker
 | |
| 	electionMgr *ElectionManager
 | |
| 	logger      func(msg string, args ...interface{})
 | |
| }
 | |
| 
 | |
| // NewElectionManager creates a new election manager
 | |
| func NewElectionManager(
 | |
| 	ctx context.Context,
 | |
| 	cfg *config.Config,
 | |
| 	host libp2p.Host,
 | |
| 	ps *pubsub.PubSub,
 | |
| 	nodeID string,
 | |
| ) *ElectionManager {
 | |
| 	electionCtx, cancel := context.WithCancel(ctx)
 | |
| 
 | |
| 	em := &ElectionManager{
 | |
| 		ctx:             electionCtx,
 | |
| 		cancel:          cancel,
 | |
| 		config:          cfg,
 | |
| 		host:            host,
 | |
| 		pubsub:          ps,
 | |
| 		nodeID:          nodeID,
 | |
| 		state:           StateIdle,
 | |
| 		candidates:      make(map[string]*AdminCandidate),
 | |
| 		votes:           make(map[string]string),
 | |
| 		electionTrigger: make(chan ElectionTrigger, 10),
 | |
| 		startTime:       time.Now(),
 | |
| 
 | |
| 		// Initialize stability windows (as per WHOOSH issue #7)
 | |
| 		electionStabilityWindow: getElectionStabilityWindow(cfg),
 | |
| 		leaderStabilityWindow:   getLeaderStabilityWindow(cfg),
 | |
| 	}
 | |
| 
 | |
| 	// Initialize heartbeat manager
 | |
| 	em.heartbeatManager = &HeartbeatManager{
 | |
| 		electionMgr: em,
 | |
| 		logger: func(msg string, args ...interface{}) {
 | |
| 			log.Printf("[HEARTBEAT] "+msg, args...)
 | |
| 		},
 | |
| 	}
 | |
| 
 | |
| 	return em
 | |
| }
 | |
| 
 | |
| // Start begins the election management system
 | |
| func (em *ElectionManager) Start() error {
 | |
| 	log.Printf("🗳️ Starting election manager for node %s", em.nodeID)
 | |
| 
 | |
| 	if err := em.pubsub.SubscribeRawTopic(electionTopic, func(data []byte, _ peer.ID) {
 | |
| 		em.handleElectionMessage(data)
 | |
| 	}); err != nil {
 | |
| 		return fmt.Errorf("failed to subscribe to election messages: %w", err)
 | |
| 	}
 | |
| 
 | |
| 	if err := em.pubsub.SubscribeRawTopic(adminHeartbeatTopic, func(data []byte, _ peer.ID) {
 | |
| 		em.handleAdminHeartbeat(data)
 | |
| 	}); err != nil {
 | |
| 		return fmt.Errorf("failed to subscribe to admin heartbeat: %w", err)
 | |
| 	}
 | |
| 
 | |
| 	// Start discovery process
 | |
| 	log.Printf("🔍 About to start discovery loop goroutine...")
 | |
| 	go func() {
 | |
| 		log.Printf("🔍 Discovery loop goroutine started successfully")
 | |
| 		em.startDiscoveryLoop()
 | |
| 	}()
 | |
| 
 | |
| 	// Start election coordinator
 | |
| 	log.Printf("🗳️ About to start election coordinator goroutine...")
 | |
| 	go func() {
 | |
| 		log.Printf("🗳️ Election coordinator goroutine started successfully")
 | |
| 		em.electionCoordinator()
 | |
| 	}()
 | |
| 
 | |
| 	// Start heartbeat if this node is already admin at startup
 | |
| 	if em.IsCurrentAdmin() {
 | |
| 		go func() {
 | |
| 			// Slight delay to ensure everything is initialized
 | |
| 			time.Sleep(2 * time.Second)
 | |
| 			if err := em.heartbeatManager.StartHeartbeat(); err != nil {
 | |
| 				log.Printf("⚠️ Failed to start initial heartbeat: %v", err)
 | |
| 			}
 | |
| 		}()
 | |
| 	}
 | |
| 
 | |
| 	log.Printf("✅ Election manager started")
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // Stop shuts down the election manager
 | |
| func (em *ElectionManager) Stop() {
 | |
| 	log.Printf("🛑 Stopping election manager")
 | |
| 
 | |
| 	// Stop heartbeat first
 | |
| 	if em.heartbeatManager != nil {
 | |
| 		em.heartbeatManager.StopHeartbeat()
 | |
| 	}
 | |
| 
 | |
| 	em.cancel()
 | |
| 
 | |
| 	em.mu.Lock()
 | |
| 	defer em.mu.Unlock()
 | |
| 
 | |
| 	if em.heartbeatTimer != nil {
 | |
| 		em.heartbeatTimer.Stop()
 | |
| 	}
 | |
| 	if em.discoveryTimer != nil {
 | |
| 		em.discoveryTimer.Stop()
 | |
| 	}
 | |
| 	if em.electionTimer != nil {
 | |
| 		em.electionTimer.Stop()
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // TriggerElection manually triggers an election with stability window checks
 | |
| func (em *ElectionManager) TriggerElection(trigger ElectionTrigger) {
 | |
| 	// Check if election already in progress
 | |
| 	em.mu.RLock()
 | |
| 	currentState := em.state
 | |
| 	currentAdmin := em.currentAdmin
 | |
| 	lastElection := em.lastElectionTime
 | |
| 	em.mu.RUnlock()
 | |
| 
 | |
| 	if currentState != StateIdle {
 | |
| 		log.Printf("🗳️ Election already in progress (state: %s), ignoring trigger: %s", currentState, trigger)
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	// Apply stability window to prevent election churn (WHOOSH issue #7)
 | |
| 	now := time.Now()
 | |
| 	if !lastElection.IsZero() {
 | |
| 		timeSinceElection := now.Sub(lastElection)
 | |
| 
 | |
| 		// If we have a current admin, check leader stability window
 | |
| 		if currentAdmin != "" && timeSinceElection < em.leaderStabilityWindow {
 | |
| 			log.Printf("⏳ Leader stability window active (%.1fs remaining), ignoring trigger: %s",
 | |
| 				(em.leaderStabilityWindow - timeSinceElection).Seconds(), trigger)
 | |
| 			return
 | |
| 		}
 | |
| 
 | |
| 		// General election stability window
 | |
| 		if timeSinceElection < em.electionStabilityWindow {
 | |
| 			log.Printf("⏳ Election stability window active (%.1fs remaining), ignoring trigger: %s",
 | |
| 				(em.electionStabilityWindow - timeSinceElection).Seconds(), trigger)
 | |
| 			return
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	select {
 | |
| 	case em.electionTrigger <- trigger:
 | |
| 		log.Printf("🗳️ Election triggered: %s", trigger)
 | |
| 	default:
 | |
| 		log.Printf("⚠️ Election trigger buffer full, ignoring: %s", trigger)
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // GetCurrentAdmin returns the current admin node ID
 | |
| func (em *ElectionManager) GetCurrentAdmin() string {
 | |
| 	em.mu.RLock()
 | |
| 	defer em.mu.RUnlock()
 | |
| 	return em.currentAdmin
 | |
| }
 | |
| 
 | |
| // IsCurrentAdmin checks if this node is the current admin
 | |
| func (em *ElectionManager) IsCurrentAdmin() bool {
 | |
| 	return em.GetCurrentAdmin() == em.nodeID
 | |
| }
 | |
| 
 | |
| // GetElectionState returns the current election state
 | |
| func (em *ElectionManager) GetElectionState() ElectionState {
 | |
| 	em.mu.RLock()
 | |
| 	defer em.mu.RUnlock()
 | |
| 	return em.state
 | |
| }
 | |
| 
 | |
| // SetCallbacks sets election event callbacks
 | |
| func (em *ElectionManager) SetCallbacks(
 | |
| 	onAdminChanged func(oldAdmin, newAdmin string),
 | |
| 	onElectionComplete func(winner string),
 | |
| ) {
 | |
| 	em.onAdminChanged = onAdminChanged
 | |
| 	em.onElectionComplete = onElectionComplete
 | |
| }
 | |
| 
 | |
| // GetHeartbeatStatus returns the current heartbeat status
 | |
| func (em *ElectionManager) GetHeartbeatStatus() map[string]interface{} {
 | |
| 	if em.heartbeatManager == nil {
 | |
| 		return map[string]interface{}{
 | |
| 			"error": "heartbeat manager not initialized",
 | |
| 		}
 | |
| 	}
 | |
| 	return em.heartbeatManager.GetHeartbeatStatus()
 | |
| }
 | |
| 
 | |
| // startDiscoveryLoop starts the admin discovery loop
 | |
| func (em *ElectionManager) startDiscoveryLoop() {
 | |
| 	defer func() {
 | |
| 		if r := recover(); r != nil {
 | |
| 			log.Printf("🔍 PANIC in discovery loop: %v", r)
 | |
| 		}
 | |
| 		log.Printf("🔍 Discovery loop goroutine exiting")
 | |
| 	}()
 | |
| 
 | |
| 	log.Printf("🔍 ENHANCED-DEBUG: Starting admin discovery loop with timeout: %v", em.config.Security.ElectionConfig.DiscoveryTimeout)
 | |
| 	log.Printf("🔍 ENHANCED-DEBUG: Context status: err=%v", em.ctx.Err())
 | |
| 	log.Printf("🔍 ENHANCED-DEBUG: Node ID: %s, Can be admin: %v", em.nodeID, em.canBeAdmin())
 | |
| 
 | |
| 	for {
 | |
| 		log.Printf("🔍 Discovery loop iteration starting, waiting for timeout...")
 | |
| 		log.Printf("🔍 Context status before select: err=%v", em.ctx.Err())
 | |
| 
 | |
| 		select {
 | |
| 		case <-em.ctx.Done():
 | |
| 			log.Printf("🔍 Discovery loop cancelled via context: %v", em.ctx.Err())
 | |
| 			return
 | |
| 		case <-time.After(em.config.Security.ElectionConfig.DiscoveryTimeout):
 | |
| 			log.Printf("🔍 Discovery timeout triggered! Calling performAdminDiscovery()...")
 | |
| 			em.performAdminDiscovery()
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // performAdminDiscovery attempts to discover existing admin
 | |
| func (em *ElectionManager) performAdminDiscovery() {
 | |
| 	em.mu.Lock()
 | |
| 	currentState := em.state
 | |
| 	lastHeartbeat := em.lastHeartbeat
 | |
| 	em.mu.Unlock()
 | |
| 
 | |
| 	log.Printf("🔍 Discovery check: state=%s, lastHeartbeat=%v, canAdmin=%v",
 | |
| 		currentState, lastHeartbeat, em.canBeAdmin())
 | |
| 
 | |
| 	// Only discover if we're idle or the heartbeat is stale
 | |
| 	if currentState != StateIdle {
 | |
| 		log.Printf("🔍 Skipping discovery - not in idle state (current: %s)", currentState)
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	// Check if admin heartbeat has timed out
 | |
| 	if !lastHeartbeat.IsZero() && time.Since(lastHeartbeat) > em.config.Security.ElectionConfig.HeartbeatTimeout {
 | |
| 		log.Printf("⚰️ Admin heartbeat timeout detected (last: %v)", lastHeartbeat)
 | |
| 		em.TriggerElection(TriggerHeartbeatTimeout)
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	// If we haven't heard from an admin recently, try to discover one
 | |
| 	timeSinceHeartbeat := time.Since(lastHeartbeat)
 | |
| 	discoveryThreshold := em.config.Security.ElectionConfig.DiscoveryTimeout / 2
 | |
| 
 | |
| 	log.Printf("🔍 Heartbeat check: isZero=%v, timeSince=%v, threshold=%v",
 | |
| 		lastHeartbeat.IsZero(), timeSinceHeartbeat, discoveryThreshold)
 | |
| 
 | |
| 	if lastHeartbeat.IsZero() || timeSinceHeartbeat > discoveryThreshold {
 | |
| 		log.Printf("🔍 Sending discovery request...")
 | |
| 		em.sendDiscoveryRequest()
 | |
| 
 | |
| 		// 🚨 CRITICAL FIX: If we have no admin and can become admin, trigger election after discovery timeout
 | |
| 		em.mu.Lock()
 | |
| 		currentAdmin := em.currentAdmin
 | |
| 		em.mu.Unlock()
 | |
| 
 | |
| 		if currentAdmin == "" && em.canBeAdmin() {
 | |
| 			log.Printf("🗳️ No admin discovered and we can be admin - scheduling election check")
 | |
| 			go func() {
 | |
| 				// Add randomization to prevent simultaneous elections from all nodes
 | |
| 				baseDelay := em.config.Security.ElectionConfig.DiscoveryTimeout * 2
 | |
| 				randomDelay := time.Duration(rand.Intn(int(em.config.Security.ElectionConfig.DiscoveryTimeout)))
 | |
| 				totalDelay := baseDelay + randomDelay
 | |
| 
 | |
| 				log.Printf("🗳️ Waiting %v before checking if election needed", totalDelay)
 | |
| 				time.Sleep(totalDelay)
 | |
| 
 | |
| 				// Check again if still no admin and no one else started election
 | |
| 				em.mu.RLock()
 | |
| 				stillNoAdmin := em.currentAdmin == ""
 | |
| 				stillIdle := em.state == StateIdle
 | |
| 				em.mu.RUnlock()
 | |
| 
 | |
| 				if stillNoAdmin && stillIdle && em.canBeAdmin() {
 | |
| 					log.Printf("🗳️ Election grace period expired with no admin - triggering election")
 | |
| 					em.TriggerElection(TriggerDiscoveryFailure)
 | |
| 				} else {
 | |
| 					log.Printf("🗳️ Election check: admin=%s, state=%s - skipping election", em.currentAdmin, em.state)
 | |
| 				}
 | |
| 			}()
 | |
| 		}
 | |
| 	} else {
 | |
| 		log.Printf("🔍 Discovery threshold not met - waiting")
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // sendDiscoveryRequest broadcasts admin discovery request
 | |
| func (em *ElectionManager) sendDiscoveryRequest() {
 | |
| 	em.mu.RLock()
 | |
| 	currentAdmin := em.currentAdmin
 | |
| 	em.mu.RUnlock()
 | |
| 
 | |
| 	// WHOAMI debug message
 | |
| 	if currentAdmin == "" {
 | |
| 		log.Printf("🤖 WHOAMI: I'm %s and I have no leader", em.nodeID)
 | |
| 	} else {
 | |
| 		log.Printf("🤖 WHOAMI: I'm %s and my leader is %s", em.nodeID, currentAdmin)
 | |
| 	}
 | |
| 
 | |
| 	log.Printf("📡 Sending admin discovery request from node %s", em.nodeID)
 | |
| 
 | |
| 	discoveryMsg := ElectionMessage{
 | |
| 		Type:      "admin_discovery_request",
 | |
| 		NodeID:    em.nodeID,
 | |
| 		Timestamp: time.Now(),
 | |
| 	}
 | |
| 
 | |
| 	if err := em.publishElectionMessage(discoveryMsg); err != nil {
 | |
| 		log.Printf("❌ Failed to send admin discovery request: %v", err)
 | |
| 	} else {
 | |
| 		log.Printf("✅ Admin discovery request sent successfully")
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // electionCoordinator handles the main election logic
 | |
| func (em *ElectionManager) electionCoordinator() {
 | |
| 	log.Printf("🎯 Election coordinator started")
 | |
| 
 | |
| 	for {
 | |
| 		select {
 | |
| 		case <-em.ctx.Done():
 | |
| 			return
 | |
| 		case trigger := <-em.electionTrigger:
 | |
| 			em.handleElectionTrigger(trigger)
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // handleElectionTrigger processes election triggers
 | |
| func (em *ElectionManager) handleElectionTrigger(trigger ElectionTrigger) {
 | |
| 	log.Printf("🔥 Processing election trigger: %s", trigger)
 | |
| 
 | |
| 	em.mu.Lock()
 | |
| 	currentState := em.state
 | |
| 	em.mu.Unlock()
 | |
| 
 | |
| 	// Ignore triggers if we're already in an election
 | |
| 	if currentState != StateIdle {
 | |
| 		log.Printf("⏸️ Ignoring election trigger, current state: %s", currentState)
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	// Begin election process
 | |
| 	em.beginElection(trigger)
 | |
| }
 | |
| 
 | |
| // beginElection starts a new election
 | |
| func (em *ElectionManager) beginElection(trigger ElectionTrigger) {
 | |
| 	log.Printf("🗳️ Beginning election due to: %s", trigger)
 | |
| 
 | |
| 	em.mu.Lock()
 | |
| 	em.state = StateElecting
 | |
| 	em.currentTerm++
 | |
| 	em.lastElectionTime = time.Now() // Record election timestamp for stability window
 | |
| 	term := em.currentTerm
 | |
| 	em.candidates = make(map[string]*AdminCandidate)
 | |
| 	em.votes = make(map[string]string)
 | |
| 	em.mu.Unlock()
 | |
| 
 | |
| 	// Announce candidacy if this node can be admin
 | |
| 	if em.canBeAdmin() {
 | |
| 		em.announceCandidacy(term)
 | |
| 	}
 | |
| 
 | |
| 	// Send election announcement
 | |
| 	electionMsg := ElectionMessage{
 | |
| 		Type:      "election_started",
 | |
| 		NodeID:    em.nodeID,
 | |
| 		Timestamp: time.Now(),
 | |
| 		Term:      term,
 | |
| 		Data: map[string]interface{}{
 | |
| 			"trigger": string(trigger),
 | |
| 		},
 | |
| 	}
 | |
| 
 | |
| 	if err := em.publishElectionMessage(electionMsg); err != nil {
 | |
| 		log.Printf("❌ Failed to announce election start: %v", err)
 | |
| 	}
 | |
| 
 | |
| 	// Start election timeout
 | |
| 	em.startElectionTimeout(term)
 | |
| }
 | |
| 
 | |
| // canBeAdmin checks if this node can become admin
 | |
| func (em *ElectionManager) canBeAdmin() bool {
 | |
| 	// Check if node has admin capabilities
 | |
| 	for _, cap := range em.config.Agent.Capabilities {
 | |
| 		if cap == "admin_election" || cap == "context_curation" || cap == "project_manager" {
 | |
| 			return true
 | |
| 		}
 | |
| 	}
 | |
| 	return false
 | |
| }
 | |
| 
 | |
| // announceCandidacy announces this node as an election candidate
 | |
| func (em *ElectionManager) announceCandidacy(term int) {
 | |
| 	uptime := time.Since(em.startTime)
 | |
| 
 | |
| 	candidate := &AdminCandidate{
 | |
| 		NodeID:       em.nodeID,
 | |
| 		PeerID:       em.host.ID(),
 | |
| 		Capabilities: em.config.Agent.Capabilities,
 | |
| 		Uptime:       uptime,
 | |
| 		Resources:    em.getResourceMetrics(),
 | |
| 		Experience:   uptime, // For now, use uptime as experience
 | |
| 		Metadata: map[string]interface{}{
 | |
| 			"specialization": em.config.Agent.Specialization,
 | |
| 			"models":         em.config.Agent.Models,
 | |
| 		},
 | |
| 	}
 | |
| 
 | |
| 	// Calculate candidate score
 | |
| 	candidate.Score = em.calculateCandidateScore(candidate)
 | |
| 
 | |
| 	candidacyMsg := ElectionMessage{
 | |
| 		Type:      "candidacy_announcement",
 | |
| 		NodeID:    em.nodeID,
 | |
| 		Timestamp: time.Now(),
 | |
| 		Term:      term,
 | |
| 		Data:      candidate,
 | |
| 	}
 | |
| 
 | |
| 	log.Printf("📢 Announcing candidacy (score: %.2f)", candidate.Score)
 | |
| 
 | |
| 	if err := em.publishElectionMessage(candidacyMsg); err != nil {
 | |
| 		log.Printf("❌ Failed to announce candidacy: %v", err)
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // getResourceMetrics collects current node resource metrics
 | |
| func (em *ElectionManager) getResourceMetrics() ResourceMetrics {
 | |
| 	// TODO: Implement actual resource collection
 | |
| 	// For now, return simulated values
 | |
| 	return ResourceMetrics{
 | |
| 		CPUUsage:       rand.Float64() * 0.5,     // 0-50% CPU
 | |
| 		MemoryUsage:    rand.Float64() * 0.7,     // 0-70% Memory
 | |
| 		DiskUsage:      rand.Float64() * 0.6,     // 0-60% Disk
 | |
| 		NetworkQuality: 0.8 + rand.Float64()*0.2, // 80-100% Network Quality
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // calculateCandidateScore calculates election score for a candidate
 | |
| func (em *ElectionManager) calculateCandidateScore(candidate *AdminCandidate) float64 {
 | |
| 	// TODO: Add LeadershipScoring to config.ElectionConfig
 | |
| 	// scoring := em.config.Security.ElectionConfig.LeadershipScoring
 | |
| 	// Default scoring weights handled inline
 | |
| 
 | |
| 	// Normalize metrics to 0-1 range
 | |
| 	uptimeScore := min(1.0, candidate.Uptime.Hours()/24.0) // Up to 24 hours gets full score
 | |
| 
 | |
| 	// Capability score - higher for admin/coordination capabilities
 | |
| 	capabilityScore := 0.0
 | |
| 	adminCapabilities := []string{"admin_election", "context_curation", "key_reconstruction", "semantic_analysis", "project_manager"}
 | |
| 	for _, cap := range candidate.Capabilities {
 | |
| 		for _, adminCap := range adminCapabilities {
 | |
| 			if cap == adminCap {
 | |
| 				weight := 0.25 // Default weight
 | |
| 				// Project manager capabilities get higher weight
 | |
| 				if adminCap == "project_manager" || adminCap == "context_curation" {
 | |
| 					weight = 0.35
 | |
| 				}
 | |
| 				capabilityScore += weight
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	capabilityScore = min(1.0, capabilityScore)
 | |
| 
 | |
| 	// Resource score - lower usage is better
 | |
| 	resourceScore := (1.0-candidate.Resources.CPUUsage)*0.3 +
 | |
| 		(1.0-candidate.Resources.MemoryUsage)*0.3 +
 | |
| 		(1.0-candidate.Resources.DiskUsage)*0.2 +
 | |
| 		candidate.Resources.NetworkQuality*0.2
 | |
| 
 | |
| 	experienceScore := min(1.0, candidate.Experience.Hours()/168.0) // Up to 1 week gets full score
 | |
| 
 | |
| 	// Weighted final score (using default weights)
 | |
| 	finalScore := uptimeScore*0.3 +
 | |
| 		capabilityScore*0.2 +
 | |
| 		resourceScore*0.2 +
 | |
| 		candidate.Resources.NetworkQuality*0.15 +
 | |
| 		experienceScore*0.15
 | |
| 
 | |
| 	return finalScore
 | |
| }
 | |
| 
 | |
| // startElectionTimeout starts the election timeout timer
 | |
| func (em *ElectionManager) startElectionTimeout(term int) {
 | |
| 	em.mu.Lock()
 | |
| 	defer em.mu.Unlock()
 | |
| 
 | |
| 	if em.electionTimer != nil {
 | |
| 		em.electionTimer.Stop()
 | |
| 	}
 | |
| 
 | |
| 	em.electionTimer = time.AfterFunc(em.config.Security.ElectionConfig.ElectionTimeout, func() {
 | |
| 		em.completeElection(term)
 | |
| 	})
 | |
| }
 | |
| 
 | |
| // completeElection completes the election and announces winner
 | |
| func (em *ElectionManager) completeElection(term int) {
 | |
| 	em.mu.Lock()
 | |
| 	defer em.mu.Unlock()
 | |
| 
 | |
| 	// Verify this is still the current term
 | |
| 	if term != em.currentTerm {
 | |
| 		log.Printf("⏰ Election timeout for old term %d, ignoring", term)
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	log.Printf("⏰ Election timeout reached, tallying votes")
 | |
| 
 | |
| 	// Find the winning candidate
 | |
| 	winner := em.findElectionWinner()
 | |
| 	if winner == nil {
 | |
| 		log.Printf("❌ No winner found in election")
 | |
| 		em.state = StateIdle
 | |
| 		// Trigger another election after a delay
 | |
| 		go func() {
 | |
| 			time.Sleep(em.config.Security.ElectionConfig.DiscoveryBackoff)
 | |
| 			em.TriggerElection(TriggerDiscoveryFailure)
 | |
| 		}()
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	log.Printf("🏆 Election winner: %s (score: %.2f)", winner.NodeID, winner.Score)
 | |
| 
 | |
| 	// Update admin
 | |
| 	oldAdmin := em.currentAdmin
 | |
| 	em.currentAdmin = winner.NodeID
 | |
| 	em.state = StateComplete
 | |
| 
 | |
| 	// Announce the winner
 | |
| 	winnerMsg := ElectionMessage{
 | |
| 		Type:      "election_winner",
 | |
| 		NodeID:    em.nodeID,
 | |
| 		Timestamp: time.Now(),
 | |
| 		Term:      term,
 | |
| 		Data:      winner,
 | |
| 	}
 | |
| 
 | |
| 	em.mu.Unlock() // Unlock before publishing
 | |
| 
 | |
| 	if err := em.publishElectionMessage(winnerMsg); err != nil {
 | |
| 		log.Printf("❌ Failed to announce election winner: %v", err)
 | |
| 	}
 | |
| 
 | |
| 	// Handle heartbeat lifecycle based on admin change
 | |
| 	em.handleHeartbeatTransition(oldAdmin, winner.NodeID)
 | |
| 
 | |
| 	// Trigger callbacks
 | |
| 	if em.onAdminChanged != nil {
 | |
| 		em.onAdminChanged(oldAdmin, winner.NodeID)
 | |
| 	}
 | |
| 	if em.onElectionComplete != nil {
 | |
| 		em.onElectionComplete(winner.NodeID)
 | |
| 	}
 | |
| 
 | |
| 	em.mu.Lock()
 | |
| 	em.state = StateIdle // Reset state for next election
 | |
| }
 | |
| 
 | |
| // findElectionWinner determines the election winner based on votes and scores
 | |
| func (em *ElectionManager) findElectionWinner() *AdminCandidate {
 | |
| 	if len(em.candidates) == 0 {
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	// Count votes for each candidate
 | |
| 	voteCounts := make(map[string]int)
 | |
| 	totalVotes := 0
 | |
| 
 | |
| 	// Initialize vote counts for all candidates
 | |
| 	for candidateID := range em.candidates {
 | |
| 		voteCounts[candidateID] = 0
 | |
| 	}
 | |
| 
 | |
| 	// Tally actual votes
 | |
| 	for _, candidateID := range em.votes {
 | |
| 		if _, exists := em.candidates[candidateID]; exists {
 | |
| 			voteCounts[candidateID]++
 | |
| 			totalVotes++
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// If no votes cast, fall back to highest scoring candidate
 | |
| 	if totalVotes == 0 {
 | |
| 		var winner *AdminCandidate
 | |
| 		highestScore := -1.0
 | |
| 
 | |
| 		for _, candidate := range em.candidates {
 | |
| 			if candidate.Score > highestScore {
 | |
| 				highestScore = candidate.Score
 | |
| 				winner = candidate
 | |
| 			}
 | |
| 		}
 | |
| 		return winner
 | |
| 	}
 | |
| 
 | |
| 	// Find candidate with most votes
 | |
| 	var winner *AdminCandidate
 | |
| 	maxVotes := -1
 | |
| 	highestScore := -1.0
 | |
| 
 | |
| 	for candidateID, voteCount := range voteCounts {
 | |
| 		candidate := em.candidates[candidateID]
 | |
| 		if voteCount > maxVotes || (voteCount == maxVotes && candidate.Score > highestScore) {
 | |
| 			maxVotes = voteCount
 | |
| 			highestScore = candidate.Score
 | |
| 			winner = candidate
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	log.Printf("🗳️ Election results: %d total votes, winner: %s with %d votes (score: %.2f)",
 | |
| 		totalVotes, winner.NodeID, maxVotes, winner.Score)
 | |
| 
 | |
| 	return winner
 | |
| }
 | |
| 
 | |
| // handleElectionMessage processes incoming election messages
 | |
| func (em *ElectionManager) handleElectionMessage(data []byte) {
 | |
| 	var msg ElectionMessage
 | |
| 	if err := json.Unmarshal(data, &msg); err != nil {
 | |
| 		log.Printf("❌ Failed to unmarshal election message: %v", err)
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	// Ignore messages from ourselves
 | |
| 	if msg.NodeID == em.nodeID {
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	switch msg.Type {
 | |
| 	case "admin_discovery_request":
 | |
| 		em.handleAdminDiscoveryRequest(msg)
 | |
| 	case "admin_discovery_response":
 | |
| 		em.handleAdminDiscoveryResponse(msg)
 | |
| 	case "election_started":
 | |
| 		em.handleElectionStarted(msg)
 | |
| 	case "candidacy_announcement":
 | |
| 		em.handleCandidacyAnnouncement(msg)
 | |
| 	case "election_vote":
 | |
| 		em.handleElectionVote(msg)
 | |
| 	case "election_winner":
 | |
| 		em.handleElectionWinner(msg)
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // handleAdminDiscoveryRequest responds to admin discovery requests
 | |
| func (em *ElectionManager) handleAdminDiscoveryRequest(msg ElectionMessage) {
 | |
| 	em.mu.RLock()
 | |
| 	currentAdmin := em.currentAdmin
 | |
| 	state := em.state
 | |
| 	em.mu.RUnlock()
 | |
| 
 | |
| 	log.Printf("📩 Received admin discovery request from %s (my leader: %s, state: %s)",
 | |
| 		msg.NodeID, currentAdmin, state)
 | |
| 
 | |
| 	// Only respond if we know who the current admin is and we're idle
 | |
| 	if currentAdmin != "" && state == StateIdle {
 | |
| 		responseMsg := ElectionMessage{
 | |
| 			Type:      "admin_discovery_response",
 | |
| 			NodeID:    em.nodeID,
 | |
| 			Timestamp: time.Now(),
 | |
| 			Data: map[string]interface{}{
 | |
| 				"current_admin": currentAdmin,
 | |
| 			},
 | |
| 		}
 | |
| 
 | |
| 		log.Printf("📤 Responding to discovery with admin: %s", currentAdmin)
 | |
| 		if err := em.publishElectionMessage(responseMsg); err != nil {
 | |
| 			log.Printf("❌ Failed to send admin discovery response: %v", err)
 | |
| 		} else {
 | |
| 			log.Printf("✅ Admin discovery response sent successfully")
 | |
| 		}
 | |
| 	} else {
 | |
| 		log.Printf("🔇 Not responding to discovery (admin=%s, state=%s)", currentAdmin, state)
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // handleAdminDiscoveryResponse processes admin discovery responses
 | |
| func (em *ElectionManager) handleAdminDiscoveryResponse(msg ElectionMessage) {
 | |
| 	log.Printf("📥 Received admin discovery response from %s", msg.NodeID)
 | |
| 
 | |
| 	if data, ok := msg.Data.(map[string]interface{}); ok {
 | |
| 		if admin, ok := data["current_admin"].(string); ok && admin != "" {
 | |
| 			em.mu.Lock()
 | |
| 			oldAdmin := em.currentAdmin
 | |
| 			if em.currentAdmin == "" {
 | |
| 				log.Printf("📡 Discovered admin: %s (reported by %s)", admin, msg.NodeID)
 | |
| 				em.currentAdmin = admin
 | |
| 				em.lastHeartbeat = time.Now() // Set initial heartbeat
 | |
| 			} else if em.currentAdmin != admin {
 | |
| 				log.Printf("⚠️ Admin conflict: I know %s, but %s reports %s", em.currentAdmin, msg.NodeID, admin)
 | |
| 			} else {
 | |
| 				log.Printf("📡 Admin confirmed: %s (reported by %s)", admin, msg.NodeID)
 | |
| 			}
 | |
| 			em.mu.Unlock()
 | |
| 
 | |
| 			// Trigger callback if admin changed
 | |
| 			if oldAdmin != admin && em.onAdminChanged != nil {
 | |
| 				em.onAdminChanged(oldAdmin, admin)
 | |
| 			}
 | |
| 		}
 | |
| 	} else {
 | |
| 		log.Printf("❌ Invalid admin discovery response from %s", msg.NodeID)
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // handleElectionStarted processes election start announcements
 | |
| func (em *ElectionManager) handleElectionStarted(msg ElectionMessage) {
 | |
| 	em.mu.Lock()
 | |
| 	defer em.mu.Unlock()
 | |
| 
 | |
| 	// If we receive an election start with a higher term, join the election
 | |
| 	if msg.Term > em.currentTerm {
 | |
| 		log.Printf("🔄 Joining election with term %d", msg.Term)
 | |
| 		em.currentTerm = msg.Term
 | |
| 		em.state = StateElecting
 | |
| 		em.candidates = make(map[string]*AdminCandidate)
 | |
| 		em.votes = make(map[string]string)
 | |
| 
 | |
| 		// Announce candidacy if eligible
 | |
| 		if em.canBeAdmin() {
 | |
| 			go em.announceCandidacy(msg.Term)
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // handleCandidacyAnnouncement processes candidacy announcements
 | |
| func (em *ElectionManager) handleCandidacyAnnouncement(msg ElectionMessage) {
 | |
| 	em.mu.Lock()
 | |
| 	defer em.mu.Unlock()
 | |
| 
 | |
| 	// Only process if it's for the current term
 | |
| 	if msg.Term != em.currentTerm {
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	// Convert data to candidate struct
 | |
| 	candidateData, err := json.Marshal(msg.Data)
 | |
| 	if err != nil {
 | |
| 		log.Printf("❌ Failed to marshal candidate data: %v", err)
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	var candidate AdminCandidate
 | |
| 	if err := json.Unmarshal(candidateData, &candidate); err != nil {
 | |
| 		log.Printf("❌ Failed to unmarshal candidate: %v", err)
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	log.Printf("📝 Received candidacy from %s (score: %.2f)", candidate.NodeID, candidate.Score)
 | |
| 	em.candidates[candidate.NodeID] = &candidate
 | |
| }
 | |
| 
 | |
| // handleElectionVote processes election votes
 | |
| func (em *ElectionManager) handleElectionVote(msg ElectionMessage) {
 | |
| 	em.mu.Lock()
 | |
| 	defer em.mu.Unlock()
 | |
| 
 | |
| 	// Extract vote data
 | |
| 	voteData, ok := msg.Data.(map[string]interface{})
 | |
| 	if !ok {
 | |
| 		log.Printf("❌ Invalid vote data format from %s", msg.NodeID)
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	candidateID, ok := voteData["candidate"].(string)
 | |
| 	if !ok {
 | |
| 		log.Printf("❌ Invalid candidate ID in vote from %s", msg.NodeID)
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	// Validate candidate exists
 | |
| 	if _, exists := em.candidates[candidateID]; !exists {
 | |
| 		log.Printf("❌ Vote for unknown candidate %s from %s", candidateID, msg.NodeID)
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	// Prevent duplicate voting
 | |
| 	if existingVote, exists := em.votes[msg.NodeID]; exists {
 | |
| 		log.Printf("⚠️ Node %s already voted for %s, updating to %s", msg.NodeID, existingVote, candidateID)
 | |
| 	}
 | |
| 
 | |
| 	// Record the vote
 | |
| 	em.votes[msg.NodeID] = candidateID
 | |
| 	log.Printf("🗳️ Recorded vote from %s for candidate %s", msg.NodeID, candidateID)
 | |
| }
 | |
| 
 | |
| // handleElectionWinner processes election winner announcements
 | |
| func (em *ElectionManager) handleElectionWinner(msg ElectionMessage) {
 | |
| 	candidateData, err := json.Marshal(msg.Data)
 | |
| 	if err != nil {
 | |
| 		log.Printf("❌ Failed to marshal winner data: %v", err)
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	var winner AdminCandidate
 | |
| 	if err := json.Unmarshal(candidateData, &winner); err != nil {
 | |
| 		log.Printf("❌ Failed to unmarshal winner: %v", err)
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	em.mu.Lock()
 | |
| 	oldAdmin := em.currentAdmin
 | |
| 	em.currentAdmin = winner.NodeID
 | |
| 	em.state = StateIdle
 | |
| 	em.mu.Unlock()
 | |
| 
 | |
| 	log.Printf("👑 New admin elected: %s", winner.NodeID)
 | |
| 
 | |
| 	// Handle heartbeat lifecycle based on admin change
 | |
| 	em.handleHeartbeatTransition(oldAdmin, winner.NodeID)
 | |
| 
 | |
| 	// Trigger callback
 | |
| 	if em.onAdminChanged != nil {
 | |
| 		em.onAdminChanged(oldAdmin, winner.NodeID)
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // handleHeartbeatTransition manages heartbeat start/stop on admin transitions
 | |
| func (em *ElectionManager) handleHeartbeatTransition(oldAdmin, newAdmin string) {
 | |
| 	// If we lost admin role, stop heartbeat
 | |
| 	if oldAdmin == em.nodeID && newAdmin != em.nodeID {
 | |
| 		log.Printf("🔄 Lost admin role, stopping heartbeat")
 | |
| 		if err := em.heartbeatManager.StopHeartbeat(); err != nil {
 | |
| 			log.Printf("⚠️ Error stopping heartbeat: %v", err)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// If we gained admin role, start heartbeat
 | |
| 	if newAdmin == em.nodeID && oldAdmin != em.nodeID {
 | |
| 		log.Printf("🔄 Gained admin role, starting heartbeat")
 | |
| 		// Start with slight delay to ensure election is fully settled
 | |
| 		go func() {
 | |
| 			time.Sleep(1 * time.Second)
 | |
| 			if err := em.heartbeatManager.StartHeartbeat(); err != nil {
 | |
| 				log.Printf("⚠️ Error starting heartbeat: %v", err)
 | |
| 			}
 | |
| 		}()
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // handleAdminHeartbeat processes admin heartbeat messages
 | |
| func (em *ElectionManager) handleAdminHeartbeat(data []byte) {
 | |
| 	var heartbeat struct {
 | |
| 		NodeID    string    `json:"node_id"`
 | |
| 		Timestamp time.Time `json:"timestamp"`
 | |
| 	}
 | |
| 
 | |
| 	if err := json.Unmarshal(data, &heartbeat); err != nil {
 | |
| 		log.Printf("❌ Failed to unmarshal heartbeat: %v", err)
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	em.mu.Lock()
 | |
| 	defer em.mu.Unlock()
 | |
| 
 | |
| 	// Update admin and heartbeat timestamp
 | |
| 	if em.currentAdmin == "" || em.currentAdmin == heartbeat.NodeID {
 | |
| 		em.currentAdmin = heartbeat.NodeID
 | |
| 		em.lastHeartbeat = heartbeat.Timestamp
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // publishElectionMessage publishes an election message
 | |
| func (em *ElectionManager) publishElectionMessage(msg ElectionMessage) error {
 | |
| 	data, err := json.Marshal(msg)
 | |
| 	if err != nil {
 | |
| 		return fmt.Errorf("failed to marshal election message: %w", err)
 | |
| 	}
 | |
| 
 | |
| 	return em.pubsub.PublishRaw(electionTopic, data)
 | |
| }
 | |
| 
 | |
| // SendAdminHeartbeat sends admin heartbeat (only if this node is admin)
 | |
| func (em *ElectionManager) SendAdminHeartbeat() error {
 | |
| 	if !em.IsCurrentAdmin() {
 | |
| 		return fmt.Errorf("not current admin")
 | |
| 	}
 | |
| 
 | |
| 	heartbeat := struct {
 | |
| 		NodeID    string    `json:"node_id"`
 | |
| 		Timestamp time.Time `json:"timestamp"`
 | |
| 	}{
 | |
| 		NodeID:    em.nodeID,
 | |
| 		Timestamp: time.Now(),
 | |
| 	}
 | |
| 
 | |
| 	data, err := json.Marshal(heartbeat)
 | |
| 	if err != nil {
 | |
| 		return fmt.Errorf("failed to marshal heartbeat: %w", err)
 | |
| 	}
 | |
| 
 | |
| 	return em.pubsub.PublishRaw(adminHeartbeatTopic, data)
 | |
| }
 | |
| 
 | |
| // min returns the minimum of two float64 values
 | |
| func min(a, b float64) float64 {
 | |
| 	if a < b {
 | |
| 		return a
 | |
| 	}
 | |
| 	return b
 | |
| }
 | |
| 
 | |
| // HeartbeatManager methods
 | |
| 
 | |
| // NewHeartbeatManager creates a new heartbeat manager
 | |
| func NewHeartbeatManager(electionMgr *ElectionManager) *HeartbeatManager {
 | |
| 	return &HeartbeatManager{
 | |
| 		electionMgr: electionMgr,
 | |
| 		logger: func(msg string, args ...interface{}) {
 | |
| 			log.Printf("[HEARTBEAT] "+msg, args...)
 | |
| 		},
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // StartHeartbeat begins heartbeat transmission
 | |
| func (hm *HeartbeatManager) StartHeartbeat() error {
 | |
| 	hm.mu.Lock()
 | |
| 	defer hm.mu.Unlock()
 | |
| 
 | |
| 	if hm.isRunning {
 | |
| 		hm.logger("Heartbeat already running")
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	if !hm.electionMgr.IsCurrentAdmin() {
 | |
| 		return fmt.Errorf("not admin, cannot start heartbeat")
 | |
| 	}
 | |
| 
 | |
| 	hm.logger("Starting admin heartbeat transmission")
 | |
| 
 | |
| 	hm.stopCh = make(chan struct{})
 | |
| 	interval := hm.electionMgr.config.Security.ElectionConfig.HeartbeatTimeout / 2
 | |
| 	hm.ticker = time.NewTicker(interval)
 | |
| 	hm.isRunning = true
 | |
| 
 | |
| 	// Start heartbeat goroutine
 | |
| 	go hm.heartbeatLoop()
 | |
| 
 | |
| 	hm.logger("Admin heartbeat started (interval: %v)", interval)
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // StopHeartbeat stops heartbeat transmission
 | |
| func (hm *HeartbeatManager) StopHeartbeat() error {
 | |
| 	hm.mu.Lock()
 | |
| 	defer hm.mu.Unlock()
 | |
| 
 | |
| 	if !hm.isRunning {
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	hm.logger("Stopping admin heartbeat transmission")
 | |
| 
 | |
| 	// Signal stop
 | |
| 	close(hm.stopCh)
 | |
| 
 | |
| 	// Stop ticker
 | |
| 	if hm.ticker != nil {
 | |
| 		hm.ticker.Stop()
 | |
| 		hm.ticker = nil
 | |
| 	}
 | |
| 
 | |
| 	hm.isRunning = false
 | |
| 	hm.logger("Admin heartbeat stopped")
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // IsRunning returns whether heartbeat is currently active
 | |
| func (hm *HeartbeatManager) IsRunning() bool {
 | |
| 	hm.mu.Lock()
 | |
| 	defer hm.mu.Unlock()
 | |
| 	return hm.isRunning
 | |
| }
 | |
| 
 | |
| // heartbeatLoop runs the heartbeat transmission loop
 | |
| func (hm *HeartbeatManager) heartbeatLoop() {
 | |
| 	defer func() {
 | |
| 		hm.mu.Lock()
 | |
| 		hm.isRunning = false
 | |
| 		hm.mu.Unlock()
 | |
| 		hm.logger("Heartbeat loop terminated")
 | |
| 	}()
 | |
| 
 | |
| 	for {
 | |
| 		select {
 | |
| 		case <-hm.ticker.C:
 | |
| 			// Only send heartbeat if still admin
 | |
| 			if hm.electionMgr.IsCurrentAdmin() {
 | |
| 				if err := hm.electionMgr.SendAdminHeartbeat(); err != nil {
 | |
| 					hm.logger("Failed to send heartbeat: %v", err)
 | |
| 				}
 | |
| 			} else {
 | |
| 				hm.logger("No longer admin, stopping heartbeat")
 | |
| 				return
 | |
| 			}
 | |
| 
 | |
| 		case <-hm.stopCh:
 | |
| 			hm.logger("Heartbeat stop signal received")
 | |
| 			return
 | |
| 
 | |
| 		case <-hm.electionMgr.ctx.Done():
 | |
| 			hm.logger("Election manager context cancelled")
 | |
| 			return
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // GetHeartbeatStatus returns current heartbeat status
 | |
| func (hm *HeartbeatManager) GetHeartbeatStatus() map[string]interface{} {
 | |
| 	hm.mu.Lock()
 | |
| 	defer hm.mu.Unlock()
 | |
| 
 | |
| 	status := map[string]interface{}{
 | |
| 		"running":   hm.isRunning,
 | |
| 		"is_admin":  hm.electionMgr.IsCurrentAdmin(),
 | |
| 		"last_sent": time.Now(), // TODO: Track actual last sent time
 | |
| 	}
 | |
| 
 | |
| 	if hm.isRunning && hm.ticker != nil {
 | |
| 		// Calculate next heartbeat time (approximate)
 | |
| 		interval := hm.electionMgr.config.Security.ElectionConfig.HeartbeatTimeout / 2
 | |
| 		status["interval"] = interval.String()
 | |
| 		status["next_heartbeat"] = time.Now().Add(interval)
 | |
| 	}
 | |
| 
 | |
| 	return status
 | |
| }
 | |
| 
 | |
| // Helper functions for stability window configuration
 | |
| 
 | |
| // getElectionStabilityWindow gets the minimum time between elections
 | |
| func getElectionStabilityWindow(cfg *config.Config) time.Duration {
 | |
| 	// Try to get from environment or use default
 | |
| 	if stability := os.Getenv("CHORUS_ELECTION_MIN_TERM"); stability != "" {
 | |
| 		if duration, err := time.ParseDuration(stability); err == nil {
 | |
| 			return duration
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// Try to get from config structure if it exists
 | |
| 	if cfg.Security.ElectionConfig.DiscoveryTimeout > 0 {
 | |
| 		// Use double the discovery timeout as default stability window
 | |
| 		return cfg.Security.ElectionConfig.DiscoveryTimeout * 2
 | |
| 	}
 | |
| 
 | |
| 	// Default fallback
 | |
| 	return 30 * time.Second
 | |
| }
 | |
| 
 | |
| // getLeaderStabilityWindow gets the minimum time before challenging a healthy leader
 | |
| func getLeaderStabilityWindow(cfg *config.Config) time.Duration {
 | |
| 	// Try to get from environment or use default
 | |
| 	if stability := os.Getenv("CHORUS_LEADER_MIN_TERM"); stability != "" {
 | |
| 		if duration, err := time.ParseDuration(stability); err == nil {
 | |
| 			return duration
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// Try to get from config structure if it exists
 | |
| 	if cfg.Security.ElectionConfig.HeartbeatTimeout > 0 {
 | |
| 		// Use 3x heartbeat timeout as default leader stability
 | |
| 		return cfg.Security.ElectionConfig.HeartbeatTimeout * 3
 | |
| 	}
 | |
| 
 | |
| 	// Default fallback
 | |
| 	return 45 * time.Second
 | |
| }
 |