Harden CHORUS security and messaging stack

This commit is contained in:
anthonyrawlins
2025-09-20 23:21:35 +10:00
parent 57751f277a
commit 1bb736c09a
25 changed files with 2793 additions and 2474 deletions

View File

@@ -19,8 +19,8 @@ import (
type ElectionTrigger string
const (
TriggerHeartbeatTimeout ElectionTrigger = "admin_heartbeat_timeout"
TriggerDiscoveryFailure ElectionTrigger = "no_admin_discovered"
TriggerHeartbeatTimeout ElectionTrigger = "admin_heartbeat_timeout"
TriggerDiscoveryFailure ElectionTrigger = "no_admin_discovered"
TriggerSplitBrain ElectionTrigger = "split_brain_detected"
TriggerQuorumRestored ElectionTrigger = "quorum_restored"
TriggerManual ElectionTrigger = "manual_trigger"
@@ -30,30 +30,35 @@ const (
type ElectionState string
const (
StateIdle ElectionState = "idle"
StateDiscovering ElectionState = "discovering"
StateElecting ElectionState = "electing"
electionTopic = "CHORUS/election/v1"
adminHeartbeatTopic = "CHORUS/admin/heartbeat/v1"
)
const (
StateIdle ElectionState = "idle"
StateDiscovering ElectionState = "discovering"
StateElecting ElectionState = "electing"
StateReconstructing ElectionState = "reconstructing_keys"
StateComplete ElectionState = "complete"
StateComplete ElectionState = "complete"
)
// AdminCandidate represents a node candidate for admin role
type AdminCandidate struct {
NodeID string `json:"node_id"`
PeerID peer.ID `json:"peer_id"`
Capabilities []string `json:"capabilities"`
Uptime time.Duration `json:"uptime"`
Resources ResourceMetrics `json:"resources"`
Experience time.Duration `json:"experience"`
Score float64 `json:"score"`
Metadata map[string]interface{} `json:"metadata,omitempty"`
NodeID string `json:"node_id"`
PeerID peer.ID `json:"peer_id"`
Capabilities []string `json:"capabilities"`
Uptime time.Duration `json:"uptime"`
Resources ResourceMetrics `json:"resources"`
Experience time.Duration `json:"experience"`
Score float64 `json:"score"`
Metadata map[string]interface{} `json:"metadata,omitempty"`
}
// ResourceMetrics holds node resource information for election scoring
type ResourceMetrics struct {
CPUUsage float64 `json:"cpu_usage"`
MemoryUsage float64 `json:"memory_usage"`
DiskUsage float64 `json:"disk_usage"`
CPUUsage float64 `json:"cpu_usage"`
MemoryUsage float64 `json:"memory_usage"`
DiskUsage float64 `json:"disk_usage"`
NetworkQuality float64 `json:"network_quality"`
}
@@ -68,46 +73,46 @@ type ElectionMessage struct {
// ElectionManager handles admin election coordination
type ElectionManager struct {
ctx context.Context
cancel context.CancelFunc
config *config.Config
host libp2p.Host
pubsub *pubsub.PubSub
nodeID string
ctx context.Context
cancel context.CancelFunc
config *config.Config
host libp2p.Host
pubsub *pubsub.PubSub
nodeID string
// Election state
mu sync.RWMutex
state ElectionState
currentTerm int
lastHeartbeat time.Time
currentAdmin string
candidates map[string]*AdminCandidate
votes map[string]string // voter -> candidate
mu sync.RWMutex
state ElectionState
currentTerm int
lastHeartbeat time.Time
currentAdmin string
candidates map[string]*AdminCandidate
votes map[string]string // voter -> candidate
// Timers and channels
heartbeatTimer *time.Timer
discoveryTimer *time.Timer
electionTimer *time.Timer
electionTrigger chan ElectionTrigger
heartbeatTimer *time.Timer
discoveryTimer *time.Timer
electionTimer *time.Timer
electionTrigger chan ElectionTrigger
// Heartbeat management
heartbeatManager *HeartbeatManager
heartbeatManager *HeartbeatManager
// Callbacks
onAdminChanged func(oldAdmin, newAdmin string)
onAdminChanged func(oldAdmin, newAdmin string)
onElectionComplete func(winner string)
startTime time.Time
}
// HeartbeatManager manages admin heartbeat lifecycle
type HeartbeatManager struct {
mu sync.Mutex
isRunning bool
stopCh chan struct{}
ticker *time.Ticker
electionMgr *ElectionManager
logger func(msg string, args ...interface{})
mu sync.Mutex
isRunning bool
stopCh chan struct{}
ticker *time.Ticker
electionMgr *ElectionManager
logger func(msg string, args ...interface{})
}
// NewElectionManager creates a new election manager
@@ -119,7 +124,7 @@ func NewElectionManager(
nodeID string,
) *ElectionManager {
electionCtx, cancel := context.WithCancel(ctx)
em := &ElectionManager{
ctx: electionCtx,
cancel: cancel,
@@ -133,7 +138,7 @@ func NewElectionManager(
electionTrigger: make(chan ElectionTrigger, 10),
startTime: time.Now(),
}
// Initialize heartbeat manager
em.heartbeatManager = &HeartbeatManager{
electionMgr: em,
@@ -141,29 +146,32 @@ func NewElectionManager(
log.Printf("[HEARTBEAT] "+msg, args...)
},
}
return em
}
// Start begins the election management system
func (em *ElectionManager) Start() error {
log.Printf("🗳️ Starting election manager for node %s", em.nodeID)
// TODO: Subscribe to election-related messages - pubsub interface needs update
// if err := em.pubsub.Subscribe("CHORUS/election/v1", em.handleElectionMessage); err != nil {
// return fmt.Errorf("failed to subscribe to election messages: %w", err)
// }
//
// if err := em.pubsub.Subscribe("CHORUS/admin/heartbeat/v1", em.handleAdminHeartbeat); err != nil {
// return fmt.Errorf("failed to subscribe to admin heartbeat: %w", err)
// }
if err := em.pubsub.SubscribeRawTopic(electionTopic, func(data []byte, _ peer.ID) {
em.handleElectionMessage(data)
}); err != nil {
return fmt.Errorf("failed to subscribe to election messages: %w", err)
}
if err := em.pubsub.SubscribeRawTopic(adminHeartbeatTopic, func(data []byte, _ peer.ID) {
em.handleAdminHeartbeat(data)
}); err != nil {
return fmt.Errorf("failed to subscribe to admin heartbeat: %w", err)
}
// Start discovery process
go em.startDiscoveryLoop()
// Start election coordinator
go em.electionCoordinator()
// Start heartbeat if this node is already admin at startup
if em.IsCurrentAdmin() {
go func() {
@@ -174,7 +182,7 @@ func (em *ElectionManager) Start() error {
}
}()
}
log.Printf("✅ Election manager started")
return nil
}
@@ -182,17 +190,17 @@ func (em *ElectionManager) Start() error {
// Stop shuts down the election manager
func (em *ElectionManager) Stop() {
log.Printf("🛑 Stopping election manager")
// Stop heartbeat first
if em.heartbeatManager != nil {
em.heartbeatManager.StopHeartbeat()
}
em.cancel()
em.mu.Lock()
defer em.mu.Unlock()
if em.heartbeatTimer != nil {
em.heartbeatTimer.Stop()
}
@@ -255,7 +263,7 @@ func (em *ElectionManager) GetHeartbeatStatus() map[string]interface{} {
// startDiscoveryLoop starts the admin discovery loop
func (em *ElectionManager) startDiscoveryLoop() {
log.Printf("🔍 Starting admin discovery loop")
for {
select {
case <-em.ctx.Done():
@@ -272,19 +280,19 @@ func (em *ElectionManager) performAdminDiscovery() {
currentState := em.state
lastHeartbeat := em.lastHeartbeat
em.mu.Unlock()
// Only discover if we're idle or the heartbeat is stale
if currentState != StateIdle {
return
}
// Check if admin heartbeat has timed out
if !lastHeartbeat.IsZero() && time.Since(lastHeartbeat) > em.config.Security.ElectionConfig.HeartbeatTimeout {
log.Printf("⚰️ Admin heartbeat timeout detected (last: %v)", lastHeartbeat)
em.TriggerElection(TriggerHeartbeatTimeout)
return
}
// If we haven't heard from an admin recently, try to discover one
if lastHeartbeat.IsZero() || time.Since(lastHeartbeat) > em.config.Security.ElectionConfig.DiscoveryTimeout/2 {
em.sendDiscoveryRequest()
@@ -298,7 +306,7 @@ func (em *ElectionManager) sendDiscoveryRequest() {
NodeID: em.nodeID,
Timestamp: time.Now(),
}
if err := em.publishElectionMessage(discoveryMsg); err != nil {
log.Printf("❌ Failed to send admin discovery request: %v", err)
}
@@ -307,7 +315,7 @@ func (em *ElectionManager) sendDiscoveryRequest() {
// electionCoordinator handles the main election logic
func (em *ElectionManager) electionCoordinator() {
log.Printf("🎯 Election coordinator started")
for {
select {
case <-em.ctx.Done():
@@ -321,17 +329,17 @@ func (em *ElectionManager) electionCoordinator() {
// handleElectionTrigger processes election triggers
func (em *ElectionManager) handleElectionTrigger(trigger ElectionTrigger) {
log.Printf("🔥 Processing election trigger: %s", trigger)
em.mu.Lock()
currentState := em.state
em.mu.Unlock()
// Ignore triggers if we're already in an election
if currentState != StateIdle {
log.Printf("⏸️ Ignoring election trigger, current state: %s", currentState)
return
}
// Begin election process
em.beginElection(trigger)
}
@@ -339,7 +347,7 @@ func (em *ElectionManager) handleElectionTrigger(trigger ElectionTrigger) {
// beginElection starts a new election
func (em *ElectionManager) beginElection(trigger ElectionTrigger) {
log.Printf("🗳️ Beginning election due to: %s", trigger)
em.mu.Lock()
em.state = StateElecting
em.currentTerm++
@@ -347,12 +355,12 @@ func (em *ElectionManager) beginElection(trigger ElectionTrigger) {
em.candidates = make(map[string]*AdminCandidate)
em.votes = make(map[string]string)
em.mu.Unlock()
// Announce candidacy if this node can be admin
if em.canBeAdmin() {
em.announceCandidacy(term)
}
// Send election announcement
electionMsg := ElectionMessage{
Type: "election_started",
@@ -363,11 +371,11 @@ func (em *ElectionManager) beginElection(trigger ElectionTrigger) {
"trigger": string(trigger),
},
}
if err := em.publishElectionMessage(electionMsg); err != nil {
log.Printf("❌ Failed to announce election start: %v", err)
}
// Start election timeout
em.startElectionTimeout(term)
}
@@ -386,7 +394,7 @@ func (em *ElectionManager) canBeAdmin() bool {
// announceCandidacy announces this node as an election candidate
func (em *ElectionManager) announceCandidacy(term int) {
uptime := time.Since(em.startTime)
candidate := &AdminCandidate{
NodeID: em.nodeID,
PeerID: em.host.ID(),
@@ -396,13 +404,13 @@ func (em *ElectionManager) announceCandidacy(term int) {
Experience: uptime, // For now, use uptime as experience
Metadata: map[string]interface{}{
"specialization": em.config.Agent.Specialization,
"models": em.config.Agent.Models,
"models": em.config.Agent.Models,
},
}
// Calculate candidate score
candidate.Score = em.calculateCandidateScore(candidate)
candidacyMsg := ElectionMessage{
Type: "candidacy_announcement",
NodeID: em.nodeID,
@@ -410,9 +418,9 @@ func (em *ElectionManager) announceCandidacy(term int) {
Term: term,
Data: candidate,
}
log.Printf("📢 Announcing candidacy (score: %.2f)", candidate.Score)
if err := em.publishElectionMessage(candidacyMsg); err != nil {
log.Printf("❌ Failed to announce candidacy: %v", err)
}
@@ -423,9 +431,9 @@ func (em *ElectionManager) getResourceMetrics() ResourceMetrics {
// TODO: Implement actual resource collection
// For now, return simulated values
return ResourceMetrics{
CPUUsage: rand.Float64() * 0.5, // 0-50% CPU
MemoryUsage: rand.Float64() * 0.7, // 0-70% Memory
DiskUsage: rand.Float64() * 0.6, // 0-60% Disk
CPUUsage: rand.Float64() * 0.5, // 0-50% CPU
MemoryUsage: rand.Float64() * 0.7, // 0-70% Memory
DiskUsage: rand.Float64() * 0.6, // 0-60% Disk
NetworkQuality: 0.8 + rand.Float64()*0.2, // 80-100% Network Quality
}
}
@@ -435,10 +443,10 @@ func (em *ElectionManager) calculateCandidateScore(candidate *AdminCandidate) fl
// TODO: Add LeadershipScoring to config.ElectionConfig
// scoring := em.config.Security.ElectionConfig.LeadershipScoring
// Default scoring weights handled inline
// Normalize metrics to 0-1 range
uptimeScore := min(1.0, candidate.Uptime.Hours()/24.0) // Up to 24 hours gets full score
// Capability score - higher for admin/coordination capabilities
capabilityScore := 0.0
adminCapabilities := []string{"admin_election", "context_curation", "key_reconstruction", "semantic_analysis", "project_manager"}
@@ -455,22 +463,22 @@ func (em *ElectionManager) calculateCandidateScore(candidate *AdminCandidate) fl
}
}
capabilityScore = min(1.0, capabilityScore)
// Resource score - lower usage is better
resourceScore := (1.0 - candidate.Resources.CPUUsage) * 0.3 +
(1.0 - candidate.Resources.MemoryUsage) * 0.3 +
(1.0 - candidate.Resources.DiskUsage) * 0.2 +
candidate.Resources.NetworkQuality * 0.2
resourceScore := (1.0-candidate.Resources.CPUUsage)*0.3 +
(1.0-candidate.Resources.MemoryUsage)*0.3 +
(1.0-candidate.Resources.DiskUsage)*0.2 +
candidate.Resources.NetworkQuality*0.2
experienceScore := min(1.0, candidate.Experience.Hours()/168.0) // Up to 1 week gets full score
// Weighted final score (using default weights)
finalScore := uptimeScore*0.3 +
capabilityScore*0.2 +
resourceScore*0.2 +
candidate.Resources.NetworkQuality*0.15 +
experienceScore*0.15
return finalScore
}
@@ -478,11 +486,11 @@ func (em *ElectionManager) calculateCandidateScore(candidate *AdminCandidate) fl
func (em *ElectionManager) startElectionTimeout(term int) {
em.mu.Lock()
defer em.mu.Unlock()
if em.electionTimer != nil {
em.electionTimer.Stop()
}
em.electionTimer = time.AfterFunc(em.config.Security.ElectionConfig.ElectionTimeout, func() {
em.completeElection(term)
})
@@ -492,15 +500,15 @@ func (em *ElectionManager) startElectionTimeout(term int) {
func (em *ElectionManager) completeElection(term int) {
em.mu.Lock()
defer em.mu.Unlock()
// Verify this is still the current term
if term != em.currentTerm {
log.Printf("⏰ Election timeout for old term %d, ignoring", term)
return
}
log.Printf("⏰ Election timeout reached, tallying votes")
// Find the winning candidate
winner := em.findElectionWinner()
if winner == nil {
@@ -513,14 +521,14 @@ func (em *ElectionManager) completeElection(term int) {
}()
return
}
log.Printf("🏆 Election winner: %s (score: %.2f)", winner.NodeID, winner.Score)
// Update admin
oldAdmin := em.currentAdmin
em.currentAdmin = winner.NodeID
em.state = StateComplete
// Announce the winner
winnerMsg := ElectionMessage{
Type: "election_winner",
@@ -529,16 +537,16 @@ func (em *ElectionManager) completeElection(term int) {
Term: term,
Data: winner,
}
em.mu.Unlock() // Unlock before publishing
if err := em.publishElectionMessage(winnerMsg); err != nil {
log.Printf("❌ Failed to announce election winner: %v", err)
}
// Handle heartbeat lifecycle based on admin change
em.handleHeartbeatTransition(oldAdmin, winner.NodeID)
// Trigger callbacks
if em.onAdminChanged != nil {
em.onAdminChanged(oldAdmin, winner.NodeID)
@@ -546,7 +554,7 @@ func (em *ElectionManager) completeElection(term int) {
if em.onElectionComplete != nil {
em.onElectionComplete(winner.NodeID)
}
em.mu.Lock()
em.state = StateIdle // Reset state for next election
}
@@ -556,16 +564,16 @@ func (em *ElectionManager) findElectionWinner() *AdminCandidate {
if len(em.candidates) == 0 {
return nil
}
// Count votes for each candidate
voteCounts := make(map[string]int)
totalVotes := 0
// Initialize vote counts for all candidates
for candidateID := range em.candidates {
voteCounts[candidateID] = 0
}
// Tally actual votes
for _, candidateID := range em.votes {
if _, exists := em.candidates[candidateID]; exists {
@@ -573,12 +581,12 @@ func (em *ElectionManager) findElectionWinner() *AdminCandidate {
totalVotes++
}
}
// If no votes cast, fall back to highest scoring candidate
if totalVotes == 0 {
var winner *AdminCandidate
highestScore := -1.0
for _, candidate := range em.candidates {
if candidate.Score > highestScore {
highestScore = candidate.Score
@@ -587,12 +595,12 @@ func (em *ElectionManager) findElectionWinner() *AdminCandidate {
}
return winner
}
// Find candidate with most votes
var winner *AdminCandidate
maxVotes := -1
highestScore := -1.0
for candidateID, voteCount := range voteCounts {
candidate := em.candidates[candidateID]
if voteCount > maxVotes || (voteCount == maxVotes && candidate.Score > highestScore) {
@@ -601,10 +609,10 @@ func (em *ElectionManager) findElectionWinner() *AdminCandidate {
winner = candidate
}
}
log.Printf("🗳️ Election results: %d total votes, winner: %s with %d votes (score: %.2f)",
log.Printf("🗳️ Election results: %d total votes, winner: %s with %d votes (score: %.2f)",
totalVotes, winner.NodeID, maxVotes, winner.Score)
return winner
}
@@ -615,12 +623,12 @@ func (em *ElectionManager) handleElectionMessage(data []byte) {
log.Printf("❌ Failed to unmarshal election message: %v", err)
return
}
// Ignore messages from ourselves
if msg.NodeID == em.nodeID {
return
}
switch msg.Type {
case "admin_discovery_request":
em.handleAdminDiscoveryRequest(msg)
@@ -643,7 +651,7 @@ func (em *ElectionManager) handleAdminDiscoveryRequest(msg ElectionMessage) {
currentAdmin := em.currentAdmin
state := em.state
em.mu.RUnlock()
// Only respond if we know who the current admin is and we're idle
if currentAdmin != "" && state == StateIdle {
responseMsg := ElectionMessage{
@@ -654,7 +662,7 @@ func (em *ElectionManager) handleAdminDiscoveryRequest(msg ElectionMessage) {
"current_admin": currentAdmin,
},
}
if err := em.publishElectionMessage(responseMsg); err != nil {
log.Printf("❌ Failed to send admin discovery response: %v", err)
}
@@ -679,7 +687,7 @@ func (em *ElectionManager) handleAdminDiscoveryResponse(msg ElectionMessage) {
func (em *ElectionManager) handleElectionStarted(msg ElectionMessage) {
em.mu.Lock()
defer em.mu.Unlock()
// If we receive an election start with a higher term, join the election
if msg.Term > em.currentTerm {
log.Printf("🔄 Joining election with term %d", msg.Term)
@@ -687,7 +695,7 @@ func (em *ElectionManager) handleElectionStarted(msg ElectionMessage) {
em.state = StateElecting
em.candidates = make(map[string]*AdminCandidate)
em.votes = make(map[string]string)
// Announce candidacy if eligible
if em.canBeAdmin() {
go em.announceCandidacy(msg.Term)
@@ -699,25 +707,25 @@ func (em *ElectionManager) handleElectionStarted(msg ElectionMessage) {
func (em *ElectionManager) handleCandidacyAnnouncement(msg ElectionMessage) {
em.mu.Lock()
defer em.mu.Unlock()
// Only process if it's for the current term
if msg.Term != em.currentTerm {
return
}
// Convert data to candidate struct
candidateData, err := json.Marshal(msg.Data)
if err != nil {
log.Printf("❌ Failed to marshal candidate data: %v", err)
return
}
var candidate AdminCandidate
if err := json.Unmarshal(candidateData, &candidate); err != nil {
log.Printf("❌ Failed to unmarshal candidate: %v", err)
return
}
log.Printf("📝 Received candidacy from %s (score: %.2f)", candidate.NodeID, candidate.Score)
em.candidates[candidate.NodeID] = &candidate
}
@@ -726,31 +734,31 @@ func (em *ElectionManager) handleCandidacyAnnouncement(msg ElectionMessage) {
func (em *ElectionManager) handleElectionVote(msg ElectionMessage) {
em.mu.Lock()
defer em.mu.Unlock()
// Extract vote data
voteData, ok := msg.Data.(map[string]interface{})
if !ok {
log.Printf("❌ Invalid vote data format from %s", msg.NodeID)
return
}
candidateID, ok := voteData["candidate"].(string)
if !ok {
log.Printf("❌ Invalid candidate ID in vote from %s", msg.NodeID)
return
}
// Validate candidate exists
if _, exists := em.candidates[candidateID]; !exists {
log.Printf("❌ Vote for unknown candidate %s from %s", candidateID, msg.NodeID)
return
}
// Prevent duplicate voting
if existingVote, exists := em.votes[msg.NodeID]; exists {
log.Printf("⚠️ Node %s already voted for %s, updating to %s", msg.NodeID, existingVote, candidateID)
}
// Record the vote
em.votes[msg.NodeID] = candidateID
log.Printf("🗳️ Recorded vote from %s for candidate %s", msg.NodeID, candidateID)
@@ -763,24 +771,24 @@ func (em *ElectionManager) handleElectionWinner(msg ElectionMessage) {
log.Printf("❌ Failed to marshal winner data: %v", err)
return
}
var winner AdminCandidate
if err := json.Unmarshal(candidateData, &winner); err != nil {
log.Printf("❌ Failed to unmarshal winner: %v", err)
return
}
em.mu.Lock()
oldAdmin := em.currentAdmin
em.currentAdmin = winner.NodeID
em.state = StateIdle
em.mu.Unlock()
log.Printf("👑 New admin elected: %s", winner.NodeID)
// Handle heartbeat lifecycle based on admin change
em.handleHeartbeatTransition(oldAdmin, winner.NodeID)
// Trigger callback
if em.onAdminChanged != nil {
em.onAdminChanged(oldAdmin, winner.NodeID)
@@ -796,7 +804,7 @@ func (em *ElectionManager) handleHeartbeatTransition(oldAdmin, newAdmin string)
log.Printf("⚠️ Error stopping heartbeat: %v", err)
}
}
// If we gained admin role, start heartbeat
if newAdmin == em.nodeID && oldAdmin != em.nodeID {
log.Printf("🔄 Gained admin role, starting heartbeat")
@@ -816,15 +824,15 @@ func (em *ElectionManager) handleAdminHeartbeat(data []byte) {
NodeID string `json:"node_id"`
Timestamp time.Time `json:"timestamp"`
}
if err := json.Unmarshal(data, &heartbeat); err != nil {
log.Printf("❌ Failed to unmarshal heartbeat: %v", err)
return
}
em.mu.Lock()
defer em.mu.Unlock()
// Update admin and heartbeat timestamp
if em.currentAdmin == "" || em.currentAdmin == heartbeat.NodeID {
em.currentAdmin = heartbeat.NodeID
@@ -838,11 +846,8 @@ func (em *ElectionManager) publishElectionMessage(msg ElectionMessage) error {
if err != nil {
return fmt.Errorf("failed to marshal election message: %w", err)
}
// TODO: Fix pubsub interface
// return em.pubsub.Publish("CHORUS/election/v1", data)
_ = data // Avoid unused variable
return nil
return em.pubsub.PublishRaw(electionTopic, data)
}
// SendAdminHeartbeat sends admin heartbeat (only if this node is admin)
@@ -850,7 +855,7 @@ func (em *ElectionManager) SendAdminHeartbeat() error {
if !em.IsCurrentAdmin() {
return fmt.Errorf("not current admin")
}
heartbeat := struct {
NodeID string `json:"node_id"`
Timestamp time.Time `json:"timestamp"`
@@ -858,16 +863,13 @@ func (em *ElectionManager) SendAdminHeartbeat() error {
NodeID: em.nodeID,
Timestamp: time.Now(),
}
data, err := json.Marshal(heartbeat)
if err != nil {
return fmt.Errorf("failed to marshal heartbeat: %w", err)
}
// TODO: Fix pubsub interface
// return em.pubsub.Publish("CHORUS/admin/heartbeat/v1", data)
_ = data // Avoid unused variable
return nil
return em.pubsub.PublishRaw(adminHeartbeatTopic, data)
}
// min returns the minimum of two float64 values
@@ -894,26 +896,26 @@ func NewHeartbeatManager(electionMgr *ElectionManager) *HeartbeatManager {
func (hm *HeartbeatManager) StartHeartbeat() error {
hm.mu.Lock()
defer hm.mu.Unlock()
if hm.isRunning {
hm.logger("Heartbeat already running")
return nil
}
if !hm.electionMgr.IsCurrentAdmin() {
return fmt.Errorf("not admin, cannot start heartbeat")
}
hm.logger("Starting admin heartbeat transmission")
hm.stopCh = make(chan struct{})
interval := hm.electionMgr.config.Security.ElectionConfig.HeartbeatTimeout / 2
hm.ticker = time.NewTicker(interval)
hm.isRunning = true
// Start heartbeat goroutine
go hm.heartbeatLoop()
hm.logger("Admin heartbeat started (interval: %v)", interval)
return nil
}
@@ -922,22 +924,22 @@ func (hm *HeartbeatManager) StartHeartbeat() error {
func (hm *HeartbeatManager) StopHeartbeat() error {
hm.mu.Lock()
defer hm.mu.Unlock()
if !hm.isRunning {
return nil
}
hm.logger("Stopping admin heartbeat transmission")
// Signal stop
close(hm.stopCh)
// Stop ticker
if hm.ticker != nil {
hm.ticker.Stop()
hm.ticker = nil
}
hm.isRunning = false
hm.logger("Admin heartbeat stopped")
return nil
@@ -958,7 +960,7 @@ func (hm *HeartbeatManager) heartbeatLoop() {
hm.mu.Unlock()
hm.logger("Heartbeat loop terminated")
}()
for {
select {
case <-hm.ticker.C:
@@ -971,11 +973,11 @@ func (hm *HeartbeatManager) heartbeatLoop() {
hm.logger("No longer admin, stopping heartbeat")
return
}
case <-hm.stopCh:
hm.logger("Heartbeat stop signal received")
return
case <-hm.electionMgr.ctx.Done():
hm.logger("Election manager context cancelled")
return
@@ -987,19 +989,19 @@ func (hm *HeartbeatManager) heartbeatLoop() {
func (hm *HeartbeatManager) GetHeartbeatStatus() map[string]interface{} {
hm.mu.Lock()
defer hm.mu.Unlock()
status := map[string]interface{}{
"running": hm.isRunning,
"is_admin": hm.electionMgr.IsCurrentAdmin(),
"last_sent": time.Now(), // TODO: Track actual last sent time
"running": hm.isRunning,
"is_admin": hm.electionMgr.IsCurrentAdmin(),
"last_sent": time.Now(), // TODO: Track actual last sent time
}
if hm.isRunning && hm.ticker != nil {
// Calculate next heartbeat time (approximate)
interval := hm.electionMgr.config.Security.ElectionConfig.HeartbeatTimeout / 2
status["interval"] = interval.String()
status["next_heartbeat"] = time.Now().Add(interval)
}
return status
}
}

View File

@@ -2,451 +2,185 @@ package election
import (
"context"
"encoding/json"
"testing"
"time"
"chorus/pkg/config"
pubsubpkg "chorus/pubsub"
libp2p "github.com/libp2p/go-libp2p"
)
func TestElectionManager_NewElectionManager(t *testing.T) {
// newTestElectionManager wires a real libp2p host and PubSub instance so the
// election manager exercises the same code paths used in production.
func newTestElectionManager(t *testing.T) *ElectionManager {
t.Helper()
ctx, cancel := context.WithCancel(context.Background())
host, err := libp2p.New(libp2p.ListenAddrStrings("/ip4/127.0.0.1/tcp/0"))
if err != nil {
cancel()
t.Fatalf("failed to create libp2p host: %v", err)
}
ps, err := pubsubpkg.NewPubSub(ctx, host, "", "")
if err != nil {
host.Close()
cancel()
t.Fatalf("failed to create pubsub: %v", err)
}
cfg := &config.Config{
Agent: config.AgentConfig{
ID: "test-node",
ID: host.ID().String(),
Role: "context_admin",
Capabilities: []string{"admin_election", "context_curation"},
Models: []string{"meta/llama-3.1-8b-instruct"},
Specialization: "coordination",
},
Security: config.SecurityConfig{},
}
em := NewElectionManager(cfg)
if em == nil {
t.Fatal("Expected NewElectionManager to return non-nil manager")
}
em := NewElectionManager(ctx, cfg, host, ps, host.ID().String())
if em.nodeID != "test-node" {
t.Errorf("Expected nodeID to be 'test-node', got %s", em.nodeID)
}
t.Cleanup(func() {
em.Stop()
ps.Close()
host.Close()
cancel()
})
return em
}
func TestNewElectionManagerInitialState(t *testing.T) {
em := newTestElectionManager(t)
if em.state != StateIdle {
t.Errorf("Expected initial state to be StateIdle, got %v", em.state)
t.Fatalf("expected initial state %q, got %q", StateIdle, em.state)
}
if em.currentTerm != 0 {
t.Fatalf("expected initial term 0, got %d", em.currentTerm)
}
if em.nodeID == "" {
t.Fatal("expected nodeID to be populated")
}
}
func TestElectionManager_StartElection(t *testing.T) {
cfg := &config.Config{
Agent: config.AgentConfig{
ID: "test-node",
},
func TestElectionManagerCanBeAdmin(t *testing.T) {
em := newTestElectionManager(t)
if !em.canBeAdmin() {
t.Fatal("expected node to qualify for admin election")
}
em := NewElectionManager(cfg)
// Start election
err := em.StartElection()
if err != nil {
t.Fatalf("Failed to start election: %v", err)
}
// Verify state changed
if em.state != StateCandidate {
t.Errorf("Expected state to be StateCandidate after starting election, got %v", em.state)
}
// Verify we added ourselves as a candidate
em.mu.RLock()
candidate, exists := em.candidates[em.nodeID]
em.mu.RUnlock()
if !exists {
t.Error("Expected to find ourselves as a candidate after starting election")
}
if candidate.NodeID != em.nodeID {
t.Errorf("Expected candidate NodeID to be %s, got %s", em.nodeID, candidate.NodeID)
em.config.Agent.Capabilities = []string{"runtime_support"}
if em.canBeAdmin() {
t.Fatal("expected node without admin capabilities to be ineligible")
}
}
func TestElectionManager_Vote(t *testing.T) {
cfg := &config.Config{
Agent: config.AgentConfig{
ID: "test-node",
},
}
em := NewElectionManager(cfg)
// Add a candidate first
candidate := &AdminCandidate{
NodeID: "candidate-1",
Term: 1,
Score: 0.8,
Capabilities: []string{"admin"},
LastSeen: time.Now(),
}
em.mu.Lock()
em.candidates["candidate-1"] = candidate
em.mu.Unlock()
// Vote for the candidate
err := em.Vote("candidate-1")
if err != nil {
t.Fatalf("Failed to vote: %v", err)
}
// Verify vote was recorded
em.mu.RLock()
vote, exists := em.votes[em.nodeID]
em.mu.RUnlock()
if !exists {
t.Error("Expected to find our vote after voting")
}
if vote != "candidate-1" {
t.Errorf("Expected vote to be for 'candidate-1', got %s", vote)
}
}
func TestElectionManager_VoteInvalidCandidate(t *testing.T) {
cfg := &config.Config{
Agent: config.AgentConfig{
ID: "test-node",
},
}
em := NewElectionManager(cfg)
// Try to vote for non-existent candidate
err := em.Vote("non-existent")
if err == nil {
t.Error("Expected error when voting for non-existent candidate")
}
}
func TestElectionManager_AddCandidate(t *testing.T) {
cfg := &config.Config{
Agent: config.AgentConfig{
ID: "test-node",
},
}
em := NewElectionManager(cfg)
candidate := &AdminCandidate{
NodeID: "new-candidate",
Term: 1,
Score: 0.7,
Capabilities: []string{"admin", "leader"},
LastSeen: time.Now(),
}
err := em.AddCandidate(candidate)
if err != nil {
t.Fatalf("Failed to add candidate: %v", err)
}
// Verify candidate was added
em.mu.RLock()
stored, exists := em.candidates["new-candidate"]
em.mu.RUnlock()
if !exists {
t.Error("Expected to find added candidate")
}
if stored.NodeID != "new-candidate" {
t.Errorf("Expected stored candidate NodeID to be 'new-candidate', got %s", stored.NodeID)
}
if stored.Score != 0.7 {
t.Errorf("Expected stored candidate score to be 0.7, got %f", stored.Score)
}
}
func TestElectionManager_FindElectionWinner(t *testing.T) {
cfg := &config.Config{
Agent: config.AgentConfig{
ID: "test-node",
},
}
em := NewElectionManager(cfg)
// Add candidates with different scores
candidates := []*AdminCandidate{
{
NodeID: "candidate-1",
Term: 1,
Score: 0.6,
Capabilities: []string{"admin"},
LastSeen: time.Now(),
},
{
NodeID: "candidate-2",
Term: 1,
Score: 0.8,
Capabilities: []string{"admin", "leader"},
LastSeen: time.Now(),
},
{
NodeID: "candidate-3",
Term: 1,
Score: 0.7,
Capabilities: []string{"admin"},
LastSeen: time.Now(),
},
}
func TestFindElectionWinnerPrefersVotesThenScore(t *testing.T) {
em := newTestElectionManager(t)
em.mu.Lock()
for _, candidate := range candidates {
em.candidates[candidate.NodeID] = candidate
em.candidates = map[string]*AdminCandidate{
"candidate-1": {
NodeID: "candidate-1",
PeerID: em.host.ID(),
Score: 0.65,
},
"candidate-2": {
NodeID: "candidate-2",
PeerID: em.host.ID(),
Score: 0.80,
},
}
em.votes = map[string]string{
"voter-a": "candidate-1",
"voter-b": "candidate-2",
"voter-c": "candidate-2",
}
// Add some votes
em.votes["voter-1"] = "candidate-2"
em.votes["voter-2"] = "candidate-2"
em.votes["voter-3"] = "candidate-1"
em.mu.Unlock()
// Find winner
winner := em.findElectionWinner()
if winner == nil {
t.Fatal("Expected findElectionWinner to return a winner")
t.Fatal("expected a winner to be selected")
}
// candidate-2 should win with most votes (2 votes)
if winner.NodeID != "candidate-2" {
t.Errorf("Expected winner to be 'candidate-2', got %s", winner.NodeID)
t.Fatalf("expected candidate-2 to win, got %s", winner.NodeID)
}
}
func TestElectionManager_FindElectionWinnerNoVotes(t *testing.T) {
cfg := &config.Config{
Agent: config.AgentConfig{
ID: "test-node",
},
}
em := NewElectionManager(cfg)
// Add candidates but no votes - should fall back to highest score
candidates := []*AdminCandidate{
{
NodeID: "candidate-1",
Term: 1,
Score: 0.6,
Capabilities: []string{"admin"},
LastSeen: time.Now(),
},
{
NodeID: "candidate-2",
Term: 1,
Score: 0.9, // Highest score
Capabilities: []string{"admin", "leader"},
LastSeen: time.Now(),
},
}
func TestHandleElectionMessageAddsCandidate(t *testing.T) {
em := newTestElectionManager(t)
em.mu.Lock()
for _, candidate := range candidates {
em.candidates[candidate.NodeID] = candidate
}
em.currentTerm = 3
em.state = StateElecting
em.mu.Unlock()
// Find winner without any votes
winner := em.findElectionWinner()
if winner == nil {
t.Fatal("Expected findElectionWinner to return a winner")
}
// candidate-2 should win with highest score
if winner.NodeID != "candidate-2" {
t.Errorf("Expected winner to be 'candidate-2' (highest score), got %s", winner.NodeID)
}
}
func TestElectionManager_HandleElectionVote(t *testing.T) {
cfg := &config.Config{
Agent: config.AgentConfig{
ID: "test-node",
},
}
em := NewElectionManager(cfg)
// Add a candidate first
candidate := &AdminCandidate{
NodeID: "candidate-1",
Term: 1,
Score: 0.8,
Capabilities: []string{"admin"},
LastSeen: time.Now(),
NodeID: "peer-2",
PeerID: em.host.ID(),
Capabilities: []string{"admin_election"},
Uptime: time.Second,
Score: 0.75,
}
payload, err := json.Marshal(candidate)
if err != nil {
t.Fatalf("failed to marshal candidate: %v", err)
}
var data map[string]interface{}
if err := json.Unmarshal(payload, &data); err != nil {
t.Fatalf("failed to unmarshal candidate payload: %v", err)
}
em.mu.Lock()
em.candidates["candidate-1"] = candidate
em.mu.Unlock()
// Create vote message
msg := ElectionMessage{
Type: MessageTypeVote,
NodeID: "voter-1",
Data: map[string]interface{}{
"candidate": "candidate-1",
},
Type: "candidacy_announcement",
NodeID: "peer-2",
Timestamp: time.Now(),
Term: 3,
Data: data,
}
// Handle the vote
em.handleElectionVote(msg)
serialized, err := json.Marshal(msg)
if err != nil {
t.Fatalf("failed to marshal election message: %v", err)
}
em.handleElectionMessage(serialized)
// Verify vote was recorded
em.mu.RLock()
vote, exists := em.votes["voter-1"]
_, exists := em.candidates["peer-2"]
em.mu.RUnlock()
if !exists {
t.Error("Expected vote to be recorded after handling vote message")
}
if vote != "candidate-1" {
t.Errorf("Expected recorded vote to be for 'candidate-1', got %s", vote)
t.Fatal("expected candidacy announcement to register candidate")
}
}
func TestElectionManager_HandleElectionVoteInvalidData(t *testing.T) {
cfg := &config.Config{
Agent: config.AgentConfig{
ID: "test-node",
},
func TestSendAdminHeartbeatRequiresLeadership(t *testing.T) {
em := newTestElectionManager(t)
if err := em.SendAdminHeartbeat(); err == nil {
t.Fatal("expected error when non-admin sends heartbeat")
}
em := NewElectionManager(cfg)
// Create vote message with invalid data
msg := ElectionMessage{
Type: MessageTypeVote,
NodeID: "voter-1",
Data: "invalid-data", // Should be map[string]interface{}
if err := em.Start(); err != nil {
t.Fatalf("failed to start election manager: %v", err)
}
// Handle the vote - should not crash
em.handleElectionVote(msg)
// Verify no vote was recorded
em.mu.RLock()
_, exists := em.votes["voter-1"]
em.mu.RUnlock()
if exists {
t.Error("Expected no vote to be recorded with invalid data")
}
}
func TestElectionManager_CompleteElection(t *testing.T) {
cfg := &config.Config{
Agent: config.AgentConfig{
ID: "test-node",
},
}
em := NewElectionManager(cfg)
// Set up election state
em.mu.Lock()
em.state = StateCandidate
em.currentTerm = 1
em.currentAdmin = em.nodeID
em.mu.Unlock()
// Add a candidate
candidate := &AdminCandidate{
NodeID: "winner",
Term: 1,
Score: 0.9,
Capabilities: []string{"admin", "leader"},
LastSeen: time.Now(),
}
em.mu.Lock()
em.candidates["winner"] = candidate
em.mu.Unlock()
// Complete election
em.CompleteElection()
// Verify state reset
em.mu.RLock()
state := em.state
em.mu.RUnlock()
if state != StateIdle {
t.Errorf("Expected state to be StateIdle after completing election, got %v", state)
if err := em.SendAdminHeartbeat(); err != nil {
t.Fatalf("expected heartbeat to succeed for current admin, got error: %v", err)
}
}
func TestElectionManager_Concurrency(t *testing.T) {
cfg := &config.Config{
Agent: config.AgentConfig{
ID: "test-node",
},
}
em := NewElectionManager(cfg)
// Test concurrent access to vote and candidate operations
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
// Add a candidate
candidate := &AdminCandidate{
NodeID: "candidate-1",
Term: 1,
Score: 0.8,
Capabilities: []string{"admin"},
LastSeen: time.Now(),
}
err := em.AddCandidate(candidate)
if err != nil {
t.Fatalf("Failed to add candidate: %v", err)
}
// Run concurrent operations
done := make(chan bool, 2)
// Concurrent voting
go func() {
defer func() { done <- true }()
for i := 0; i < 10; i++ {
select {
case <-ctx.Done():
return
default:
em.Vote("candidate-1") // Ignore errors in concurrent test
time.Sleep(10 * time.Millisecond)
}
}
}()
// Concurrent state checking
go func() {
defer func() { done <- true }()
for i := 0; i < 10; i++ {
select {
case <-ctx.Done():
return
default:
em.findElectionWinner() // Just check for races
time.Sleep(10 * time.Millisecond)
}
}
}()
// Wait for completion
for i := 0; i < 2; i++ {
select {
case <-done:
case <-ctx.Done():
t.Fatal("Concurrent test timed out")
}
}
}