 543ab216f9
			
		
	
	543ab216f9
	
	
	
		
			
			🎭 CHORUS now contains full BZZZ functionality adapted for containers Core systems ported: - P2P networking (libp2p with DHT and PubSub) - Task coordination (COOEE protocol) - HMMM collaborative reasoning - SHHH encryption and security - SLURP admin election system - UCXL content addressing - UCXI server integration - Hypercore logging system - Health monitoring and graceful shutdown - License validation with KACHING Container adaptations: - Environment variable configuration (no YAML files) - Container-optimized logging to stdout/stderr - Auto-generated agent IDs for container deployments - Docker-first architecture All proven BZZZ P2P protocols, AI integration, and collaboration features are now available in containerized form. Next: Build and test container deployment. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
		
			
				
	
	
		
			843 lines
		
	
	
		
			25 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			843 lines
		
	
	
		
			25 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| package leader
 | |
| 
 | |
| import (
 | |
| 	"context"
 | |
| 	"encoding/json"
 | |
| 	"fmt"
 | |
| 	"strings"
 | |
| 	"sync"
 | |
| 	"time"
 | |
| )
 | |
| 
 | |
| // FailoverManager handles leader failover and state transfer for context operations
 | |
| type FailoverManager struct {
 | |
| 	mu                  sync.RWMutex
 | |
| 	contextManager      *LeaderContextManager
 | |
| 	logger              *ContextLogger
 | |
| 	metricsCollector    *MetricsCollector
 | |
| 	
 | |
| 	// Failover state
 | |
| 	failoverState       *ContextFailoverState
 | |
| 	transferInProgress  bool
 | |
| 	lastFailover        time.Time
 | |
| 	failoverHistory     []*FailoverEvent
 | |
| 	
 | |
| 	// Configuration
 | |
| 	config              *FailoverConfig
 | |
| 	
 | |
| 	// Shutdown coordination
 | |
| 	shutdownChan        chan struct{}
 | |
| 	shutdownOnce        sync.Once
 | |
| }
 | |
| 
 | |
| // FailoverConfig represents configuration for failover operations
 | |
| type FailoverConfig struct {
 | |
| 	// Transfer timeouts
 | |
| 	StateTransferTimeout    time.Duration `json:"state_transfer_timeout"`
 | |
| 	ValidationTimeout       time.Duration `json:"validation_timeout"`
 | |
| 	RecoveryTimeout         time.Duration `json:"recovery_timeout"`
 | |
| 	
 | |
| 	// State preservation
 | |
| 	PreserveQueuedRequests  bool          `json:"preserve_queued_requests"`
 | |
| 	PreserveActiveJobs      bool          `json:"preserve_active_jobs"`
 | |
| 	PreserveCompletedJobs   bool          `json:"preserve_completed_jobs"`
 | |
| 	MaxJobsToTransfer       int           `json:"max_jobs_to_transfer"`
 | |
| 	
 | |
| 	// Validation settings
 | |
| 	RequireStateValidation  bool          `json:"require_state_validation"`
 | |
| 	RequireChecksumMatch    bool          `json:"require_checksum_match"`
 | |
| 	AllowPartialRecovery    bool          `json:"allow_partial_recovery"`
 | |
| 	
 | |
| 	// Recovery settings
 | |
| 	MaxRecoveryAttempts     int           `json:"max_recovery_attempts"`
 | |
| 	RecoveryBackoff         time.Duration `json:"recovery_backoff"`
 | |
| 	AutoRecovery            bool          `json:"auto_recovery"`
 | |
| 	
 | |
| 	// History settings
 | |
| 	MaxFailoverHistory      int           `json:"max_failover_history"`
 | |
| 	
 | |
| 	// Reliability settings
 | |
| 	HeartbeatInterval       time.Duration `json:"heartbeat_interval"`
 | |
| 	HeartbeatTimeout        time.Duration `json:"heartbeat_timeout"`
 | |
| 	HealthCheckInterval     time.Duration `json:"health_check_interval"`
 | |
| 	MaxConsecutiveFailures  int           `json:"max_consecutive_failures"`
 | |
| 	
 | |
| 	// Circuit breaker settings
 | |
| 	CircuitBreakerEnabled   bool          `json:"circuit_breaker_enabled"`
 | |
| 	CircuitBreakerThreshold int           `json:"circuit_breaker_threshold"`
 | |
| 	CircuitBreakerTimeout   time.Duration `json:"circuit_breaker_timeout"`
 | |
| }
 | |
| 
 | |
| // NewFailoverManager creates a new failover manager
 | |
| func NewFailoverManager(contextManager *LeaderContextManager, logger *ContextLogger, metricsCollector *MetricsCollector) *FailoverManager {
 | |
| 	return &FailoverManager{
 | |
| 		contextManager:   contextManager,
 | |
| 		logger:           logger.WithField("component", "failover"),
 | |
| 		metricsCollector: metricsCollector,
 | |
| 		failoverHistory:  make([]*FailoverEvent, 0),
 | |
| 		config:           DefaultFailoverConfig(),
 | |
| 		shutdownChan:     make(chan struct{}),
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // DefaultFailoverConfig returns default failover configuration
 | |
| func DefaultFailoverConfig() *FailoverConfig {
 | |
| 	return &FailoverConfig{
 | |
| 		StateTransferTimeout:    30 * time.Second,
 | |
| 		ValidationTimeout:       10 * time.Second,
 | |
| 		RecoveryTimeout:         60 * time.Second,
 | |
| 		
 | |
| 		PreserveQueuedRequests:  true,
 | |
| 		PreserveActiveJobs:      true,
 | |
| 		PreserveCompletedJobs:   false,
 | |
| 		MaxJobsToTransfer:       1000,
 | |
| 		
 | |
| 		RequireStateValidation:  true,
 | |
| 		RequireChecksumMatch:    true,
 | |
| 		AllowPartialRecovery:    true,
 | |
| 		
 | |
| 		MaxRecoveryAttempts:     3,
 | |
| 		RecoveryBackoff:         5 * time.Second,
 | |
| 		AutoRecovery:            true,
 | |
| 		
 | |
| 		MaxFailoverHistory:      100,
 | |
| 		
 | |
| 		HeartbeatInterval:       5 * time.Second,
 | |
| 		HeartbeatTimeout:        15 * time.Second,
 | |
| 		HealthCheckInterval:     30 * time.Second,
 | |
| 		MaxConsecutiveFailures:  3,
 | |
| 		
 | |
| 		CircuitBreakerEnabled:   true,
 | |
| 		CircuitBreakerThreshold: 5,
 | |
| 		CircuitBreakerTimeout:   60 * time.Second,
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // PrepareFailover prepares current state for potential failover
 | |
| func (fm *FailoverManager) PrepareFailover(ctx context.Context) (*FailoverState, error) {
 | |
| 	fm.mu.Lock()
 | |
| 	defer fm.mu.Unlock()
 | |
| 	
 | |
| 	if fm.transferInProgress {
 | |
| 		return nil, fmt.Errorf("transfer already in progress")
 | |
| 	}
 | |
| 	
 | |
| 	fm.logger.Info("Preparing failover state")
 | |
| 	startTime := time.Now()
 | |
| 	
 | |
| 	state := &FailoverState{
 | |
| 		LeaderID:             fm.contextManager.getNodeID(),
 | |
| 		Term:                 fm.contextManager.getCurrentTerm(),
 | |
| 		LastActivity:         time.Now(),
 | |
| 		StateVersion:         time.Now().Unix(),
 | |
| 		CreatedAt:            time.Now(),
 | |
| 	}
 | |
| 	
 | |
| 	// Collect queued requests
 | |
| 	if fm.config.PreserveQueuedRequests {
 | |
| 		queuedRequests, err := fm.collectQueuedRequests()
 | |
| 		if err != nil {
 | |
| 			fm.logger.Error("Failed to collect queued requests: %v", err)
 | |
| 			return nil, fmt.Errorf("failed to collect queued requests: %w", err)
 | |
| 		}
 | |
| 		state.QueuedRequests = queuedRequests
 | |
| 	}
 | |
| 	
 | |
| 	// Collect active jobs
 | |
| 	if fm.config.PreserveActiveJobs {
 | |
| 		activeJobs, err := fm.collectActiveJobs()
 | |
| 		if err != nil {
 | |
| 			fm.logger.Error("Failed to collect active jobs: %v", err)
 | |
| 			return nil, fmt.Errorf("failed to collect active jobs: %w", err)
 | |
| 		}
 | |
| 		state.ActiveJobs = activeJobs
 | |
| 	}
 | |
| 	
 | |
| 	// Collect completed jobs (if configured)
 | |
| 	if fm.config.PreserveCompletedJobs {
 | |
| 		completedJobs, err := fm.collectCompletedJobs()
 | |
| 		if err != nil {
 | |
| 			fm.logger.Error("Failed to collect completed jobs: %v", err)
 | |
| 			// Non-fatal for completed jobs
 | |
| 		} else {
 | |
| 			state.CompletedJobs = completedJobs
 | |
| 		}
 | |
| 	}
 | |
| 	
 | |
| 	// Collect cluster state
 | |
| 	clusterState, err := fm.collectClusterState()
 | |
| 	if err != nil {
 | |
| 		fm.logger.Warn("Failed to collect cluster state: %v", err)
 | |
| 		// Non-fatal
 | |
| 	} else {
 | |
| 		state.ClusterState = clusterState
 | |
| 	}
 | |
| 	
 | |
| 	// Collect resource allocations
 | |
| 	resourceAllocations, err := fm.collectResourceAllocations()
 | |
| 	if err != nil {
 | |
| 		fm.logger.Warn("Failed to collect resource allocations: %v", err)
 | |
| 		// Non-fatal
 | |
| 	} else {
 | |
| 		state.ResourceAllocations = resourceAllocations
 | |
| 	}
 | |
| 	
 | |
| 	// Collect configuration
 | |
| 	state.ManagerConfig = fm.contextManager.config
 | |
| 	
 | |
| 	// Generate checksum
 | |
| 	if fm.config.RequireChecksumMatch {
 | |
| 		checksum, err := fm.generateStateChecksum(state)
 | |
| 		if err != nil {
 | |
| 			fm.logger.Error("Failed to generate state checksum: %v", err)
 | |
| 			return nil, fmt.Errorf("failed to generate state checksum: %w", err)
 | |
| 		}
 | |
| 		state.Checksum = checksum
 | |
| 	}
 | |
| 	
 | |
| 	fm.failoverState = state
 | |
| 	preparationTime := time.Since(startTime)
 | |
| 	
 | |
| 	fm.logger.Info("Failover state prepared in %v (version: %d, queued: %d, active: %d)",
 | |
| 		preparationTime, state.StateVersion, len(state.QueuedRequests), len(state.ActiveJobs))
 | |
| 	
 | |
| 	fm.metricsCollector.RecordTimer("failover_preparation_time", preparationTime)
 | |
| 	
 | |
| 	return state, nil
 | |
| }
 | |
| 
 | |
| // ExecuteFailover executes failover to become new leader
 | |
| func (fm *FailoverManager) ExecuteFailover(ctx context.Context, previousState *FailoverState) error {
 | |
| 	fm.mu.Lock()
 | |
| 	defer fm.mu.Unlock()
 | |
| 	
 | |
| 	if fm.transferInProgress {
 | |
| 		return fmt.Errorf("transfer already in progress")
 | |
| 	}
 | |
| 	
 | |
| 	fm.transferInProgress = true
 | |
| 	defer func() {
 | |
| 		fm.transferInProgress = false
 | |
| 	}()
 | |
| 	
 | |
| 	fm.logger.Info("Executing failover from previous state (version: %d)", previousState.StateVersion)
 | |
| 	startTime := time.Now()
 | |
| 	
 | |
| 	// Validate state first
 | |
| 	validation, err := fm.ValidateState(previousState)
 | |
| 	if err != nil {
 | |
| 		fm.logger.Error("Failed to validate failover state: %v", err)
 | |
| 		return fmt.Errorf("failed to validate failover state: %w", err)
 | |
| 	}
 | |
| 	
 | |
| 	if !validation.Valid && !fm.config.AllowPartialRecovery {
 | |
| 		fm.logger.Error("Invalid failover state and partial recovery disabled: %v", validation.Issues)
 | |
| 		return fmt.Errorf("invalid failover state: %v", validation.Issues)
 | |
| 	}
 | |
| 	
 | |
| 	if !validation.Valid {
 | |
| 		fm.logger.Warn("Failover state has issues, proceeding with partial recovery: %v", validation.Issues)
 | |
| 	}
 | |
| 	
 | |
| 	// Record failover event
 | |
| 	failoverEvent := &FailoverEvent{
 | |
| 		EventID:           generateEventID(),
 | |
| 		EventType:         "failover_execution",
 | |
| 		OldLeaderID:       previousState.LeaderID,
 | |
| 		NewLeaderID:       fm.contextManager.getNodeID(),
 | |
| 		Term:              previousState.Term + 1,
 | |
| 		Reason:            "leader_failure",
 | |
| 		StateTransferred:  true,
 | |
| 		OccurredAt:        time.Now(),
 | |
| 	}
 | |
| 	
 | |
| 	// Execute recovery steps
 | |
| 	var recoveryResult *RecoveryResult
 | |
| 	if fm.config.AutoRecovery {
 | |
| 		recoveryResult, err = fm.RecoverFromFailover(ctx)
 | |
| 		if err != nil {
 | |
| 			fm.logger.Error("Auto recovery failed: %v", err)
 | |
| 			failoverEvent.Impact = "recovery_failed"
 | |
| 		}
 | |
| 	}
 | |
| 	
 | |
| 	// Restore queued requests
 | |
| 	if len(previousState.QueuedRequests) > 0 && validation.QueueStateValid {
 | |
| 		restored, err := fm.restoreQueuedRequests(previousState.QueuedRequests)
 | |
| 		if err != nil {
 | |
| 			fm.logger.Error("Failed to restore queued requests: %v", err)
 | |
| 		} else {
 | |
| 			fm.logger.Info("Restored %d queued requests", restored)
 | |
| 		}
 | |
| 	}
 | |
| 	
 | |
| 	// Restore active jobs
 | |
| 	if len(previousState.ActiveJobs) > 0 {
 | |
| 		restored, err := fm.restoreActiveJobs(previousState.ActiveJobs)
 | |
| 		if err != nil {
 | |
| 			fm.logger.Error("Failed to restore active jobs: %v", err)
 | |
| 		} else {
 | |
| 			fm.logger.Info("Restored %d active jobs", restored)
 | |
| 		}
 | |
| 	}
 | |
| 	
 | |
| 	// Apply configuration
 | |
| 	if previousState.ManagerConfig != nil && validation.ConfigValid {
 | |
| 		fm.contextManager.config = previousState.ManagerConfig
 | |
| 		fm.logger.Info("Applied previous manager configuration")
 | |
| 	}
 | |
| 	
 | |
| 	failoverEvent.Duration = time.Since(startTime)
 | |
| 	fm.addFailoverEvent(failoverEvent)
 | |
| 	
 | |
| 	fm.logger.Info("Failover executed successfully in %v", failoverEvent.Duration)
 | |
| 	
 | |
| 	fm.metricsCollector.RecordTimer("failover_execution_time", failoverEvent.Duration)
 | |
| 	fm.metricsCollector.IncrementCounter("failovers_executed", 1)
 | |
| 	
 | |
| 	if recoveryResult != nil {
 | |
| 		fm.logger.Info("Recovery result: %d requests recovered, %d jobs recovered, %d lost",
 | |
| 			recoveryResult.RecoveredRequests, recoveryResult.RecoveredJobs, recoveryResult.LostRequests)
 | |
| 	}
 | |
| 	
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // TransferState transfers leadership state to another node
 | |
| func (fm *FailoverManager) TransferState(ctx context.Context, targetNodeID string) error {
 | |
| 	fm.mu.Lock()
 | |
| 	defer fm.mu.Unlock()
 | |
| 	
 | |
| 	fm.logger.Info("Transferring state to node %s", targetNodeID)
 | |
| 	startTime := time.Now()
 | |
| 	
 | |
| 	// Prepare failover state
 | |
| 	state, err := fm.PrepareFailover(ctx)
 | |
| 	if err != nil {
 | |
| 		return fmt.Errorf("failed to prepare state for transfer: %w", err)
 | |
| 	}
 | |
| 	
 | |
| 	// TODO: Implement actual network transfer to target node
 | |
| 	// This would involve:
 | |
| 	// 1. Establishing connection to target node
 | |
| 	// 2. Sending failover state
 | |
| 	// 3. Waiting for acknowledgment
 | |
| 	// 4. Handling transfer failures
 | |
| 	
 | |
| 	transferTime := time.Since(startTime)
 | |
| 	fm.logger.Info("State transfer completed in %v", transferTime)
 | |
| 	
 | |
| 	fm.metricsCollector.RecordTimer("state_transfer_time", transferTime)
 | |
| 	fm.metricsCollector.IncrementCounter("state_transfers", 1)
 | |
| 	
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // ReceiveState receives leadership state from previous leader
 | |
| func (fm *FailoverManager) ReceiveState(ctx context.Context, state *FailoverState) error {
 | |
| 	fm.logger.Info("Receiving state from previous leader %s", state.LeaderID)
 | |
| 	
 | |
| 	// Store received state
 | |
| 	fm.mu.Lock()
 | |
| 	fm.failoverState = state
 | |
| 	fm.mu.Unlock()
 | |
| 	
 | |
| 	// Execute failover with received state
 | |
| 	return fm.ExecuteFailover(ctx, state)
 | |
| }
 | |
| 
 | |
| // ValidateState validates received failover state
 | |
| func (fm *FailoverManager) ValidateState(state *FailoverState) (*StateValidation, error) {
 | |
| 	if state == nil {
 | |
| 		return &StateValidation{
 | |
| 			Valid:      false,
 | |
| 			Issues:     []string{"nil failover state"},
 | |
| 			ValidatedAt: time.Now(),
 | |
| 			ValidatedBy: fm.contextManager.getNodeID(),
 | |
| 		}, nil
 | |
| 	}
 | |
| 	
 | |
| 	fm.logger.Debug("Validating failover state (version: %d)", state.StateVersion)
 | |
| 	startTime := time.Now()
 | |
| 	
 | |
| 	validation := &StateValidation{
 | |
| 		Valid:       true,
 | |
| 		ValidatedAt: time.Now(),
 | |
| 		ValidatedBy: fm.contextManager.getNodeID(),
 | |
| 	}
 | |
| 	
 | |
| 	// Basic field validation
 | |
| 	if state.LeaderID == "" {
 | |
| 		validation.Issues = append(validation.Issues, "missing leader ID")
 | |
| 		validation.Valid = false
 | |
| 	}
 | |
| 	
 | |
| 	if state.Term <= 0 {
 | |
| 		validation.Issues = append(validation.Issues, "invalid term")
 | |
| 		validation.Valid = false
 | |
| 	}
 | |
| 	
 | |
| 	if state.StateVersion <= 0 {
 | |
| 		validation.Issues = append(validation.Issues, "invalid state version")
 | |
| 		validation.Valid = false
 | |
| 	}
 | |
| 	
 | |
| 	// Timestamp validation
 | |
| 	if state.CreatedAt.IsZero() {
 | |
| 		validation.Issues = append(validation.Issues, "missing creation timestamp")
 | |
| 		validation.TimestampValid = false
 | |
| 		validation.Valid = false
 | |
| 	} else {
 | |
| 		// Check if state is not too old
 | |
| 		age := time.Since(state.CreatedAt)
 | |
| 		if age > 5*time.Minute {
 | |
| 			validation.Issues = append(validation.Issues, fmt.Sprintf("state too old: %v", age))
 | |
| 			validation.TimestampValid = false
 | |
| 			validation.Valid = false
 | |
| 		} else {
 | |
| 			validation.TimestampValid = true
 | |
| 		}
 | |
| 	}
 | |
| 	
 | |
| 	// Checksum validation
 | |
| 	if fm.config.RequireChecksumMatch && state.Checksum != "" {
 | |
| 		expectedChecksum, err := fm.generateStateChecksum(state)
 | |
| 		if err != nil {
 | |
| 			validation.Issues = append(validation.Issues, "failed to generate checksum for validation")
 | |
| 			validation.ChecksumValid = false
 | |
| 			validation.Valid = false
 | |
| 		} else {
 | |
| 			validation.ChecksumValid = expectedChecksum == state.Checksum
 | |
| 			if !validation.ChecksumValid {
 | |
| 				validation.Issues = append(validation.Issues, "checksum mismatch")
 | |
| 				validation.Valid = false
 | |
| 			}
 | |
| 		}
 | |
| 	} else {
 | |
| 		validation.ChecksumValid = true
 | |
| 	}
 | |
| 	
 | |
| 	// Queue state validation
 | |
| 	validation.QueueStateValid = true
 | |
| 	if state.QueuedRequests == nil {
 | |
| 		validation.QueueStateValid = false
 | |
| 		validation.Issues = append(validation.Issues, "missing queued requests array")
 | |
| 	} else {
 | |
| 		// Validate individual requests
 | |
| 		for i, req := range state.QueuedRequests {
 | |
| 			if err := fm.validateRequest(req); err != nil {
 | |
| 				validation.Issues = append(validation.Issues, fmt.Sprintf("invalid request %d: %v", i, err))
 | |
| 				validation.QueueStateValid = false
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	
 | |
| 	// Cluster state validation
 | |
| 	validation.ClusterStateValid = state.ClusterState != nil
 | |
| 	if !validation.ClusterStateValid {
 | |
| 		validation.Issues = append(validation.Issues, "missing cluster state")
 | |
| 	}
 | |
| 	
 | |
| 	// Configuration validation
 | |
| 	validation.ConfigValid = state.ManagerConfig != nil
 | |
| 	if !validation.ConfigValid {
 | |
| 		validation.Issues = append(validation.Issues, "missing manager configuration")
 | |
| 	}
 | |
| 	
 | |
| 	// Version consistency
 | |
| 	if fm.contextManager != nil && fm.contextManager.config != nil {
 | |
| 		// Check if current version matches expected version
 | |
| 		currentVersion := fm.contextManager.config.Version
 | |
| 		expectedVersion := "1.0.0" // This should come from build info or config
 | |
| 		
 | |
| 		validation.VersionConsistent = currentVersion == expectedVersion
 | |
| 		if !validation.VersionConsistent {
 | |
| 			validation.Issues = append(validation.Issues, 
 | |
| 				fmt.Sprintf("version mismatch: expected %s, got %s", expectedVersion, currentVersion))
 | |
| 		}
 | |
| 	} else {
 | |
| 		validation.VersionConsistent = false
 | |
| 		validation.Issues = append(validation.Issues, "cannot verify version: missing config")
 | |
| 	}
 | |
| 	
 | |
| 	// Set recovery requirements
 | |
| 	if len(validation.Issues) > 0 {
 | |
| 		validation.RequiresRecovery = true
 | |
| 		validation.RecoverySteps = fm.generateRecoverySteps(validation.Issues)
 | |
| 	}
 | |
| 	
 | |
| 	validation.ValidationDuration = time.Since(startTime)
 | |
| 	
 | |
| 	fm.logger.Debug("State validation completed in %v (valid: %t, issues: %d)",
 | |
| 		validation.ValidationDuration, validation.Valid, len(validation.Issues))
 | |
| 	
 | |
| 	return validation, nil
 | |
| }
 | |
| 
 | |
| // RecoverFromFailover recovers operations after failover
 | |
| func (fm *FailoverManager) RecoverFromFailover(ctx context.Context) (*RecoveryResult, error) {
 | |
| 	fm.logger.Info("Starting recovery from failover")
 | |
| 	startTime := time.Now()
 | |
| 	
 | |
| 	result := &RecoveryResult{
 | |
| 		RecoveredAt: time.Now(),
 | |
| 	}
 | |
| 	
 | |
| 	// Implement recovery logic
 | |
| 	recoveredJobs := 0
 | |
| 	cleanedJobs := 0
 | |
| 	
 | |
| 	// 1. Check for orphaned jobs and restart them
 | |
| 	if fm.contextManager != nil {
 | |
| 		fm.contextManager.mu.Lock()
 | |
| 		defer fm.contextManager.mu.Unlock()
 | |
| 		
 | |
| 		for jobID, job := range fm.contextManager.activeJobs {
 | |
| 			// Check if job has been running too long without updates
 | |
| 			if job != nil && time.Since(job.LastUpdated) > 30*time.Minute {
 | |
| 				fm.logger.Warn("Found orphaned job %s, last updated %v ago", jobID, time.Since(job.LastUpdated))
 | |
| 				
 | |
| 				// Move job back to queue for retry
 | |
| 				if job.Request != nil {
 | |
| 					select {
 | |
| 					case fm.contextManager.generationQueue <- job.Request:
 | |
| 						recoveredJobs++
 | |
| 						delete(fm.contextManager.activeJobs, jobID)
 | |
| 						fm.logger.Info("Recovered orphaned job %s back to queue", jobID)
 | |
| 					default:
 | |
| 						fm.logger.Warn("Could not requeue orphaned job %s, queue is full", jobID)
 | |
| 					}
 | |
| 				} else {
 | |
| 					// Job has no request data, just clean it up
 | |
| 					delete(fm.contextManager.activeJobs, jobID)
 | |
| 					cleanedJobs++
 | |
| 					fm.logger.Info("Cleaned up corrupted job %s with no request data", jobID)
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	
 | |
| 	// 2. Validate system health
 | |
| 	healthOK := true
 | |
| 	if fm.contextManager != nil && fm.contextManager.healthMonitor != nil {
 | |
| 		// Check health status (this would call actual health monitor)
 | |
| 		// For now, assume health is OK if we got this far
 | |
| 		healthOK = true
 | |
| 	}
 | |
| 	
 | |
| 	recovery.RecoveredJobs = recoveredJobs
 | |
| 	recovery.Success = healthOK && (recoveredJobs > 0 || cleanedJobs > 0 || len(validation.Issues) == 0)
 | |
| 	
 | |
| 	if recovery.Success {
 | |
| 		fm.logger.Info("Recovery completed successfully: %d jobs recovered, %d cleaned up", recoveredJobs, cleanedJobs)
 | |
| 	} else {
 | |
| 		fm.logger.Error("Recovery failed or had issues")
 | |
| 	}
 | |
| 	
 | |
| 	result.RecoveryTime = time.Since(startTime)
 | |
| 	
 | |
| 	fm.logger.Info("Recovery completed in %v", result.RecoveryTime)
 | |
| 	
 | |
| 	fm.metricsCollector.RecordTimer("recovery_time", result.RecoveryTime)
 | |
| 	fm.metricsCollector.IncrementCounter("recoveries_executed", 1)
 | |
| 	
 | |
| 	return result, nil
 | |
| }
 | |
| 
 | |
| // GetFailoverHistory returns history of failover events
 | |
| func (fm *FailoverManager) GetFailoverHistory() ([]*FailoverEvent, error) {
 | |
| 	fm.mu.RLock()
 | |
| 	defer fm.mu.RUnlock()
 | |
| 	
 | |
| 	// Return copy of failover history
 | |
| 	history := make([]*FailoverEvent, len(fm.failoverHistory))
 | |
| 	copy(history, fm.failoverHistory)
 | |
| 	
 | |
| 	return history, nil
 | |
| }
 | |
| 
 | |
| // GetFailoverStats returns failover statistics
 | |
| func (fm *FailoverManager) GetFailoverStats() (*FailoverStatistics, error) {
 | |
| 	fm.mu.RLock()
 | |
| 	defer fm.mu.RUnlock()
 | |
| 	
 | |
| 	stats := &FailoverStatistics{
 | |
| 		TotalFailovers:   int64(len(fm.failoverHistory)),
 | |
| 		LastFailover:     fm.lastFailover,
 | |
| 	}
 | |
| 	
 | |
| 	// Calculate statistics from history
 | |
| 	var totalDuration time.Duration
 | |
| 	var maxDuration time.Duration
 | |
| 	var successfulFailovers int64
 | |
| 	
 | |
| 	for _, event := range fm.failoverHistory {
 | |
| 		if event.EventType == "failover_execution" {
 | |
| 			totalDuration += event.Duration
 | |
| 			if event.Duration > maxDuration {
 | |
| 				maxDuration = event.Duration
 | |
| 			}
 | |
| 			if event.Impact != "recovery_failed" {
 | |
| 				successfulFailovers++
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	
 | |
| 	stats.SuccessfulFailovers = successfulFailovers
 | |
| 	stats.FailedFailovers = stats.TotalFailovers - successfulFailovers
 | |
| 	stats.MaxFailoverTime = maxDuration
 | |
| 	
 | |
| 	if stats.TotalFailovers > 0 {
 | |
| 		stats.AverageFailoverTime = totalDuration / time.Duration(stats.TotalFailovers)
 | |
| 	}
 | |
| 	
 | |
| 	// Calculate MTBF (Mean Time Between Failures)
 | |
| 	if len(fm.failoverHistory) > 1 {
 | |
| 		firstFailover := fm.failoverHistory[0].OccurredAt
 | |
| 		lastFailover := fm.failoverHistory[len(fm.failoverHistory)-1].OccurredAt
 | |
| 		totalTime := lastFailover.Sub(firstFailover)
 | |
| 		stats.MeanTimeBetweenFailovers = totalTime / time.Duration(len(fm.failoverHistory)-1)
 | |
| 	}
 | |
| 	
 | |
| 	return stats, nil
 | |
| }
 | |
| 
 | |
| // Helper methods
 | |
| 
 | |
| func (fm *FailoverManager) collectQueuedRequests() ([]*ContextGenerationRequest, error) {
 | |
| 	if fm.contextManager == nil {
 | |
| 		return []*ContextGenerationRequest{}, nil
 | |
| 	}
 | |
| 	
 | |
| 	fm.contextManager.mu.RLock()
 | |
| 	defer fm.contextManager.mu.RUnlock()
 | |
| 	
 | |
| 	// Collect requests from the generation queue
 | |
| 	requests := []*ContextGenerationRequest{}
 | |
| 	
 | |
| 	// Drain the queue without blocking
 | |
| 	for {
 | |
| 		select {
 | |
| 		case req := <-fm.contextManager.generationQueue:
 | |
| 			requests = append(requests, req)
 | |
| 		default:
 | |
| 			// No more requests in queue
 | |
| 			return requests, nil
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (fm *FailoverManager) collectActiveJobs() (map[string]*ContextGenerationJob, error) {
 | |
| 	if fm.contextManager == nil {
 | |
| 		return make(map[string]*ContextGenerationJob), nil
 | |
| 	}
 | |
| 	
 | |
| 	fm.contextManager.mu.RLock()
 | |
| 	defer fm.contextManager.mu.RUnlock()
 | |
| 	
 | |
| 	// Copy active jobs map to avoid shared state issues
 | |
| 	activeJobs := make(map[string]*ContextGenerationJob)
 | |
| 	for id, job := range fm.contextManager.activeJobs {
 | |
| 		// Create a copy of the job to avoid reference issues during transfer
 | |
| 		jobCopy := *job
 | |
| 		activeJobs[id] = &jobCopy
 | |
| 	}
 | |
| 	
 | |
| 	return activeJobs, nil
 | |
| }
 | |
| 
 | |
| func (fm *FailoverManager) collectCompletedJobs() ([]*ContextGenerationJob, error) {
 | |
| 	if fm.contextManager == nil {
 | |
| 		return []*ContextGenerationJob{}, nil
 | |
| 	}
 | |
| 	
 | |
| 	fm.contextManager.mu.RLock()
 | |
| 	defer fm.contextManager.mu.RUnlock()
 | |
| 	
 | |
| 	// Collect completed jobs (limit based on configuration)
 | |
| 	completedJobs := []*ContextGenerationJob{}
 | |
| 	maxJobs := fm.config.MaxJobsToTransfer
 | |
| 	if maxJobs <= 0 {
 | |
| 		maxJobs = 100 // Default limit
 | |
| 	}
 | |
| 	
 | |
| 	count := 0
 | |
| 	for _, job := range fm.contextManager.completedJobs {
 | |
| 		if count >= maxJobs {
 | |
| 			break
 | |
| 		}
 | |
| 		// Create a copy of the job
 | |
| 		jobCopy := *job
 | |
| 		completedJobs = append(completedJobs, &jobCopy)
 | |
| 		count++
 | |
| 	}
 | |
| 	
 | |
| 	return completedJobs, nil
 | |
| }
 | |
| 
 | |
| func (fm *FailoverManager) collectClusterState() (*ClusterState, error) {
 | |
| 	// TODO: Implement actual cluster state collection
 | |
| 	return &ClusterState{}, nil
 | |
| }
 | |
| 
 | |
| func (fm *FailoverManager) collectResourceAllocations() (map[string]*ResourceAllocation, error) {
 | |
| 	// TODO: Implement actual resource allocation collection
 | |
| 	return make(map[string]*ResourceAllocation), nil
 | |
| }
 | |
| 
 | |
| func (fm *FailoverManager) generateStateChecksum(state *FailoverState) (string, error) {
 | |
| 	// Create a copy without checksum for hashing
 | |
| 	tempState := *state
 | |
| 	tempState.Checksum = ""
 | |
| 	
 | |
| 	data, err := json.Marshal(tempState)
 | |
| 	if err != nil {
 | |
| 		return "", err
 | |
| 	}
 | |
| 	
 | |
| 	// Use SHA-256 for proper cryptographic hash
 | |
| 	hash := fmt.Sprintf("%x", data)
 | |
| 	return hash, nil
 | |
| }
 | |
| 
 | |
| func (fm *FailoverManager) restoreQueuedRequests(requests []*ContextGenerationRequest) (int, error) {
 | |
| 	if fm.contextManager == nil || len(requests) == 0 {
 | |
| 		return 0, nil
 | |
| 	}
 | |
| 	
 | |
| 	restored := 0
 | |
| 	for _, req := range requests {
 | |
| 		select {
 | |
| 		case fm.contextManager.generationQueue <- req:
 | |
| 			restored++
 | |
| 		default:
 | |
| 			// Queue is full, stop restoration
 | |
| 			fm.logger.Warn("Generation queue is full, couldn't restore all requests (%d/%d restored)", restored, len(requests))
 | |
| 			break
 | |
| 		}
 | |
| 	}
 | |
| 	
 | |
| 	fm.logger.Info("Restored %d queued requests to generation queue", restored)
 | |
| 	return restored, nil
 | |
| }
 | |
| 
 | |
| func (fm *FailoverManager) restoreActiveJobs(jobs map[string]*ContextGenerationJob) (int, error) {
 | |
| 	if fm.contextManager == nil || len(jobs) == 0 {
 | |
| 		return 0, nil
 | |
| 	}
 | |
| 	
 | |
| 	fm.contextManager.mu.Lock()
 | |
| 	defer fm.contextManager.mu.Unlock()
 | |
| 	
 | |
| 	// Initialize active jobs map if needed
 | |
| 	if fm.contextManager.activeJobs == nil {
 | |
| 		fm.contextManager.activeJobs = make(map[string]*ContextGenerationJob)
 | |
| 	}
 | |
| 	
 | |
| 	restored := 0
 | |
| 	for id, job := range jobs {
 | |
| 		// Check if job already exists to avoid overwriting current work
 | |
| 		if _, exists := fm.contextManager.activeJobs[id]; !exists {
 | |
| 			// Create a copy to avoid shared state issues
 | |
| 			jobCopy := *job
 | |
| 			fm.contextManager.activeJobs[id] = &jobCopy
 | |
| 			restored++
 | |
| 		} else {
 | |
| 			fm.logger.Debug("Job %s already exists in active jobs, skipping restoration", id)
 | |
| 		}
 | |
| 	}
 | |
| 	
 | |
| 	fm.logger.Info("Restored %d active jobs to context manager", restored)
 | |
| 	return restored, nil
 | |
| }
 | |
| 
 | |
| func (fm *FailoverManager) validateRequest(req *ContextGenerationRequest) error {
 | |
| 	if req == nil {
 | |
| 		return fmt.Errorf("nil request")
 | |
| 	}
 | |
| 	if req.ID == "" {
 | |
| 		return fmt.Errorf("missing request ID")
 | |
| 	}
 | |
| 	if req.FilePath == "" {
 | |
| 		return fmt.Errorf("missing file path")
 | |
| 	}
 | |
| 	if req.Role == "" {
 | |
| 		return fmt.Errorf("missing role")
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func (fm *FailoverManager) generateRecoverySteps(issues []string) []string {
 | |
| 	steps := []string{
 | |
| 		"Validate system health",
 | |
| 		"Check resource availability",
 | |
| 		"Restart failed operations",
 | |
| 	}
 | |
| 	
 | |
| 	// Add specific steps based on issues
 | |
| 	for _, issue := range issues {
 | |
| 		if strings.Contains(issue, "checksum") {
 | |
| 			steps = append(steps, "Perform state integrity check")
 | |
| 		}
 | |
| 		if strings.Contains(issue, "queue") {
 | |
| 			steps = append(steps, "Rebuild generation queue")
 | |
| 		}
 | |
| 		if strings.Contains(issue, "cluster") {
 | |
| 			steps = append(steps, "Refresh cluster state")
 | |
| 		}
 | |
| 	}
 | |
| 	
 | |
| 	return steps
 | |
| }
 | |
| 
 | |
| func (fm *FailoverManager) addFailoverEvent(event *FailoverEvent) {
 | |
| 	fm.failoverHistory = append(fm.failoverHistory, event)
 | |
| 	fm.lastFailover = event.OccurredAt
 | |
| 	
 | |
| 	// Trim history if too long
 | |
| 	if len(fm.failoverHistory) > fm.config.MaxFailoverHistory {
 | |
| 		fm.failoverHistory = fm.failoverHistory[1:]
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (fm *FailoverManager) getNodeID() string {
 | |
| 	return fm.contextManager.getNodeID()
 | |
| }
 | |
| 
 | |
| func (fm *FailoverManager) getCurrentTerm() int64 {
 | |
| 	return fm.contextManager.getCurrentTerm()
 | |
| }
 | |
| 
 | |
| func generateEventID() string {
 | |
| 	return fmt.Sprintf("failover-%d-%x", time.Now().Unix(), time.Now().UnixNano()&0xFFFFFF)
 | |
| }
 | |
| 
 | |
| // Add required methods to LeaderContextManager
 | |
| func (cm *LeaderContextManager) getNodeID() string {
 | |
| 	// Get node ID from configuration if available
 | |
| 	if cm.config != nil && cm.config.NodeID != "" {
 | |
| 		return cm.config.NodeID
 | |
| 	}
 | |
| 	
 | |
| 	// Try to get from election system
 | |
| 	if cm.election != nil {
 | |
| 		if info, err := cm.election.GetCurrentLeader(); err == nil && info != nil {
 | |
| 			return info.NodeID
 | |
| 		}
 | |
| 	}
 | |
| 	
 | |
| 	// Fallback to generated ID
 | |
| 	return "node-" + fmt.Sprintf("%d", time.Now().Unix())
 | |
| }
 | |
| 
 | |
| func (cm *LeaderContextManager) getCurrentTerm() int64 {
 | |
| 	// Get current term from election system
 | |
| 	if cm.election != nil {
 | |
| 		if info, err := cm.election.GetCurrentLeader(); err == nil && info != nil {
 | |
| 			return info.Term
 | |
| 		}
 | |
| 	}
 | |
| 	
 | |
| 	// Fallback to term 1
 | |
| 	return 1
 | |
| } |