bzzz/pkg/election/slurp_election.go

package election

import (
	"context"
	"time"

	slurpContext "chorus.services/bzzz/pkg/slurp/context"
)

// SLURPElection extends the base Election interface to include Project Manager contextual intelligence duties
type SLURPElection interface {
	Election // Embed base election interface

	// Project Manager specific capabilities

	// RegisterContextManager registers a SLURP context manager for leader duties
	RegisterContextManager(manager ContextManager) error

	// IsContextLeader returns whether this node is the current context generation leader
	IsContextLeader() bool

	// GetContextManager returns the registered context manager (if leader)
	GetContextManager() (ContextManager, error)

	// TransferContextLeadership initiates graceful context leadership transfer
	TransferContextLeadership(ctx context.Context, targetNodeID string) error

	// GetContextLeaderInfo returns information about current context leader
	GetContextLeaderInfo() (*LeaderInfo, error)

	// Context generation coordination

	// StartContextGeneration begins context generation operations (leader only)
	StartContextGeneration(ctx context.Context) error

	// StopContextGeneration stops context generation operations
	StopContextGeneration(ctx context.Context) error

	// GetContextGenerationStatus returns status of context operations
	GetContextGenerationStatus() (*GenerationStatus, error)

	// RequestContextGeneration queues a context generation request
	RequestContextGeneration(req *ContextGenerationRequest) error

	// Context leadership monitoring

	// SetContextLeadershipCallbacks sets callbacks for context leadership changes
	SetContextLeadershipCallbacks(callbacks *ContextLeadershipCallbacks) error

	// GetContextClusterHealth returns health of context generation cluster
	GetContextClusterHealth() (*ContextClusterHealth, error)

	// Failover and recovery

	// PrepareContextFailover prepares context state for leadership failover
	PrepareContextFailover(ctx context.Context) (*ContextFailoverState, error)

	// ExecuteContextFailover executes context leadership failover
	ExecuteContextFailover(ctx context.Context, state *ContextFailoverState) error

	// ValidateContextState validates context failover state
	ValidateContextState(state *ContextFailoverState) (*ContextStateValidation, error)
}

// Election represents the base election interface (extracted from existing code)
type Election interface {
	// Basic election operations
	Start() error
	Stop()
	TriggerElection(trigger ElectionTrigger)

	// Leadership queries
	GetCurrentAdmin() string
	IsCurrentAdmin() bool
	GetElectionState() ElectionState

	// Callback management
	SetCallbacks(onAdminChanged func(oldAdmin, newAdmin string), onElectionComplete func(winner string))

	// Admin operations
	SendAdminHeartbeat() error
}

// ContextLeadershipCallbacks defines callbacks for context leadership events
type ContextLeadershipCallbacks struct {
	// OnBecomeContextLeader called when this node becomes context leader
	OnBecomeContextLeader func(ctx context.Context, term int64) error

	// OnLoseContextLeadership called when this node loses context leadership
	OnLoseContextLeadership func(ctx context.Context, newLeader string) error

	// OnContextLeaderChanged called when context leader changes (any node)
	OnContextLeaderChanged func(oldLeader, newLeader string, term int64)

	// OnContextGenerationStarted called when context generation starts
	OnContextGenerationStarted func(leaderID string)

	// OnContextGenerationStopped called when context generation stops
	OnContextGenerationStopped func(leaderID string, reason string)

	// OnContextFailover called when context leadership failover occurs
	OnContextFailover func(oldLeader, newLeader string, duration time.Duration)

	// OnContextError called when context operation errors occur
	OnContextError func(error error, severity ErrorSeverity)
}

// ContextClusterHealth represents health of context generation cluster
type ContextClusterHealth struct {
	TotalNodes          int                     `json:"total_nodes"`           // Total nodes in cluster
	HealthyNodes        int                     `json:"healthy_nodes"`         // Healthy nodes
	UnhealthyNodes      []string                `json:"unhealthy_nodes"`       // Unhealthy node IDs
	CurrentLeader       string                  `json:"current_leader"`        // Current context leader
	LeaderHealthy       bool                    `json:"leader_healthy"`        // Leader health status
	GenerationActive    bool                    `json:"generation_active"`     // Context generation status
	QueueHealth         *QueueHealthStatus      `json:"queue_health"`          // Queue health
	NodeHealths         map[string]*NodeHealthStatus `json:"node_healths"`    // Per-node health
	LastElection        time.Time               `json:"last_election"`         // Last election time
	NextHealthCheck     time.Time               `json:"next_health_check"`     // Next health check
	OverallHealthScore  float64                 `json:"overall_health_score"`  // Overall health (0-1)
}

// QueueHealthStatus represents health of context generation queue
type QueueHealthStatus struct {
	QueueLength         int           `json:"queue_length"`          // Current queue length
	MaxQueueSize        int           `json:"max_queue_size"`        // Maximum queue capacity
	QueueUtilization    float64       `json:"queue_utilization"`     // Queue utilization (0-1)
	ProcessingRate      float64       `json:"processing_rate"`       // Requests per second
	AverageWaitTime     time.Duration `json:"average_wait_time"`     // Average wait time
	OldestRequest       *time.Time    `json:"oldest_request"`        // Oldest queued request
	HealthScore         float64       `json:"health_score"`          // Queue health score (0-1)
	Issues              []string      `json:"issues,omitempty"`      // Queue health issues
}

// NodeHealthStatus represents health status of individual node
type NodeHealthStatus struct {
	NodeID              string        `json:"node_id"`               // Node ID
	IsLeader            bool          `json:"is_leader"`             // Whether node is leader
	LastHeartbeat       time.Time     `json:"last_heartbeat"`        // Last heartbeat
	ResponseTime        time.Duration `json:"response_time"`         // Response time
	LoadAverage         float64       `json:"load_average"`          // System load
	ActiveTasks         int           `json:"active_tasks"`          // Active context tasks
	CompletedTasks      int64         `json:"completed_tasks"`       // Completed tasks
	FailedTasks         int64         `json:"failed_tasks"`          // Failed tasks
	HealthScore         float64       `json:"health_score"`          // Health score (0-1)
	Status              NodeStatus    `json:"status"`                // Node status
	Issues              []string      `json:"issues,omitempty"`      // Health issues
}

// NodeStatus represents status of cluster node
type NodeStatus string

const (
	NodeStatusHealthy     NodeStatus = "healthy"      // Node is healthy
	NodeStatusDegraded    NodeStatus = "degraded"     // Node performance degraded
	NodeStatusUnhealthy   NodeStatus = "unhealthy"    // Node is unhealthy
	NodeStatusUnresponsive NodeStatus = "unresponsive" // Node not responding
	NodeStatusOffline     NodeStatus = "offline"      // Node is offline
)

// ContextFailoverState represents state to transfer during context leadership failover
type ContextFailoverState struct {
	// Basic failover state
	LeaderID            string                              `json:"leader_id"`             // Previous leader
	Term                int64                               `json:"term"`                  // Leadership term
	TransferTime        time.Time                           `json:"transfer_time"`         // When transfer occurred

	// Context generation state
	QueuedRequests      []*ContextGenerationRequest  `json:"queued_requests"`       // Queued requests
	ActiveJobs          map[string]*ContextGenerationJob `json:"active_jobs"`       // Active jobs
	CompletedJobs       []*ContextGenerationJob      `json:"completed_jobs"`        // Recent completed jobs

	// Cluster coordination state
	ClusterState        *ClusterState                `json:"cluster_state"`         // Current cluster state
	ResourceAllocations map[string]*ResourceAllocation `json:"resource_allocations"` // Resource allocations
	NodeAssignments     map[string][]string                 `json:"node_assignments"`      // Task assignments per node

	// Configuration state
	ManagerConfig       *ManagerConfig               `json:"manager_config"`        // Manager configuration
	GenerationPolicy    *GenerationPolicy            `json:"generation_policy"`     // Generation policy
	QueuePolicy         *QueuePolicy                 `json:"queue_policy"`          // Queue policy

	// State validation
	StateVersion        int64                               `json:"state_version"`         // State version
	Checksum            string                              `json:"checksum"`              // State checksum
	HealthSnapshot      *ContextClusterHealth               `json:"health_snapshot"`       // Health at transfer

	// Transfer metadata
	TransferReason      string                              `json:"transfer_reason"`       // Reason for transfer
	TransferSource      string                              `json:"transfer_source"`       // Who initiated transfer
	TransferDuration    time.Duration                       `json:"transfer_duration"`     // How long transfer took
	ValidationResults   *ContextStateValidation             `json:"validation_results"`    // State validation results
}

// ContextStateValidation represents validation results for failover state
type ContextStateValidation struct {
	Valid               bool      `json:"valid"`                // Overall validity
	Issues              []string  `json:"issues,omitempty"`     // Validation issues

	// Component validations
	ChecksumValid       bool      `json:"checksum_valid"`       // Checksum validation
	VersionConsistent   bool      `json:"version_consistent"`   // Version consistency
	TimestampValid      bool      `json:"timestamp_valid"`      // Timestamp validity
	QueueStateValid     bool      `json:"queue_state_valid"`    // Queue state validity
	ClusterStateValid   bool      `json:"cluster_state_valid"`  // Cluster state validity
	ConfigValid         bool      `json:"config_valid"`         // Configuration validity

	// Validation metadata
	ValidatedAt         time.Time `json:"validated_at"`         // When validation occurred
	ValidatedBy         string    `json:"validated_by"`         // Node that performed validation
	ValidationDuration  time.Duration `json:"validation_duration"` // Time taken for validation

	// Recommendations
	Recommendations     []string  `json:"recommendations,omitempty"` // Recommendations for issues
	RequiresRecovery    bool      `json:"requires_recovery"`     // Whether recovery is needed
	RecoverySteps       []string  `json:"recovery_steps,omitempty"` // Recovery steps if needed
}

// ErrorSeverity represents severity levels for context operation errors
type ErrorSeverity string

const (
	ErrorSeverityLow      ErrorSeverity = "low"      // Low severity error
	ErrorSeverityMedium   ErrorSeverity = "medium"   // Medium severity error
	ErrorSeverityHigh     ErrorSeverity = "high"     // High severity error
	ErrorSeverityCritical ErrorSeverity = "critical" // Critical error requiring immediate attention
)

// SLURPElectionConfig represents configuration for SLURP-enhanced elections
type SLURPElectionConfig struct {
	// Context leadership configuration
	EnableContextLeadership     bool          `json:"enable_context_leadership"`     // Enable context leadership
	ContextLeadershipWeight     float64       `json:"context_leadership_weight"`     // Weight for context leadership scoring
	RequireContextCapability    bool          `json:"require_context_capability"`    // Require context capability for leadership

	// Context generation configuration
	AutoStartGeneration         bool          `json:"auto_start_generation"`         // Auto-start generation on leadership
	GenerationStartDelay        time.Duration `json:"generation_start_delay"`        // Delay before starting generation
	GenerationStopTimeout       time.Duration `json:"generation_stop_timeout"`       // Timeout for stopping generation

	// Failover configuration
	ContextFailoverTimeout      time.Duration `json:"context_failover_timeout"`      // Context failover timeout
	StateTransferTimeout        time.Duration `json:"state_transfer_timeout"`        // State transfer timeout
	ValidationTimeout           time.Duration `json:"validation_timeout"`            // State validation timeout
	RequireStateValidation      bool          `json:"require_state_validation"`      // Require state validation

	// Health monitoring configuration
	ContextHealthCheckInterval  time.Duration `json:"context_health_check_interval"` // Context health check interval
	ClusterHealthThreshold      float64       `json:"cluster_health_threshold"`      // Minimum cluster health for operations
	LeaderHealthThreshold       float64       `json:"leader_health_threshold"`       // Minimum leader health

	// Queue management configuration
	MaxQueueTransferSize        int           `json:"max_queue_transfer_size"`       // Max requests to transfer
	QueueDrainTimeout           time.Duration `json:"queue_drain_timeout"`           // Timeout for draining queue
	PreserveCompletedJobs       bool          `json:"preserve_completed_jobs"`       // Preserve completed jobs on transfer

	// Coordination configuration
	CoordinationTimeout         time.Duration `json:"coordination_timeout"`          // Coordination operation timeout
	MaxCoordinationRetries      int           `json:"max_coordination_retries"`      // Max coordination retries
	CoordinationBackoff         time.Duration `json:"coordination_backoff"`          // Backoff between coordination retries
}

// DefaultSLURPElectionConfig returns default configuration for SLURP elections
func DefaultSLURPElectionConfig() *SLURPElectionConfig {
	return &SLURPElectionConfig{
		EnableContextLeadership:     true,
		ContextLeadershipWeight:     0.3, // 30% weight for context capabilities
		RequireContextCapability:    true,

		AutoStartGeneration:         true,
		GenerationStartDelay:        5 * time.Second,
		GenerationStopTimeout:       30 * time.Second,

		ContextFailoverTimeout:      60 * time.Second,
		StateTransferTimeout:        30 * time.Second,
		ValidationTimeout:           10 * time.Second,
		RequireStateValidation:      true,

		ContextHealthCheckInterval:  30 * time.Second,
		ClusterHealthThreshold:      0.7, // 70% minimum cluster health
		LeaderHealthThreshold:       0.8, // 80% minimum leader health

		MaxQueueTransferSize:        1000,
		QueueDrainTimeout:           60 * time.Second,
		PreserveCompletedJobs:       true,

		CoordinationTimeout:         10 * time.Second,
		MaxCoordinationRetries:      3,
		CoordinationBackoff:         2 * time.Second,
	}
}