CHORUS/pkg/health/enhanced_health_checks.go

package health

import (
	"context"
	"fmt"
	"math"
	"sync"
	"time"

	"chorus/pkg/dht"
	"chorus/pkg/election"
	"chorus/pubsub"
)

// EnhancedHealthChecks provides comprehensive health monitoring for CHORUS infrastructure
type EnhancedHealthChecks struct {
	mu           sync.RWMutex
	manager      *Manager
	election     *election.ElectionManager
	dht          *dht.LibP2PDHT
	pubsub       *pubsub.PubSub
	replication  *dht.ReplicationManager

	// Metrics storage
	metrics      *HealthMetrics
	checkHistory map[string][]*CheckResult
	maxHistory   int

	// Configuration
	config       *HealthConfig

	logger       Logger
}

// HealthConfig configures health check behavior
type HealthConfig struct {
	// Active probe intervals
	PubSubProbeInterval    time.Duration
	DHTProbeInterval      time.Duration
	ElectionProbeInterval time.Duration

	// Probe timeouts
	PubSubProbeTimeout    time.Duration
	DHTProbeTimeout       time.Duration
	ElectionProbeTimeout  time.Duration

	// Thresholds
	MaxFailedProbes       int
	HealthyThreshold      float64
	DegradedThreshold     float64

	// History retention
	MaxHistoryEntries     int
	HistoryCleanupInterval time.Duration

	// Enable/disable specific checks
	EnablePubSubProbes    bool
	EnableDHTProbes       bool
	EnableElectionProbes  bool
	EnableReplicationProbes bool
}

// HealthMetrics tracks comprehensive health metrics
type HealthMetrics struct {
	mu                    sync.RWMutex

	// Overall system health
	SystemHealthScore     float64
	LastFullHealthCheck   time.Time
	TotalHealthChecks     int64
	FailedHealthChecks    int64

	// PubSub metrics
	PubSubHealthScore     float64
	PubSubProbeLatency    time.Duration
	PubSubSuccessRate     float64
	PubSubLastSuccess     time.Time
	PubSubConsecutiveFails int

	// DHT metrics
	DHTHealthScore        float64
	DHTProbeLatency       time.Duration
	DHTSuccessRate        float64
	DHTLastSuccess        time.Time
	DHTConsecutiveFails   int
	DHTReplicationStatus  map[string]*dht.ReplicationStatus

	// Election metrics
	ElectionHealthScore   float64
	ElectionStability     float64
	HeartbeatLatency      time.Duration
	LeadershipChanges     int64
	LastLeadershipChange  time.Time
	AdminUptime           time.Duration

	// Network metrics
	P2PConnectedPeers     int
	P2PConnectivityScore  float64
	NetworkLatency        time.Duration

	// Resource metrics
	CPUUsage             float64
	MemoryUsage          float64
	DiskUsage            float64

	// Service-specific metrics
	ActiveTasks          int
	QueuedTasks          int
	TaskSuccessRate      float64
}

// DefaultHealthConfig returns default health check configuration
func DefaultHealthConfig() *HealthConfig {
	return &HealthConfig{
		PubSubProbeInterval:     30 * time.Second,
		DHTProbeInterval:        60 * time.Second,
		ElectionProbeInterval:   15 * time.Second,
		PubSubProbeTimeout:      10 * time.Second,
		DHTProbeTimeout:         20 * time.Second,
		ElectionProbeTimeout:    5 * time.Second,
		MaxFailedProbes:         3,
		HealthyThreshold:        0.95,
		DegradedThreshold:       0.75,
		MaxHistoryEntries:       1000,
		HistoryCleanupInterval:  1 * time.Hour,
		EnablePubSubProbes:      true,
		EnableDHTProbes:         true,
		EnableElectionProbes:    true,
		EnableReplicationProbes: true,
	}
}

// NewEnhancedHealthChecks creates a new enhanced health check system
func NewEnhancedHealthChecks(
	manager *Manager,
	election *election.ElectionManager,
	dht *dht.LibP2PDHT,
	pubsub *pubsub.PubSub,
	replication *dht.ReplicationManager,
	logger Logger,
) *EnhancedHealthChecks {
	ehc := &EnhancedHealthChecks{
		manager:     manager,
		election:    election,
		dht:         dht,
		pubsub:      pubsub,
		replication: replication,
		metrics:     &HealthMetrics{},
		checkHistory: make(map[string][]*CheckResult),
		maxHistory:  1000,
		config:      DefaultHealthConfig(),
		logger:      logger,
	}

	// Initialize metrics
	ehc.initializeMetrics()

	// Register enhanced health checks
	ehc.registerHealthChecks()

	// Start background monitoring
	go ehc.startBackgroundMonitoring()

	return ehc
}

// initializeMetrics initializes the metrics system
func (ehc *EnhancedHealthChecks) initializeMetrics() {
	ehc.metrics.mu.Lock()
	defer ehc.metrics.mu.Unlock()

	ehc.metrics.DHTReplicationStatus = make(map[string]*dht.ReplicationStatus)
	ehc.metrics.LastFullHealthCheck = time.Now()
}

// registerHealthChecks registers all enhanced health checks with the manager
func (ehc *EnhancedHealthChecks) registerHealthChecks() {
	if ehc.config.EnablePubSubProbes {
		ehc.manager.RegisterCheck(ehc.createEnhancedPubSubCheck())
	}

	// Temporarily disable DHT health check to prevent shutdown issues
	// TODO: Fix DHT configuration and re-enable this check
	// if ehc.config.EnableDHTProbes {
	// 	ehc.manager.RegisterCheck(ehc.createEnhancedDHTCheck())
	// }

	if ehc.config.EnableElectionProbes {
		ehc.manager.RegisterCheck(ehc.createElectionHealthCheck())
	}

	if ehc.config.EnableReplicationProbes {
		ehc.manager.RegisterCheck(ehc.createReplicationHealthCheck())
	}

	// System-level checks
	ehc.manager.RegisterCheck(ehc.createP2PConnectivityCheck())
	ehc.manager.RegisterCheck(ehc.createResourceHealthCheck())
	ehc.manager.RegisterCheck(ehc.createTaskManagerHealthCheck())
}

// createEnhancedPubSubCheck creates an enhanced PubSub health check
func (ehc *EnhancedHealthChecks) createEnhancedPubSubCheck() *HealthCheck {
	return &HealthCheck{
		Name:        "pubsub-enhanced",
		Description: "Enhanced PubSub health check with comprehensive probing",
		Enabled:     true,
		Critical:    true,
		Interval:    ehc.config.PubSubProbeInterval,
		Timeout:     ehc.config.PubSubProbeTimeout,
		Checker: func(ctx context.Context) CheckResult {
			start := time.Now()

			// Generate unique test data
			testID := fmt.Sprintf("health-test-%d", time.Now().UnixNano())
			testTopic := "CHORUS/health/enhanced/v1"

			testData := map[string]interface{}{
				"test_id":    testID,
				"timestamp":  time.Now().Unix(),
				"node_id":    ehc.getNodeID(),
				"check_type": "enhanced_pubsub_probe",
			}

			// Test message publishing and subscription
			result := ehc.testPubSubRoundTrip(ctx, testTopic, testData)
			result.Latency = time.Since(start)

			// Update metrics
			ehc.updatePubSubMetrics(result)

			// Add comprehensive details
			result.Details = map[string]interface{}{
				"test_id":           testID,
				"topic":             testTopic,
				"probe_latency_ms":  result.Latency.Milliseconds(),
				"success_rate":      ehc.metrics.PubSubSuccessRate,
				"consecutive_fails": ehc.metrics.PubSubConsecutiveFails,
				"last_success":      ehc.metrics.PubSubLastSuccess,
			}

			return result
		},
	}
}

// createEnhancedDHTCheck creates an enhanced DHT health check
func (ehc *EnhancedHealthChecks) createEnhancedDHTCheck() *HealthCheck {
	return &HealthCheck{
		Name:        "dht-enhanced",
		Description: "Enhanced DHT health check with replication monitoring",
		Enabled:     true,
		Critical:    true,
		Interval:    ehc.config.DHTProbeInterval,
		Timeout:     ehc.config.DHTProbeTimeout,
		Checker: func(ctx context.Context) CheckResult {
			start := time.Now()

			// Test DHT operations
			result := ehc.testDHTOperations(ctx)
			result.Latency = time.Since(start)

			// Check replication status
			replicationHealth := ehc.checkReplicationHealth(ctx)

			// Combine results
			if !result.Healthy || !replicationHealth.Healthy {
				result.Healthy = false
				result.Message = fmt.Sprintf("DHT: %s | Replication: %s",
					result.Message, replicationHealth.Message)
			}

			// Update metrics
			ehc.updateDHTMetrics(result, replicationHealth)

			// Add comprehensive details
			result.Details = map[string]interface{}{
				"dht_latency_ms":       result.Latency.Milliseconds(),
				"replication_health":   replicationHealth.Healthy,
				"success_rate":         ehc.metrics.DHTSuccessRate,
				"consecutive_fails":    ehc.metrics.DHTConsecutiveFails,
				"replication_status":   ehc.metrics.DHTReplicationStatus,
			}

			return result
		},
	}
}

// createElectionHealthCheck creates election system health check
func (ehc *EnhancedHealthChecks) createElectionHealthCheck() *HealthCheck {
	return &HealthCheck{
		Name:        "election-health",
		Description: "Election system health and leadership stability check",
		Enabled:     false, // Temporarily disabled to prevent shutdown loops
		Critical:    false,
		Interval:    ehc.config.ElectionProbeInterval,
		Timeout:     ehc.config.ElectionProbeTimeout,
		Checker: func(ctx context.Context) CheckResult {
			start := time.Now()

			// Check election state and heartbeat status
			currentAdmin := ehc.election.GetCurrentAdmin()
			electionState := ehc.election.GetElectionState()
			heartbeatStatus := ehc.election.GetHeartbeatStatus()

			result := CheckResult{
				Timestamp: time.Now(),
			}

			// Determine health based on election state
			switch electionState {
			case election.StateIdle:
				if currentAdmin != "" {
					result.Healthy = true
					result.Message = fmt.Sprintf("Election stable, admin: %s", currentAdmin)
				} else {
					result.Healthy = false
					result.Message = "No admin elected"
				}
			case election.StateElecting:
				result.Healthy = false
				result.Message = "Election in progress"
			case election.StateDiscovering:
				result.Healthy = false
				result.Message = "Admin discovery in progress"
			default:
				result.Healthy = false
				result.Message = fmt.Sprintf("Unknown election state: %s", electionState)
			}

			result.Latency = time.Since(start)

			// Update metrics
			ehc.updateElectionMetrics(result, currentAdmin, heartbeatStatus)

			result.Details = map[string]interface{}{
				"current_admin":     currentAdmin,
				"election_state":    electionState,
				"heartbeat_status":  heartbeatStatus,
				"leadership_changes": ehc.metrics.LeadershipChanges,
				"admin_uptime":      ehc.metrics.AdminUptime.String(),
				"stability_score":   ehc.metrics.ElectionStability,
			}

			return result
		},
	}
}

// createReplicationHealthCheck creates replication system health check
func (ehc *EnhancedHealthChecks) createReplicationHealthCheck() *HealthCheck {
	return &HealthCheck{
		Name:        "replication-health",
		Description: "DHT replication system health monitoring",
		Enabled:     true,
		Critical:    false,
		Interval:    120 * time.Second,
		Timeout:     30 * time.Second,
		Checker: func(ctx context.Context) CheckResult {
			start := time.Now()

			if ehc.replication == nil {
				return CheckResult{
					Healthy:   false,
					Message:   "Replication manager not available",
					Timestamp: time.Now(),
					Latency:   time.Since(start),
				}
			}

			metrics := ehc.replication.GetMetrics()

			result := CheckResult{
				Healthy:   true,
				Message:   fmt.Sprintf("Replication healthy: %d keys, %.1f avg replicas",
					metrics.TotalKeys, metrics.AverageReplication),
				Timestamp: time.Now(),
				Latency:   time.Since(start),
			}

			// Check for replication health issues
			if metrics.FailedReplications > metrics.SuccessfulReplications/10 {
				result.Healthy = false
				result.Message = fmt.Sprintf("High replication failure rate: %d/%d failed",
					metrics.FailedReplications, metrics.SuccessfulReplications)
			}

			result.Details = map[string]interface{}{
				"total_keys":          metrics.TotalKeys,
				"total_providers":     metrics.TotalProviders,
				"successful_replicas": metrics.SuccessfulReplications,
				"failed_replicas":     metrics.FailedReplications,
				"average_replication": metrics.AverageReplication,
				"last_reprovide":      metrics.LastReprovideTime,
			}

			return result
		},
	}
}

// createP2PConnectivityCheck creates P2P network connectivity health check
func (ehc *EnhancedHealthChecks) createP2PConnectivityCheck() *HealthCheck {
	return &HealthCheck{
		Name:        "p2p-connectivity",
		Description: "P2P network connectivity and peer quality check",
		Enabled:     true,
		Critical:    true,
		Interval:    30 * time.Second,
		Timeout:     15 * time.Second,
		Checker: func(ctx context.Context) CheckResult {
			start := time.Now()

			// This would integrate with the P2P node
			// For now, we'll use placeholder values
			connectedPeers := 5 // Would get from actual P2P node
			targetPeers := 3

			result := CheckResult{
				Timestamp: time.Now(),
			}

			if connectedPeers >= targetPeers {
				result.Healthy = true
				result.Message = fmt.Sprintf("P2P connectivity healthy: %d peers connected", connectedPeers)
			} else {
				result.Healthy = false
				result.Message = fmt.Sprintf("Insufficient P2P peers: %d < %d required",
					connectedPeers, targetPeers)
			}

			result.Latency = time.Since(start)

			// Update metrics
			ehc.metrics.mu.Lock()
			ehc.metrics.P2PConnectedPeers = connectedPeers
			ehc.metrics.P2PConnectivityScore = float64(connectedPeers) / float64(targetPeers)
			if ehc.metrics.P2PConnectivityScore > 1.0 {
				ehc.metrics.P2PConnectivityScore = 1.0
			}
			ehc.metrics.mu.Unlock()

			result.Details = map[string]interface{}{
				"connected_peers":    connectedPeers,
				"target_peers":       targetPeers,
				"connectivity_score": ehc.metrics.P2PConnectivityScore,
			}

			return result
		},
	}
}

// createResourceHealthCheck creates system resource health check
func (ehc *EnhancedHealthChecks) createResourceHealthCheck() *HealthCheck {
	return &HealthCheck{
		Name:        "resource-health",
		Description: "System resource utilization health check",
		Enabled:     true,
		Critical:    false,
		Interval:    60 * time.Second,
		Timeout:     10 * time.Second,
		Checker: func(ctx context.Context) CheckResult {
			start := time.Now()

			// In a real implementation, these would be actual system metrics
			cpuUsage := 0.45    // 45%
			memoryUsage := 0.62 // 62%
			diskUsage := 0.73   // 73%

			result := CheckResult{
				Healthy:   true,
				Message:   "Resource utilization within normal ranges",
				Timestamp: time.Now(),
				Latency:   time.Since(start),
			}

			// Check thresholds
			if cpuUsage > 0.85 || memoryUsage > 0.90 || diskUsage > 0.90 {
				result.Healthy = false
				result.Message = fmt.Sprintf("High resource utilization: CPU %.1f%%, Memory %.1f%%, Disk %.1f%%",
					cpuUsage*100, memoryUsage*100, diskUsage*100)
			} else if cpuUsage > 0.70 || memoryUsage > 0.80 || diskUsage > 0.80 {
				result.Message = fmt.Sprintf("Elevated resource utilization: CPU %.1f%%, Memory %.1f%%, Disk %.1f%%",
					cpuUsage*100, memoryUsage*100, diskUsage*100)
			}

			// Update metrics
			ehc.metrics.mu.Lock()
			ehc.metrics.CPUUsage = cpuUsage
			ehc.metrics.MemoryUsage = memoryUsage
			ehc.metrics.DiskUsage = diskUsage
			ehc.metrics.mu.Unlock()

			result.Details = map[string]interface{}{
				"cpu_usage":    cpuUsage,
				"memory_usage": memoryUsage,
				"disk_usage":   diskUsage,
			}

			return result
		},
	}
}

// createTaskManagerHealthCheck creates task management health check
func (ehc *EnhancedHealthChecks) createTaskManagerHealthCheck() *HealthCheck {
	return &HealthCheck{
		Name:        "task-manager",
		Description: "Task coordination and management health check",
		Enabled:     true,
		Critical:    false,
		Interval:    30 * time.Second,
		Timeout:     10 * time.Second,
		Checker: func(ctx context.Context) CheckResult {
			start := time.Now()

			// In a real implementation, these would come from the task coordinator
			activeTasks := 3
			queuedTasks := 1
			maxTasks := 10
			successRate := 0.95

			result := CheckResult{
				Healthy:   true,
				Message:   fmt.Sprintf("Task management healthy: %d active, %d queued", activeTasks, queuedTasks),
				Timestamp: time.Now(),
				Latency:   time.Since(start),
			}

			// Check for task management issues
			if activeTasks >= maxTasks {
				result.Healthy = false
				result.Message = "Task manager at capacity"
			} else if successRate < 0.80 {
				result.Healthy = false
				result.Message = fmt.Sprintf("Low task success rate: %.1f%%", successRate*100)
			}

			// Update metrics
			ehc.metrics.mu.Lock()
			ehc.metrics.ActiveTasks = activeTasks
			ehc.metrics.QueuedTasks = queuedTasks
			ehc.metrics.TaskSuccessRate = successRate
			ehc.metrics.mu.Unlock()

			result.Details = map[string]interface{}{
				"active_tasks":   activeTasks,
				"queued_tasks":   queuedTasks,
				"max_tasks":      maxTasks,
				"success_rate":   successRate,
				"utilization":    float64(activeTasks) / float64(maxTasks),
			}

			return result
		},
	}
}

// testPubSubRoundTrip tests PubSub publish/subscribe functionality
func (ehc *EnhancedHealthChecks) testPubSubRoundTrip(ctx context.Context, topic string, testData map[string]interface{}) CheckResult {
	// This would implement actual PubSub round-trip testing
	// For now, we simulate the test

	// Simulate test latency
	time.Sleep(50 * time.Millisecond)

	return CheckResult{
		Healthy:   true,
		Message:   "PubSub round-trip test successful",
		Timestamp: time.Now(),
	}
}

// testDHTOperations tests DHT put/get operations
func (ehc *EnhancedHealthChecks) testDHTOperations(ctx context.Context) CheckResult {
	if ehc.dht == nil {
		return CheckResult{
			Healthy:   false,
			Message:   "DHT not available",
			Timestamp: time.Now(),
		}
	}

	// This would implement actual DHT testing using the adapter
	adapter := NewDHTAdapter(ehc.dht)

	testKey := fmt.Sprintf("health-test-%d", time.Now().UnixNano())
	testValue := []byte(fmt.Sprintf(`{"test":true,"timestamp":%d}`, time.Now().Unix()))

	// Test put operation
	if err := adapter.PutValue(ctx, testKey, testValue); err != nil {
		return CheckResult{
			Healthy:   false,
			Message:   fmt.Sprintf("DHT put failed: %v", err),
			Error:     err,
			Timestamp: time.Now(),
		}
	}

	// Test get operation
	retrievedValue, err := adapter.GetValue(ctx, testKey)
	if err != nil {
		return CheckResult{
			Healthy:   false,
			Message:   fmt.Sprintf("DHT get failed: %v", err),
			Error:     err,
			Timestamp: time.Now(),
		}
	}

	// Verify data integrity
	if string(retrievedValue) != string(testValue) {
		return CheckResult{
			Healthy:   false,
			Message:   "DHT data integrity check failed",
			Timestamp: time.Now(),
		}
	}

	return CheckResult{
		Healthy:   true,
		Message:   "DHT operations successful",
		Timestamp: time.Now(),
	}
}

// checkReplicationHealth checks the health of DHT replication
func (ehc *EnhancedHealthChecks) checkReplicationHealth(ctx context.Context) CheckResult {
	if ehc.replication == nil {
		return CheckResult{
			Healthy:   true,
			Message:   "Replication manager not configured",
			Timestamp: time.Now(),
		}
	}

	metrics := ehc.replication.GetMetrics()

	// Check replication health
	if metrics.TotalKeys == 0 {
		return CheckResult{
			Healthy:   true,
			Message:   "No content to replicate",
			Timestamp: time.Now(),
		}
	}

	// Check failure rate
	totalOperations := metrics.SuccessfulReplications + metrics.FailedReplications
	if totalOperations > 0 {
		failureRate := float64(metrics.FailedReplications) / float64(totalOperations)
		if failureRate > 0.1 { // More than 10% failure rate
			return CheckResult{
				Healthy:   false,
				Message:   fmt.Sprintf("High replication failure rate: %.1f%%", failureRate*100),
				Timestamp: time.Now(),
			}
		}
	}

	return CheckResult{
		Healthy:   true,
		Message:   fmt.Sprintf("Replication healthy: %d keys, %.1f avg replicas",
			metrics.TotalKeys, metrics.AverageReplication),
		Timestamp: time.Now(),
	}
}

// updatePubSubMetrics updates PubSub health metrics
func (ehc *EnhancedHealthChecks) updatePubSubMetrics(result CheckResult) {
	ehc.metrics.mu.Lock()
	defer ehc.metrics.mu.Unlock()

	ehc.metrics.PubSubProbeLatency = result.Latency

	if result.Healthy {
		ehc.metrics.PubSubLastSuccess = result.Timestamp
		ehc.metrics.PubSubConsecutiveFails = 0

		// Update success rate (simple exponential moving average)
		ehc.metrics.PubSubSuccessRate = ehc.metrics.PubSubSuccessRate*0.9 + 0.1
	} else {
		ehc.metrics.PubSubConsecutiveFails++
		ehc.metrics.PubSubSuccessRate = ehc.metrics.PubSubSuccessRate * 0.9
	}

	// Calculate health score
	ehc.metrics.PubSubHealthScore = ehc.metrics.PubSubSuccessRate *
		(1.0 - float64(ehc.metrics.PubSubConsecutiveFails)*0.1)
	if ehc.metrics.PubSubHealthScore < 0 {
		ehc.metrics.PubSubHealthScore = 0
	}
}

// updateDHTMetrics updates DHT health metrics
func (ehc *EnhancedHealthChecks) updateDHTMetrics(result CheckResult, replicationResult CheckResult) {
	ehc.metrics.mu.Lock()
	defer ehc.metrics.mu.Unlock()

	ehc.metrics.DHTProbeLatency = result.Latency

	if result.Healthy {
		ehc.metrics.DHTLastSuccess = result.Timestamp
		ehc.metrics.DHTConsecutiveFails = 0
		ehc.metrics.DHTSuccessRate = ehc.metrics.DHTSuccessRate*0.9 + 0.1
	} else {
		ehc.metrics.DHTConsecutiveFails++
		ehc.metrics.DHTSuccessRate = ehc.metrics.DHTSuccessRate * 0.9
	}

	// Calculate health score
	ehc.metrics.DHTHealthScore = ehc.metrics.DHTSuccessRate *
		(1.0 - float64(ehc.metrics.DHTConsecutiveFails)*0.1)
	if ehc.metrics.DHTHealthScore < 0 {
		ehc.metrics.DHTHealthScore = 0
	}

	// Include replication health in overall DHT health
	if replicationResult.Healthy {
		ehc.metrics.DHTHealthScore = ehc.metrics.DHTHealthScore*0.8 + 0.2
	} else {
		ehc.metrics.DHTHealthScore = ehc.metrics.DHTHealthScore * 0.8
	}
}

// updateElectionMetrics updates election health metrics
func (ehc *EnhancedHealthChecks) updateElectionMetrics(result CheckResult, currentAdmin string, heartbeatStatus map[string]interface{}) {
	ehc.metrics.mu.Lock()
	defer ehc.metrics.mu.Unlock()

	// Track leadership changes
	if ehc.metrics.LastLeadershipChange.IsZero() {
		ehc.metrics.LastLeadershipChange = time.Now()
	}

	// Calculate admin uptime
	if currentAdmin != "" {
		ehc.metrics.AdminUptime = time.Since(ehc.metrics.LastLeadershipChange)
	} else {
		ehc.metrics.AdminUptime = 0
	}

	// Calculate election stability (higher is better)
	timeSinceLastChange := time.Since(ehc.metrics.LastLeadershipChange)
	ehc.metrics.ElectionStability = math.Min(1.0, timeSinceLastChange.Hours()/24.0)

	// Extract heartbeat latency if available
	if latencyStr, ok := heartbeatStatus["interval"].(string); ok {
		if interval, err := time.ParseDuration(latencyStr); err == nil {
			ehc.metrics.HeartbeatLatency = interval / 2 // Approximate latency
		}
	}

	// Calculate election health score
	if result.Healthy && currentAdmin != "" {
		ehc.metrics.ElectionHealthScore = 1.0 * ehc.metrics.ElectionStability
	} else {
		ehc.metrics.ElectionHealthScore = 0.3 // Degraded but not critical
	}
}

// startBackgroundMonitoring starts background health monitoring
func (ehc *EnhancedHealthChecks) startBackgroundMonitoring() {
	ticker := time.NewTicker(30 * time.Second)
	defer ticker.Stop()

	for range ticker.C {
		ehc.calculateOverallSystemHealth()
		ehc.cleanupHistory()
	}
}

// calculateOverallSystemHealth calculates overall system health score
func (ehc *EnhancedHealthChecks) calculateOverallSystemHealth() {
	ehc.metrics.mu.Lock()
	defer ehc.metrics.mu.Unlock()

	// Weight different components
	weights := map[string]float64{
		"pubsub":       0.25,
		"dht":          0.25,
		"election":     0.15,
		"p2p":          0.20,
		"resources":    0.10,
		"tasks":        0.05,
	}

	// Calculate weighted average
	totalScore := 0.0
	totalWeight := 0.0

	if ehc.config.EnablePubSubProbes {
		totalScore += ehc.metrics.PubSubHealthScore * weights["pubsub"]
		totalWeight += weights["pubsub"]
	}

	if ehc.config.EnableDHTProbes {
		totalScore += ehc.metrics.DHTHealthScore * weights["dht"]
		totalWeight += weights["dht"]
	}

	if ehc.config.EnableElectionProbes {
		totalScore += ehc.metrics.ElectionHealthScore * weights["election"]
		totalWeight += weights["election"]
	}

	totalScore += ehc.metrics.P2PConnectivityScore * weights["p2p"]
	totalWeight += weights["p2p"]

	// Resource health (inverse of utilization)
	resourceHealth := 1.0 - math.Max(ehc.metrics.CPUUsage,
		math.Max(ehc.metrics.MemoryUsage, ehc.metrics.DiskUsage))
	totalScore += resourceHealth * weights["resources"]
	totalWeight += weights["resources"]

	// Task health
	taskHealth := ehc.metrics.TaskSuccessRate
	totalScore += taskHealth * weights["tasks"]
	totalWeight += weights["tasks"]

	if totalWeight > 0 {
		ehc.metrics.SystemHealthScore = totalScore / totalWeight
	} else {
		ehc.metrics.SystemHealthScore = 0.5 // Unknown health
	}

	ehc.metrics.LastFullHealthCheck = time.Now()
	ehc.metrics.TotalHealthChecks++
}

// cleanupHistory cleans up old health check history
func (ehc *EnhancedHealthChecks) cleanupHistory() {
	ehc.mu.Lock()
	defer ehc.mu.Unlock()

	cutoff := time.Now().Add(-24 * time.Hour) // Keep last 24 hours

	for checkName, history := range ehc.checkHistory {
		var newHistory []*CheckResult
		for _, result := range history {
			if result.Timestamp.After(cutoff) {
				newHistory = append(newHistory, result)
			}
		}
		ehc.checkHistory[checkName] = newHistory
	}
}

// GetHealthMetrics returns comprehensive health metrics
func (ehc *EnhancedHealthChecks) GetHealthMetrics() *HealthMetrics {
	ehc.metrics.mu.RLock()
	defer ehc.metrics.mu.RUnlock()

	// Create a deep copy to avoid race conditions
	metrics := &HealthMetrics{}
	*metrics = *ehc.metrics

	// Copy the map
	metrics.DHTReplicationStatus = make(map[string]*dht.ReplicationStatus)
	for k, v := range ehc.metrics.DHTReplicationStatus {
		statusCopy := *v
		metrics.DHTReplicationStatus[k] = &statusCopy
	}

	return metrics
}

// GetHealthSummary returns a summary of system health
func (ehc *EnhancedHealthChecks) GetHealthSummary() map[string]interface{} {
	metrics := ehc.GetHealthMetrics()

	status := "healthy"
	if metrics.SystemHealthScore < ehc.config.DegradedThreshold {
		status = "degraded"
	}
	if metrics.SystemHealthScore < ehc.config.DegradedThreshold*0.5 {
		status = "critical"
	}

	return map[string]interface{}{
		"status":               status,
		"overall_score":        metrics.SystemHealthScore,
		"last_check":           metrics.LastFullHealthCheck,
		"total_checks":         metrics.TotalHealthChecks,
		"component_scores": map[string]float64{
			"pubsub":         metrics.PubSubHealthScore,
			"dht":            metrics.DHTHealthScore,
			"election":       metrics.ElectionHealthScore,
			"p2p":            metrics.P2PConnectivityScore,
		},
		"key_metrics": map[string]interface{}{
			"connected_peers":      metrics.P2PConnectedPeers,
			"active_tasks":         metrics.ActiveTasks,
			"admin_uptime":         metrics.AdminUptime.String(),
			"leadership_changes":   metrics.LeadershipChanges,
			"resource_utilization": map[string]float64{
				"cpu":    metrics.CPUUsage,
				"memory": metrics.MemoryUsage,
				"disk":   metrics.DiskUsage,
			},
		},
	}
}

// getNodeID returns the current node ID (placeholder implementation)
func (ehc *EnhancedHealthChecks) getNodeID() string {
	return "node-placeholder" // Would get from actual node
}