package health import ( "context" "fmt" "math" "sync" "time" "chorus.services/bzzz/pkg/dht" "chorus.services/bzzz/pkg/election" "chorus.services/bzzz/pubsub" ) // EnhancedHealthChecks provides comprehensive health monitoring for BZZZ infrastructure type EnhancedHealthChecks struct { mu sync.RWMutex manager *Manager election *election.ElectionManager dht *dht.LibP2PDHT pubsub *pubsub.PubSub replication *dht.ReplicationManager // Metrics storage metrics *HealthMetrics checkHistory map[string][]*CheckResult maxHistory int // Configuration config *HealthConfig logger Logger } // HealthConfig configures health check behavior type HealthConfig struct { // Active probe intervals PubSubProbeInterval time.Duration DHTProbeInterval time.Duration ElectionProbeInterval time.Duration // Probe timeouts PubSubProbeTimeout time.Duration DHTProbeTimeout time.Duration ElectionProbeTimeout time.Duration // Thresholds MaxFailedProbes int HealthyThreshold float64 DegradedThreshold float64 // History retention MaxHistoryEntries int HistoryCleanupInterval time.Duration // Enable/disable specific checks EnablePubSubProbes bool EnableDHTProbes bool EnableElectionProbes bool EnableReplicationProbes bool } // HealthMetrics tracks comprehensive health metrics type HealthMetrics struct { mu sync.RWMutex // Overall system health SystemHealthScore float64 LastFullHealthCheck time.Time TotalHealthChecks int64 FailedHealthChecks int64 // PubSub metrics PubSubHealthScore float64 PubSubProbeLatency time.Duration PubSubSuccessRate float64 PubSubLastSuccess time.Time PubSubConsecutiveFails int // DHT metrics DHTHealthScore float64 DHTProbeLatency time.Duration DHTSuccessRate float64 DHTLastSuccess time.Time DHTConsecutiveFails int DHTReplicationStatus map[string]*dht.ReplicationStatus // Election metrics ElectionHealthScore float64 ElectionStability float64 HeartbeatLatency time.Duration LeadershipChanges int64 LastLeadershipChange time.Time AdminUptime time.Duration // Network metrics P2PConnectedPeers int P2PConnectivityScore float64 NetworkLatency time.Duration // Resource metrics CPUUsage float64 MemoryUsage float64 DiskUsage float64 // Service-specific metrics ActiveTasks int QueuedTasks int TaskSuccessRate float64 } // DefaultHealthConfig returns default health check configuration func DefaultHealthConfig() *HealthConfig { return &HealthConfig{ PubSubProbeInterval: 30 * time.Second, DHTProbeInterval: 60 * time.Second, ElectionProbeInterval: 15 * time.Second, PubSubProbeTimeout: 10 * time.Second, DHTProbeTimeout: 20 * time.Second, ElectionProbeTimeout: 5 * time.Second, MaxFailedProbes: 3, HealthyThreshold: 0.95, DegradedThreshold: 0.75, MaxHistoryEntries: 1000, HistoryCleanupInterval: 1 * time.Hour, EnablePubSubProbes: true, EnableDHTProbes: true, EnableElectionProbes: true, EnableReplicationProbes: true, } } // NewEnhancedHealthChecks creates a new enhanced health check system func NewEnhancedHealthChecks( manager *Manager, election *election.ElectionManager, dht *dht.LibP2PDHT, pubsub *pubsub.PubSub, replication *dht.ReplicationManager, logger Logger, ) *EnhancedHealthChecks { ehc := &EnhancedHealthChecks{ manager: manager, election: election, dht: dht, pubsub: pubsub, replication: replication, metrics: &HealthMetrics{}, checkHistory: make(map[string][]*CheckResult), maxHistory: 1000, config: DefaultHealthConfig(), logger: logger, } // Initialize metrics ehc.initializeMetrics() // Register enhanced health checks ehc.registerHealthChecks() // Start background monitoring go ehc.startBackgroundMonitoring() return ehc } // initializeMetrics initializes the metrics system func (ehc *EnhancedHealthChecks) initializeMetrics() { ehc.metrics.mu.Lock() defer ehc.metrics.mu.Unlock() ehc.metrics.DHTReplicationStatus = make(map[string]*dht.ReplicationStatus) ehc.metrics.LastFullHealthCheck = time.Now() } // registerHealthChecks registers all enhanced health checks with the manager func (ehc *EnhancedHealthChecks) registerHealthChecks() { if ehc.config.EnablePubSubProbes { ehc.manager.RegisterCheck(ehc.createEnhancedPubSubCheck()) } if ehc.config.EnableDHTProbes { ehc.manager.RegisterCheck(ehc.createEnhancedDHTCheck()) } if ehc.config.EnableElectionProbes { ehc.manager.RegisterCheck(ehc.createElectionHealthCheck()) } if ehc.config.EnableReplicationProbes { ehc.manager.RegisterCheck(ehc.createReplicationHealthCheck()) } // System-level checks ehc.manager.RegisterCheck(ehc.createP2PConnectivityCheck()) ehc.manager.RegisterCheck(ehc.createResourceHealthCheck()) ehc.manager.RegisterCheck(ehc.createTaskManagerHealthCheck()) } // createEnhancedPubSubCheck creates an enhanced PubSub health check func (ehc *EnhancedHealthChecks) createEnhancedPubSubCheck() *HealthCheck { return &HealthCheck{ Name: "pubsub-enhanced", Description: "Enhanced PubSub health check with comprehensive probing", Enabled: true, Critical: true, Interval: ehc.config.PubSubProbeInterval, Timeout: ehc.config.PubSubProbeTimeout, Checker: func(ctx context.Context) CheckResult { start := time.Now() // Generate unique test data testID := fmt.Sprintf("health-test-%d", time.Now().UnixNano()) testTopic := "bzzz/health/enhanced/v1" testData := map[string]interface{}{ "test_id": testID, "timestamp": time.Now().Unix(), "node_id": ehc.getNodeID(), "check_type": "enhanced_pubsub_probe", } // Test message publishing and subscription result := ehc.testPubSubRoundTrip(ctx, testTopic, testData) result.Latency = time.Since(start) // Update metrics ehc.updatePubSubMetrics(result) // Add comprehensive details result.Details = map[string]interface{}{ "test_id": testID, "topic": testTopic, "probe_latency_ms": result.Latency.Milliseconds(), "success_rate": ehc.metrics.PubSubSuccessRate, "consecutive_fails": ehc.metrics.PubSubConsecutiveFails, "last_success": ehc.metrics.PubSubLastSuccess, } return result }, } } // createEnhancedDHTCheck creates an enhanced DHT health check func (ehc *EnhancedHealthChecks) createEnhancedDHTCheck() *HealthCheck { return &HealthCheck{ Name: "dht-enhanced", Description: "Enhanced DHT health check with replication monitoring", Enabled: true, Critical: true, Interval: ehc.config.DHTProbeInterval, Timeout: ehc.config.DHTProbeTimeout, Checker: func(ctx context.Context) CheckResult { start := time.Now() // Test DHT operations result := ehc.testDHTOperations(ctx) result.Latency = time.Since(start) // Check replication status replicationHealth := ehc.checkReplicationHealth(ctx) // Combine results if !result.Healthy || !replicationHealth.Healthy { result.Healthy = false result.Message = fmt.Sprintf("DHT: %s | Replication: %s", result.Message, replicationHealth.Message) } // Update metrics ehc.updateDHTMetrics(result, replicationHealth) // Add comprehensive details result.Details = map[string]interface{}{ "dht_latency_ms": result.Latency.Milliseconds(), "replication_health": replicationHealth.Healthy, "success_rate": ehc.metrics.DHTSuccessRate, "consecutive_fails": ehc.metrics.DHTConsecutiveFails, "replication_status": ehc.metrics.DHTReplicationStatus, } return result }, } } // createElectionHealthCheck creates election system health check func (ehc *EnhancedHealthChecks) createElectionHealthCheck() *HealthCheck { return &HealthCheck{ Name: "election-health", Description: "Election system health and leadership stability check", Enabled: true, Critical: false, Interval: ehc.config.ElectionProbeInterval, Timeout: ehc.config.ElectionProbeTimeout, Checker: func(ctx context.Context) CheckResult { start := time.Now() // Check election state and heartbeat status currentAdmin := ehc.election.GetCurrentAdmin() electionState := ehc.election.GetElectionState() heartbeatStatus := ehc.election.GetHeartbeatStatus() result := CheckResult{ Timestamp: time.Now(), } // Determine health based on election state switch electionState { case election.StateIdle: if currentAdmin != "" { result.Healthy = true result.Message = fmt.Sprintf("Election stable, admin: %s", currentAdmin) } else { result.Healthy = false result.Message = "No admin elected" } case election.StateElecting: result.Healthy = false result.Message = "Election in progress" case election.StateDiscovering: result.Healthy = false result.Message = "Admin discovery in progress" default: result.Healthy = false result.Message = fmt.Sprintf("Unknown election state: %s", electionState) } result.Latency = time.Since(start) // Update metrics ehc.updateElectionMetrics(result, currentAdmin, heartbeatStatus) result.Details = map[string]interface{}{ "current_admin": currentAdmin, "election_state": electionState, "heartbeat_status": heartbeatStatus, "leadership_changes": ehc.metrics.LeadershipChanges, "admin_uptime": ehc.metrics.AdminUptime.String(), "stability_score": ehc.metrics.ElectionStability, } return result }, } } // createReplicationHealthCheck creates replication system health check func (ehc *EnhancedHealthChecks) createReplicationHealthCheck() *HealthCheck { return &HealthCheck{ Name: "replication-health", Description: "DHT replication system health monitoring", Enabled: true, Critical: false, Interval: 120 * time.Second, Timeout: 30 * time.Second, Checker: func(ctx context.Context) CheckResult { start := time.Now() if ehc.replication == nil { return CheckResult{ Healthy: false, Message: "Replication manager not available", Timestamp: time.Now(), Latency: time.Since(start), } } metrics := ehc.replication.GetMetrics() result := CheckResult{ Healthy: true, Message: fmt.Sprintf("Replication healthy: %d keys, %.1f avg replicas", metrics.TotalKeys, metrics.AverageReplication), Timestamp: time.Now(), Latency: time.Since(start), } // Check for replication health issues if metrics.FailedReplications > metrics.SuccessfulReplications/10 { result.Healthy = false result.Message = fmt.Sprintf("High replication failure rate: %d/%d failed", metrics.FailedReplications, metrics.SuccessfulReplications) } result.Details = map[string]interface{}{ "total_keys": metrics.TotalKeys, "total_providers": metrics.TotalProviders, "successful_replicas": metrics.SuccessfulReplications, "failed_replicas": metrics.FailedReplications, "average_replication": metrics.AverageReplication, "last_reprovide": metrics.LastReprovideTime, } return result }, } } // createP2PConnectivityCheck creates P2P network connectivity health check func (ehc *EnhancedHealthChecks) createP2PConnectivityCheck() *HealthCheck { return &HealthCheck{ Name: "p2p-connectivity", Description: "P2P network connectivity and peer quality check", Enabled: true, Critical: true, Interval: 30 * time.Second, Timeout: 15 * time.Second, Checker: func(ctx context.Context) CheckResult { start := time.Now() // This would integrate with the P2P node // For now, we'll use placeholder values connectedPeers := 5 // Would get from actual P2P node targetPeers := 3 result := CheckResult{ Timestamp: time.Now(), } if connectedPeers >= targetPeers { result.Healthy = true result.Message = fmt.Sprintf("P2P connectivity healthy: %d peers connected", connectedPeers) } else { result.Healthy = false result.Message = fmt.Sprintf("Insufficient P2P peers: %d < %d required", connectedPeers, targetPeers) } result.Latency = time.Since(start) // Update metrics ehc.metrics.mu.Lock() ehc.metrics.P2PConnectedPeers = connectedPeers ehc.metrics.P2PConnectivityScore = float64(connectedPeers) / float64(targetPeers) if ehc.metrics.P2PConnectivityScore > 1.0 { ehc.metrics.P2PConnectivityScore = 1.0 } ehc.metrics.mu.Unlock() result.Details = map[string]interface{}{ "connected_peers": connectedPeers, "target_peers": targetPeers, "connectivity_score": ehc.metrics.P2PConnectivityScore, } return result }, } } // createResourceHealthCheck creates system resource health check func (ehc *EnhancedHealthChecks) createResourceHealthCheck() *HealthCheck { return &HealthCheck{ Name: "resource-health", Description: "System resource utilization health check", Enabled: true, Critical: false, Interval: 60 * time.Second, Timeout: 10 * time.Second, Checker: func(ctx context.Context) CheckResult { start := time.Now() // In a real implementation, these would be actual system metrics cpuUsage := 0.45 // 45% memoryUsage := 0.62 // 62% diskUsage := 0.73 // 73% result := CheckResult{ Healthy: true, Message: "Resource utilization within normal ranges", Timestamp: time.Now(), Latency: time.Since(start), } // Check thresholds if cpuUsage > 0.85 || memoryUsage > 0.90 || diskUsage > 0.90 { result.Healthy = false result.Message = fmt.Sprintf("High resource utilization: CPU %.1f%%, Memory %.1f%%, Disk %.1f%%", cpuUsage*100, memoryUsage*100, diskUsage*100) } else if cpuUsage > 0.70 || memoryUsage > 0.80 || diskUsage > 0.80 { result.Message = fmt.Sprintf("Elevated resource utilization: CPU %.1f%%, Memory %.1f%%, Disk %.1f%%", cpuUsage*100, memoryUsage*100, diskUsage*100) } // Update metrics ehc.metrics.mu.Lock() ehc.metrics.CPUUsage = cpuUsage ehc.metrics.MemoryUsage = memoryUsage ehc.metrics.DiskUsage = diskUsage ehc.metrics.mu.Unlock() result.Details = map[string]interface{}{ "cpu_usage": cpuUsage, "memory_usage": memoryUsage, "disk_usage": diskUsage, } return result }, } } // createTaskManagerHealthCheck creates task management health check func (ehc *EnhancedHealthChecks) createTaskManagerHealthCheck() *HealthCheck { return &HealthCheck{ Name: "task-manager", Description: "Task coordination and management health check", Enabled: true, Critical: false, Interval: 30 * time.Second, Timeout: 10 * time.Second, Checker: func(ctx context.Context) CheckResult { start := time.Now() // In a real implementation, these would come from the task coordinator activeTasks := 3 queuedTasks := 1 maxTasks := 10 successRate := 0.95 result := CheckResult{ Healthy: true, Message: fmt.Sprintf("Task management healthy: %d active, %d queued", activeTasks, queuedTasks), Timestamp: time.Now(), Latency: time.Since(start), } // Check for task management issues if activeTasks >= maxTasks { result.Healthy = false result.Message = "Task manager at capacity" } else if successRate < 0.80 { result.Healthy = false result.Message = fmt.Sprintf("Low task success rate: %.1f%%", successRate*100) } // Update metrics ehc.metrics.mu.Lock() ehc.metrics.ActiveTasks = activeTasks ehc.metrics.QueuedTasks = queuedTasks ehc.metrics.TaskSuccessRate = successRate ehc.metrics.mu.Unlock() result.Details = map[string]interface{}{ "active_tasks": activeTasks, "queued_tasks": queuedTasks, "max_tasks": maxTasks, "success_rate": successRate, "utilization": float64(activeTasks) / float64(maxTasks), } return result }, } } // testPubSubRoundTrip tests PubSub publish/subscribe functionality func (ehc *EnhancedHealthChecks) testPubSubRoundTrip(ctx context.Context, topic string, testData map[string]interface{}) CheckResult { // This would implement actual PubSub round-trip testing // For now, we simulate the test // Simulate test latency time.Sleep(50 * time.Millisecond) return CheckResult{ Healthy: true, Message: "PubSub round-trip test successful", Timestamp: time.Now(), } } // testDHTOperations tests DHT put/get operations func (ehc *EnhancedHealthChecks) testDHTOperations(ctx context.Context) CheckResult { if ehc.dht == nil { return CheckResult{ Healthy: false, Message: "DHT not available", Timestamp: time.Now(), } } // This would implement actual DHT testing using the adapter adapter := NewDHTAdapter(ehc.dht) testKey := fmt.Sprintf("health-test-%d", time.Now().UnixNano()) testValue := []byte(fmt.Sprintf(`{"test":true,"timestamp":%d}`, time.Now().Unix())) // Test put operation if err := adapter.PutValue(ctx, testKey, testValue); err != nil { return CheckResult{ Healthy: false, Message: fmt.Sprintf("DHT put failed: %v", err), Error: err, Timestamp: time.Now(), } } // Test get operation retrievedValue, err := adapter.GetValue(ctx, testKey) if err != nil { return CheckResult{ Healthy: false, Message: fmt.Sprintf("DHT get failed: %v", err), Error: err, Timestamp: time.Now(), } } // Verify data integrity if string(retrievedValue) != string(testValue) { return CheckResult{ Healthy: false, Message: "DHT data integrity check failed", Timestamp: time.Now(), } } return CheckResult{ Healthy: true, Message: "DHT operations successful", Timestamp: time.Now(), } } // checkReplicationHealth checks the health of DHT replication func (ehc *EnhancedHealthChecks) checkReplicationHealth(ctx context.Context) CheckResult { if ehc.replication == nil { return CheckResult{ Healthy: true, Message: "Replication manager not configured", Timestamp: time.Now(), } } metrics := ehc.replication.GetMetrics() // Check replication health if metrics.TotalKeys == 0 { return CheckResult{ Healthy: true, Message: "No content to replicate", Timestamp: time.Now(), } } // Check failure rate totalOperations := metrics.SuccessfulReplications + metrics.FailedReplications if totalOperations > 0 { failureRate := float64(metrics.FailedReplications) / float64(totalOperations) if failureRate > 0.1 { // More than 10% failure rate return CheckResult{ Healthy: false, Message: fmt.Sprintf("High replication failure rate: %.1f%%", failureRate*100), Timestamp: time.Now(), } } } return CheckResult{ Healthy: true, Message: fmt.Sprintf("Replication healthy: %d keys, %.1f avg replicas", metrics.TotalKeys, metrics.AverageReplication), Timestamp: time.Now(), } } // updatePubSubMetrics updates PubSub health metrics func (ehc *EnhancedHealthChecks) updatePubSubMetrics(result CheckResult) { ehc.metrics.mu.Lock() defer ehc.metrics.mu.Unlock() ehc.metrics.PubSubProbeLatency = result.Latency if result.Healthy { ehc.metrics.PubSubLastSuccess = result.Timestamp ehc.metrics.PubSubConsecutiveFails = 0 // Update success rate (simple exponential moving average) ehc.metrics.PubSubSuccessRate = ehc.metrics.PubSubSuccessRate*0.9 + 0.1 } else { ehc.metrics.PubSubConsecutiveFails++ ehc.metrics.PubSubSuccessRate = ehc.metrics.PubSubSuccessRate * 0.9 } // Calculate health score ehc.metrics.PubSubHealthScore = ehc.metrics.PubSubSuccessRate * (1.0 - float64(ehc.metrics.PubSubConsecutiveFails)*0.1) if ehc.metrics.PubSubHealthScore < 0 { ehc.metrics.PubSubHealthScore = 0 } } // updateDHTMetrics updates DHT health metrics func (ehc *EnhancedHealthChecks) updateDHTMetrics(result CheckResult, replicationResult CheckResult) { ehc.metrics.mu.Lock() defer ehc.metrics.mu.Unlock() ehc.metrics.DHTProbeLatency = result.Latency if result.Healthy { ehc.metrics.DHTLastSuccess = result.Timestamp ehc.metrics.DHTConsecutiveFails = 0 ehc.metrics.DHTSuccessRate = ehc.metrics.DHTSuccessRate*0.9 + 0.1 } else { ehc.metrics.DHTConsecutiveFails++ ehc.metrics.DHTSuccessRate = ehc.metrics.DHTSuccessRate * 0.9 } // Calculate health score ehc.metrics.DHTHealthScore = ehc.metrics.DHTSuccessRate * (1.0 - float64(ehc.metrics.DHTConsecutiveFails)*0.1) if ehc.metrics.DHTHealthScore < 0 { ehc.metrics.DHTHealthScore = 0 } // Include replication health in overall DHT health if replicationResult.Healthy { ehc.metrics.DHTHealthScore = ehc.metrics.DHTHealthScore*0.8 + 0.2 } else { ehc.metrics.DHTHealthScore = ehc.metrics.DHTHealthScore * 0.8 } } // updateElectionMetrics updates election health metrics func (ehc *EnhancedHealthChecks) updateElectionMetrics(result CheckResult, currentAdmin string, heartbeatStatus map[string]interface{}) { ehc.metrics.mu.Lock() defer ehc.metrics.mu.Unlock() // Track leadership changes if ehc.metrics.LastLeadershipChange.IsZero() { ehc.metrics.LastLeadershipChange = time.Now() } // Calculate admin uptime if currentAdmin != "" { ehc.metrics.AdminUptime = time.Since(ehc.metrics.LastLeadershipChange) } else { ehc.metrics.AdminUptime = 0 } // Calculate election stability (higher is better) timeSinceLastChange := time.Since(ehc.metrics.LastLeadershipChange) ehc.metrics.ElectionStability = math.Min(1.0, timeSinceLastChange.Hours()/24.0) // Extract heartbeat latency if available if latencyStr, ok := heartbeatStatus["interval"].(string); ok { if interval, err := time.ParseDuration(latencyStr); err == nil { ehc.metrics.HeartbeatLatency = interval / 2 // Approximate latency } } // Calculate election health score if result.Healthy && currentAdmin != "" { ehc.metrics.ElectionHealthScore = 1.0 * ehc.metrics.ElectionStability } else { ehc.metrics.ElectionHealthScore = 0.3 // Degraded but not critical } } // startBackgroundMonitoring starts background health monitoring func (ehc *EnhancedHealthChecks) startBackgroundMonitoring() { ticker := time.NewTicker(30 * time.Second) defer ticker.Stop() for range ticker.C { ehc.calculateOverallSystemHealth() ehc.cleanupHistory() } } // calculateOverallSystemHealth calculates overall system health score func (ehc *EnhancedHealthChecks) calculateOverallSystemHealth() { ehc.metrics.mu.Lock() defer ehc.metrics.mu.Unlock() // Weight different components weights := map[string]float64{ "pubsub": 0.25, "dht": 0.25, "election": 0.15, "p2p": 0.20, "resources": 0.10, "tasks": 0.05, } // Calculate weighted average totalScore := 0.0 totalWeight := 0.0 if ehc.config.EnablePubSubProbes { totalScore += ehc.metrics.PubSubHealthScore * weights["pubsub"] totalWeight += weights["pubsub"] } if ehc.config.EnableDHTProbes { totalScore += ehc.metrics.DHTHealthScore * weights["dht"] totalWeight += weights["dht"] } if ehc.config.EnableElectionProbes { totalScore += ehc.metrics.ElectionHealthScore * weights["election"] totalWeight += weights["election"] } totalScore += ehc.metrics.P2PConnectivityScore * weights["p2p"] totalWeight += weights["p2p"] // Resource health (inverse of utilization) resourceHealth := 1.0 - math.Max(ehc.metrics.CPUUsage, math.Max(ehc.metrics.MemoryUsage, ehc.metrics.DiskUsage)) totalScore += resourceHealth * weights["resources"] totalWeight += weights["resources"] // Task health taskHealth := ehc.metrics.TaskSuccessRate totalScore += taskHealth * weights["tasks"] totalWeight += weights["tasks"] if totalWeight > 0 { ehc.metrics.SystemHealthScore = totalScore / totalWeight } else { ehc.metrics.SystemHealthScore = 0.5 // Unknown health } ehc.metrics.LastFullHealthCheck = time.Now() ehc.metrics.TotalHealthChecks++ } // cleanupHistory cleans up old health check history func (ehc *EnhancedHealthChecks) cleanupHistory() { ehc.mu.Lock() defer ehc.mu.Unlock() cutoff := time.Now().Add(-24 * time.Hour) // Keep last 24 hours for checkName, history := range ehc.checkHistory { var newHistory []*CheckResult for _, result := range history { if result.Timestamp.After(cutoff) { newHistory = append(newHistory, result) } } ehc.checkHistory[checkName] = newHistory } } // GetHealthMetrics returns comprehensive health metrics func (ehc *EnhancedHealthChecks) GetHealthMetrics() *HealthMetrics { ehc.metrics.mu.RLock() defer ehc.metrics.mu.RUnlock() // Create a deep copy to avoid race conditions metrics := &HealthMetrics{} *metrics = *ehc.metrics // Copy the map metrics.DHTReplicationStatus = make(map[string]*dht.ReplicationStatus) for k, v := range ehc.metrics.DHTReplicationStatus { statusCopy := *v metrics.DHTReplicationStatus[k] = &statusCopy } return metrics } // GetHealthSummary returns a summary of system health func (ehc *EnhancedHealthChecks) GetHealthSummary() map[string]interface{} { metrics := ehc.GetHealthMetrics() status := "healthy" if metrics.SystemHealthScore < ehc.config.DegradedThreshold { status = "degraded" } if metrics.SystemHealthScore < ehc.config.DegradedThreshold*0.5 { status = "critical" } return map[string]interface{}{ "status": status, "overall_score": metrics.SystemHealthScore, "last_check": metrics.LastFullHealthCheck, "total_checks": metrics.TotalHealthChecks, "component_scores": map[string]float64{ "pubsub": metrics.PubSubHealthScore, "dht": metrics.DHTHealthScore, "election": metrics.ElectionHealthScore, "p2p": metrics.P2PConnectivityScore, }, "key_metrics": map[string]interface{}{ "connected_peers": metrics.P2PConnectedPeers, "active_tasks": metrics.ActiveTasks, "admin_uptime": metrics.AdminUptime.String(), "leadership_changes": metrics.LeadershipChanges, "resource_utilization": map[string]float64{ "cpu": metrics.CPUUsage, "memory": metrics.MemoryUsage, "disk": metrics.DiskUsage, }, }, } } // getNodeID returns the current node ID (placeholder implementation) func (ehc *EnhancedHealthChecks) getNodeID() string { return "node-placeholder" // Would get from actual node }