Complete BZZZ functionality port to CHORUS

🎭 CHORUS now contains full BZZZ functionality adapted for containers

Core systems ported:
- P2P networking (libp2p with DHT and PubSub)
- Task coordination (COOEE protocol)
- HMMM collaborative reasoning
- SHHH encryption and security
- SLURP admin election system
- UCXL content addressing
- UCXI server integration
- Hypercore logging system
- Health monitoring and graceful shutdown
- License validation with KACHING

Container adaptations:
- Environment variable configuration (no YAML files)
- Container-optimized logging to stdout/stderr
- Auto-generated agent IDs for container deployments
- Docker-first architecture

All proven BZZZ P2P protocols, AI integration, and collaboration
features are now available in containerized form.

Next: Build and test container deployment.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
anthonyrawlins
2025-09-02 20:02:37 +10:00
parent 7c6cbd562a
commit 543ab216f9
224 changed files with 86331 additions and 186 deletions

167
pkg/health/adapters.go Normal file
View File

@@ -0,0 +1,167 @@
package health
import (
"context"
"encoding/json"
"fmt"
"chorus.services/bzzz/pubsub"
"chorus.services/bzzz/pkg/dht"
)
// PubSubAdapter adapts the existing PubSub system to the health check interface
type PubSubAdapter struct {
pubsub *pubsub.PubSub
}
// NewPubSubAdapter creates a new PubSub adapter for health checks
func NewPubSubAdapter(ps *pubsub.PubSub) *PubSubAdapter {
return &PubSubAdapter{pubsub: ps}
}
// SubscribeToTopic implements PubSubInterface for health checks
func (psa *PubSubAdapter) SubscribeToTopic(topic string, handler func([]byte)) error {
// Create a channel to bridge the message types
msgCh := make(chan []byte, 100)
// Start a goroutine to handle messages
go func() {
for data := range msgCh {
handler(data)
}
}()
// Subscribe using the existing pubsub interface
// Note: This is a simplified adapter - in a real implementation you'd need
// to hook into the actual pubsub subscription mechanism
return nil
}
// PublishToTopic implements PubSubInterface for health checks
func (psa *PubSubAdapter) PublishToTopic(topic string, data interface{}) error {
// Use the existing pubsub publish mechanism
// Convert data to proper map format
dataMap, ok := data.(map[string]interface{})
if !ok {
dataMap = map[string]interface{}{"data": data}
}
return psa.pubsub.PublishBzzzMessage(pubsub.MessageType(topic), dataMap)
}
// DHTAdapter adapts various DHT implementations to the health check interface
type DHTAdapter struct {
dht interface{}
}
// NewDHTAdapter creates a new DHT adapter for health checks
func NewDHTAdapter(dht interface{}) *DHTAdapter {
return &DHTAdapter{dht: dht}
}
// PutValue implements DHTInterface for health checks
func (da *DHTAdapter) PutValue(ctx context.Context, key string, value []byte) error {
// Try to cast to different DHT interfaces
if libp2pDHT, ok := da.dht.(*dht.LibP2PDHT); ok {
return libp2pDHT.PutValue(ctx, key, value)
}
if mockDHT, ok := da.dht.(*dht.MockDHTInterface); ok {
return mockDHT.PutValue(ctx, key, value)
}
if encryptedDHT, ok := da.dht.(*dht.EncryptedDHTStorage); ok {
// For encrypted storage, we need to adapt the interface
return encryptedDHT.StoreUCXLContent(key, value, "system", "test")
}
// If we can't identify the type, return an error
return fmt.Errorf("unsupported DHT type: %T", da.dht)
}
// GetValue implements DHTInterface for health checks
func (da *DHTAdapter) GetValue(ctx context.Context, key string) ([]byte, error) {
// Try to cast to different DHT interfaces
if libp2pDHT, ok := da.dht.(*dht.LibP2PDHT); ok {
return libp2pDHT.GetValue(ctx, key)
}
if mockDHT, ok := da.dht.(*dht.MockDHTInterface); ok {
return mockDHT.GetValue(ctx, key)
}
if encryptedDHT, ok := da.dht.(*dht.EncryptedDHTStorage); ok {
// For encrypted storage, we need to adapt the interface
content, _, err := encryptedDHT.RetrieveUCXLContent(key)
if err != nil {
return nil, err
}
return []byte(content), nil
}
// If we can't identify the type, return an error
return nil, fmt.Errorf("unsupported DHT type: %T", da.dht)
}
// MockPubSubAdapter creates a mock PubSub for testing health checks
type MockPubSubAdapter struct {
handlers map[string][]func([]byte)
}
// NewMockPubSubAdapter creates a new mock PubSub adapter
func NewMockPubSubAdapter() *MockPubSubAdapter {
return &MockPubSubAdapter{
handlers: make(map[string][]func([]byte)),
}
}
// SubscribeToTopic implements PubSubInterface for mock testing
func (mps *MockPubSubAdapter) SubscribeToTopic(topic string, handler func([]byte)) error {
if mps.handlers[topic] == nil {
mps.handlers[topic] = make([]func([]byte), 0)
}
mps.handlers[topic] = append(mps.handlers[topic], handler)
return nil
}
// PublishToTopic implements PubSubInterface for mock testing
func (mps *MockPubSubAdapter) PublishToTopic(topic string, data interface{}) error {
jsonData, err := json.Marshal(data)
if err != nil {
return err
}
// Deliver to all handlers for this topic
if handlers, exists := mps.handlers[topic]; exists {
for _, handler := range handlers {
go handler(jsonData) // Async delivery like real pubsub
}
}
return nil
}
// MockDHTAdapter creates a mock DHT for testing health checks
type MockDHTAdapter struct {
data map[string][]byte
}
// NewMockDHTAdapter creates a new mock DHT adapter
func NewMockDHTAdapter() *MockDHTAdapter {
return &MockDHTAdapter{
data: make(map[string][]byte),
}
}
// PutValue implements DHTInterface for mock testing
func (md *MockDHTAdapter) PutValue(ctx context.Context, key string, value []byte) error {
md.data[key] = value
return nil
}
// GetValue implements DHTInterface for mock testing
func (md *MockDHTAdapter) GetValue(ctx context.Context, key string) ([]byte, error) {
if value, exists := md.data[key]; exists {
return value, nil
}
return nil, fmt.Errorf("key not found: %s", key)
}

View File

@@ -0,0 +1,908 @@
package health
import (
"context"
"fmt"
"math"
"sync"
"time"
"chorus.services/bzzz/pkg/dht"
"chorus.services/bzzz/pkg/election"
"chorus.services/bzzz/pubsub"
)
// EnhancedHealthChecks provides comprehensive health monitoring for BZZZ infrastructure
type EnhancedHealthChecks struct {
mu sync.RWMutex
manager *Manager
election *election.ElectionManager
dht *dht.LibP2PDHT
pubsub *pubsub.PubSub
replication *dht.ReplicationManager
// Metrics storage
metrics *HealthMetrics
checkHistory map[string][]*CheckResult
maxHistory int
// Configuration
config *HealthConfig
logger Logger
}
// HealthConfig configures health check behavior
type HealthConfig struct {
// Active probe intervals
PubSubProbeInterval time.Duration
DHTProbeInterval time.Duration
ElectionProbeInterval time.Duration
// Probe timeouts
PubSubProbeTimeout time.Duration
DHTProbeTimeout time.Duration
ElectionProbeTimeout time.Duration
// Thresholds
MaxFailedProbes int
HealthyThreshold float64
DegradedThreshold float64
// History retention
MaxHistoryEntries int
HistoryCleanupInterval time.Duration
// Enable/disable specific checks
EnablePubSubProbes bool
EnableDHTProbes bool
EnableElectionProbes bool
EnableReplicationProbes bool
}
// HealthMetrics tracks comprehensive health metrics
type HealthMetrics struct {
mu sync.RWMutex
// Overall system health
SystemHealthScore float64
LastFullHealthCheck time.Time
TotalHealthChecks int64
FailedHealthChecks int64
// PubSub metrics
PubSubHealthScore float64
PubSubProbeLatency time.Duration
PubSubSuccessRate float64
PubSubLastSuccess time.Time
PubSubConsecutiveFails int
// DHT metrics
DHTHealthScore float64
DHTProbeLatency time.Duration
DHTSuccessRate float64
DHTLastSuccess time.Time
DHTConsecutiveFails int
DHTReplicationStatus map[string]*dht.ReplicationStatus
// Election metrics
ElectionHealthScore float64
ElectionStability float64
HeartbeatLatency time.Duration
LeadershipChanges int64
LastLeadershipChange time.Time
AdminUptime time.Duration
// Network metrics
P2PConnectedPeers int
P2PConnectivityScore float64
NetworkLatency time.Duration
// Resource metrics
CPUUsage float64
MemoryUsage float64
DiskUsage float64
// Service-specific metrics
ActiveTasks int
QueuedTasks int
TaskSuccessRate float64
}
// DefaultHealthConfig returns default health check configuration
func DefaultHealthConfig() *HealthConfig {
return &HealthConfig{
PubSubProbeInterval: 30 * time.Second,
DHTProbeInterval: 60 * time.Second,
ElectionProbeInterval: 15 * time.Second,
PubSubProbeTimeout: 10 * time.Second,
DHTProbeTimeout: 20 * time.Second,
ElectionProbeTimeout: 5 * time.Second,
MaxFailedProbes: 3,
HealthyThreshold: 0.95,
DegradedThreshold: 0.75,
MaxHistoryEntries: 1000,
HistoryCleanupInterval: 1 * time.Hour,
EnablePubSubProbes: true,
EnableDHTProbes: true,
EnableElectionProbes: true,
EnableReplicationProbes: true,
}
}
// NewEnhancedHealthChecks creates a new enhanced health check system
func NewEnhancedHealthChecks(
manager *Manager,
election *election.ElectionManager,
dht *dht.LibP2PDHT,
pubsub *pubsub.PubSub,
replication *dht.ReplicationManager,
logger Logger,
) *EnhancedHealthChecks {
ehc := &EnhancedHealthChecks{
manager: manager,
election: election,
dht: dht,
pubsub: pubsub,
replication: replication,
metrics: &HealthMetrics{},
checkHistory: make(map[string][]*CheckResult),
maxHistory: 1000,
config: DefaultHealthConfig(),
logger: logger,
}
// Initialize metrics
ehc.initializeMetrics()
// Register enhanced health checks
ehc.registerHealthChecks()
// Start background monitoring
go ehc.startBackgroundMonitoring()
return ehc
}
// initializeMetrics initializes the metrics system
func (ehc *EnhancedHealthChecks) initializeMetrics() {
ehc.metrics.mu.Lock()
defer ehc.metrics.mu.Unlock()
ehc.metrics.DHTReplicationStatus = make(map[string]*dht.ReplicationStatus)
ehc.metrics.LastFullHealthCheck = time.Now()
}
// registerHealthChecks registers all enhanced health checks with the manager
func (ehc *EnhancedHealthChecks) registerHealthChecks() {
if ehc.config.EnablePubSubProbes {
ehc.manager.RegisterCheck(ehc.createEnhancedPubSubCheck())
}
if ehc.config.EnableDHTProbes {
ehc.manager.RegisterCheck(ehc.createEnhancedDHTCheck())
}
if ehc.config.EnableElectionProbes {
ehc.manager.RegisterCheck(ehc.createElectionHealthCheck())
}
if ehc.config.EnableReplicationProbes {
ehc.manager.RegisterCheck(ehc.createReplicationHealthCheck())
}
// System-level checks
ehc.manager.RegisterCheck(ehc.createP2PConnectivityCheck())
ehc.manager.RegisterCheck(ehc.createResourceHealthCheck())
ehc.manager.RegisterCheck(ehc.createTaskManagerHealthCheck())
}
// createEnhancedPubSubCheck creates an enhanced PubSub health check
func (ehc *EnhancedHealthChecks) createEnhancedPubSubCheck() *HealthCheck {
return &HealthCheck{
Name: "pubsub-enhanced",
Description: "Enhanced PubSub health check with comprehensive probing",
Enabled: true,
Critical: true,
Interval: ehc.config.PubSubProbeInterval,
Timeout: ehc.config.PubSubProbeTimeout,
Checker: func(ctx context.Context) CheckResult {
start := time.Now()
// Generate unique test data
testID := fmt.Sprintf("health-test-%d", time.Now().UnixNano())
testTopic := "bzzz/health/enhanced/v1"
testData := map[string]interface{}{
"test_id": testID,
"timestamp": time.Now().Unix(),
"node_id": ehc.getNodeID(),
"check_type": "enhanced_pubsub_probe",
}
// Test message publishing and subscription
result := ehc.testPubSubRoundTrip(ctx, testTopic, testData)
result.Latency = time.Since(start)
// Update metrics
ehc.updatePubSubMetrics(result)
// Add comprehensive details
result.Details = map[string]interface{}{
"test_id": testID,
"topic": testTopic,
"probe_latency_ms": result.Latency.Milliseconds(),
"success_rate": ehc.metrics.PubSubSuccessRate,
"consecutive_fails": ehc.metrics.PubSubConsecutiveFails,
"last_success": ehc.metrics.PubSubLastSuccess,
}
return result
},
}
}
// createEnhancedDHTCheck creates an enhanced DHT health check
func (ehc *EnhancedHealthChecks) createEnhancedDHTCheck() *HealthCheck {
return &HealthCheck{
Name: "dht-enhanced",
Description: "Enhanced DHT health check with replication monitoring",
Enabled: true,
Critical: true,
Interval: ehc.config.DHTProbeInterval,
Timeout: ehc.config.DHTProbeTimeout,
Checker: func(ctx context.Context) CheckResult {
start := time.Now()
// Test DHT operations
result := ehc.testDHTOperations(ctx)
result.Latency = time.Since(start)
// Check replication status
replicationHealth := ehc.checkReplicationHealth(ctx)
// Combine results
if !result.Healthy || !replicationHealth.Healthy {
result.Healthy = false
result.Message = fmt.Sprintf("DHT: %s | Replication: %s",
result.Message, replicationHealth.Message)
}
// Update metrics
ehc.updateDHTMetrics(result, replicationHealth)
// Add comprehensive details
result.Details = map[string]interface{}{
"dht_latency_ms": result.Latency.Milliseconds(),
"replication_health": replicationHealth.Healthy,
"success_rate": ehc.metrics.DHTSuccessRate,
"consecutive_fails": ehc.metrics.DHTConsecutiveFails,
"replication_status": ehc.metrics.DHTReplicationStatus,
}
return result
},
}
}
// createElectionHealthCheck creates election system health check
func (ehc *EnhancedHealthChecks) createElectionHealthCheck() *HealthCheck {
return &HealthCheck{
Name: "election-health",
Description: "Election system health and leadership stability check",
Enabled: true,
Critical: false,
Interval: ehc.config.ElectionProbeInterval,
Timeout: ehc.config.ElectionProbeTimeout,
Checker: func(ctx context.Context) CheckResult {
start := time.Now()
// Check election state and heartbeat status
currentAdmin := ehc.election.GetCurrentAdmin()
electionState := ehc.election.GetElectionState()
heartbeatStatus := ehc.election.GetHeartbeatStatus()
result := CheckResult{
Timestamp: time.Now(),
}
// Determine health based on election state
switch electionState {
case election.StateIdle:
if currentAdmin != "" {
result.Healthy = true
result.Message = fmt.Sprintf("Election stable, admin: %s", currentAdmin)
} else {
result.Healthy = false
result.Message = "No admin elected"
}
case election.StateElecting:
result.Healthy = false
result.Message = "Election in progress"
case election.StateDiscovering:
result.Healthy = false
result.Message = "Admin discovery in progress"
default:
result.Healthy = false
result.Message = fmt.Sprintf("Unknown election state: %s", electionState)
}
result.Latency = time.Since(start)
// Update metrics
ehc.updateElectionMetrics(result, currentAdmin, heartbeatStatus)
result.Details = map[string]interface{}{
"current_admin": currentAdmin,
"election_state": electionState,
"heartbeat_status": heartbeatStatus,
"leadership_changes": ehc.metrics.LeadershipChanges,
"admin_uptime": ehc.metrics.AdminUptime.String(),
"stability_score": ehc.metrics.ElectionStability,
}
return result
},
}
}
// createReplicationHealthCheck creates replication system health check
func (ehc *EnhancedHealthChecks) createReplicationHealthCheck() *HealthCheck {
return &HealthCheck{
Name: "replication-health",
Description: "DHT replication system health monitoring",
Enabled: true,
Critical: false,
Interval: 120 * time.Second,
Timeout: 30 * time.Second,
Checker: func(ctx context.Context) CheckResult {
start := time.Now()
if ehc.replication == nil {
return CheckResult{
Healthy: false,
Message: "Replication manager not available",
Timestamp: time.Now(),
Latency: time.Since(start),
}
}
metrics := ehc.replication.GetMetrics()
result := CheckResult{
Healthy: true,
Message: fmt.Sprintf("Replication healthy: %d keys, %.1f avg replicas",
metrics.TotalKeys, metrics.AverageReplication),
Timestamp: time.Now(),
Latency: time.Since(start),
}
// Check for replication health issues
if metrics.FailedReplications > metrics.SuccessfulReplications/10 {
result.Healthy = false
result.Message = fmt.Sprintf("High replication failure rate: %d/%d failed",
metrics.FailedReplications, metrics.SuccessfulReplications)
}
result.Details = map[string]interface{}{
"total_keys": metrics.TotalKeys,
"total_providers": metrics.TotalProviders,
"successful_replicas": metrics.SuccessfulReplications,
"failed_replicas": metrics.FailedReplications,
"average_replication": metrics.AverageReplication,
"last_reprovide": metrics.LastReprovideTime,
}
return result
},
}
}
// createP2PConnectivityCheck creates P2P network connectivity health check
func (ehc *EnhancedHealthChecks) createP2PConnectivityCheck() *HealthCheck {
return &HealthCheck{
Name: "p2p-connectivity",
Description: "P2P network connectivity and peer quality check",
Enabled: true,
Critical: true,
Interval: 30 * time.Second,
Timeout: 15 * time.Second,
Checker: func(ctx context.Context) CheckResult {
start := time.Now()
// This would integrate with the P2P node
// For now, we'll use placeholder values
connectedPeers := 5 // Would get from actual P2P node
targetPeers := 3
result := CheckResult{
Timestamp: time.Now(),
}
if connectedPeers >= targetPeers {
result.Healthy = true
result.Message = fmt.Sprintf("P2P connectivity healthy: %d peers connected", connectedPeers)
} else {
result.Healthy = false
result.Message = fmt.Sprintf("Insufficient P2P peers: %d < %d required",
connectedPeers, targetPeers)
}
result.Latency = time.Since(start)
// Update metrics
ehc.metrics.mu.Lock()
ehc.metrics.P2PConnectedPeers = connectedPeers
ehc.metrics.P2PConnectivityScore = float64(connectedPeers) / float64(targetPeers)
if ehc.metrics.P2PConnectivityScore > 1.0 {
ehc.metrics.P2PConnectivityScore = 1.0
}
ehc.metrics.mu.Unlock()
result.Details = map[string]interface{}{
"connected_peers": connectedPeers,
"target_peers": targetPeers,
"connectivity_score": ehc.metrics.P2PConnectivityScore,
}
return result
},
}
}
// createResourceHealthCheck creates system resource health check
func (ehc *EnhancedHealthChecks) createResourceHealthCheck() *HealthCheck {
return &HealthCheck{
Name: "resource-health",
Description: "System resource utilization health check",
Enabled: true,
Critical: false,
Interval: 60 * time.Second,
Timeout: 10 * time.Second,
Checker: func(ctx context.Context) CheckResult {
start := time.Now()
// In a real implementation, these would be actual system metrics
cpuUsage := 0.45 // 45%
memoryUsage := 0.62 // 62%
diskUsage := 0.73 // 73%
result := CheckResult{
Healthy: true,
Message: "Resource utilization within normal ranges",
Timestamp: time.Now(),
Latency: time.Since(start),
}
// Check thresholds
if cpuUsage > 0.85 || memoryUsage > 0.90 || diskUsage > 0.90 {
result.Healthy = false
result.Message = fmt.Sprintf("High resource utilization: CPU %.1f%%, Memory %.1f%%, Disk %.1f%%",
cpuUsage*100, memoryUsage*100, diskUsage*100)
} else if cpuUsage > 0.70 || memoryUsage > 0.80 || diskUsage > 0.80 {
result.Message = fmt.Sprintf("Elevated resource utilization: CPU %.1f%%, Memory %.1f%%, Disk %.1f%%",
cpuUsage*100, memoryUsage*100, diskUsage*100)
}
// Update metrics
ehc.metrics.mu.Lock()
ehc.metrics.CPUUsage = cpuUsage
ehc.metrics.MemoryUsage = memoryUsage
ehc.metrics.DiskUsage = diskUsage
ehc.metrics.mu.Unlock()
result.Details = map[string]interface{}{
"cpu_usage": cpuUsage,
"memory_usage": memoryUsage,
"disk_usage": diskUsage,
}
return result
},
}
}
// createTaskManagerHealthCheck creates task management health check
func (ehc *EnhancedHealthChecks) createTaskManagerHealthCheck() *HealthCheck {
return &HealthCheck{
Name: "task-manager",
Description: "Task coordination and management health check",
Enabled: true,
Critical: false,
Interval: 30 * time.Second,
Timeout: 10 * time.Second,
Checker: func(ctx context.Context) CheckResult {
start := time.Now()
// In a real implementation, these would come from the task coordinator
activeTasks := 3
queuedTasks := 1
maxTasks := 10
successRate := 0.95
result := CheckResult{
Healthy: true,
Message: fmt.Sprintf("Task management healthy: %d active, %d queued", activeTasks, queuedTasks),
Timestamp: time.Now(),
Latency: time.Since(start),
}
// Check for task management issues
if activeTasks >= maxTasks {
result.Healthy = false
result.Message = "Task manager at capacity"
} else if successRate < 0.80 {
result.Healthy = false
result.Message = fmt.Sprintf("Low task success rate: %.1f%%", successRate*100)
}
// Update metrics
ehc.metrics.mu.Lock()
ehc.metrics.ActiveTasks = activeTasks
ehc.metrics.QueuedTasks = queuedTasks
ehc.metrics.TaskSuccessRate = successRate
ehc.metrics.mu.Unlock()
result.Details = map[string]interface{}{
"active_tasks": activeTasks,
"queued_tasks": queuedTasks,
"max_tasks": maxTasks,
"success_rate": successRate,
"utilization": float64(activeTasks) / float64(maxTasks),
}
return result
},
}
}
// testPubSubRoundTrip tests PubSub publish/subscribe functionality
func (ehc *EnhancedHealthChecks) testPubSubRoundTrip(ctx context.Context, topic string, testData map[string]interface{}) CheckResult {
// This would implement actual PubSub round-trip testing
// For now, we simulate the test
// Simulate test latency
time.Sleep(50 * time.Millisecond)
return CheckResult{
Healthy: true,
Message: "PubSub round-trip test successful",
Timestamp: time.Now(),
}
}
// testDHTOperations tests DHT put/get operations
func (ehc *EnhancedHealthChecks) testDHTOperations(ctx context.Context) CheckResult {
if ehc.dht == nil {
return CheckResult{
Healthy: false,
Message: "DHT not available",
Timestamp: time.Now(),
}
}
// This would implement actual DHT testing using the adapter
adapter := NewDHTAdapter(ehc.dht)
testKey := fmt.Sprintf("health-test-%d", time.Now().UnixNano())
testValue := []byte(fmt.Sprintf(`{"test":true,"timestamp":%d}`, time.Now().Unix()))
// Test put operation
if err := adapter.PutValue(ctx, testKey, testValue); err != nil {
return CheckResult{
Healthy: false,
Message: fmt.Sprintf("DHT put failed: %v", err),
Error: err,
Timestamp: time.Now(),
}
}
// Test get operation
retrievedValue, err := adapter.GetValue(ctx, testKey)
if err != nil {
return CheckResult{
Healthy: false,
Message: fmt.Sprintf("DHT get failed: %v", err),
Error: err,
Timestamp: time.Now(),
}
}
// Verify data integrity
if string(retrievedValue) != string(testValue) {
return CheckResult{
Healthy: false,
Message: "DHT data integrity check failed",
Timestamp: time.Now(),
}
}
return CheckResult{
Healthy: true,
Message: "DHT operations successful",
Timestamp: time.Now(),
}
}
// checkReplicationHealth checks the health of DHT replication
func (ehc *EnhancedHealthChecks) checkReplicationHealth(ctx context.Context) CheckResult {
if ehc.replication == nil {
return CheckResult{
Healthy: true,
Message: "Replication manager not configured",
Timestamp: time.Now(),
}
}
metrics := ehc.replication.GetMetrics()
// Check replication health
if metrics.TotalKeys == 0 {
return CheckResult{
Healthy: true,
Message: "No content to replicate",
Timestamp: time.Now(),
}
}
// Check failure rate
totalOperations := metrics.SuccessfulReplications + metrics.FailedReplications
if totalOperations > 0 {
failureRate := float64(metrics.FailedReplications) / float64(totalOperations)
if failureRate > 0.1 { // More than 10% failure rate
return CheckResult{
Healthy: false,
Message: fmt.Sprintf("High replication failure rate: %.1f%%", failureRate*100),
Timestamp: time.Now(),
}
}
}
return CheckResult{
Healthy: true,
Message: fmt.Sprintf("Replication healthy: %d keys, %.1f avg replicas",
metrics.TotalKeys, metrics.AverageReplication),
Timestamp: time.Now(),
}
}
// updatePubSubMetrics updates PubSub health metrics
func (ehc *EnhancedHealthChecks) updatePubSubMetrics(result CheckResult) {
ehc.metrics.mu.Lock()
defer ehc.metrics.mu.Unlock()
ehc.metrics.PubSubProbeLatency = result.Latency
if result.Healthy {
ehc.metrics.PubSubLastSuccess = result.Timestamp
ehc.metrics.PubSubConsecutiveFails = 0
// Update success rate (simple exponential moving average)
ehc.metrics.PubSubSuccessRate = ehc.metrics.PubSubSuccessRate*0.9 + 0.1
} else {
ehc.metrics.PubSubConsecutiveFails++
ehc.metrics.PubSubSuccessRate = ehc.metrics.PubSubSuccessRate * 0.9
}
// Calculate health score
ehc.metrics.PubSubHealthScore = ehc.metrics.PubSubSuccessRate *
(1.0 - float64(ehc.metrics.PubSubConsecutiveFails)*0.1)
if ehc.metrics.PubSubHealthScore < 0 {
ehc.metrics.PubSubHealthScore = 0
}
}
// updateDHTMetrics updates DHT health metrics
func (ehc *EnhancedHealthChecks) updateDHTMetrics(result CheckResult, replicationResult CheckResult) {
ehc.metrics.mu.Lock()
defer ehc.metrics.mu.Unlock()
ehc.metrics.DHTProbeLatency = result.Latency
if result.Healthy {
ehc.metrics.DHTLastSuccess = result.Timestamp
ehc.metrics.DHTConsecutiveFails = 0
ehc.metrics.DHTSuccessRate = ehc.metrics.DHTSuccessRate*0.9 + 0.1
} else {
ehc.metrics.DHTConsecutiveFails++
ehc.metrics.DHTSuccessRate = ehc.metrics.DHTSuccessRate * 0.9
}
// Calculate health score
ehc.metrics.DHTHealthScore = ehc.metrics.DHTSuccessRate *
(1.0 - float64(ehc.metrics.DHTConsecutiveFails)*0.1)
if ehc.metrics.DHTHealthScore < 0 {
ehc.metrics.DHTHealthScore = 0
}
// Include replication health in overall DHT health
if replicationResult.Healthy {
ehc.metrics.DHTHealthScore = ehc.metrics.DHTHealthScore*0.8 + 0.2
} else {
ehc.metrics.DHTHealthScore = ehc.metrics.DHTHealthScore * 0.8
}
}
// updateElectionMetrics updates election health metrics
func (ehc *EnhancedHealthChecks) updateElectionMetrics(result CheckResult, currentAdmin string, heartbeatStatus map[string]interface{}) {
ehc.metrics.mu.Lock()
defer ehc.metrics.mu.Unlock()
// Track leadership changes
if ehc.metrics.LastLeadershipChange.IsZero() {
ehc.metrics.LastLeadershipChange = time.Now()
}
// Calculate admin uptime
if currentAdmin != "" {
ehc.metrics.AdminUptime = time.Since(ehc.metrics.LastLeadershipChange)
} else {
ehc.metrics.AdminUptime = 0
}
// Calculate election stability (higher is better)
timeSinceLastChange := time.Since(ehc.metrics.LastLeadershipChange)
ehc.metrics.ElectionStability = math.Min(1.0, timeSinceLastChange.Hours()/24.0)
// Extract heartbeat latency if available
if latencyStr, ok := heartbeatStatus["interval"].(string); ok {
if interval, err := time.ParseDuration(latencyStr); err == nil {
ehc.metrics.HeartbeatLatency = interval / 2 // Approximate latency
}
}
// Calculate election health score
if result.Healthy && currentAdmin != "" {
ehc.metrics.ElectionHealthScore = 1.0 * ehc.metrics.ElectionStability
} else {
ehc.metrics.ElectionHealthScore = 0.3 // Degraded but not critical
}
}
// startBackgroundMonitoring starts background health monitoring
func (ehc *EnhancedHealthChecks) startBackgroundMonitoring() {
ticker := time.NewTicker(30 * time.Second)
defer ticker.Stop()
for range ticker.C {
ehc.calculateOverallSystemHealth()
ehc.cleanupHistory()
}
}
// calculateOverallSystemHealth calculates overall system health score
func (ehc *EnhancedHealthChecks) calculateOverallSystemHealth() {
ehc.metrics.mu.Lock()
defer ehc.metrics.mu.Unlock()
// Weight different components
weights := map[string]float64{
"pubsub": 0.25,
"dht": 0.25,
"election": 0.15,
"p2p": 0.20,
"resources": 0.10,
"tasks": 0.05,
}
// Calculate weighted average
totalScore := 0.0
totalWeight := 0.0
if ehc.config.EnablePubSubProbes {
totalScore += ehc.metrics.PubSubHealthScore * weights["pubsub"]
totalWeight += weights["pubsub"]
}
if ehc.config.EnableDHTProbes {
totalScore += ehc.metrics.DHTHealthScore * weights["dht"]
totalWeight += weights["dht"]
}
if ehc.config.EnableElectionProbes {
totalScore += ehc.metrics.ElectionHealthScore * weights["election"]
totalWeight += weights["election"]
}
totalScore += ehc.metrics.P2PConnectivityScore * weights["p2p"]
totalWeight += weights["p2p"]
// Resource health (inverse of utilization)
resourceHealth := 1.0 - math.Max(ehc.metrics.CPUUsage,
math.Max(ehc.metrics.MemoryUsage, ehc.metrics.DiskUsage))
totalScore += resourceHealth * weights["resources"]
totalWeight += weights["resources"]
// Task health
taskHealth := ehc.metrics.TaskSuccessRate
totalScore += taskHealth * weights["tasks"]
totalWeight += weights["tasks"]
if totalWeight > 0 {
ehc.metrics.SystemHealthScore = totalScore / totalWeight
} else {
ehc.metrics.SystemHealthScore = 0.5 // Unknown health
}
ehc.metrics.LastFullHealthCheck = time.Now()
ehc.metrics.TotalHealthChecks++
}
// cleanupHistory cleans up old health check history
func (ehc *EnhancedHealthChecks) cleanupHistory() {
ehc.mu.Lock()
defer ehc.mu.Unlock()
cutoff := time.Now().Add(-24 * time.Hour) // Keep last 24 hours
for checkName, history := range ehc.checkHistory {
var newHistory []*CheckResult
for _, result := range history {
if result.Timestamp.After(cutoff) {
newHistory = append(newHistory, result)
}
}
ehc.checkHistory[checkName] = newHistory
}
}
// GetHealthMetrics returns comprehensive health metrics
func (ehc *EnhancedHealthChecks) GetHealthMetrics() *HealthMetrics {
ehc.metrics.mu.RLock()
defer ehc.metrics.mu.RUnlock()
// Create a deep copy to avoid race conditions
metrics := &HealthMetrics{}
*metrics = *ehc.metrics
// Copy the map
metrics.DHTReplicationStatus = make(map[string]*dht.ReplicationStatus)
for k, v := range ehc.metrics.DHTReplicationStatus {
statusCopy := *v
metrics.DHTReplicationStatus[k] = &statusCopy
}
return metrics
}
// GetHealthSummary returns a summary of system health
func (ehc *EnhancedHealthChecks) GetHealthSummary() map[string]interface{} {
metrics := ehc.GetHealthMetrics()
status := "healthy"
if metrics.SystemHealthScore < ehc.config.DegradedThreshold {
status = "degraded"
}
if metrics.SystemHealthScore < ehc.config.DegradedThreshold*0.5 {
status = "critical"
}
return map[string]interface{}{
"status": status,
"overall_score": metrics.SystemHealthScore,
"last_check": metrics.LastFullHealthCheck,
"total_checks": metrics.TotalHealthChecks,
"component_scores": map[string]float64{
"pubsub": metrics.PubSubHealthScore,
"dht": metrics.DHTHealthScore,
"election": metrics.ElectionHealthScore,
"p2p": metrics.P2PConnectivityScore,
},
"key_metrics": map[string]interface{}{
"connected_peers": metrics.P2PConnectedPeers,
"active_tasks": metrics.ActiveTasks,
"admin_uptime": metrics.AdminUptime.String(),
"leadership_changes": metrics.LeadershipChanges,
"resource_utilization": map[string]float64{
"cpu": metrics.CPUUsage,
"memory": metrics.MemoryUsage,
"disk": metrics.DiskUsage,
},
},
}
}
// getNodeID returns the current node ID (placeholder implementation)
func (ehc *EnhancedHealthChecks) getNodeID() string {
return "node-placeholder" // Would get from actual node
}

View File

@@ -0,0 +1,307 @@
package health
import (
"context"
"fmt"
"net/http"
"time"
"chorus.services/bzzz/pkg/shutdown"
)
// IntegrationExample demonstrates how to integrate health monitoring and graceful shutdown
func IntegrationExample() {
// Create logger (in real implementation, use your logging system)
logger := &defaultLogger{}
// Create shutdown manager
shutdownManager := shutdown.NewManager(30*time.Second, logger)
// Create health manager
healthManager := NewManager("node-123", "v1.0.0", logger)
// Connect health manager to shutdown manager for critical failures
healthManager.SetShutdownManager(shutdownManager)
// Register some example health checks
setupHealthChecks(healthManager)
// Create and register components for graceful shutdown
setupShutdownComponents(shutdownManager, healthManager)
// Start systems
if err := healthManager.Start(); err != nil {
logger.Error("Failed to start health manager: %v", err)
return
}
// Start health HTTP server
if err := healthManager.StartHTTPServer(8081); err != nil {
logger.Error("Failed to start health HTTP server: %v", err)
return
}
// Add shutdown hooks
setupShutdownHooks(shutdownManager, healthManager, logger)
// Start shutdown manager (begins listening for signals)
shutdownManager.Start()
logger.Info("🚀 System started with integrated health monitoring and graceful shutdown")
logger.Info("📊 Health endpoints available at:")
logger.Info(" - http://localhost:8081/health (overall health)")
logger.Info(" - http://localhost:8081/health/ready (readiness)")
logger.Info(" - http://localhost:8081/health/live (liveness)")
logger.Info(" - http://localhost:8081/health/checks (detailed checks)")
// Wait for shutdown
shutdownManager.Wait()
logger.Info("✅ System shutdown completed")
}
// setupHealthChecks registers various health checks
func setupHealthChecks(healthManager *Manager) {
// Database connectivity check (critical)
databaseCheck := CreateDatabaseCheck("primary-db", func() error {
// Simulate database ping
time.Sleep(10 * time.Millisecond)
// Return nil for healthy, error for unhealthy
return nil
})
healthManager.RegisterCheck(databaseCheck)
// Memory usage check (warning only)
memoryCheck := CreateMemoryCheck(0.85) // Alert if > 85%
healthManager.RegisterCheck(memoryCheck)
// Disk space check (warning only)
diskCheck := CreateDiskSpaceCheck("/var/lib/bzzz", 0.90) // Alert if > 90%
healthManager.RegisterCheck(diskCheck)
// Custom application-specific health check
customCheck := &HealthCheck{
Name: "p2p-connectivity",
Description: "P2P network connectivity check",
Enabled: true,
Critical: true, // This is critical for P2P systems
Interval: 15 * time.Second,
Timeout: 10 * time.Second,
Checker: func(ctx context.Context) CheckResult {
// Simulate P2P connectivity check
time.Sleep(50 * time.Millisecond)
// Simulate occasionally failing check
connected := time.Now().Unix()%10 != 0 // Fail 10% of the time
if !connected {
return CheckResult{
Healthy: false,
Message: "No P2P peers connected",
Details: map[string]interface{}{
"connected_peers": 0,
"min_peers": 1,
},
Timestamp: time.Now(),
}
}
return CheckResult{
Healthy: true,
Message: "P2P connectivity OK",
Details: map[string]interface{}{
"connected_peers": 5,
"min_peers": 1,
},
Timestamp: time.Now(),
}
},
}
healthManager.RegisterCheck(customCheck)
// Election system health check
electionCheck := &HealthCheck{
Name: "election-system",
Description: "Election system health check",
Enabled: true,
Critical: false, // Elections can be temporarily unhealthy
Interval: 30 * time.Second,
Timeout: 5 * time.Second,
Checker: func(ctx context.Context) CheckResult {
// Simulate election system check
healthy := true
message := "Election system operational"
return CheckResult{
Healthy: healthy,
Message: message,
Details: map[string]interface{}{
"current_admin": "node-456",
"election_term": 42,
"last_election": time.Now().Add(-10 * time.Minute),
},
Timestamp: time.Now(),
}
},
}
healthManager.RegisterCheck(electionCheck)
}
// setupShutdownComponents registers components for graceful shutdown
func setupShutdownComponents(shutdownManager *shutdown.Manager, healthManager *Manager) {
// Register health manager for shutdown (high priority to stop health checks early)
healthComponent := shutdown.NewGenericComponent("health-manager", 10, true).
SetShutdownFunc(func(ctx context.Context) error {
return healthManager.Stop()
})
shutdownManager.Register(healthComponent)
// Simulate HTTP server
httpServer := &http.Server{Addr: ":8080"}
httpComponent := shutdown.NewHTTPServerComponent("main-http-server", httpServer, 20)
shutdownManager.Register(httpComponent)
// Simulate P2P node
p2pComponent := shutdown.NewP2PNodeComponent("p2p-node", func() error {
// Simulate P2P node cleanup
time.Sleep(2 * time.Second)
return nil
}, 30)
shutdownManager.Register(p2pComponent)
// Simulate database connections
dbComponent := shutdown.NewDatabaseComponent("database-pool", func() error {
// Simulate database connection cleanup
time.Sleep(1 * time.Second)
return nil
}, 40)
shutdownManager.Register(dbComponent)
// Simulate worker pool
workerStopCh := make(chan struct{})
workerComponent := shutdown.NewWorkerPoolComponent("background-workers", workerStopCh, 5, 50)
shutdownManager.Register(workerComponent)
// Simulate monitoring/metrics system
monitoringComponent := shutdown.NewMonitoringComponent("metrics-system", func() error {
// Simulate metrics system cleanup
time.Sleep(500 * time.Millisecond)
return nil
}, 60)
shutdownManager.Register(monitoringComponent)
}
// setupShutdownHooks adds hooks for different shutdown phases
func setupShutdownHooks(shutdownManager *shutdown.Manager, healthManager *Manager, logger shutdown.Logger) {
// Pre-shutdown hook: Mark system as stopping
shutdownManager.AddHook(shutdown.PhasePreShutdown, func(ctx context.Context) error {
logger.Info("🔄 Pre-shutdown: Marking system as stopping")
// Update health status to stopping
status := healthManager.GetStatus()
status.Status = StatusStopping
status.Message = "System is shutting down"
return nil
})
// Shutdown hook: Log progress
shutdownManager.AddHook(shutdown.PhaseShutdown, func(ctx context.Context) error {
logger.Info("🔄 Shutdown phase: Components are being shut down")
return nil
})
// Post-shutdown hook: Final health status update and cleanup
shutdownManager.AddHook(shutdown.PhasePostShutdown, func(ctx context.Context) error {
logger.Info("🔄 Post-shutdown: Performing final cleanup")
// Any final cleanup that needs to happen after components are shut down
return nil
})
// Cleanup hook: Final logging and state persistence
shutdownManager.AddHook(shutdown.PhaseCleanup, func(ctx context.Context) error {
logger.Info("🔄 Cleanup: Finalizing shutdown process")
// Save any final state, flush logs, etc.
return nil
})
}
// HealthAwareComponent is an example of how to create components that integrate with health monitoring
type HealthAwareComponent struct {
name string
healthManager *Manager
checkName string
isRunning bool
stopCh chan struct{}
}
// NewHealthAwareComponent creates a component that registers its own health check
func NewHealthAwareComponent(name string, healthManager *Manager) *HealthAwareComponent {
comp := &HealthAwareComponent{
name: name,
healthManager: healthManager,
checkName: fmt.Sprintf("%s-health", name),
stopCh: make(chan struct{}),
}
// Register health check for this component
healthCheck := &HealthCheck{
Name: comp.checkName,
Description: fmt.Sprintf("Health check for %s component", name),
Enabled: true,
Critical: false,
Interval: 30 * time.Second,
Timeout: 10 * time.Second,
Checker: func(ctx context.Context) CheckResult {
if comp.isRunning {
return CheckResult{
Healthy: true,
Message: fmt.Sprintf("%s is running normally", comp.name),
Timestamp: time.Now(),
}
}
return CheckResult{
Healthy: false,
Message: fmt.Sprintf("%s is not running", comp.name),
Timestamp: time.Now(),
}
},
}
healthManager.RegisterCheck(healthCheck)
return comp
}
// Start starts the component
func (c *HealthAwareComponent) Start() error {
c.isRunning = true
return nil
}
// Name returns the component name
func (c *HealthAwareComponent) Name() string {
return c.name
}
// Priority returns the shutdown priority
func (c *HealthAwareComponent) Priority() int {
return 50
}
// CanForceStop returns whether the component can be force-stopped
func (c *HealthAwareComponent) CanForceStop() bool {
return true
}
// Shutdown gracefully shuts down the component
func (c *HealthAwareComponent) Shutdown(ctx context.Context) error {
c.isRunning = false
close(c.stopCh)
// Unregister health check
c.healthManager.UnregisterCheck(c.checkName)
return nil
}

758
pkg/health/manager.go Normal file
View File

@@ -0,0 +1,758 @@
package health
import (
"context"
"encoding/json"
"fmt"
"net/http"
"sync"
"time"
"chorus.services/bzzz/pkg/shutdown"
)
// Manager provides comprehensive health monitoring and integrates with graceful shutdown
type Manager struct {
mu sync.RWMutex
checks map[string]*HealthCheck
status *SystemStatus
httpServer *http.Server
shutdownManager *shutdown.Manager
ticker *time.Ticker
stopCh chan struct{}
logger Logger
}
// HealthCheck represents a single health check
type HealthCheck struct {
Name string `json:"name"`
Description string `json:"description"`
Checker func(ctx context.Context) CheckResult `json:"-"`
Interval time.Duration `json:"interval"`
Timeout time.Duration `json:"timeout"`
Enabled bool `json:"enabled"`
Critical bool `json:"critical"` // If true, failure triggers shutdown
LastRun time.Time `json:"last_run"`
LastResult *CheckResult `json:"last_result,omitempty"`
}
// CheckResult represents the result of a health check
type CheckResult struct {
Healthy bool `json:"healthy"`
Message string `json:"message"`
Details map[string]interface{} `json:"details,omitempty"`
Latency time.Duration `json:"latency"`
Timestamp time.Time `json:"timestamp"`
Error error `json:"error,omitempty"`
}
// SystemStatus represents the overall system health status
type SystemStatus struct {
Status Status `json:"status"`
Message string `json:"message"`
Checks map[string]*CheckResult `json:"checks"`
Uptime time.Duration `json:"uptime"`
StartTime time.Time `json:"start_time"`
LastUpdate time.Time `json:"last_update"`
Version string `json:"version"`
NodeID string `json:"node_id"`
}
// Status represents health status levels
type Status string
const (
StatusHealthy Status = "healthy"
StatusDegraded Status = "degraded"
StatusUnhealthy Status = "unhealthy"
StatusStarting Status = "starting"
StatusStopping Status = "stopping"
)
// Logger interface for health monitoring
type Logger interface {
Info(msg string, args ...interface{})
Warn(msg string, args ...interface{})
Error(msg string, args ...interface{})
}
// PubSubInterface defines the interface for PubSub health checks
type PubSubInterface interface {
SubscribeToTopic(topic string, handler func([]byte)) error
PublishToTopic(topic string, data interface{}) error
}
// DHTInterface defines the interface for DHT health checks
type DHTInterface interface {
PutValue(ctx context.Context, key string, value []byte) error
GetValue(ctx context.Context, key string) ([]byte, error)
}
// NewManager creates a new health manager
func NewManager(nodeID, version string, logger Logger) *Manager {
if logger == nil {
logger = &defaultLogger{}
}
return &Manager{
checks: make(map[string]*HealthCheck),
status: &SystemStatus{
Status: StatusStarting,
Message: "System starting up",
Checks: make(map[string]*CheckResult),
StartTime: time.Now(),
Version: version,
NodeID: nodeID,
},
stopCh: make(chan struct{}),
logger: logger,
}
}
// RegisterCheck adds a new health check
func (m *Manager) RegisterCheck(check *HealthCheck) {
m.mu.Lock()
defer m.mu.Unlock()
if check.Timeout == 0 {
check.Timeout = 10 * time.Second
}
if check.Interval == 0 {
check.Interval = 30 * time.Second
}
m.checks[check.Name] = check
m.logger.Info("Registered health check: %s (critical: %t, interval: %v)",
check.Name, check.Critical, check.Interval)
}
// UnregisterCheck removes a health check
func (m *Manager) UnregisterCheck(name string) {
m.mu.Lock()
defer m.mu.Unlock()
delete(m.checks, name)
delete(m.status.Checks, name)
m.logger.Info("Unregistered health check: %s", name)
}
// Start begins health monitoring
func (m *Manager) Start() error {
m.mu.Lock()
defer m.mu.Unlock()
// Start health check loop
m.ticker = time.NewTicker(5 * time.Second) // Check every 5 seconds
go m.healthCheckLoop()
// Update status to healthy (assuming no critical checks fail immediately)
m.status.Status = StatusHealthy
m.status.Message = "System operational"
m.logger.Info("Health monitoring started")
return nil
}
// Stop stops health monitoring
func (m *Manager) Stop() error {
m.mu.Lock()
defer m.mu.Unlock()
close(m.stopCh)
if m.ticker != nil {
m.ticker.Stop()
}
m.status.Status = StatusStopping
m.status.Message = "System shutting down"
m.logger.Info("Health monitoring stopped")
return nil
}
// StartHTTPServer starts an HTTP server for health endpoints
func (m *Manager) StartHTTPServer(port int) error {
mux := http.NewServeMux()
// Health check endpoint
mux.HandleFunc("/health", m.handleHealth)
mux.HandleFunc("/health/ready", m.handleReady)
mux.HandleFunc("/health/live", m.handleLive)
mux.HandleFunc("/health/checks", m.handleChecks)
m.httpServer = &http.Server{
Addr: fmt.Sprintf(":%d", port),
Handler: mux,
}
go func() {
if err := m.httpServer.ListenAndServe(); err != nil && err != http.ErrServerClosed {
m.logger.Error("Health HTTP server error: %v", err)
}
}()
m.logger.Info("Health HTTP server started on port %d", port)
return nil
}
// SetShutdownManager sets the shutdown manager for critical health failures
func (m *Manager) SetShutdownManager(shutdownManager *shutdown.Manager) {
m.shutdownManager = shutdownManager
}
// GetStatus returns the current system status
func (m *Manager) GetStatus() *SystemStatus {
m.mu.RLock()
defer m.mu.RUnlock()
// Create a copy to avoid race conditions
status := *m.status
status.Uptime = time.Since(m.status.StartTime)
status.LastUpdate = time.Now()
// Copy checks
status.Checks = make(map[string]*CheckResult)
for name, result := range m.status.Checks {
if result != nil {
resultCopy := *result
status.Checks[name] = &resultCopy
}
}
return &status
}
// healthCheckLoop runs health checks periodically
func (m *Manager) healthCheckLoop() {
defer m.ticker.Stop()
for {
select {
case <-m.ticker.C:
m.runHealthChecks()
case <-m.stopCh:
return
}
}
}
// runHealthChecks executes all registered health checks
func (m *Manager) runHealthChecks() {
m.mu.RLock()
checks := make([]*HealthCheck, 0, len(m.checks))
for _, check := range m.checks {
if check.Enabled && time.Since(check.LastRun) >= check.Interval {
checks = append(checks, check)
}
}
m.mu.RUnlock()
if len(checks) == 0 {
return
}
for _, check := range checks {
go m.executeHealthCheck(check)
}
}
// executeHealthCheck runs a single health check
func (m *Manager) executeHealthCheck(check *HealthCheck) {
ctx, cancel := context.WithTimeout(context.Background(), check.Timeout)
defer cancel()
start := time.Now()
result := check.Checker(ctx)
result.Latency = time.Since(start)
result.Timestamp = time.Now()
m.mu.Lock()
check.LastRun = time.Now()
check.LastResult = &result
m.status.Checks[check.Name] = &result
m.mu.Unlock()
// Log health check results
if result.Healthy {
m.logger.Info("Health check passed: %s (latency: %v)", check.Name, result.Latency)
} else {
m.logger.Warn("Health check failed: %s - %s (latency: %v)",
check.Name, result.Message, result.Latency)
// If this is a critical check and it failed, consider shutdown
if check.Critical && m.shutdownManager != nil {
m.logger.Error("Critical health check failed: %s - initiating graceful shutdown", check.Name)
m.shutdownManager.Stop()
}
}
// Update overall system status
m.updateSystemStatus()
}
// updateSystemStatus recalculates the overall system status
func (m *Manager) updateSystemStatus() {
m.mu.Lock()
defer m.mu.Unlock()
var healthyChecks, totalChecks, criticalFailures int
for _, result := range m.status.Checks {
totalChecks++
if result.Healthy {
healthyChecks++
} else {
// Check if this is a critical check
if check, exists := m.checks[result.Timestamp.String()]; exists && check.Critical {
criticalFailures++
}
}
}
// Determine overall status
if criticalFailures > 0 {
m.status.Status = StatusUnhealthy
m.status.Message = fmt.Sprintf("Critical health checks failing (%d)", criticalFailures)
} else if totalChecks == 0 {
m.status.Status = StatusStarting
m.status.Message = "No health checks configured"
} else if healthyChecks == totalChecks {
m.status.Status = StatusHealthy
m.status.Message = "All health checks passing"
} else {
m.status.Status = StatusDegraded
m.status.Message = fmt.Sprintf("Some health checks failing (%d/%d healthy)",
healthyChecks, totalChecks)
}
}
// HTTP Handlers
func (m *Manager) handleHealth(w http.ResponseWriter, r *http.Request) {
status := m.GetStatus()
w.Header().Set("Content-Type", "application/json")
// Set HTTP status code based on health
switch status.Status {
case StatusHealthy:
w.WriteHeader(http.StatusOK)
case StatusDegraded:
w.WriteHeader(http.StatusOK) // Still OK, but degraded
case StatusUnhealthy:
w.WriteHeader(http.StatusServiceUnavailable)
case StatusStarting:
w.WriteHeader(http.StatusServiceUnavailable)
case StatusStopping:
w.WriteHeader(http.StatusServiceUnavailable)
}
json.NewEncoder(w).Encode(status)
}
func (m *Manager) handleReady(w http.ResponseWriter, r *http.Request) {
status := m.GetStatus()
w.Header().Set("Content-Type", "application/json")
// Ready means we can handle requests
if status.Status == StatusHealthy || status.Status == StatusDegraded {
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(map[string]interface{}{
"ready": true,
"status": status.Status,
"message": status.Message,
})
} else {
w.WriteHeader(http.StatusServiceUnavailable)
json.NewEncoder(w).Encode(map[string]interface{}{
"ready": false,
"status": status.Status,
"message": status.Message,
})
}
}
func (m *Manager) handleLive(w http.ResponseWriter, r *http.Request) {
status := m.GetStatus()
w.Header().Set("Content-Type", "application/json")
// Live means the process is running (not necessarily healthy)
if status.Status != StatusStopping {
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(map[string]interface{}{
"live": true,
"status": status.Status,
"uptime": status.Uptime.String(),
})
} else {
w.WriteHeader(http.StatusServiceUnavailable)
json.NewEncoder(w).Encode(map[string]interface{}{
"live": false,
"status": status.Status,
"message": "System is shutting down",
})
}
}
func (m *Manager) handleChecks(w http.ResponseWriter, r *http.Request) {
status := m.GetStatus()
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(map[string]interface{}{
"checks": status.Checks,
"total": len(status.Checks),
"timestamp": time.Now(),
})
}
// Predefined health checks
// CreateDatabaseCheck creates a health check for database connectivity
func CreateDatabaseCheck(name string, pingFunc func() error) *HealthCheck {
return &HealthCheck{
Name: name,
Description: fmt.Sprintf("Database connectivity check for %s", name),
Enabled: true,
Critical: true,
Interval: 30 * time.Second,
Timeout: 10 * time.Second,
Checker: func(ctx context.Context) CheckResult {
start := time.Now()
err := pingFunc()
if err != nil {
return CheckResult{
Healthy: false,
Message: fmt.Sprintf("Database ping failed: %v", err),
Error: err,
Timestamp: time.Now(),
Latency: time.Since(start),
}
}
return CheckResult{
Healthy: true,
Message: "Database connectivity OK",
Timestamp: time.Now(),
Latency: time.Since(start),
}
},
}
}
// CreateDiskSpaceCheck creates a health check for disk space
func CreateDiskSpaceCheck(path string, threshold float64) *HealthCheck {
return &HealthCheck{
Name: fmt.Sprintf("disk-space-%s", path),
Description: fmt.Sprintf("Disk space check for %s (threshold: %.1f%%)", path, threshold*100),
Enabled: true,
Critical: false,
Interval: 60 * time.Second,
Timeout: 5 * time.Second,
Checker: func(ctx context.Context) CheckResult {
// In a real implementation, you would check actual disk usage
// For now, we'll simulate it
usage := 0.75 // Simulate 75% usage
if usage > threshold {
return CheckResult{
Healthy: false,
Message: fmt.Sprintf("Disk usage %.1f%% exceeds threshold %.1f%%",
usage*100, threshold*100),
Details: map[string]interface{}{
"path": path,
"usage": usage,
"threshold": threshold,
},
Timestamp: time.Now(),
}
}
return CheckResult{
Healthy: true,
Message: fmt.Sprintf("Disk usage %.1f%% is within threshold", usage*100),
Details: map[string]interface{}{
"path": path,
"usage": usage,
"threshold": threshold,
},
Timestamp: time.Now(),
}
},
}
}
// CreateMemoryCheck creates a health check for memory usage
func CreateMemoryCheck(threshold float64) *HealthCheck {
return &HealthCheck{
Name: "memory-usage",
Description: fmt.Sprintf("Memory usage check (threshold: %.1f%%)", threshold*100),
Enabled: true,
Critical: false,
Interval: 30 * time.Second,
Timeout: 5 * time.Second,
Checker: func(ctx context.Context) CheckResult {
// In a real implementation, you would check actual memory usage
usage := 0.60 // Simulate 60% usage
if usage > threshold {
return CheckResult{
Healthy: false,
Message: fmt.Sprintf("Memory usage %.1f%% exceeds threshold %.1f%%",
usage*100, threshold*100),
Details: map[string]interface{}{
"usage": usage,
"threshold": threshold,
},
Timestamp: time.Now(),
}
}
return CheckResult{
Healthy: true,
Message: fmt.Sprintf("Memory usage %.1f%% is within threshold", usage*100),
Details: map[string]interface{}{
"usage": usage,
"threshold": threshold,
},
Timestamp: time.Now(),
}
},
}
}
// CreateActivePubSubCheck creates an active health check for PubSub system
func CreateActivePubSubCheck(pubsub PubSubInterface) *HealthCheck {
return &HealthCheck{
Name: "pubsub-active-probe",
Description: "Active PubSub system health probe with loopback test",
Enabled: true,
Critical: false,
Interval: 60 * time.Second,
Timeout: 15 * time.Second,
Checker: func(ctx context.Context) CheckResult {
start := time.Now()
// Generate unique test message
testKey := fmt.Sprintf("health-check-%d", time.Now().UnixNano())
testMessage := map[string]interface{}{
"test_key": testKey,
"timestamp": time.Now().Unix(),
"probe_id": "pubsub-health-check",
}
// Channel to receive test message
resultCh := make(chan bool, 1)
errorCh := make(chan error, 1)
// Set up message handler for test topic
handler := func(data []byte) {
var received map[string]interface{}
if err := json.Unmarshal(data, &received); err != nil {
return
}
if receivedKey, ok := received["test_key"].(string); ok && receivedKey == testKey {
select {
case resultCh <- true:
default:
}
}
}
// Subscribe to test topic
testTopic := "bzzz/health-test/v1"
if err := pubsub.SubscribeToTopic(testTopic, handler); err != nil {
return CheckResult{
Healthy: false,
Message: fmt.Sprintf("Failed to subscribe to test topic: %v", err),
Error: err,
Timestamp: time.Now(),
Latency: time.Since(start),
}
}
// Allow subscription to settle
time.Sleep(500 * time.Millisecond)
// Publish test message
go func() {
if err := pubsub.PublishToTopic(testTopic, testMessage); err != nil {
errorCh <- err
}
}()
// Wait for result with timeout
select {
case <-resultCh:
latency := time.Since(start)
return CheckResult{
Healthy: true,
Message: fmt.Sprintf("PubSub loopback test successful"),
Details: map[string]interface{}{
"test_topic": testTopic,
"test_key": testKey,
"latency_ms": latency.Milliseconds(),
},
Timestamp: time.Now(),
Latency: latency,
}
case err := <-errorCh:
return CheckResult{
Healthy: false,
Message: fmt.Sprintf("Failed to publish test message: %v", err),
Error: err,
Timestamp: time.Now(),
Latency: time.Since(start),
}
case <-time.After(10 * time.Second):
return CheckResult{
Healthy: false,
Message: "PubSub loopback test timeout - message not received",
Details: map[string]interface{}{
"test_topic": testTopic,
"test_key": testKey,
"timeout": "10s",
},
Timestamp: time.Now(),
Latency: time.Since(start),
}
case <-ctx.Done():
return CheckResult{
Healthy: false,
Message: "PubSub health check cancelled",
Details: map[string]interface{}{
"test_topic": testTopic,
"reason": "context_cancelled",
},
Timestamp: time.Now(),
Latency: time.Since(start),
}
}
},
}
}
// CreateActiveDHTCheck creates an active health check for DHT system
func CreateActiveDHTCheck(dht DHTInterface) *HealthCheck {
return &HealthCheck{
Name: "dht-active-probe",
Description: "Active DHT system health probe with put/get test",
Enabled: true,
Critical: false,
Interval: 90 * time.Second,
Timeout: 20 * time.Second,
Checker: func(ctx context.Context) CheckResult {
start := time.Now()
// Generate unique test key and value
testKey := fmt.Sprintf("health-check-%d", time.Now().UnixNano())
testValue := []byte(fmt.Sprintf(`{"test_key":"%s","timestamp":%d,"probe_id":"dht-health-check"}`,
testKey, time.Now().Unix()))
// Test DHT put operation
putStart := time.Now()
if err := dht.PutValue(ctx, testKey, testValue); err != nil {
return CheckResult{
Healthy: false,
Message: fmt.Sprintf("DHT put operation failed: %v", err),
Details: map[string]interface{}{
"test_key": testKey,
"operation": "put",
"put_latency": time.Since(putStart).Milliseconds(),
},
Error: err,
Timestamp: time.Now(),
Latency: time.Since(start),
}
}
putLatency := time.Since(putStart)
// Allow some time for propagation
time.Sleep(100 * time.Millisecond)
// Test DHT get operation
getStart := time.Now()
retrievedValue, err := dht.GetValue(ctx, testKey)
if err != nil {
return CheckResult{
Healthy: false,
Message: fmt.Sprintf("DHT get operation failed: %v", err),
Details: map[string]interface{}{
"test_key": testKey,
"operation": "get",
"put_latency": putLatency.Milliseconds(),
"get_latency": time.Since(getStart).Milliseconds(),
},
Error: err,
Timestamp: time.Now(),
Latency: time.Since(start),
}
}
getLatency := time.Since(getStart)
// Verify retrieved value matches
if string(retrievedValue) != string(testValue) {
return CheckResult{
Healthy: false,
Message: "DHT data integrity check failed - retrieved value doesn't match",
Details: map[string]interface{}{
"test_key": testKey,
"expected_len": len(testValue),
"retrieved_len": len(retrievedValue),
"put_latency": putLatency.Milliseconds(),
"get_latency": getLatency.Milliseconds(),
"total_latency": time.Since(start).Milliseconds(),
},
Timestamp: time.Now(),
Latency: time.Since(start),
}
}
totalLatency := time.Since(start)
// Get DHT statistics if available
var stats interface{}
if statsProvider, ok := dht.(interface{ GetStats() interface{} }); ok {
stats = statsProvider.GetStats()
}
return CheckResult{
Healthy: true,
Message: "DHT put/get test successful",
Details: map[string]interface{}{
"test_key": testKey,
"put_latency": putLatency.Milliseconds(),
"get_latency": getLatency.Milliseconds(),
"total_latency": totalLatency.Milliseconds(),
"data_integrity": "verified",
"stats": stats,
},
Timestamp: time.Now(),
Latency: totalLatency,
}
},
}
}
// defaultLogger is a simple logger implementation
type defaultLogger struct{}
func (l *defaultLogger) Info(msg string, args ...interface{}) {
fmt.Printf("[INFO] "+msg+"\n", args...)
}
func (l *defaultLogger) Warn(msg string, args ...interface{}) {
fmt.Printf("[WARN] "+msg+"\n", args...)
}
func (l *defaultLogger) Error(msg string, args ...interface{}) {
fmt.Printf("[ERROR] "+msg+"\n", args...)
}