package health import ( "context" "encoding/json" "fmt" "net/http" "sync" "time" "github.com/anthonyrawlins/bzzz/pkg/shutdown" ) // Manager provides comprehensive health monitoring and integrates with graceful shutdown type Manager struct { mu sync.RWMutex checks map[string]*HealthCheck status *SystemStatus httpServer *http.Server shutdownManager *shutdown.Manager ticker *time.Ticker stopCh chan struct{} logger Logger } // HealthCheck represents a single health check type HealthCheck struct { Name string `json:"name"` Description string `json:"description"` Checker func(ctx context.Context) CheckResult `json:"-"` Interval time.Duration `json:"interval"` Timeout time.Duration `json:"timeout"` Enabled bool `json:"enabled"` Critical bool `json:"critical"` // If true, failure triggers shutdown LastRun time.Time `json:"last_run"` LastResult *CheckResult `json:"last_result,omitempty"` } // CheckResult represents the result of a health check type CheckResult struct { Healthy bool `json:"healthy"` Message string `json:"message"` Details map[string]interface{} `json:"details,omitempty"` Latency time.Duration `json:"latency"` Timestamp time.Time `json:"timestamp"` Error error `json:"error,omitempty"` } // SystemStatus represents the overall system health status type SystemStatus struct { Status Status `json:"status"` Message string `json:"message"` Checks map[string]*CheckResult `json:"checks"` Uptime time.Duration `json:"uptime"` StartTime time.Time `json:"start_time"` LastUpdate time.Time `json:"last_update"` Version string `json:"version"` NodeID string `json:"node_id"` } // Status represents health status levels type Status string const ( StatusHealthy Status = "healthy" StatusDegraded Status = "degraded" StatusUnhealthy Status = "unhealthy" StatusStarting Status = "starting" StatusStopping Status = "stopping" ) // Logger interface for health monitoring type Logger interface { Info(msg string, args ...interface{}) Warn(msg string, args ...interface{}) Error(msg string, args ...interface{}) } // NewManager creates a new health manager func NewManager(nodeID, version string, logger Logger) *Manager { if logger == nil { logger = &defaultLogger{} } return &Manager{ checks: make(map[string]*HealthCheck), status: &SystemStatus{ Status: StatusStarting, Message: "System starting up", Checks: make(map[string]*CheckResult), StartTime: time.Now(), Version: version, NodeID: nodeID, }, stopCh: make(chan struct{}), logger: logger, } } // RegisterCheck adds a new health check func (m *Manager) RegisterCheck(check *HealthCheck) { m.mu.Lock() defer m.mu.Unlock() if check.Timeout == 0 { check.Timeout = 10 * time.Second } if check.Interval == 0 { check.Interval = 30 * time.Second } m.checks[check.Name] = check m.logger.Info("Registered health check: %s (critical: %t, interval: %v)", check.Name, check.Critical, check.Interval) } // UnregisterCheck removes a health check func (m *Manager) UnregisterCheck(name string) { m.mu.Lock() defer m.mu.Unlock() delete(m.checks, name) delete(m.status.Checks, name) m.logger.Info("Unregistered health check: %s", name) } // Start begins health monitoring func (m *Manager) Start() error { m.mu.Lock() defer m.mu.Unlock() // Start health check loop m.ticker = time.NewTicker(5 * time.Second) // Check every 5 seconds go m.healthCheckLoop() // Update status to healthy (assuming no critical checks fail immediately) m.status.Status = StatusHealthy m.status.Message = "System operational" m.logger.Info("Health monitoring started") return nil } // Stop stops health monitoring func (m *Manager) Stop() error { m.mu.Lock() defer m.mu.Unlock() close(m.stopCh) if m.ticker != nil { m.ticker.Stop() } m.status.Status = StatusStopping m.status.Message = "System shutting down" m.logger.Info("Health monitoring stopped") return nil } // StartHTTPServer starts an HTTP server for health endpoints func (m *Manager) StartHTTPServer(port int) error { mux := http.NewServeMux() // Health check endpoint mux.HandleFunc("/health", m.handleHealth) mux.HandleFunc("/health/ready", m.handleReady) mux.HandleFunc("/health/live", m.handleLive) mux.HandleFunc("/health/checks", m.handleChecks) m.httpServer = &http.Server{ Addr: fmt.Sprintf(":%d", port), Handler: mux, } go func() { if err := m.httpServer.ListenAndServe(); err != nil && err != http.ErrServerClosed { m.logger.Error("Health HTTP server error: %v", err) } }() m.logger.Info("Health HTTP server started on port %d", port) return nil } // SetShutdownManager sets the shutdown manager for critical health failures func (m *Manager) SetShutdownManager(shutdownManager *shutdown.Manager) { m.shutdownManager = shutdownManager } // GetStatus returns the current system status func (m *Manager) GetStatus() *SystemStatus { m.mu.RLock() defer m.mu.RUnlock() // Create a copy to avoid race conditions status := *m.status status.Uptime = time.Since(m.status.StartTime) status.LastUpdate = time.Now() // Copy checks status.Checks = make(map[string]*CheckResult) for name, result := range m.status.Checks { if result != nil { resultCopy := *result status.Checks[name] = &resultCopy } } return &status } // healthCheckLoop runs health checks periodically func (m *Manager) healthCheckLoop() { defer m.ticker.Stop() for { select { case <-m.ticker.C: m.runHealthChecks() case <-m.stopCh: return } } } // runHealthChecks executes all registered health checks func (m *Manager) runHealthChecks() { m.mu.RLock() checks := make([]*HealthCheck, 0, len(m.checks)) for _, check := range m.checks { if check.Enabled && time.Since(check.LastRun) >= check.Interval { checks = append(checks, check) } } m.mu.RUnlock() if len(checks) == 0 { return } for _, check := range checks { go m.executeHealthCheck(check) } } // executeHealthCheck runs a single health check func (m *Manager) executeHealthCheck(check *HealthCheck) { ctx, cancel := context.WithTimeout(context.Background(), check.Timeout) defer cancel() start := time.Now() result := check.Checker(ctx) result.Latency = time.Since(start) result.Timestamp = time.Now() m.mu.Lock() check.LastRun = time.Now() check.LastResult = &result m.status.Checks[check.Name] = &result m.mu.Unlock() // Log health check results if result.Healthy { m.logger.Info("Health check passed: %s (latency: %v)", check.Name, result.Latency) } else { m.logger.Warn("Health check failed: %s - %s (latency: %v)", check.Name, result.Message, result.Latency) // If this is a critical check and it failed, consider shutdown if check.Critical && m.shutdownManager != nil { m.logger.Error("Critical health check failed: %s - initiating graceful shutdown", check.Name) m.shutdownManager.Stop() } } // Update overall system status m.updateSystemStatus() } // updateSystemStatus recalculates the overall system status func (m *Manager) updateSystemStatus() { m.mu.Lock() defer m.mu.Unlock() var healthyChecks, totalChecks, criticalFailures int for _, result := range m.status.Checks { totalChecks++ if result.Healthy { healthyChecks++ } else { // Check if this is a critical check if check, exists := m.checks[result.Timestamp.String()]; exists && check.Critical { criticalFailures++ } } } // Determine overall status if criticalFailures > 0 { m.status.Status = StatusUnhealthy m.status.Message = fmt.Sprintf("Critical health checks failing (%d)", criticalFailures) } else if totalChecks == 0 { m.status.Status = StatusStarting m.status.Message = "No health checks configured" } else if healthyChecks == totalChecks { m.status.Status = StatusHealthy m.status.Message = "All health checks passing" } else { m.status.Status = StatusDegraded m.status.Message = fmt.Sprintf("Some health checks failing (%d/%d healthy)", healthyChecks, totalChecks) } } // HTTP Handlers func (m *Manager) handleHealth(w http.ResponseWriter, r *http.Request) { status := m.GetStatus() w.Header().Set("Content-Type", "application/json") // Set HTTP status code based on health switch status.Status { case StatusHealthy: w.WriteHeader(http.StatusOK) case StatusDegraded: w.WriteHeader(http.StatusOK) // Still OK, but degraded case StatusUnhealthy: w.WriteHeader(http.StatusServiceUnavailable) case StatusStarting: w.WriteHeader(http.StatusServiceUnavailable) case StatusStopping: w.WriteHeader(http.StatusServiceUnavailable) } json.NewEncoder(w).Encode(status) } func (m *Manager) handleReady(w http.ResponseWriter, r *http.Request) { status := m.GetStatus() w.Header().Set("Content-Type", "application/json") // Ready means we can handle requests if status.Status == StatusHealthy || status.Status == StatusDegraded { w.WriteHeader(http.StatusOK) json.NewEncoder(w).Encode(map[string]interface{}{ "ready": true, "status": status.Status, "message": status.Message, }) } else { w.WriteHeader(http.StatusServiceUnavailable) json.NewEncoder(w).Encode(map[string]interface{}{ "ready": false, "status": status.Status, "message": status.Message, }) } } func (m *Manager) handleLive(w http.ResponseWriter, r *http.Request) { status := m.GetStatus() w.Header().Set("Content-Type", "application/json") // Live means the process is running (not necessarily healthy) if status.Status != StatusStopping { w.WriteHeader(http.StatusOK) json.NewEncoder(w).Encode(map[string]interface{}{ "live": true, "status": status.Status, "uptime": status.Uptime.String(), }) } else { w.WriteHeader(http.StatusServiceUnavailable) json.NewEncoder(w).Encode(map[string]interface{}{ "live": false, "status": status.Status, "message": "System is shutting down", }) } } func (m *Manager) handleChecks(w http.ResponseWriter, r *http.Request) { status := m.GetStatus() w.Header().Set("Content-Type", "application/json") w.WriteHeader(http.StatusOK) json.NewEncoder(w).Encode(map[string]interface{}{ "checks": status.Checks, "total": len(status.Checks), "timestamp": time.Now(), }) } // Predefined health checks // CreateDatabaseCheck creates a health check for database connectivity func CreateDatabaseCheck(name string, pingFunc func() error) *HealthCheck { return &HealthCheck{ Name: name, Description: fmt.Sprintf("Database connectivity check for %s", name), Enabled: true, Critical: true, Interval: 30 * time.Second, Timeout: 10 * time.Second, Checker: func(ctx context.Context) CheckResult { start := time.Now() err := pingFunc() if err != nil { return CheckResult{ Healthy: false, Message: fmt.Sprintf("Database ping failed: %v", err), Error: err, Timestamp: time.Now(), Latency: time.Since(start), } } return CheckResult{ Healthy: true, Message: "Database connectivity OK", Timestamp: time.Now(), Latency: time.Since(start), } }, } } // CreateDiskSpaceCheck creates a health check for disk space func CreateDiskSpaceCheck(path string, threshold float64) *HealthCheck { return &HealthCheck{ Name: fmt.Sprintf("disk-space-%s", path), Description: fmt.Sprintf("Disk space check for %s (threshold: %.1f%%)", path, threshold*100), Enabled: true, Critical: false, Interval: 60 * time.Second, Timeout: 5 * time.Second, Checker: func(ctx context.Context) CheckResult { // In a real implementation, you would check actual disk usage // For now, we'll simulate it usage := 0.75 // Simulate 75% usage if usage > threshold { return CheckResult{ Healthy: false, Message: fmt.Sprintf("Disk usage %.1f%% exceeds threshold %.1f%%", usage*100, threshold*100), Details: map[string]interface{}{ "path": path, "usage": usage, "threshold": threshold, }, Timestamp: time.Now(), } } return CheckResult{ Healthy: true, Message: fmt.Sprintf("Disk usage %.1f%% is within threshold", usage*100), Details: map[string]interface{}{ "path": path, "usage": usage, "threshold": threshold, }, Timestamp: time.Now(), } }, } } // CreateMemoryCheck creates a health check for memory usage func CreateMemoryCheck(threshold float64) *HealthCheck { return &HealthCheck{ Name: "memory-usage", Description: fmt.Sprintf("Memory usage check (threshold: %.1f%%)", threshold*100), Enabled: true, Critical: false, Interval: 30 * time.Second, Timeout: 5 * time.Second, Checker: func(ctx context.Context) CheckResult { // In a real implementation, you would check actual memory usage usage := 0.60 // Simulate 60% usage if usage > threshold { return CheckResult{ Healthy: false, Message: fmt.Sprintf("Memory usage %.1f%% exceeds threshold %.1f%%", usage*100, threshold*100), Details: map[string]interface{}{ "usage": usage, "threshold": threshold, }, Timestamp: time.Now(), } } return CheckResult{ Healthy: true, Message: fmt.Sprintf("Memory usage %.1f%% is within threshold", usage*100), Details: map[string]interface{}{ "usage": usage, "threshold": threshold, }, Timestamp: time.Now(), } }, } } // defaultLogger is a simple logger implementation type defaultLogger struct{} func (l *defaultLogger) Info(msg string, args ...interface{}) { fmt.Printf("[INFO] "+msg+"\n", args...) } func (l *defaultLogger) Warn(msg string, args ...interface{}) { fmt.Printf("[WARN] "+msg+"\n", args...) } func (l *defaultLogger) Error(msg string, args ...interface{}) { fmt.Printf("[ERROR] "+msg+"\n", args...) }