This comprehensive refactoring addresses critical architectural issues: IMPORT CYCLE RESOLUTION: • pkg/crypto ↔ pkg/slurp/roles: Created pkg/security/access_levels.go • pkg/ucxl → pkg/dht: Created pkg/storage/interfaces.go • pkg/slurp/leader → pkg/election → pkg/slurp/storage: Moved types to pkg/election/interfaces.go MODULE PATH MIGRATION: • Changed from github.com/anthonyrawlins/bzzz to chorus.services/bzzz • Updated all import statements across 115+ files • Maintains compatibility while removing personal GitHub account dependency TYPE SYSTEM IMPROVEMENTS: • Resolved duplicate type declarations in crypto package • Added missing type definitions (RoleStatus, TimeRestrictions, KeyStatus, KeyRotationResult) • Proper interface segregation to prevent future cycles ARCHITECTURAL BENEFITS: • Build now progresses past structural issues to normal dependency resolution • Cleaner separation of concerns between packages • Eliminates circular dependencies that prevented compilation • Establishes foundation for scalable codebase growth 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
529 lines
14 KiB
Go
529 lines
14 KiB
Go
package health
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"net/http"
|
|
"sync"
|
|
"time"
|
|
|
|
"chorus.services/bzzz/pkg/shutdown"
|
|
)
|
|
|
|
// Manager provides comprehensive health monitoring and integrates with graceful shutdown
|
|
type Manager struct {
|
|
mu sync.RWMutex
|
|
checks map[string]*HealthCheck
|
|
status *SystemStatus
|
|
httpServer *http.Server
|
|
shutdownManager *shutdown.Manager
|
|
ticker *time.Ticker
|
|
stopCh chan struct{}
|
|
logger Logger
|
|
}
|
|
|
|
// HealthCheck represents a single health check
|
|
type HealthCheck struct {
|
|
Name string `json:"name"`
|
|
Description string `json:"description"`
|
|
Checker func(ctx context.Context) CheckResult `json:"-"`
|
|
Interval time.Duration `json:"interval"`
|
|
Timeout time.Duration `json:"timeout"`
|
|
Enabled bool `json:"enabled"`
|
|
Critical bool `json:"critical"` // If true, failure triggers shutdown
|
|
LastRun time.Time `json:"last_run"`
|
|
LastResult *CheckResult `json:"last_result,omitempty"`
|
|
}
|
|
|
|
// CheckResult represents the result of a health check
|
|
type CheckResult struct {
|
|
Healthy bool `json:"healthy"`
|
|
Message string `json:"message"`
|
|
Details map[string]interface{} `json:"details,omitempty"`
|
|
Latency time.Duration `json:"latency"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
Error error `json:"error,omitempty"`
|
|
}
|
|
|
|
// SystemStatus represents the overall system health status
|
|
type SystemStatus struct {
|
|
Status Status `json:"status"`
|
|
Message string `json:"message"`
|
|
Checks map[string]*CheckResult `json:"checks"`
|
|
Uptime time.Duration `json:"uptime"`
|
|
StartTime time.Time `json:"start_time"`
|
|
LastUpdate time.Time `json:"last_update"`
|
|
Version string `json:"version"`
|
|
NodeID string `json:"node_id"`
|
|
}
|
|
|
|
// Status represents health status levels
|
|
type Status string
|
|
|
|
const (
|
|
StatusHealthy Status = "healthy"
|
|
StatusDegraded Status = "degraded"
|
|
StatusUnhealthy Status = "unhealthy"
|
|
StatusStarting Status = "starting"
|
|
StatusStopping Status = "stopping"
|
|
)
|
|
|
|
// Logger interface for health monitoring
|
|
type Logger interface {
|
|
Info(msg string, args ...interface{})
|
|
Warn(msg string, args ...interface{})
|
|
Error(msg string, args ...interface{})
|
|
}
|
|
|
|
// NewManager creates a new health manager
|
|
func NewManager(nodeID, version string, logger Logger) *Manager {
|
|
if logger == nil {
|
|
logger = &defaultLogger{}
|
|
}
|
|
|
|
return &Manager{
|
|
checks: make(map[string]*HealthCheck),
|
|
status: &SystemStatus{
|
|
Status: StatusStarting,
|
|
Message: "System starting up",
|
|
Checks: make(map[string]*CheckResult),
|
|
StartTime: time.Now(),
|
|
Version: version,
|
|
NodeID: nodeID,
|
|
},
|
|
stopCh: make(chan struct{}),
|
|
logger: logger,
|
|
}
|
|
}
|
|
|
|
// RegisterCheck adds a new health check
|
|
func (m *Manager) RegisterCheck(check *HealthCheck) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
if check.Timeout == 0 {
|
|
check.Timeout = 10 * time.Second
|
|
}
|
|
if check.Interval == 0 {
|
|
check.Interval = 30 * time.Second
|
|
}
|
|
|
|
m.checks[check.Name] = check
|
|
m.logger.Info("Registered health check: %s (critical: %t, interval: %v)",
|
|
check.Name, check.Critical, check.Interval)
|
|
}
|
|
|
|
// UnregisterCheck removes a health check
|
|
func (m *Manager) UnregisterCheck(name string) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
delete(m.checks, name)
|
|
delete(m.status.Checks, name)
|
|
m.logger.Info("Unregistered health check: %s", name)
|
|
}
|
|
|
|
// Start begins health monitoring
|
|
func (m *Manager) Start() error {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
// Start health check loop
|
|
m.ticker = time.NewTicker(5 * time.Second) // Check every 5 seconds
|
|
go m.healthCheckLoop()
|
|
|
|
// Update status to healthy (assuming no critical checks fail immediately)
|
|
m.status.Status = StatusHealthy
|
|
m.status.Message = "System operational"
|
|
|
|
m.logger.Info("Health monitoring started")
|
|
return nil
|
|
}
|
|
|
|
// Stop stops health monitoring
|
|
func (m *Manager) Stop() error {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
close(m.stopCh)
|
|
if m.ticker != nil {
|
|
m.ticker.Stop()
|
|
}
|
|
|
|
m.status.Status = StatusStopping
|
|
m.status.Message = "System shutting down"
|
|
|
|
m.logger.Info("Health monitoring stopped")
|
|
return nil
|
|
}
|
|
|
|
// StartHTTPServer starts an HTTP server for health endpoints
|
|
func (m *Manager) StartHTTPServer(port int) error {
|
|
mux := http.NewServeMux()
|
|
|
|
// Health check endpoint
|
|
mux.HandleFunc("/health", m.handleHealth)
|
|
mux.HandleFunc("/health/ready", m.handleReady)
|
|
mux.HandleFunc("/health/live", m.handleLive)
|
|
mux.HandleFunc("/health/checks", m.handleChecks)
|
|
|
|
m.httpServer = &http.Server{
|
|
Addr: fmt.Sprintf(":%d", port),
|
|
Handler: mux,
|
|
}
|
|
|
|
go func() {
|
|
if err := m.httpServer.ListenAndServe(); err != nil && err != http.ErrServerClosed {
|
|
m.logger.Error("Health HTTP server error: %v", err)
|
|
}
|
|
}()
|
|
|
|
m.logger.Info("Health HTTP server started on port %d", port)
|
|
return nil
|
|
}
|
|
|
|
// SetShutdownManager sets the shutdown manager for critical health failures
|
|
func (m *Manager) SetShutdownManager(shutdownManager *shutdown.Manager) {
|
|
m.shutdownManager = shutdownManager
|
|
}
|
|
|
|
// GetStatus returns the current system status
|
|
func (m *Manager) GetStatus() *SystemStatus {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
// Create a copy to avoid race conditions
|
|
status := *m.status
|
|
status.Uptime = time.Since(m.status.StartTime)
|
|
status.LastUpdate = time.Now()
|
|
|
|
// Copy checks
|
|
status.Checks = make(map[string]*CheckResult)
|
|
for name, result := range m.status.Checks {
|
|
if result != nil {
|
|
resultCopy := *result
|
|
status.Checks[name] = &resultCopy
|
|
}
|
|
}
|
|
|
|
return &status
|
|
}
|
|
|
|
// healthCheckLoop runs health checks periodically
|
|
func (m *Manager) healthCheckLoop() {
|
|
defer m.ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-m.ticker.C:
|
|
m.runHealthChecks()
|
|
case <-m.stopCh:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// runHealthChecks executes all registered health checks
|
|
func (m *Manager) runHealthChecks() {
|
|
m.mu.RLock()
|
|
checks := make([]*HealthCheck, 0, len(m.checks))
|
|
for _, check := range m.checks {
|
|
if check.Enabled && time.Since(check.LastRun) >= check.Interval {
|
|
checks = append(checks, check)
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
if len(checks) == 0 {
|
|
return
|
|
}
|
|
|
|
for _, check := range checks {
|
|
go m.executeHealthCheck(check)
|
|
}
|
|
}
|
|
|
|
// executeHealthCheck runs a single health check
|
|
func (m *Manager) executeHealthCheck(check *HealthCheck) {
|
|
ctx, cancel := context.WithTimeout(context.Background(), check.Timeout)
|
|
defer cancel()
|
|
|
|
start := time.Now()
|
|
result := check.Checker(ctx)
|
|
result.Latency = time.Since(start)
|
|
result.Timestamp = time.Now()
|
|
|
|
m.mu.Lock()
|
|
check.LastRun = time.Now()
|
|
check.LastResult = &result
|
|
m.status.Checks[check.Name] = &result
|
|
m.mu.Unlock()
|
|
|
|
// Log health check results
|
|
if result.Healthy {
|
|
m.logger.Info("Health check passed: %s (latency: %v)", check.Name, result.Latency)
|
|
} else {
|
|
m.logger.Warn("Health check failed: %s - %s (latency: %v)",
|
|
check.Name, result.Message, result.Latency)
|
|
|
|
// If this is a critical check and it failed, consider shutdown
|
|
if check.Critical && m.shutdownManager != nil {
|
|
m.logger.Error("Critical health check failed: %s - initiating graceful shutdown", check.Name)
|
|
m.shutdownManager.Stop()
|
|
}
|
|
}
|
|
|
|
// Update overall system status
|
|
m.updateSystemStatus()
|
|
}
|
|
|
|
// updateSystemStatus recalculates the overall system status
|
|
func (m *Manager) updateSystemStatus() {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
var healthyChecks, totalChecks, criticalFailures int
|
|
|
|
for _, result := range m.status.Checks {
|
|
totalChecks++
|
|
if result.Healthy {
|
|
healthyChecks++
|
|
} else {
|
|
// Check if this is a critical check
|
|
if check, exists := m.checks[result.Timestamp.String()]; exists && check.Critical {
|
|
criticalFailures++
|
|
}
|
|
}
|
|
}
|
|
|
|
// Determine overall status
|
|
if criticalFailures > 0 {
|
|
m.status.Status = StatusUnhealthy
|
|
m.status.Message = fmt.Sprintf("Critical health checks failing (%d)", criticalFailures)
|
|
} else if totalChecks == 0 {
|
|
m.status.Status = StatusStarting
|
|
m.status.Message = "No health checks configured"
|
|
} else if healthyChecks == totalChecks {
|
|
m.status.Status = StatusHealthy
|
|
m.status.Message = "All health checks passing"
|
|
} else {
|
|
m.status.Status = StatusDegraded
|
|
m.status.Message = fmt.Sprintf("Some health checks failing (%d/%d healthy)",
|
|
healthyChecks, totalChecks)
|
|
}
|
|
}
|
|
|
|
// HTTP Handlers
|
|
|
|
func (m *Manager) handleHealth(w http.ResponseWriter, r *http.Request) {
|
|
status := m.GetStatus()
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
|
|
// Set HTTP status code based on health
|
|
switch status.Status {
|
|
case StatusHealthy:
|
|
w.WriteHeader(http.StatusOK)
|
|
case StatusDegraded:
|
|
w.WriteHeader(http.StatusOK) // Still OK, but degraded
|
|
case StatusUnhealthy:
|
|
w.WriteHeader(http.StatusServiceUnavailable)
|
|
case StatusStarting:
|
|
w.WriteHeader(http.StatusServiceUnavailable)
|
|
case StatusStopping:
|
|
w.WriteHeader(http.StatusServiceUnavailable)
|
|
}
|
|
|
|
json.NewEncoder(w).Encode(status)
|
|
}
|
|
|
|
func (m *Manager) handleReady(w http.ResponseWriter, r *http.Request) {
|
|
status := m.GetStatus()
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
|
|
// Ready means we can handle requests
|
|
if status.Status == StatusHealthy || status.Status == StatusDegraded {
|
|
w.WriteHeader(http.StatusOK)
|
|
json.NewEncoder(w).Encode(map[string]interface{}{
|
|
"ready": true,
|
|
"status": status.Status,
|
|
"message": status.Message,
|
|
})
|
|
} else {
|
|
w.WriteHeader(http.StatusServiceUnavailable)
|
|
json.NewEncoder(w).Encode(map[string]interface{}{
|
|
"ready": false,
|
|
"status": status.Status,
|
|
"message": status.Message,
|
|
})
|
|
}
|
|
}
|
|
|
|
func (m *Manager) handleLive(w http.ResponseWriter, r *http.Request) {
|
|
status := m.GetStatus()
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
|
|
// Live means the process is running (not necessarily healthy)
|
|
if status.Status != StatusStopping {
|
|
w.WriteHeader(http.StatusOK)
|
|
json.NewEncoder(w).Encode(map[string]interface{}{
|
|
"live": true,
|
|
"status": status.Status,
|
|
"uptime": status.Uptime.String(),
|
|
})
|
|
} else {
|
|
w.WriteHeader(http.StatusServiceUnavailable)
|
|
json.NewEncoder(w).Encode(map[string]interface{}{
|
|
"live": false,
|
|
"status": status.Status,
|
|
"message": "System is shutting down",
|
|
})
|
|
}
|
|
}
|
|
|
|
func (m *Manager) handleChecks(w http.ResponseWriter, r *http.Request) {
|
|
status := m.GetStatus()
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
w.WriteHeader(http.StatusOK)
|
|
|
|
json.NewEncoder(w).Encode(map[string]interface{}{
|
|
"checks": status.Checks,
|
|
"total": len(status.Checks),
|
|
"timestamp": time.Now(),
|
|
})
|
|
}
|
|
|
|
// Predefined health checks
|
|
|
|
// CreateDatabaseCheck creates a health check for database connectivity
|
|
func CreateDatabaseCheck(name string, pingFunc func() error) *HealthCheck {
|
|
return &HealthCheck{
|
|
Name: name,
|
|
Description: fmt.Sprintf("Database connectivity check for %s", name),
|
|
Enabled: true,
|
|
Critical: true,
|
|
Interval: 30 * time.Second,
|
|
Timeout: 10 * time.Second,
|
|
Checker: func(ctx context.Context) CheckResult {
|
|
start := time.Now()
|
|
err := pingFunc()
|
|
|
|
if err != nil {
|
|
return CheckResult{
|
|
Healthy: false,
|
|
Message: fmt.Sprintf("Database ping failed: %v", err),
|
|
Error: err,
|
|
Timestamp: time.Now(),
|
|
Latency: time.Since(start),
|
|
}
|
|
}
|
|
|
|
return CheckResult{
|
|
Healthy: true,
|
|
Message: "Database connectivity OK",
|
|
Timestamp: time.Now(),
|
|
Latency: time.Since(start),
|
|
}
|
|
},
|
|
}
|
|
}
|
|
|
|
// CreateDiskSpaceCheck creates a health check for disk space
|
|
func CreateDiskSpaceCheck(path string, threshold float64) *HealthCheck {
|
|
return &HealthCheck{
|
|
Name: fmt.Sprintf("disk-space-%s", path),
|
|
Description: fmt.Sprintf("Disk space check for %s (threshold: %.1f%%)", path, threshold*100),
|
|
Enabled: true,
|
|
Critical: false,
|
|
Interval: 60 * time.Second,
|
|
Timeout: 5 * time.Second,
|
|
Checker: func(ctx context.Context) CheckResult {
|
|
// In a real implementation, you would check actual disk usage
|
|
// For now, we'll simulate it
|
|
usage := 0.75 // Simulate 75% usage
|
|
|
|
if usage > threshold {
|
|
return CheckResult{
|
|
Healthy: false,
|
|
Message: fmt.Sprintf("Disk usage %.1f%% exceeds threshold %.1f%%",
|
|
usage*100, threshold*100),
|
|
Details: map[string]interface{}{
|
|
"path": path,
|
|
"usage": usage,
|
|
"threshold": threshold,
|
|
},
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
return CheckResult{
|
|
Healthy: true,
|
|
Message: fmt.Sprintf("Disk usage %.1f%% is within threshold", usage*100),
|
|
Details: map[string]interface{}{
|
|
"path": path,
|
|
"usage": usage,
|
|
"threshold": threshold,
|
|
},
|
|
Timestamp: time.Now(),
|
|
}
|
|
},
|
|
}
|
|
}
|
|
|
|
// CreateMemoryCheck creates a health check for memory usage
|
|
func CreateMemoryCheck(threshold float64) *HealthCheck {
|
|
return &HealthCheck{
|
|
Name: "memory-usage",
|
|
Description: fmt.Sprintf("Memory usage check (threshold: %.1f%%)", threshold*100),
|
|
Enabled: true,
|
|
Critical: false,
|
|
Interval: 30 * time.Second,
|
|
Timeout: 5 * time.Second,
|
|
Checker: func(ctx context.Context) CheckResult {
|
|
// In a real implementation, you would check actual memory usage
|
|
usage := 0.60 // Simulate 60% usage
|
|
|
|
if usage > threshold {
|
|
return CheckResult{
|
|
Healthy: false,
|
|
Message: fmt.Sprintf("Memory usage %.1f%% exceeds threshold %.1f%%",
|
|
usage*100, threshold*100),
|
|
Details: map[string]interface{}{
|
|
"usage": usage,
|
|
"threshold": threshold,
|
|
},
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
return CheckResult{
|
|
Healthy: true,
|
|
Message: fmt.Sprintf("Memory usage %.1f%% is within threshold", usage*100),
|
|
Details: map[string]interface{}{
|
|
"usage": usage,
|
|
"threshold": threshold,
|
|
},
|
|
Timestamp: time.Now(),
|
|
}
|
|
},
|
|
}
|
|
}
|
|
|
|
// defaultLogger is a simple logger implementation
|
|
type defaultLogger struct{}
|
|
|
|
func (l *defaultLogger) Info(msg string, args ...interface{}) {
|
|
fmt.Printf("[INFO] "+msg+"\n", args...)
|
|
}
|
|
|
|
func (l *defaultLogger) Warn(msg string, args ...interface{}) {
|
|
fmt.Printf("[WARN] "+msg+"\n", args...)
|
|
}
|
|
|
|
func (l *defaultLogger) Error(msg string, args ...interface{}) {
|
|
fmt.Printf("[ERROR] "+msg+"\n", args...)
|
|
} |