bzzz/pkg/health/manager.go

package health

import (
	"context"
	"encoding/json"
	"fmt"
	"net/http"
	"sync"
	"time"

	"chorus.services/bzzz/pkg/shutdown"
)

// Manager provides comprehensive health monitoring and integrates with graceful shutdown
type Manager struct {
	mu              sync.RWMutex
	checks          map[string]*HealthCheck
	status          *SystemStatus
	httpServer      *http.Server
	shutdownManager *shutdown.Manager
	ticker          *time.Ticker
	stopCh          chan struct{}
	logger          Logger
}

// HealthCheck represents a single health check
type HealthCheck struct {
	Name        string                      `json:"name"`
	Description string                      `json:"description"`
	Checker     func(ctx context.Context) CheckResult `json:"-"`
	Interval    time.Duration               `json:"interval"`
	Timeout     time.Duration               `json:"timeout"`
	Enabled     bool                        `json:"enabled"`
	Critical    bool                        `json:"critical"` // If true, failure triggers shutdown
	LastRun     time.Time                   `json:"last_run"`
	LastResult  *CheckResult                `json:"last_result,omitempty"`
}

// CheckResult represents the result of a health check
type CheckResult struct {
	Healthy    bool                   `json:"healthy"`
	Message    string                 `json:"message"`
	Details    map[string]interface{} `json:"details,omitempty"`
	Latency    time.Duration          `json:"latency"`
	Timestamp  time.Time              `json:"timestamp"`
	Error      error                  `json:"error,omitempty"`
}

// SystemStatus represents the overall system health status
type SystemStatus struct {
	Status     Status                     `json:"status"`
	Message    string                     `json:"message"`
	Checks     map[string]*CheckResult    `json:"checks"`
	Uptime     time.Duration              `json:"uptime"`
	StartTime  time.Time                  `json:"start_time"`
	LastUpdate time.Time                  `json:"last_update"`
	Version    string                     `json:"version"`
	NodeID     string                     `json:"node_id"`
}

// Status represents health status levels
type Status string

const (
	StatusHealthy   Status = "healthy"
	StatusDegraded  Status = "degraded"
	StatusUnhealthy Status = "unhealthy"
	StatusStarting  Status = "starting"
	StatusStopping  Status = "stopping"
)

// Logger interface for health monitoring
type Logger interface {
	Info(msg string, args ...interface{})
	Warn(msg string, args ...interface{})
	Error(msg string, args ...interface{})
}

// NewManager creates a new health manager
func NewManager(nodeID, version string, logger Logger) *Manager {
	if logger == nil {
		logger = &defaultLogger{}
	}

	return &Manager{
		checks: make(map[string]*HealthCheck),
		status: &SystemStatus{
			Status:    StatusStarting,
			Message:   "System starting up",
			Checks:    make(map[string]*CheckResult),
			StartTime: time.Now(),
			Version:   version,
			NodeID:    nodeID,
		},
		stopCh: make(chan struct{}),
		logger: logger,
	}
}

// RegisterCheck adds a new health check
func (m *Manager) RegisterCheck(check *HealthCheck) {
	m.mu.Lock()
	defer m.mu.Unlock()

	if check.Timeout == 0 {
		check.Timeout = 10 * time.Second
	}
	if check.Interval == 0 {
		check.Interval = 30 * time.Second
	}

	m.checks[check.Name] = check
	m.logger.Info("Registered health check: %s (critical: %t, interval: %v)",
		check.Name, check.Critical, check.Interval)
}

// UnregisterCheck removes a health check
func (m *Manager) UnregisterCheck(name string) {
	m.mu.Lock()
	defer m.mu.Unlock()

	delete(m.checks, name)
	delete(m.status.Checks, name)
	m.logger.Info("Unregistered health check: %s", name)
}

// Start begins health monitoring
func (m *Manager) Start() error {
	m.mu.Lock()
	defer m.mu.Unlock()

	// Start health check loop
	m.ticker = time.NewTicker(5 * time.Second) // Check every 5 seconds
	go m.healthCheckLoop()

	// Update status to healthy (assuming no critical checks fail immediately)
	m.status.Status = StatusHealthy
	m.status.Message = "System operational"

	m.logger.Info("Health monitoring started")
	return nil
}

// Stop stops health monitoring
func (m *Manager) Stop() error {
	m.mu.Lock()
	defer m.mu.Unlock()

	close(m.stopCh)
	if m.ticker != nil {
		m.ticker.Stop()
	}

	m.status.Status = StatusStopping
	m.status.Message = "System shutting down"

	m.logger.Info("Health monitoring stopped")
	return nil
}

// StartHTTPServer starts an HTTP server for health endpoints
func (m *Manager) StartHTTPServer(port int) error {
	mux := http.NewServeMux()

	// Health check endpoint
	mux.HandleFunc("/health", m.handleHealth)
	mux.HandleFunc("/health/ready", m.handleReady)
	mux.HandleFunc("/health/live", m.handleLive)
	mux.HandleFunc("/health/checks", m.handleChecks)

	m.httpServer = &http.Server{
		Addr:    fmt.Sprintf(":%d", port),
		Handler: mux,
	}

	go func() {
		if err := m.httpServer.ListenAndServe(); err != nil && err != http.ErrServerClosed {
			m.logger.Error("Health HTTP server error: %v", err)
		}
	}()

	m.logger.Info("Health HTTP server started on port %d", port)
	return nil
}

// SetShutdownManager sets the shutdown manager for critical health failures
func (m *Manager) SetShutdownManager(shutdownManager *shutdown.Manager) {
	m.shutdownManager = shutdownManager
}

// GetStatus returns the current system status
func (m *Manager) GetStatus() *SystemStatus {
	m.mu.RLock()
	defer m.mu.RUnlock()

	// Create a copy to avoid race conditions
	status := *m.status
	status.Uptime = time.Since(m.status.StartTime)
	status.LastUpdate = time.Now()

	// Copy checks
	status.Checks = make(map[string]*CheckResult)
	for name, result := range m.status.Checks {
		if result != nil {
			resultCopy := *result
			status.Checks[name] = &resultCopy
		}
	}

	return &status
}

// healthCheckLoop runs health checks periodically
func (m *Manager) healthCheckLoop() {
	defer m.ticker.Stop()

	for {
		select {
		case <-m.ticker.C:
			m.runHealthChecks()
		case <-m.stopCh:
			return
		}
	}
}

// runHealthChecks executes all registered health checks
func (m *Manager) runHealthChecks() {
	m.mu.RLock()
	checks := make([]*HealthCheck, 0, len(m.checks))
	for _, check := range m.checks {
		if check.Enabled && time.Since(check.LastRun) >= check.Interval {
			checks = append(checks, check)
		}
	}
	m.mu.RUnlock()

	if len(checks) == 0 {
		return
	}

	for _, check := range checks {
		go m.executeHealthCheck(check)
	}
}

// executeHealthCheck runs a single health check
func (m *Manager) executeHealthCheck(check *HealthCheck) {
	ctx, cancel := context.WithTimeout(context.Background(), check.Timeout)
	defer cancel()

	start := time.Now()
	result := check.Checker(ctx)
	result.Latency = time.Since(start)
	result.Timestamp = time.Now()

	m.mu.Lock()
	check.LastRun = time.Now()
	check.LastResult = &result
	m.status.Checks[check.Name] = &result
	m.mu.Unlock()

	// Log health check results
	if result.Healthy {
		m.logger.Info("Health check passed: %s (latency: %v)", check.Name, result.Latency)
	} else {
		m.logger.Warn("Health check failed: %s - %s (latency: %v)",
			check.Name, result.Message, result.Latency)

		// If this is a critical check and it failed, consider shutdown
		if check.Critical && m.shutdownManager != nil {
			m.logger.Error("Critical health check failed: %s - initiating graceful shutdown", check.Name)
			m.shutdownManager.Stop()
		}
	}

	// Update overall system status
	m.updateSystemStatus()
}

// updateSystemStatus recalculates the overall system status
func (m *Manager) updateSystemStatus() {
	m.mu.Lock()
	defer m.mu.Unlock()

	var healthyChecks, totalChecks, criticalFailures int

	for _, result := range m.status.Checks {
		totalChecks++
		if result.Healthy {
			healthyChecks++
		} else {
			// Check if this is a critical check
			if check, exists := m.checks[result.Timestamp.String()]; exists && check.Critical {
				criticalFailures++
			}
		}
	}

	// Determine overall status
	if criticalFailures > 0 {
		m.status.Status = StatusUnhealthy
		m.status.Message = fmt.Sprintf("Critical health checks failing (%d)", criticalFailures)
	} else if totalChecks == 0 {
		m.status.Status = StatusStarting
		m.status.Message = "No health checks configured"
	} else if healthyChecks == totalChecks {
		m.status.Status = StatusHealthy
		m.status.Message = "All health checks passing"
	} else {
		m.status.Status = StatusDegraded
		m.status.Message = fmt.Sprintf("Some health checks failing (%d/%d healthy)",
			healthyChecks, totalChecks)
	}
}

// HTTP Handlers

func (m *Manager) handleHealth(w http.ResponseWriter, r *http.Request) {
	status := m.GetStatus()

	w.Header().Set("Content-Type", "application/json")

	// Set HTTP status code based on health
	switch status.Status {
	case StatusHealthy:
		w.WriteHeader(http.StatusOK)
	case StatusDegraded:
		w.WriteHeader(http.StatusOK) // Still OK, but degraded
	case StatusUnhealthy:
		w.WriteHeader(http.StatusServiceUnavailable)
	case StatusStarting:
		w.WriteHeader(http.StatusServiceUnavailable)
	case StatusStopping:
		w.WriteHeader(http.StatusServiceUnavailable)
	}

	json.NewEncoder(w).Encode(status)
}

func (m *Manager) handleReady(w http.ResponseWriter, r *http.Request) {
	status := m.GetStatus()

	w.Header().Set("Content-Type", "application/json")

	// Ready means we can handle requests
	if status.Status == StatusHealthy || status.Status == StatusDegraded {
		w.WriteHeader(http.StatusOK)
		json.NewEncoder(w).Encode(map[string]interface{}{
			"ready":   true,
			"status":  status.Status,
			"message": status.Message,
		})
	} else {
		w.WriteHeader(http.StatusServiceUnavailable)
		json.NewEncoder(w).Encode(map[string]interface{}{
			"ready":   false,
			"status":  status.Status,
			"message": status.Message,
		})
	}
}

func (m *Manager) handleLive(w http.ResponseWriter, r *http.Request) {
	status := m.GetStatus()

	w.Header().Set("Content-Type", "application/json")

	// Live means the process is running (not necessarily healthy)
	if status.Status != StatusStopping {
		w.WriteHeader(http.StatusOK)
		json.NewEncoder(w).Encode(map[string]interface{}{
			"live":    true,
			"status":  status.Status,
			"uptime":  status.Uptime.String(),
		})
	} else {
		w.WriteHeader(http.StatusServiceUnavailable)
		json.NewEncoder(w).Encode(map[string]interface{}{
			"live":    false,
			"status":  status.Status,
			"message": "System is shutting down",
		})
	}
}

func (m *Manager) handleChecks(w http.ResponseWriter, r *http.Request) {
	status := m.GetStatus()

	w.Header().Set("Content-Type", "application/json")
	w.WriteHeader(http.StatusOK)

	json.NewEncoder(w).Encode(map[string]interface{}{
		"checks":     status.Checks,
		"total":      len(status.Checks),
		"timestamp":  time.Now(),
	})
}

// Predefined health checks

// CreateDatabaseCheck creates a health check for database connectivity
func CreateDatabaseCheck(name string, pingFunc func() error) *HealthCheck {
	return &HealthCheck{
		Name:        name,
		Description: fmt.Sprintf("Database connectivity check for %s", name),
		Enabled:     true,
		Critical:    true,
		Interval:    30 * time.Second,
		Timeout:     10 * time.Second,
		Checker: func(ctx context.Context) CheckResult {
			start := time.Now()
			err := pingFunc()

			if err != nil {
				return CheckResult{
					Healthy:   false,
					Message:   fmt.Sprintf("Database ping failed: %v", err),
					Error:     err,
					Timestamp: time.Now(),
					Latency:   time.Since(start),
				}
			}

			return CheckResult{
				Healthy:   true,
				Message:   "Database connectivity OK",
				Timestamp: time.Now(),
				Latency:   time.Since(start),
			}
		},
	}
}

// CreateDiskSpaceCheck creates a health check for disk space
func CreateDiskSpaceCheck(path string, threshold float64) *HealthCheck {
	return &HealthCheck{
		Name:        fmt.Sprintf("disk-space-%s", path),
		Description: fmt.Sprintf("Disk space check for %s (threshold: %.1f%%)", path, threshold*100),
		Enabled:     true,
		Critical:    false,
		Interval:    60 * time.Second,
		Timeout:     5 * time.Second,
		Checker: func(ctx context.Context) CheckResult {
			// In a real implementation, you would check actual disk usage
			// For now, we'll simulate it
			usage := 0.75 // Simulate 75% usage

			if usage > threshold {
				return CheckResult{
					Healthy: false,
					Message: fmt.Sprintf("Disk usage %.1f%% exceeds threshold %.1f%%",
						usage*100, threshold*100),
					Details: map[string]interface{}{
						"path":      path,
						"usage":     usage,
						"threshold": threshold,
					},
					Timestamp: time.Now(),
				}
			}

			return CheckResult{
				Healthy: true,
				Message: fmt.Sprintf("Disk usage %.1f%% is within threshold", usage*100),
				Details: map[string]interface{}{
					"path":      path,
					"usage":     usage,
					"threshold": threshold,
				},
				Timestamp: time.Now(),
			}
		},
	}
}

// CreateMemoryCheck creates a health check for memory usage
func CreateMemoryCheck(threshold float64) *HealthCheck {
	return &HealthCheck{
		Name:        "memory-usage",
		Description: fmt.Sprintf("Memory usage check (threshold: %.1f%%)", threshold*100),
		Enabled:     true,
		Critical:    false,
		Interval:    30 * time.Second,
		Timeout:     5 * time.Second,
		Checker: func(ctx context.Context) CheckResult {
			// In a real implementation, you would check actual memory usage
			usage := 0.60 // Simulate 60% usage

			if usage > threshold {
				return CheckResult{
					Healthy: false,
					Message: fmt.Sprintf("Memory usage %.1f%% exceeds threshold %.1f%%",
						usage*100, threshold*100),
					Details: map[string]interface{}{
						"usage":     usage,
						"threshold": threshold,
					},
					Timestamp: time.Now(),
				}
			}

			return CheckResult{
				Healthy: true,
				Message: fmt.Sprintf("Memory usage %.1f%% is within threshold", usage*100),
				Details: map[string]interface{}{
					"usage":     usage,
					"threshold": threshold,
				},
				Timestamp: time.Now(),
			}
		},
	}
}

// defaultLogger is a simple logger implementation
type defaultLogger struct{}

func (l *defaultLogger) Info(msg string, args ...interface{}) {
	fmt.Printf("[INFO] "+msg+"\n", args...)
}

func (l *defaultLogger) Warn(msg string, args ...interface{}) {
	fmt.Printf("[WARN] "+msg+"\n", args...)
}

func (l *defaultLogger) Error(msg string, args ...interface{}) {
	fmt.Printf("[ERROR] "+msg+"\n", args...)
}