bzzz/pkg/health/manager.go

package health

import (
	"context"
	"encoding/json"
	"fmt"
	"net/http"
	"sync"
	"time"

	"chorus.services/bzzz/pkg/shutdown"
)

// Manager provides comprehensive health monitoring and integrates with graceful shutdown
type Manager struct {
	mu              sync.RWMutex
	checks          map[string]*HealthCheck
	status          *SystemStatus
	httpServer      *http.Server
	shutdownManager *shutdown.Manager
	ticker          *time.Ticker
	stopCh          chan struct{}
	logger          Logger
}

// HealthCheck represents a single health check
type HealthCheck struct {
	Name        string                      `json:"name"`
	Description string                      `json:"description"`
	Checker     func(ctx context.Context) CheckResult `json:"-"`
	Interval    time.Duration               `json:"interval"`
	Timeout     time.Duration               `json:"timeout"`
	Enabled     bool                        `json:"enabled"`
	Critical    bool                        `json:"critical"` // If true, failure triggers shutdown
	LastRun     time.Time                   `json:"last_run"`
	LastResult  *CheckResult                `json:"last_result,omitempty"`
}

// CheckResult represents the result of a health check
type CheckResult struct {
	Healthy    bool                   `json:"healthy"`
	Message    string                 `json:"message"`
	Details    map[string]interface{} `json:"details,omitempty"`
	Latency    time.Duration          `json:"latency"`
	Timestamp  time.Time              `json:"timestamp"`
	Error      error                  `json:"error,omitempty"`
}

// SystemStatus represents the overall system health status
type SystemStatus struct {
	Status     Status                     `json:"status"`
	Message    string                     `json:"message"`
	Checks     map[string]*CheckResult    `json:"checks"`
	Uptime     time.Duration              `json:"uptime"`
	StartTime  time.Time                  `json:"start_time"`
	LastUpdate time.Time                  `json:"last_update"`
	Version    string                     `json:"version"`
	NodeID     string                     `json:"node_id"`
}

// Status represents health status levels
type Status string

const (
	StatusHealthy   Status = "healthy"
	StatusDegraded  Status = "degraded"
	StatusUnhealthy Status = "unhealthy"
	StatusStarting  Status = "starting"
	StatusStopping  Status = "stopping"
)

// Logger interface for health monitoring
type Logger interface {
	Info(msg string, args ...interface{})
	Warn(msg string, args ...interface{})
	Error(msg string, args ...interface{})
}

// PubSubInterface defines the interface for PubSub health checks
type PubSubInterface interface {
	SubscribeToTopic(topic string, handler func([]byte)) error
	PublishToTopic(topic string, data interface{}) error
}

// DHTInterface defines the interface for DHT health checks
type DHTInterface interface {
	PutValue(ctx context.Context, key string, value []byte) error
	GetValue(ctx context.Context, key string) ([]byte, error)
}

// NewManager creates a new health manager
func NewManager(nodeID, version string, logger Logger) *Manager {
	if logger == nil {
		logger = &defaultLogger{}
	}

	return &Manager{
		checks: make(map[string]*HealthCheck),
		status: &SystemStatus{
			Status:    StatusStarting,
			Message:   "System starting up",
			Checks:    make(map[string]*CheckResult),
			StartTime: time.Now(),
			Version:   version,
			NodeID:    nodeID,
		},
		stopCh: make(chan struct{}),
		logger: logger,
	}
}

// RegisterCheck adds a new health check
func (m *Manager) RegisterCheck(check *HealthCheck) {
	m.mu.Lock()
	defer m.mu.Unlock()

	if check.Timeout == 0 {
		check.Timeout = 10 * time.Second
	}
	if check.Interval == 0 {
		check.Interval = 30 * time.Second
	}

	m.checks[check.Name] = check
	m.logger.Info("Registered health check: %s (critical: %t, interval: %v)",
		check.Name, check.Critical, check.Interval)
}

// UnregisterCheck removes a health check
func (m *Manager) UnregisterCheck(name string) {
	m.mu.Lock()
	defer m.mu.Unlock()

	delete(m.checks, name)
	delete(m.status.Checks, name)
	m.logger.Info("Unregistered health check: %s", name)
}

// Start begins health monitoring
func (m *Manager) Start() error {
	m.mu.Lock()
	defer m.mu.Unlock()

	// Start health check loop
	m.ticker = time.NewTicker(5 * time.Second) // Check every 5 seconds
	go m.healthCheckLoop()

	// Update status to healthy (assuming no critical checks fail immediately)
	m.status.Status = StatusHealthy
	m.status.Message = "System operational"

	m.logger.Info("Health monitoring started")
	return nil
}

// Stop stops health monitoring
func (m *Manager) Stop() error {
	m.mu.Lock()
	defer m.mu.Unlock()

	close(m.stopCh)
	if m.ticker != nil {
		m.ticker.Stop()
	}

	m.status.Status = StatusStopping
	m.status.Message = "System shutting down"

	m.logger.Info("Health monitoring stopped")
	return nil
}

// StartHTTPServer starts an HTTP server for health endpoints
func (m *Manager) StartHTTPServer(port int) error {
	mux := http.NewServeMux()

	// Health check endpoint
	mux.HandleFunc("/health", m.handleHealth)
	mux.HandleFunc("/health/ready", m.handleReady)
	mux.HandleFunc("/health/live", m.handleLive)
	mux.HandleFunc("/health/checks", m.handleChecks)

	m.httpServer = &http.Server{
		Addr:    fmt.Sprintf(":%d", port),
		Handler: mux,
	}

	go func() {
		if err := m.httpServer.ListenAndServe(); err != nil && err != http.ErrServerClosed {
			m.logger.Error("Health HTTP server error: %v", err)
		}
	}()

	m.logger.Info("Health HTTP server started on port %d", port)
	return nil
}

// SetShutdownManager sets the shutdown manager for critical health failures
func (m *Manager) SetShutdownManager(shutdownManager *shutdown.Manager) {
	m.shutdownManager = shutdownManager
}

// GetStatus returns the current system status
func (m *Manager) GetStatus() *SystemStatus {
	m.mu.RLock()
	defer m.mu.RUnlock()

	// Create a copy to avoid race conditions
	status := *m.status
	status.Uptime = time.Since(m.status.StartTime)
	status.LastUpdate = time.Now()

	// Copy checks
	status.Checks = make(map[string]*CheckResult)
	for name, result := range m.status.Checks {
		if result != nil {
			resultCopy := *result
			status.Checks[name] = &resultCopy
		}
	}

	return &status
}

// healthCheckLoop runs health checks periodically
func (m *Manager) healthCheckLoop() {
	defer m.ticker.Stop()

	for {
		select {
		case <-m.ticker.C:
			m.runHealthChecks()
		case <-m.stopCh:
			return
		}
	}
}

// runHealthChecks executes all registered health checks
func (m *Manager) runHealthChecks() {
	m.mu.RLock()
	checks := make([]*HealthCheck, 0, len(m.checks))
	for _, check := range m.checks {
		if check.Enabled && time.Since(check.LastRun) >= check.Interval {
			checks = append(checks, check)
		}
	}
	m.mu.RUnlock()

	if len(checks) == 0 {
		return
	}

	for _, check := range checks {
		go m.executeHealthCheck(check)
	}
}

// executeHealthCheck runs a single health check
func (m *Manager) executeHealthCheck(check *HealthCheck) {
	ctx, cancel := context.WithTimeout(context.Background(), check.Timeout)
	defer cancel()

	start := time.Now()
	result := check.Checker(ctx)
	result.Latency = time.Since(start)
	result.Timestamp = time.Now()

	m.mu.Lock()
	check.LastRun = time.Now()
	check.LastResult = &result
	m.status.Checks[check.Name] = &result
	m.mu.Unlock()

	// Log health check results
	if result.Healthy {
		m.logger.Info("Health check passed: %s (latency: %v)", check.Name, result.Latency)
	} else {
		m.logger.Warn("Health check failed: %s - %s (latency: %v)",
			check.Name, result.Message, result.Latency)

		// If this is a critical check and it failed, consider shutdown
		if check.Critical && m.shutdownManager != nil {
			m.logger.Error("Critical health check failed: %s - initiating graceful shutdown", check.Name)
			m.shutdownManager.Stop()
		}
	}

	// Update overall system status
	m.updateSystemStatus()
}

// updateSystemStatus recalculates the overall system status
func (m *Manager) updateSystemStatus() {
	m.mu.Lock()
	defer m.mu.Unlock()

	var healthyChecks, totalChecks, criticalFailures int

	for _, result := range m.status.Checks {
		totalChecks++
		if result.Healthy {
			healthyChecks++
		} else {
			// Check if this is a critical check
			if check, exists := m.checks[result.Timestamp.String()]; exists && check.Critical {
				criticalFailures++
			}
		}
	}

	// Determine overall status
	if criticalFailures > 0 {
		m.status.Status = StatusUnhealthy
		m.status.Message = fmt.Sprintf("Critical health checks failing (%d)", criticalFailures)
	} else if totalChecks == 0 {
		m.status.Status = StatusStarting
		m.status.Message = "No health checks configured"
	} else if healthyChecks == totalChecks {
		m.status.Status = StatusHealthy
		m.status.Message = "All health checks passing"
	} else {
		m.status.Status = StatusDegraded
		m.status.Message = fmt.Sprintf("Some health checks failing (%d/%d healthy)",
			healthyChecks, totalChecks)
	}
}

// HTTP Handlers

func (m *Manager) handleHealth(w http.ResponseWriter, r *http.Request) {
	status := m.GetStatus()

	w.Header().Set("Content-Type", "application/json")

	// Set HTTP status code based on health
	switch status.Status {
	case StatusHealthy:
		w.WriteHeader(http.StatusOK)
	case StatusDegraded:
		w.WriteHeader(http.StatusOK) // Still OK, but degraded
	case StatusUnhealthy:
		w.WriteHeader(http.StatusServiceUnavailable)
	case StatusStarting:
		w.WriteHeader(http.StatusServiceUnavailable)
	case StatusStopping:
		w.WriteHeader(http.StatusServiceUnavailable)
	}

	json.NewEncoder(w).Encode(status)
}

func (m *Manager) handleReady(w http.ResponseWriter, r *http.Request) {
	status := m.GetStatus()

	w.Header().Set("Content-Type", "application/json")

	// Ready means we can handle requests
	if status.Status == StatusHealthy || status.Status == StatusDegraded {
		w.WriteHeader(http.StatusOK)
		json.NewEncoder(w).Encode(map[string]interface{}{
			"ready":   true,
			"status":  status.Status,
			"message": status.Message,
		})
	} else {
		w.WriteHeader(http.StatusServiceUnavailable)
		json.NewEncoder(w).Encode(map[string]interface{}{
			"ready":   false,
			"status":  status.Status,
			"message": status.Message,
		})
	}
}

func (m *Manager) handleLive(w http.ResponseWriter, r *http.Request) {
	status := m.GetStatus()

	w.Header().Set("Content-Type", "application/json")

	// Live means the process is running (not necessarily healthy)
	if status.Status != StatusStopping {
		w.WriteHeader(http.StatusOK)
		json.NewEncoder(w).Encode(map[string]interface{}{
			"live":    true,
			"status":  status.Status,
			"uptime":  status.Uptime.String(),
		})
	} else {
		w.WriteHeader(http.StatusServiceUnavailable)
		json.NewEncoder(w).Encode(map[string]interface{}{
			"live":    false,
			"status":  status.Status,
			"message": "System is shutting down",
		})
	}
}

func (m *Manager) handleChecks(w http.ResponseWriter, r *http.Request) {
	status := m.GetStatus()

	w.Header().Set("Content-Type", "application/json")
	w.WriteHeader(http.StatusOK)

	json.NewEncoder(w).Encode(map[string]interface{}{
		"checks":     status.Checks,
		"total":      len(status.Checks),
		"timestamp":  time.Now(),
	})
}

// Predefined health checks

// CreateDatabaseCheck creates a health check for database connectivity
func CreateDatabaseCheck(name string, pingFunc func() error) *HealthCheck {
	return &HealthCheck{
		Name:        name,
		Description: fmt.Sprintf("Database connectivity check for %s", name),
		Enabled:     true,
		Critical:    true,
		Interval:    30 * time.Second,
		Timeout:     10 * time.Second,
		Checker: func(ctx context.Context) CheckResult {
			start := time.Now()
			err := pingFunc()

			if err != nil {
				return CheckResult{
					Healthy:   false,
					Message:   fmt.Sprintf("Database ping failed: %v", err),
					Error:     err,
					Timestamp: time.Now(),
					Latency:   time.Since(start),
				}
			}

			return CheckResult{
				Healthy:   true,
				Message:   "Database connectivity OK",
				Timestamp: time.Now(),
				Latency:   time.Since(start),
			}
		},
	}
}

// CreateDiskSpaceCheck creates a health check for disk space
func CreateDiskSpaceCheck(path string, threshold float64) *HealthCheck {
	return &HealthCheck{
		Name:        fmt.Sprintf("disk-space-%s", path),
		Description: fmt.Sprintf("Disk space check for %s (threshold: %.1f%%)", path, threshold*100),
		Enabled:     true,
		Critical:    false,
		Interval:    60 * time.Second,
		Timeout:     5 * time.Second,
		Checker: func(ctx context.Context) CheckResult {
			// In a real implementation, you would check actual disk usage
			// For now, we'll simulate it
			usage := 0.75 // Simulate 75% usage

			if usage > threshold {
				return CheckResult{
					Healthy: false,
					Message: fmt.Sprintf("Disk usage %.1f%% exceeds threshold %.1f%%",
						usage*100, threshold*100),
					Details: map[string]interface{}{
						"path":      path,
						"usage":     usage,
						"threshold": threshold,
					},
					Timestamp: time.Now(),
				}
			}

			return CheckResult{
				Healthy: true,
				Message: fmt.Sprintf("Disk usage %.1f%% is within threshold", usage*100),
				Details: map[string]interface{}{
					"path":      path,
					"usage":     usage,
					"threshold": threshold,
				},
				Timestamp: time.Now(),
			}
		},
	}
}

// CreateMemoryCheck creates a health check for memory usage
func CreateMemoryCheck(threshold float64) *HealthCheck {
	return &HealthCheck{
		Name:        "memory-usage",
		Description: fmt.Sprintf("Memory usage check (threshold: %.1f%%)", threshold*100),
		Enabled:     true,
		Critical:    false,
		Interval:    30 * time.Second,
		Timeout:     5 * time.Second,
		Checker: func(ctx context.Context) CheckResult {
			// In a real implementation, you would check actual memory usage
			usage := 0.60 // Simulate 60% usage

			if usage > threshold {
				return CheckResult{
					Healthy: false,
					Message: fmt.Sprintf("Memory usage %.1f%% exceeds threshold %.1f%%",
						usage*100, threshold*100),
					Details: map[string]interface{}{
						"usage":     usage,
						"threshold": threshold,
					},
					Timestamp: time.Now(),
				}
			}

			return CheckResult{
				Healthy: true,
				Message: fmt.Sprintf("Memory usage %.1f%% is within threshold", usage*100),
				Details: map[string]interface{}{
					"usage":     usage,
					"threshold": threshold,
				},
				Timestamp: time.Now(),
			}
		},
	}
}

// CreateActivePubSubCheck creates an active health check for PubSub system
func CreateActivePubSubCheck(pubsub PubSubInterface) *HealthCheck {
	return &HealthCheck{
		Name:        "pubsub-active-probe",
		Description: "Active PubSub system health probe with loopback test",
		Enabled:     true,
		Critical:    false,
		Interval:    60 * time.Second,
		Timeout:     15 * time.Second,
		Checker: func(ctx context.Context) CheckResult {
			start := time.Now()

			// Generate unique test message
			testKey := fmt.Sprintf("health-check-%d", time.Now().UnixNano())
			testMessage := map[string]interface{}{
				"test_key":  testKey,
				"timestamp": time.Now().Unix(),
				"probe_id":  "pubsub-health-check",
			}

			// Channel to receive test message
			resultCh := make(chan bool, 1)
			errorCh := make(chan error, 1)

			// Set up message handler for test topic
			handler := func(data []byte) {
				var received map[string]interface{}
				if err := json.Unmarshal(data, &received); err != nil {
					return
				}

				if receivedKey, ok := received["test_key"].(string); ok && receivedKey == testKey {
					select {
					case resultCh <- true:
					default:
					}
				}
			}

			// Subscribe to test topic
			testTopic := "bzzz/health-test/v1"
			if err := pubsub.SubscribeToTopic(testTopic, handler); err != nil {
				return CheckResult{
					Healthy:   false,
					Message:   fmt.Sprintf("Failed to subscribe to test topic: %v", err),
					Error:     err,
					Timestamp: time.Now(),
					Latency:   time.Since(start),
				}
			}

			// Allow subscription to settle
			time.Sleep(500 * time.Millisecond)

			// Publish test message
			go func() {
				if err := pubsub.PublishToTopic(testTopic, testMessage); err != nil {
					errorCh <- err
				}
			}()

			// Wait for result with timeout
			select {
			case <-resultCh:
				latency := time.Since(start)
				return CheckResult{
					Healthy: true,
					Message: fmt.Sprintf("PubSub loopback test successful"),
					Details: map[string]interface{}{
						"test_topic": testTopic,
						"test_key":   testKey,
						"latency_ms": latency.Milliseconds(),
					},
					Timestamp: time.Now(),
					Latency:   latency,
				}

			case err := <-errorCh:
				return CheckResult{
					Healthy:   false,
					Message:   fmt.Sprintf("Failed to publish test message: %v", err),
					Error:     err,
					Timestamp: time.Now(),
					Latency:   time.Since(start),
				}

			case <-time.After(10 * time.Second):
				return CheckResult{
					Healthy: false,
					Message: "PubSub loopback test timeout - message not received",
					Details: map[string]interface{}{
						"test_topic": testTopic,
						"test_key":   testKey,
						"timeout":    "10s",
					},
					Timestamp: time.Now(),
					Latency:   time.Since(start),
				}

			case <-ctx.Done():
				return CheckResult{
					Healthy: false,
					Message: "PubSub health check cancelled",
					Details: map[string]interface{}{
						"test_topic": testTopic,
						"reason":     "context_cancelled",
					},
					Timestamp: time.Now(),
					Latency:   time.Since(start),
				}
			}
		},
	}
}

// CreateActiveDHTCheck creates an active health check for DHT system
func CreateActiveDHTCheck(dht DHTInterface) *HealthCheck {
	return &HealthCheck{
		Name:        "dht-active-probe",
		Description: "Active DHT system health probe with put/get test",
		Enabled:     true,
		Critical:    false,
		Interval:    90 * time.Second,
		Timeout:     20 * time.Second,
		Checker: func(ctx context.Context) CheckResult {
			start := time.Now()

			// Generate unique test key and value
			testKey := fmt.Sprintf("health-check-%d", time.Now().UnixNano())
			testValue := []byte(fmt.Sprintf(`{"test_key":"%s","timestamp":%d,"probe_id":"dht-health-check"}`,
				testKey, time.Now().Unix()))

			// Test DHT put operation
			putStart := time.Now()
			if err := dht.PutValue(ctx, testKey, testValue); err != nil {
				return CheckResult{
					Healthy: false,
					Message: fmt.Sprintf("DHT put operation failed: %v", err),
					Details: map[string]interface{}{
						"test_key":    testKey,
						"operation":   "put",
						"put_latency": time.Since(putStart).Milliseconds(),
					},
					Error:     err,
					Timestamp: time.Now(),
					Latency:   time.Since(start),
				}
			}
			putLatency := time.Since(putStart)

			// Allow some time for propagation
			time.Sleep(100 * time.Millisecond)

			// Test DHT get operation
			getStart := time.Now()
			retrievedValue, err := dht.GetValue(ctx, testKey)
			if err != nil {
				return CheckResult{
					Healthy: false,
					Message: fmt.Sprintf("DHT get operation failed: %v", err),
					Details: map[string]interface{}{
						"test_key":    testKey,
						"operation":   "get",
						"put_latency": putLatency.Milliseconds(),
						"get_latency": time.Since(getStart).Milliseconds(),
					},
					Error:     err,
					Timestamp: time.Now(),
					Latency:   time.Since(start),
				}
			}
			getLatency := time.Since(getStart)

			// Verify retrieved value matches
			if string(retrievedValue) != string(testValue) {
				return CheckResult{
					Healthy: false,
					Message: "DHT data integrity check failed - retrieved value doesn't match",
					Details: map[string]interface{}{
						"test_key":       testKey,
						"expected_len":   len(testValue),
						"retrieved_len":  len(retrievedValue),
						"put_latency":    putLatency.Milliseconds(),
						"get_latency":    getLatency.Milliseconds(),
						"total_latency":  time.Since(start).Milliseconds(),
					},
					Timestamp: time.Now(),
					Latency:   time.Since(start),
				}
			}

			totalLatency := time.Since(start)

			// Get DHT statistics if available
			var stats interface{}
			if statsProvider, ok := dht.(interface{ GetStats() interface{} }); ok {
				stats = statsProvider.GetStats()
			}

			return CheckResult{
				Healthy: true,
				Message: "DHT put/get test successful",
				Details: map[string]interface{}{
					"test_key":       testKey,
					"put_latency":    putLatency.Milliseconds(),
					"get_latency":    getLatency.Milliseconds(),
					"total_latency":  totalLatency.Milliseconds(),
					"data_integrity": "verified",
					"stats":          stats,
				},
				Timestamp: time.Now(),
				Latency:   totalLatency,
			}
		},
	}
}

// defaultLogger is a simple logger implementation
type defaultLogger struct{}

func (l *defaultLogger) Info(msg string, args ...interface{}) {
	fmt.Printf("[INFO] "+msg+"\n", args...)
}

func (l *defaultLogger) Warn(msg string, args ...interface{}) {
	fmt.Printf("[WARN] "+msg+"\n", args...)
}

func (l *defaultLogger) Error(msg string, args ...interface{}) {
	fmt.Printf("[ERROR] "+msg+"\n", args...)
}