 9bdcbe0447
			
		
	
	9bdcbe0447
	
	
	
		
			
			Major integrations and fixes: - Added BACKBEAT SDK integration for P2P operation timing - Implemented beat-aware status tracking for distributed operations - Added Docker secrets support for secure license management - Resolved KACHING license validation via HTTPS/TLS - Updated docker-compose configuration for clean stack deployment - Disabled rollback policies to prevent deployment failures - Added license credential storage (CHORUS-DEV-MULTI-001) Technical improvements: - BACKBEAT P2P operation tracking with phase management - Enhanced configuration system with file-based secrets - Improved error handling for license validation - Clean separation of KACHING and CHORUS deployment stacks 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
		
			
				
	
	
		
			758 lines
		
	
	
		
			21 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			758 lines
		
	
	
		
			21 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| package health
 | |
| 
 | |
| import (
 | |
| 	"context"
 | |
| 	"encoding/json"
 | |
| 	"fmt"
 | |
| 	"net/http"
 | |
| 	"sync"
 | |
| 	"time"
 | |
| 
 | |
| 	"chorus/pkg/shutdown"
 | |
| )
 | |
| 
 | |
| // Manager provides comprehensive health monitoring and integrates with graceful shutdown
 | |
| type Manager struct {
 | |
| 	mu              sync.RWMutex
 | |
| 	checks          map[string]*HealthCheck
 | |
| 	status          *SystemStatus
 | |
| 	httpServer      *http.Server
 | |
| 	shutdownManager *shutdown.Manager
 | |
| 	ticker          *time.Ticker
 | |
| 	stopCh          chan struct{}
 | |
| 	logger          Logger
 | |
| }
 | |
| 
 | |
| // HealthCheck represents a single health check
 | |
| type HealthCheck struct {
 | |
| 	Name        string                      `json:"name"`
 | |
| 	Description string                      `json:"description"`
 | |
| 	Checker     func(ctx context.Context) CheckResult `json:"-"`
 | |
| 	Interval    time.Duration               `json:"interval"`
 | |
| 	Timeout     time.Duration               `json:"timeout"`
 | |
| 	Enabled     bool                        `json:"enabled"`
 | |
| 	Critical    bool                        `json:"critical"` // If true, failure triggers shutdown
 | |
| 	LastRun     time.Time                   `json:"last_run"`
 | |
| 	LastResult  *CheckResult                `json:"last_result,omitempty"`
 | |
| }
 | |
| 
 | |
| // CheckResult represents the result of a health check
 | |
| type CheckResult struct {
 | |
| 	Healthy    bool                   `json:"healthy"`
 | |
| 	Message    string                 `json:"message"`
 | |
| 	Details    map[string]interface{} `json:"details,omitempty"`
 | |
| 	Latency    time.Duration          `json:"latency"`
 | |
| 	Timestamp  time.Time              `json:"timestamp"`
 | |
| 	Error      error                  `json:"error,omitempty"`
 | |
| }
 | |
| 
 | |
| // SystemStatus represents the overall system health status
 | |
| type SystemStatus struct {
 | |
| 	Status     Status                     `json:"status"`
 | |
| 	Message    string                     `json:"message"`
 | |
| 	Checks     map[string]*CheckResult    `json:"checks"`
 | |
| 	Uptime     time.Duration              `json:"uptime"`
 | |
| 	StartTime  time.Time                  `json:"start_time"`
 | |
| 	LastUpdate time.Time                  `json:"last_update"`
 | |
| 	Version    string                     `json:"version"`
 | |
| 	NodeID     string                     `json:"node_id"`
 | |
| }
 | |
| 
 | |
| // Status represents health status levels
 | |
| type Status string
 | |
| 
 | |
| const (
 | |
| 	StatusHealthy   Status = "healthy"
 | |
| 	StatusDegraded  Status = "degraded"
 | |
| 	StatusUnhealthy Status = "unhealthy"
 | |
| 	StatusStarting  Status = "starting"
 | |
| 	StatusStopping  Status = "stopping"
 | |
| )
 | |
| 
 | |
| // Logger interface for health monitoring
 | |
| type Logger interface {
 | |
| 	Info(msg string, args ...interface{})
 | |
| 	Warn(msg string, args ...interface{})
 | |
| 	Error(msg string, args ...interface{})
 | |
| }
 | |
| 
 | |
| // PubSubInterface defines the interface for PubSub health checks
 | |
| type PubSubInterface interface {
 | |
| 	SubscribeToTopic(topic string, handler func([]byte)) error
 | |
| 	PublishToTopic(topic string, data interface{}) error
 | |
| }
 | |
| 
 | |
| // DHTInterface defines the interface for DHT health checks
 | |
| type DHTInterface interface {
 | |
| 	PutValue(ctx context.Context, key string, value []byte) error
 | |
| 	GetValue(ctx context.Context, key string) ([]byte, error)
 | |
| }
 | |
| 
 | |
| // NewManager creates a new health manager
 | |
| func NewManager(nodeID, version string, logger Logger) *Manager {
 | |
| 	if logger == nil {
 | |
| 		logger = &defaultLogger{}
 | |
| 	}
 | |
| 
 | |
| 	return &Manager{
 | |
| 		checks: make(map[string]*HealthCheck),
 | |
| 		status: &SystemStatus{
 | |
| 			Status:    StatusStarting,
 | |
| 			Message:   "System starting up",
 | |
| 			Checks:    make(map[string]*CheckResult),
 | |
| 			StartTime: time.Now(),
 | |
| 			Version:   version,
 | |
| 			NodeID:    nodeID,
 | |
| 		},
 | |
| 		stopCh: make(chan struct{}),
 | |
| 		logger: logger,
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // RegisterCheck adds a new health check
 | |
| func (m *Manager) RegisterCheck(check *HealthCheck) {
 | |
| 	m.mu.Lock()
 | |
| 	defer m.mu.Unlock()
 | |
| 
 | |
| 	if check.Timeout == 0 {
 | |
| 		check.Timeout = 10 * time.Second
 | |
| 	}
 | |
| 	if check.Interval == 0 {
 | |
| 		check.Interval = 30 * time.Second
 | |
| 	}
 | |
| 
 | |
| 	m.checks[check.Name] = check
 | |
| 	m.logger.Info("Registered health check: %s (critical: %t, interval: %v)",
 | |
| 		check.Name, check.Critical, check.Interval)
 | |
| }
 | |
| 
 | |
| // UnregisterCheck removes a health check
 | |
| func (m *Manager) UnregisterCheck(name string) {
 | |
| 	m.mu.Lock()
 | |
| 	defer m.mu.Unlock()
 | |
| 
 | |
| 	delete(m.checks, name)
 | |
| 	delete(m.status.Checks, name)
 | |
| 	m.logger.Info("Unregistered health check: %s", name)
 | |
| }
 | |
| 
 | |
| // Start begins health monitoring
 | |
| func (m *Manager) Start() error {
 | |
| 	m.mu.Lock()
 | |
| 	defer m.mu.Unlock()
 | |
| 
 | |
| 	// Start health check loop
 | |
| 	m.ticker = time.NewTicker(5 * time.Second) // Check every 5 seconds
 | |
| 	go m.healthCheckLoop()
 | |
| 
 | |
| 	// Update status to healthy (assuming no critical checks fail immediately)
 | |
| 	m.status.Status = StatusHealthy
 | |
| 	m.status.Message = "System operational"
 | |
| 
 | |
| 	m.logger.Info("Health monitoring started")
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // Stop stops health monitoring
 | |
| func (m *Manager) Stop() error {
 | |
| 	m.mu.Lock()
 | |
| 	defer m.mu.Unlock()
 | |
| 
 | |
| 	close(m.stopCh)
 | |
| 	if m.ticker != nil {
 | |
| 		m.ticker.Stop()
 | |
| 	}
 | |
| 
 | |
| 	m.status.Status = StatusStopping
 | |
| 	m.status.Message = "System shutting down"
 | |
| 
 | |
| 	m.logger.Info("Health monitoring stopped")
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // StartHTTPServer starts an HTTP server for health endpoints
 | |
| func (m *Manager) StartHTTPServer(port int) error {
 | |
| 	mux := http.NewServeMux()
 | |
| 	
 | |
| 	// Health check endpoint
 | |
| 	mux.HandleFunc("/health", m.handleHealth)
 | |
| 	mux.HandleFunc("/health/ready", m.handleReady)
 | |
| 	mux.HandleFunc("/health/live", m.handleLive)
 | |
| 	mux.HandleFunc("/health/checks", m.handleChecks)
 | |
| 
 | |
| 	m.httpServer = &http.Server{
 | |
| 		Addr:    fmt.Sprintf(":%d", port),
 | |
| 		Handler: mux,
 | |
| 	}
 | |
| 
 | |
| 	go func() {
 | |
| 		if err := m.httpServer.ListenAndServe(); err != nil && err != http.ErrServerClosed {
 | |
| 			m.logger.Error("Health HTTP server error: %v", err)
 | |
| 		}
 | |
| 	}()
 | |
| 
 | |
| 	m.logger.Info("Health HTTP server started on port %d", port)
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // SetShutdownManager sets the shutdown manager for critical health failures
 | |
| func (m *Manager) SetShutdownManager(shutdownManager *shutdown.Manager) {
 | |
| 	m.shutdownManager = shutdownManager
 | |
| }
 | |
| 
 | |
| // GetStatus returns the current system status
 | |
| func (m *Manager) GetStatus() *SystemStatus {
 | |
| 	m.mu.RLock()
 | |
| 	defer m.mu.RUnlock()
 | |
| 
 | |
| 	// Create a copy to avoid race conditions
 | |
| 	status := *m.status
 | |
| 	status.Uptime = time.Since(m.status.StartTime)
 | |
| 	status.LastUpdate = time.Now()
 | |
| 
 | |
| 	// Copy checks
 | |
| 	status.Checks = make(map[string]*CheckResult)
 | |
| 	for name, result := range m.status.Checks {
 | |
| 		if result != nil {
 | |
| 			resultCopy := *result
 | |
| 			status.Checks[name] = &resultCopy
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return &status
 | |
| }
 | |
| 
 | |
| // healthCheckLoop runs health checks periodically
 | |
| func (m *Manager) healthCheckLoop() {
 | |
| 	defer m.ticker.Stop()
 | |
| 
 | |
| 	for {
 | |
| 		select {
 | |
| 		case <-m.ticker.C:
 | |
| 			m.runHealthChecks()
 | |
| 		case <-m.stopCh:
 | |
| 			return
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // runHealthChecks executes all registered health checks
 | |
| func (m *Manager) runHealthChecks() {
 | |
| 	m.mu.RLock()
 | |
| 	checks := make([]*HealthCheck, 0, len(m.checks))
 | |
| 	for _, check := range m.checks {
 | |
| 		if check.Enabled && time.Since(check.LastRun) >= check.Interval {
 | |
| 			checks = append(checks, check)
 | |
| 		}
 | |
| 	}
 | |
| 	m.mu.RUnlock()
 | |
| 
 | |
| 	if len(checks) == 0 {
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	for _, check := range checks {
 | |
| 		go m.executeHealthCheck(check)
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // executeHealthCheck runs a single health check
 | |
| func (m *Manager) executeHealthCheck(check *HealthCheck) {
 | |
| 	ctx, cancel := context.WithTimeout(context.Background(), check.Timeout)
 | |
| 	defer cancel()
 | |
| 
 | |
| 	start := time.Now()
 | |
| 	result := check.Checker(ctx)
 | |
| 	result.Latency = time.Since(start)
 | |
| 	result.Timestamp = time.Now()
 | |
| 
 | |
| 	m.mu.Lock()
 | |
| 	check.LastRun = time.Now()
 | |
| 	check.LastResult = &result
 | |
| 	m.status.Checks[check.Name] = &result
 | |
| 	m.mu.Unlock()
 | |
| 
 | |
| 	// Log health check results
 | |
| 	if result.Healthy {
 | |
| 		m.logger.Info("Health check passed: %s (latency: %v)", check.Name, result.Latency)
 | |
| 	} else {
 | |
| 		m.logger.Warn("Health check failed: %s - %s (latency: %v)", 
 | |
| 			check.Name, result.Message, result.Latency)
 | |
| 		
 | |
| 		// If this is a critical check and it failed, consider shutdown
 | |
| 		if check.Critical && m.shutdownManager != nil {
 | |
| 			m.logger.Error("Critical health check failed: %s - initiating graceful shutdown", check.Name)
 | |
| 			m.shutdownManager.Stop()
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// Update overall system status
 | |
| 	m.updateSystemStatus()
 | |
| }
 | |
| 
 | |
| // updateSystemStatus recalculates the overall system status
 | |
| func (m *Manager) updateSystemStatus() {
 | |
| 	m.mu.Lock()
 | |
| 	defer m.mu.Unlock()
 | |
| 
 | |
| 	var healthyChecks, totalChecks, criticalFailures int
 | |
| 
 | |
| 	for _, result := range m.status.Checks {
 | |
| 		totalChecks++
 | |
| 		if result.Healthy {
 | |
| 			healthyChecks++
 | |
| 		} else {
 | |
| 			// Check if this is a critical check
 | |
| 			if check, exists := m.checks[result.Timestamp.String()]; exists && check.Critical {
 | |
| 				criticalFailures++
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// Determine overall status
 | |
| 	if criticalFailures > 0 {
 | |
| 		m.status.Status = StatusUnhealthy
 | |
| 		m.status.Message = fmt.Sprintf("Critical health checks failing (%d)", criticalFailures)
 | |
| 	} else if totalChecks == 0 {
 | |
| 		m.status.Status = StatusStarting
 | |
| 		m.status.Message = "No health checks configured"
 | |
| 	} else if healthyChecks == totalChecks {
 | |
| 		m.status.Status = StatusHealthy
 | |
| 		m.status.Message = "All health checks passing"
 | |
| 	} else {
 | |
| 		m.status.Status = StatusDegraded
 | |
| 		m.status.Message = fmt.Sprintf("Some health checks failing (%d/%d healthy)", 
 | |
| 			healthyChecks, totalChecks)
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // HTTP Handlers
 | |
| 
 | |
| func (m *Manager) handleHealth(w http.ResponseWriter, r *http.Request) {
 | |
| 	status := m.GetStatus()
 | |
| 	
 | |
| 	w.Header().Set("Content-Type", "application/json")
 | |
| 	
 | |
| 	// Set HTTP status code based on health
 | |
| 	switch status.Status {
 | |
| 	case StatusHealthy:
 | |
| 		w.WriteHeader(http.StatusOK)
 | |
| 	case StatusDegraded:
 | |
| 		w.WriteHeader(http.StatusOK) // Still OK, but degraded
 | |
| 	case StatusUnhealthy:
 | |
| 		w.WriteHeader(http.StatusServiceUnavailable)
 | |
| 	case StatusStarting:
 | |
| 		w.WriteHeader(http.StatusServiceUnavailable)
 | |
| 	case StatusStopping:
 | |
| 		w.WriteHeader(http.StatusServiceUnavailable)
 | |
| 	}
 | |
| 	
 | |
| 	json.NewEncoder(w).Encode(status)
 | |
| }
 | |
| 
 | |
| func (m *Manager) handleReady(w http.ResponseWriter, r *http.Request) {
 | |
| 	status := m.GetStatus()
 | |
| 	
 | |
| 	w.Header().Set("Content-Type", "application/json")
 | |
| 	
 | |
| 	// Ready means we can handle requests
 | |
| 	if status.Status == StatusHealthy || status.Status == StatusDegraded {
 | |
| 		w.WriteHeader(http.StatusOK)
 | |
| 		json.NewEncoder(w).Encode(map[string]interface{}{
 | |
| 			"ready":   true,
 | |
| 			"status":  status.Status,
 | |
| 			"message": status.Message,
 | |
| 		})
 | |
| 	} else {
 | |
| 		w.WriteHeader(http.StatusServiceUnavailable)
 | |
| 		json.NewEncoder(w).Encode(map[string]interface{}{
 | |
| 			"ready":   false,
 | |
| 			"status":  status.Status,
 | |
| 			"message": status.Message,
 | |
| 		})
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (m *Manager) handleLive(w http.ResponseWriter, r *http.Request) {
 | |
| 	status := m.GetStatus()
 | |
| 	
 | |
| 	w.Header().Set("Content-Type", "application/json")
 | |
| 	
 | |
| 	// Live means the process is running (not necessarily healthy)
 | |
| 	if status.Status != StatusStopping {
 | |
| 		w.WriteHeader(http.StatusOK)
 | |
| 		json.NewEncoder(w).Encode(map[string]interface{}{
 | |
| 			"live":    true,
 | |
| 			"status":  status.Status,
 | |
| 			"uptime":  status.Uptime.String(),
 | |
| 		})
 | |
| 	} else {
 | |
| 		w.WriteHeader(http.StatusServiceUnavailable)
 | |
| 		json.NewEncoder(w).Encode(map[string]interface{}{
 | |
| 			"live":    false,
 | |
| 			"status":  status.Status,
 | |
| 			"message": "System is shutting down",
 | |
| 		})
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (m *Manager) handleChecks(w http.ResponseWriter, r *http.Request) {
 | |
| 	status := m.GetStatus()
 | |
| 	
 | |
| 	w.Header().Set("Content-Type", "application/json")
 | |
| 	w.WriteHeader(http.StatusOK)
 | |
| 	
 | |
| 	json.NewEncoder(w).Encode(map[string]interface{}{
 | |
| 		"checks":     status.Checks,
 | |
| 		"total":      len(status.Checks),
 | |
| 		"timestamp":  time.Now(),
 | |
| 	})
 | |
| }
 | |
| 
 | |
| // Predefined health checks
 | |
| 
 | |
| // CreateDatabaseCheck creates a health check for database connectivity
 | |
| func CreateDatabaseCheck(name string, pingFunc func() error) *HealthCheck {
 | |
| 	return &HealthCheck{
 | |
| 		Name:        name,
 | |
| 		Description: fmt.Sprintf("Database connectivity check for %s", name),
 | |
| 		Enabled:     true,
 | |
| 		Critical:    true,
 | |
| 		Interval:    30 * time.Second,
 | |
| 		Timeout:     10 * time.Second,
 | |
| 		Checker: func(ctx context.Context) CheckResult {
 | |
| 			start := time.Now()
 | |
| 			err := pingFunc()
 | |
| 			
 | |
| 			if err != nil {
 | |
| 				return CheckResult{
 | |
| 					Healthy:   false,
 | |
| 					Message:   fmt.Sprintf("Database ping failed: %v", err),
 | |
| 					Error:     err,
 | |
| 					Timestamp: time.Now(),
 | |
| 					Latency:   time.Since(start),
 | |
| 				}
 | |
| 			}
 | |
| 			
 | |
| 			return CheckResult{
 | |
| 				Healthy:   true,
 | |
| 				Message:   "Database connectivity OK",
 | |
| 				Timestamp: time.Now(),
 | |
| 				Latency:   time.Since(start),
 | |
| 			}
 | |
| 		},
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // CreateDiskSpaceCheck creates a health check for disk space
 | |
| func CreateDiskSpaceCheck(path string, threshold float64) *HealthCheck {
 | |
| 	return &HealthCheck{
 | |
| 		Name:        fmt.Sprintf("disk-space-%s", path),
 | |
| 		Description: fmt.Sprintf("Disk space check for %s (threshold: %.1f%%)", path, threshold*100),
 | |
| 		Enabled:     true,
 | |
| 		Critical:    false,
 | |
| 		Interval:    60 * time.Second,
 | |
| 		Timeout:     5 * time.Second,
 | |
| 		Checker: func(ctx context.Context) CheckResult {
 | |
| 			// In a real implementation, you would check actual disk usage
 | |
| 			// For now, we'll simulate it
 | |
| 			usage := 0.75 // Simulate 75% usage
 | |
| 			
 | |
| 			if usage > threshold {
 | |
| 				return CheckResult{
 | |
| 					Healthy: false,
 | |
| 					Message: fmt.Sprintf("Disk usage %.1f%% exceeds threshold %.1f%%", 
 | |
| 						usage*100, threshold*100),
 | |
| 					Details: map[string]interface{}{
 | |
| 						"path":      path,
 | |
| 						"usage":     usage,
 | |
| 						"threshold": threshold,
 | |
| 					},
 | |
| 					Timestamp: time.Now(),
 | |
| 				}
 | |
| 			}
 | |
| 			
 | |
| 			return CheckResult{
 | |
| 				Healthy: true,
 | |
| 				Message: fmt.Sprintf("Disk usage %.1f%% is within threshold", usage*100),
 | |
| 				Details: map[string]interface{}{
 | |
| 					"path":      path,
 | |
| 					"usage":     usage,
 | |
| 					"threshold": threshold,
 | |
| 				},
 | |
| 				Timestamp: time.Now(),
 | |
| 			}
 | |
| 		},
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // CreateMemoryCheck creates a health check for memory usage
 | |
| func CreateMemoryCheck(threshold float64) *HealthCheck {
 | |
| 	return &HealthCheck{
 | |
| 		Name:        "memory-usage",
 | |
| 		Description: fmt.Sprintf("Memory usage check (threshold: %.1f%%)", threshold*100),
 | |
| 		Enabled:     true,
 | |
| 		Critical:    false,
 | |
| 		Interval:    30 * time.Second,
 | |
| 		Timeout:     5 * time.Second,
 | |
| 		Checker: func(ctx context.Context) CheckResult {
 | |
| 			// In a real implementation, you would check actual memory usage
 | |
| 			usage := 0.60 // Simulate 60% usage
 | |
| 			
 | |
| 			if usage > threshold {
 | |
| 				return CheckResult{
 | |
| 					Healthy: false,
 | |
| 					Message: fmt.Sprintf("Memory usage %.1f%% exceeds threshold %.1f%%", 
 | |
| 						usage*100, threshold*100),
 | |
| 					Details: map[string]interface{}{
 | |
| 						"usage":     usage,
 | |
| 						"threshold": threshold,
 | |
| 					},
 | |
| 					Timestamp: time.Now(),
 | |
| 				}
 | |
| 			}
 | |
| 			
 | |
| 			return CheckResult{
 | |
| 				Healthy: true,
 | |
| 				Message: fmt.Sprintf("Memory usage %.1f%% is within threshold", usage*100),
 | |
| 				Details: map[string]interface{}{
 | |
| 					"usage":     usage,
 | |
| 					"threshold": threshold,
 | |
| 				},
 | |
| 				Timestamp: time.Now(),
 | |
| 			}
 | |
| 		},
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // CreateActivePubSubCheck creates an active health check for PubSub system
 | |
| func CreateActivePubSubCheck(pubsub PubSubInterface) *HealthCheck {
 | |
| 	return &HealthCheck{
 | |
| 		Name:        "pubsub-active-probe",
 | |
| 		Description: "Active PubSub system health probe with loopback test",
 | |
| 		Enabled:     true,
 | |
| 		Critical:    false,
 | |
| 		Interval:    60 * time.Second,
 | |
| 		Timeout:     15 * time.Second,
 | |
| 		Checker: func(ctx context.Context) CheckResult {
 | |
| 			start := time.Now()
 | |
| 			
 | |
| 			// Generate unique test message
 | |
| 			testKey := fmt.Sprintf("health-check-%d", time.Now().UnixNano())
 | |
| 			testMessage := map[string]interface{}{
 | |
| 				"test_key":  testKey,
 | |
| 				"timestamp": time.Now().Unix(),
 | |
| 				"probe_id":  "pubsub-health-check",
 | |
| 			}
 | |
| 			
 | |
| 			// Channel to receive test message
 | |
| 			resultCh := make(chan bool, 1)
 | |
| 			errorCh := make(chan error, 1)
 | |
| 			
 | |
| 			// Set up message handler for test topic
 | |
| 			handler := func(data []byte) {
 | |
| 				var received map[string]interface{}
 | |
| 				if err := json.Unmarshal(data, &received); err != nil {
 | |
| 					return
 | |
| 				}
 | |
| 				
 | |
| 				if receivedKey, ok := received["test_key"].(string); ok && receivedKey == testKey {
 | |
| 					select {
 | |
| 					case resultCh <- true:
 | |
| 					default:
 | |
| 					}
 | |
| 				}
 | |
| 			}
 | |
| 			
 | |
| 			// Subscribe to test topic
 | |
| 			testTopic := "CHORUS/health-test/v1"
 | |
| 			if err := pubsub.SubscribeToTopic(testTopic, handler); err != nil {
 | |
| 				return CheckResult{
 | |
| 					Healthy:   false,
 | |
| 					Message:   fmt.Sprintf("Failed to subscribe to test topic: %v", err),
 | |
| 					Error:     err,
 | |
| 					Timestamp: time.Now(),
 | |
| 					Latency:   time.Since(start),
 | |
| 				}
 | |
| 			}
 | |
| 			
 | |
| 			// Allow subscription to settle
 | |
| 			time.Sleep(500 * time.Millisecond)
 | |
| 			
 | |
| 			// Publish test message
 | |
| 			go func() {
 | |
| 				if err := pubsub.PublishToTopic(testTopic, testMessage); err != nil {
 | |
| 					errorCh <- err
 | |
| 				}
 | |
| 			}()
 | |
| 			
 | |
| 			// Wait for result with timeout
 | |
| 			select {
 | |
| 			case <-resultCh:
 | |
| 				latency := time.Since(start)
 | |
| 				return CheckResult{
 | |
| 					Healthy: true,
 | |
| 					Message: fmt.Sprintf("PubSub loopback test successful"),
 | |
| 					Details: map[string]interface{}{
 | |
| 						"test_topic": testTopic,
 | |
| 						"test_key":   testKey,
 | |
| 						"latency_ms": latency.Milliseconds(),
 | |
| 					},
 | |
| 					Timestamp: time.Now(),
 | |
| 					Latency:   latency,
 | |
| 				}
 | |
| 				
 | |
| 			case err := <-errorCh:
 | |
| 				return CheckResult{
 | |
| 					Healthy:   false,
 | |
| 					Message:   fmt.Sprintf("Failed to publish test message: %v", err),
 | |
| 					Error:     err,
 | |
| 					Timestamp: time.Now(),
 | |
| 					Latency:   time.Since(start),
 | |
| 				}
 | |
| 				
 | |
| 			case <-time.After(10 * time.Second):
 | |
| 				return CheckResult{
 | |
| 					Healthy: false,
 | |
| 					Message: "PubSub loopback test timeout - message not received",
 | |
| 					Details: map[string]interface{}{
 | |
| 						"test_topic": testTopic,
 | |
| 						"test_key":   testKey,
 | |
| 						"timeout":    "10s",
 | |
| 					},
 | |
| 					Timestamp: time.Now(),
 | |
| 					Latency:   time.Since(start),
 | |
| 				}
 | |
| 				
 | |
| 			case <-ctx.Done():
 | |
| 				return CheckResult{
 | |
| 					Healthy: false,
 | |
| 					Message: "PubSub health check cancelled",
 | |
| 					Details: map[string]interface{}{
 | |
| 						"test_topic": testTopic,
 | |
| 						"reason":     "context_cancelled",
 | |
| 					},
 | |
| 					Timestamp: time.Now(),
 | |
| 					Latency:   time.Since(start),
 | |
| 				}
 | |
| 			}
 | |
| 		},
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // CreateActiveDHTCheck creates an active health check for DHT system
 | |
| func CreateActiveDHTCheck(dht DHTInterface) *HealthCheck {
 | |
| 	return &HealthCheck{
 | |
| 		Name:        "dht-active-probe",
 | |
| 		Description: "Active DHT system health probe with put/get test",
 | |
| 		Enabled:     true,
 | |
| 		Critical:    false,
 | |
| 		Interval:    90 * time.Second,
 | |
| 		Timeout:     20 * time.Second,
 | |
| 		Checker: func(ctx context.Context) CheckResult {
 | |
| 			start := time.Now()
 | |
| 			
 | |
| 			// Generate unique test key and value
 | |
| 			testKey := fmt.Sprintf("health-check-%d", time.Now().UnixNano())
 | |
| 			testValue := []byte(fmt.Sprintf(`{"test_key":"%s","timestamp":%d,"probe_id":"dht-health-check"}`, 
 | |
| 				testKey, time.Now().Unix()))
 | |
| 			
 | |
| 			// Test DHT put operation
 | |
| 			putStart := time.Now()
 | |
| 			if err := dht.PutValue(ctx, testKey, testValue); err != nil {
 | |
| 				return CheckResult{
 | |
| 					Healthy: false,
 | |
| 					Message: fmt.Sprintf("DHT put operation failed: %v", err),
 | |
| 					Details: map[string]interface{}{
 | |
| 						"test_key":    testKey,
 | |
| 						"operation":   "put",
 | |
| 						"put_latency": time.Since(putStart).Milliseconds(),
 | |
| 					},
 | |
| 					Error:     err,
 | |
| 					Timestamp: time.Now(),
 | |
| 					Latency:   time.Since(start),
 | |
| 				}
 | |
| 			}
 | |
| 			putLatency := time.Since(putStart)
 | |
| 			
 | |
| 			// Allow some time for propagation
 | |
| 			time.Sleep(100 * time.Millisecond)
 | |
| 			
 | |
| 			// Test DHT get operation
 | |
| 			getStart := time.Now()
 | |
| 			retrievedValue, err := dht.GetValue(ctx, testKey)
 | |
| 			if err != nil {
 | |
| 				return CheckResult{
 | |
| 					Healthy: false,
 | |
| 					Message: fmt.Sprintf("DHT get operation failed: %v", err),
 | |
| 					Details: map[string]interface{}{
 | |
| 						"test_key":    testKey,
 | |
| 						"operation":   "get",
 | |
| 						"put_latency": putLatency.Milliseconds(),
 | |
| 						"get_latency": time.Since(getStart).Milliseconds(),
 | |
| 					},
 | |
| 					Error:     err,
 | |
| 					Timestamp: time.Now(),
 | |
| 					Latency:   time.Since(start),
 | |
| 				}
 | |
| 			}
 | |
| 			getLatency := time.Since(getStart)
 | |
| 			
 | |
| 			// Verify retrieved value matches
 | |
| 			if string(retrievedValue) != string(testValue) {
 | |
| 				return CheckResult{
 | |
| 					Healthy: false,
 | |
| 					Message: "DHT data integrity check failed - retrieved value doesn't match",
 | |
| 					Details: map[string]interface{}{
 | |
| 						"test_key":       testKey,
 | |
| 						"expected_len":   len(testValue),
 | |
| 						"retrieved_len":  len(retrievedValue),
 | |
| 						"put_latency":    putLatency.Milliseconds(),
 | |
| 						"get_latency":    getLatency.Milliseconds(),
 | |
| 						"total_latency":  time.Since(start).Milliseconds(),
 | |
| 					},
 | |
| 					Timestamp: time.Now(),
 | |
| 					Latency:   time.Since(start),
 | |
| 				}
 | |
| 			}
 | |
| 			
 | |
| 			totalLatency := time.Since(start)
 | |
| 			
 | |
| 			// Get DHT statistics if available
 | |
| 			var stats interface{}
 | |
| 			if statsProvider, ok := dht.(interface{ GetStats() interface{} }); ok {
 | |
| 				stats = statsProvider.GetStats()
 | |
| 			}
 | |
| 			
 | |
| 			return CheckResult{
 | |
| 				Healthy: true,
 | |
| 				Message: "DHT put/get test successful",
 | |
| 				Details: map[string]interface{}{
 | |
| 					"test_key":       testKey,
 | |
| 					"put_latency":    putLatency.Milliseconds(),
 | |
| 					"get_latency":    getLatency.Milliseconds(),
 | |
| 					"total_latency":  totalLatency.Milliseconds(),
 | |
| 					"data_integrity": "verified",
 | |
| 					"stats":          stats,
 | |
| 				},
 | |
| 				Timestamp: time.Now(),
 | |
| 				Latency:   totalLatency,
 | |
| 			}
 | |
| 		},
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // defaultLogger is a simple logger implementation
 | |
| type defaultLogger struct{}
 | |
| 
 | |
| func (l *defaultLogger) Info(msg string, args ...interface{}) {
 | |
| 	fmt.Printf("[INFO] "+msg+"\n", args...)
 | |
| }
 | |
| 
 | |
| func (l *defaultLogger) Warn(msg string, args ...interface{}) {
 | |
| 	fmt.Printf("[WARN] "+msg+"\n", args...)
 | |
| }
 | |
| 
 | |
| func (l *defaultLogger) Error(msg string, args ...interface{}) {
 | |
| 	fmt.Printf("[ERROR] "+msg+"\n", args...)
 | |
| } |