Complete BZZZ functionality port to CHORUS

🎭 CHORUS now contains full BZZZ functionality adapted for containers

Core systems ported:
- P2P networking (libp2p with DHT and PubSub)
- Task coordination (COOEE protocol)
- HMMM collaborative reasoning
- SHHH encryption and security
- SLURP admin election system
- UCXL content addressing
- UCXI server integration
- Hypercore logging system
- Health monitoring and graceful shutdown
- License validation with KACHING

Container adaptations:
- Environment variable configuration (no YAML files)
- Container-optimized logging to stdout/stderr
- Auto-generated agent IDs for container deployments
- Docker-first architecture

All proven BZZZ P2P protocols, AI integration, and collaboration
features are now available in containerized form.

Next: Build and test container deployment.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
anthonyrawlins
2025-09-02 20:02:37 +10:00
parent 7c6cbd562a
commit 543ab216f9
224 changed files with 86331 additions and 186 deletions

758
pkg/health/manager.go Normal file
View File

@@ -0,0 +1,758 @@
package health
import (
"context"
"encoding/json"
"fmt"
"net/http"
"sync"
"time"
"chorus.services/bzzz/pkg/shutdown"
)
// Manager provides comprehensive health monitoring and integrates with graceful shutdown
type Manager struct {
mu sync.RWMutex
checks map[string]*HealthCheck
status *SystemStatus
httpServer *http.Server
shutdownManager *shutdown.Manager
ticker *time.Ticker
stopCh chan struct{}
logger Logger
}
// HealthCheck represents a single health check
type HealthCheck struct {
Name string `json:"name"`
Description string `json:"description"`
Checker func(ctx context.Context) CheckResult `json:"-"`
Interval time.Duration `json:"interval"`
Timeout time.Duration `json:"timeout"`
Enabled bool `json:"enabled"`
Critical bool `json:"critical"` // If true, failure triggers shutdown
LastRun time.Time `json:"last_run"`
LastResult *CheckResult `json:"last_result,omitempty"`
}
// CheckResult represents the result of a health check
type CheckResult struct {
Healthy bool `json:"healthy"`
Message string `json:"message"`
Details map[string]interface{} `json:"details,omitempty"`
Latency time.Duration `json:"latency"`
Timestamp time.Time `json:"timestamp"`
Error error `json:"error,omitempty"`
}
// SystemStatus represents the overall system health status
type SystemStatus struct {
Status Status `json:"status"`
Message string `json:"message"`
Checks map[string]*CheckResult `json:"checks"`
Uptime time.Duration `json:"uptime"`
StartTime time.Time `json:"start_time"`
LastUpdate time.Time `json:"last_update"`
Version string `json:"version"`
NodeID string `json:"node_id"`
}
// Status represents health status levels
type Status string
const (
StatusHealthy Status = "healthy"
StatusDegraded Status = "degraded"
StatusUnhealthy Status = "unhealthy"
StatusStarting Status = "starting"
StatusStopping Status = "stopping"
)
// Logger interface for health monitoring
type Logger interface {
Info(msg string, args ...interface{})
Warn(msg string, args ...interface{})
Error(msg string, args ...interface{})
}
// PubSubInterface defines the interface for PubSub health checks
type PubSubInterface interface {
SubscribeToTopic(topic string, handler func([]byte)) error
PublishToTopic(topic string, data interface{}) error
}
// DHTInterface defines the interface for DHT health checks
type DHTInterface interface {
PutValue(ctx context.Context, key string, value []byte) error
GetValue(ctx context.Context, key string) ([]byte, error)
}
// NewManager creates a new health manager
func NewManager(nodeID, version string, logger Logger) *Manager {
if logger == nil {
logger = &defaultLogger{}
}
return &Manager{
checks: make(map[string]*HealthCheck),
status: &SystemStatus{
Status: StatusStarting,
Message: "System starting up",
Checks: make(map[string]*CheckResult),
StartTime: time.Now(),
Version: version,
NodeID: nodeID,
},
stopCh: make(chan struct{}),
logger: logger,
}
}
// RegisterCheck adds a new health check
func (m *Manager) RegisterCheck(check *HealthCheck) {
m.mu.Lock()
defer m.mu.Unlock()
if check.Timeout == 0 {
check.Timeout = 10 * time.Second
}
if check.Interval == 0 {
check.Interval = 30 * time.Second
}
m.checks[check.Name] = check
m.logger.Info("Registered health check: %s (critical: %t, interval: %v)",
check.Name, check.Critical, check.Interval)
}
// UnregisterCheck removes a health check
func (m *Manager) UnregisterCheck(name string) {
m.mu.Lock()
defer m.mu.Unlock()
delete(m.checks, name)
delete(m.status.Checks, name)
m.logger.Info("Unregistered health check: %s", name)
}
// Start begins health monitoring
func (m *Manager) Start() error {
m.mu.Lock()
defer m.mu.Unlock()
// Start health check loop
m.ticker = time.NewTicker(5 * time.Second) // Check every 5 seconds
go m.healthCheckLoop()
// Update status to healthy (assuming no critical checks fail immediately)
m.status.Status = StatusHealthy
m.status.Message = "System operational"
m.logger.Info("Health monitoring started")
return nil
}
// Stop stops health monitoring
func (m *Manager) Stop() error {
m.mu.Lock()
defer m.mu.Unlock()
close(m.stopCh)
if m.ticker != nil {
m.ticker.Stop()
}
m.status.Status = StatusStopping
m.status.Message = "System shutting down"
m.logger.Info("Health monitoring stopped")
return nil
}
// StartHTTPServer starts an HTTP server for health endpoints
func (m *Manager) StartHTTPServer(port int) error {
mux := http.NewServeMux()
// Health check endpoint
mux.HandleFunc("/health", m.handleHealth)
mux.HandleFunc("/health/ready", m.handleReady)
mux.HandleFunc("/health/live", m.handleLive)
mux.HandleFunc("/health/checks", m.handleChecks)
m.httpServer = &http.Server{
Addr: fmt.Sprintf(":%d", port),
Handler: mux,
}
go func() {
if err := m.httpServer.ListenAndServe(); err != nil && err != http.ErrServerClosed {
m.logger.Error("Health HTTP server error: %v", err)
}
}()
m.logger.Info("Health HTTP server started on port %d", port)
return nil
}
// SetShutdownManager sets the shutdown manager for critical health failures
func (m *Manager) SetShutdownManager(shutdownManager *shutdown.Manager) {
m.shutdownManager = shutdownManager
}
// GetStatus returns the current system status
func (m *Manager) GetStatus() *SystemStatus {
m.mu.RLock()
defer m.mu.RUnlock()
// Create a copy to avoid race conditions
status := *m.status
status.Uptime = time.Since(m.status.StartTime)
status.LastUpdate = time.Now()
// Copy checks
status.Checks = make(map[string]*CheckResult)
for name, result := range m.status.Checks {
if result != nil {
resultCopy := *result
status.Checks[name] = &resultCopy
}
}
return &status
}
// healthCheckLoop runs health checks periodically
func (m *Manager) healthCheckLoop() {
defer m.ticker.Stop()
for {
select {
case <-m.ticker.C:
m.runHealthChecks()
case <-m.stopCh:
return
}
}
}
// runHealthChecks executes all registered health checks
func (m *Manager) runHealthChecks() {
m.mu.RLock()
checks := make([]*HealthCheck, 0, len(m.checks))
for _, check := range m.checks {
if check.Enabled && time.Since(check.LastRun) >= check.Interval {
checks = append(checks, check)
}
}
m.mu.RUnlock()
if len(checks) == 0 {
return
}
for _, check := range checks {
go m.executeHealthCheck(check)
}
}
// executeHealthCheck runs a single health check
func (m *Manager) executeHealthCheck(check *HealthCheck) {
ctx, cancel := context.WithTimeout(context.Background(), check.Timeout)
defer cancel()
start := time.Now()
result := check.Checker(ctx)
result.Latency = time.Since(start)
result.Timestamp = time.Now()
m.mu.Lock()
check.LastRun = time.Now()
check.LastResult = &result
m.status.Checks[check.Name] = &result
m.mu.Unlock()
// Log health check results
if result.Healthy {
m.logger.Info("Health check passed: %s (latency: %v)", check.Name, result.Latency)
} else {
m.logger.Warn("Health check failed: %s - %s (latency: %v)",
check.Name, result.Message, result.Latency)
// If this is a critical check and it failed, consider shutdown
if check.Critical && m.shutdownManager != nil {
m.logger.Error("Critical health check failed: %s - initiating graceful shutdown", check.Name)
m.shutdownManager.Stop()
}
}
// Update overall system status
m.updateSystemStatus()
}
// updateSystemStatus recalculates the overall system status
func (m *Manager) updateSystemStatus() {
m.mu.Lock()
defer m.mu.Unlock()
var healthyChecks, totalChecks, criticalFailures int
for _, result := range m.status.Checks {
totalChecks++
if result.Healthy {
healthyChecks++
} else {
// Check if this is a critical check
if check, exists := m.checks[result.Timestamp.String()]; exists && check.Critical {
criticalFailures++
}
}
}
// Determine overall status
if criticalFailures > 0 {
m.status.Status = StatusUnhealthy
m.status.Message = fmt.Sprintf("Critical health checks failing (%d)", criticalFailures)
} else if totalChecks == 0 {
m.status.Status = StatusStarting
m.status.Message = "No health checks configured"
} else if healthyChecks == totalChecks {
m.status.Status = StatusHealthy
m.status.Message = "All health checks passing"
} else {
m.status.Status = StatusDegraded
m.status.Message = fmt.Sprintf("Some health checks failing (%d/%d healthy)",
healthyChecks, totalChecks)
}
}
// HTTP Handlers
func (m *Manager) handleHealth(w http.ResponseWriter, r *http.Request) {
status := m.GetStatus()
w.Header().Set("Content-Type", "application/json")
// Set HTTP status code based on health
switch status.Status {
case StatusHealthy:
w.WriteHeader(http.StatusOK)
case StatusDegraded:
w.WriteHeader(http.StatusOK) // Still OK, but degraded
case StatusUnhealthy:
w.WriteHeader(http.StatusServiceUnavailable)
case StatusStarting:
w.WriteHeader(http.StatusServiceUnavailable)
case StatusStopping:
w.WriteHeader(http.StatusServiceUnavailable)
}
json.NewEncoder(w).Encode(status)
}
func (m *Manager) handleReady(w http.ResponseWriter, r *http.Request) {
status := m.GetStatus()
w.Header().Set("Content-Type", "application/json")
// Ready means we can handle requests
if status.Status == StatusHealthy || status.Status == StatusDegraded {
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(map[string]interface{}{
"ready": true,
"status": status.Status,
"message": status.Message,
})
} else {
w.WriteHeader(http.StatusServiceUnavailable)
json.NewEncoder(w).Encode(map[string]interface{}{
"ready": false,
"status": status.Status,
"message": status.Message,
})
}
}
func (m *Manager) handleLive(w http.ResponseWriter, r *http.Request) {
status := m.GetStatus()
w.Header().Set("Content-Type", "application/json")
// Live means the process is running (not necessarily healthy)
if status.Status != StatusStopping {
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(map[string]interface{}{
"live": true,
"status": status.Status,
"uptime": status.Uptime.String(),
})
} else {
w.WriteHeader(http.StatusServiceUnavailable)
json.NewEncoder(w).Encode(map[string]interface{}{
"live": false,
"status": status.Status,
"message": "System is shutting down",
})
}
}
func (m *Manager) handleChecks(w http.ResponseWriter, r *http.Request) {
status := m.GetStatus()
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(map[string]interface{}{
"checks": status.Checks,
"total": len(status.Checks),
"timestamp": time.Now(),
})
}
// Predefined health checks
// CreateDatabaseCheck creates a health check for database connectivity
func CreateDatabaseCheck(name string, pingFunc func() error) *HealthCheck {
return &HealthCheck{
Name: name,
Description: fmt.Sprintf("Database connectivity check for %s", name),
Enabled: true,
Critical: true,
Interval: 30 * time.Second,
Timeout: 10 * time.Second,
Checker: func(ctx context.Context) CheckResult {
start := time.Now()
err := pingFunc()
if err != nil {
return CheckResult{
Healthy: false,
Message: fmt.Sprintf("Database ping failed: %v", err),
Error: err,
Timestamp: time.Now(),
Latency: time.Since(start),
}
}
return CheckResult{
Healthy: true,
Message: "Database connectivity OK",
Timestamp: time.Now(),
Latency: time.Since(start),
}
},
}
}
// CreateDiskSpaceCheck creates a health check for disk space
func CreateDiskSpaceCheck(path string, threshold float64) *HealthCheck {
return &HealthCheck{
Name: fmt.Sprintf("disk-space-%s", path),
Description: fmt.Sprintf("Disk space check for %s (threshold: %.1f%%)", path, threshold*100),
Enabled: true,
Critical: false,
Interval: 60 * time.Second,
Timeout: 5 * time.Second,
Checker: func(ctx context.Context) CheckResult {
// In a real implementation, you would check actual disk usage
// For now, we'll simulate it
usage := 0.75 // Simulate 75% usage
if usage > threshold {
return CheckResult{
Healthy: false,
Message: fmt.Sprintf("Disk usage %.1f%% exceeds threshold %.1f%%",
usage*100, threshold*100),
Details: map[string]interface{}{
"path": path,
"usage": usage,
"threshold": threshold,
},
Timestamp: time.Now(),
}
}
return CheckResult{
Healthy: true,
Message: fmt.Sprintf("Disk usage %.1f%% is within threshold", usage*100),
Details: map[string]interface{}{
"path": path,
"usage": usage,
"threshold": threshold,
},
Timestamp: time.Now(),
}
},
}
}
// CreateMemoryCheck creates a health check for memory usage
func CreateMemoryCheck(threshold float64) *HealthCheck {
return &HealthCheck{
Name: "memory-usage",
Description: fmt.Sprintf("Memory usage check (threshold: %.1f%%)", threshold*100),
Enabled: true,
Critical: false,
Interval: 30 * time.Second,
Timeout: 5 * time.Second,
Checker: func(ctx context.Context) CheckResult {
// In a real implementation, you would check actual memory usage
usage := 0.60 // Simulate 60% usage
if usage > threshold {
return CheckResult{
Healthy: false,
Message: fmt.Sprintf("Memory usage %.1f%% exceeds threshold %.1f%%",
usage*100, threshold*100),
Details: map[string]interface{}{
"usage": usage,
"threshold": threshold,
},
Timestamp: time.Now(),
}
}
return CheckResult{
Healthy: true,
Message: fmt.Sprintf("Memory usage %.1f%% is within threshold", usage*100),
Details: map[string]interface{}{
"usage": usage,
"threshold": threshold,
},
Timestamp: time.Now(),
}
},
}
}
// CreateActivePubSubCheck creates an active health check for PubSub system
func CreateActivePubSubCheck(pubsub PubSubInterface) *HealthCheck {
return &HealthCheck{
Name: "pubsub-active-probe",
Description: "Active PubSub system health probe with loopback test",
Enabled: true,
Critical: false,
Interval: 60 * time.Second,
Timeout: 15 * time.Second,
Checker: func(ctx context.Context) CheckResult {
start := time.Now()
// Generate unique test message
testKey := fmt.Sprintf("health-check-%d", time.Now().UnixNano())
testMessage := map[string]interface{}{
"test_key": testKey,
"timestamp": time.Now().Unix(),
"probe_id": "pubsub-health-check",
}
// Channel to receive test message
resultCh := make(chan bool, 1)
errorCh := make(chan error, 1)
// Set up message handler for test topic
handler := func(data []byte) {
var received map[string]interface{}
if err := json.Unmarshal(data, &received); err != nil {
return
}
if receivedKey, ok := received["test_key"].(string); ok && receivedKey == testKey {
select {
case resultCh <- true:
default:
}
}
}
// Subscribe to test topic
testTopic := "bzzz/health-test/v1"
if err := pubsub.SubscribeToTopic(testTopic, handler); err != nil {
return CheckResult{
Healthy: false,
Message: fmt.Sprintf("Failed to subscribe to test topic: %v", err),
Error: err,
Timestamp: time.Now(),
Latency: time.Since(start),
}
}
// Allow subscription to settle
time.Sleep(500 * time.Millisecond)
// Publish test message
go func() {
if err := pubsub.PublishToTopic(testTopic, testMessage); err != nil {
errorCh <- err
}
}()
// Wait for result with timeout
select {
case <-resultCh:
latency := time.Since(start)
return CheckResult{
Healthy: true,
Message: fmt.Sprintf("PubSub loopback test successful"),
Details: map[string]interface{}{
"test_topic": testTopic,
"test_key": testKey,
"latency_ms": latency.Milliseconds(),
},
Timestamp: time.Now(),
Latency: latency,
}
case err := <-errorCh:
return CheckResult{
Healthy: false,
Message: fmt.Sprintf("Failed to publish test message: %v", err),
Error: err,
Timestamp: time.Now(),
Latency: time.Since(start),
}
case <-time.After(10 * time.Second):
return CheckResult{
Healthy: false,
Message: "PubSub loopback test timeout - message not received",
Details: map[string]interface{}{
"test_topic": testTopic,
"test_key": testKey,
"timeout": "10s",
},
Timestamp: time.Now(),
Latency: time.Since(start),
}
case <-ctx.Done():
return CheckResult{
Healthy: false,
Message: "PubSub health check cancelled",
Details: map[string]interface{}{
"test_topic": testTopic,
"reason": "context_cancelled",
},
Timestamp: time.Now(),
Latency: time.Since(start),
}
}
},
}
}
// CreateActiveDHTCheck creates an active health check for DHT system
func CreateActiveDHTCheck(dht DHTInterface) *HealthCheck {
return &HealthCheck{
Name: "dht-active-probe",
Description: "Active DHT system health probe with put/get test",
Enabled: true,
Critical: false,
Interval: 90 * time.Second,
Timeout: 20 * time.Second,
Checker: func(ctx context.Context) CheckResult {
start := time.Now()
// Generate unique test key and value
testKey := fmt.Sprintf("health-check-%d", time.Now().UnixNano())
testValue := []byte(fmt.Sprintf(`{"test_key":"%s","timestamp":%d,"probe_id":"dht-health-check"}`,
testKey, time.Now().Unix()))
// Test DHT put operation
putStart := time.Now()
if err := dht.PutValue(ctx, testKey, testValue); err != nil {
return CheckResult{
Healthy: false,
Message: fmt.Sprintf("DHT put operation failed: %v", err),
Details: map[string]interface{}{
"test_key": testKey,
"operation": "put",
"put_latency": time.Since(putStart).Milliseconds(),
},
Error: err,
Timestamp: time.Now(),
Latency: time.Since(start),
}
}
putLatency := time.Since(putStart)
// Allow some time for propagation
time.Sleep(100 * time.Millisecond)
// Test DHT get operation
getStart := time.Now()
retrievedValue, err := dht.GetValue(ctx, testKey)
if err != nil {
return CheckResult{
Healthy: false,
Message: fmt.Sprintf("DHT get operation failed: %v", err),
Details: map[string]interface{}{
"test_key": testKey,
"operation": "get",
"put_latency": putLatency.Milliseconds(),
"get_latency": time.Since(getStart).Milliseconds(),
},
Error: err,
Timestamp: time.Now(),
Latency: time.Since(start),
}
}
getLatency := time.Since(getStart)
// Verify retrieved value matches
if string(retrievedValue) != string(testValue) {
return CheckResult{
Healthy: false,
Message: "DHT data integrity check failed - retrieved value doesn't match",
Details: map[string]interface{}{
"test_key": testKey,
"expected_len": len(testValue),
"retrieved_len": len(retrievedValue),
"put_latency": putLatency.Milliseconds(),
"get_latency": getLatency.Milliseconds(),
"total_latency": time.Since(start).Milliseconds(),
},
Timestamp: time.Now(),
Latency: time.Since(start),
}
}
totalLatency := time.Since(start)
// Get DHT statistics if available
var stats interface{}
if statsProvider, ok := dht.(interface{ GetStats() interface{} }); ok {
stats = statsProvider.GetStats()
}
return CheckResult{
Healthy: true,
Message: "DHT put/get test successful",
Details: map[string]interface{}{
"test_key": testKey,
"put_latency": putLatency.Milliseconds(),
"get_latency": getLatency.Milliseconds(),
"total_latency": totalLatency.Milliseconds(),
"data_integrity": "verified",
"stats": stats,
},
Timestamp: time.Now(),
Latency: totalLatency,
}
},
}
}
// defaultLogger is a simple logger implementation
type defaultLogger struct{}
func (l *defaultLogger) Info(msg string, args ...interface{}) {
fmt.Printf("[INFO] "+msg+"\n", args...)
}
func (l *defaultLogger) Warn(msg string, args ...interface{}) {
fmt.Printf("[WARN] "+msg+"\n", args...)
}
func (l *defaultLogger) Error(msg string, args ...interface{}) {
fmt.Printf("[ERROR] "+msg+"\n", args...)
}