Implement wave-based scaling system for CHORUS Docker Swarm orchestration

- Health gates system for pre-scaling validation (KACHING, BACKBEAT, bootstrap peers) - Assignment broker API for per-replica configuration management - Bootstrap pool management with weighted peer selection and health monitoring - Wave-based scaling algorithm with exponential backoff and failure recovery - Enhanced SwarmManager with Docker service scaling capabilities - Comprehensive scaling metrics collection and reporting system - RESTful HTTP API for external scaling operations and monitoring - Integration with CHORUS P2P networking and assignment systems 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-22 13:51:34 +10:00
parent 55dd5951ea
commit 564852dc91
9 changed files with 3381 additions and 87 deletions
--- a/internal/orchestrator/health_gates.go
+++ b/internal/orchestrator/health_gates.go
@@ -0,0 +1,408 @@
+package orchestrator
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"time"
+
+	"github.com/rs/zerolog/log"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
+
+	"github.com/chorus-services/whoosh/internal/tracing"
+)
+
+// HealthGates manages health checks that gate scaling operations
+type HealthGates struct {
+	kachingURL    string
+	backbeatURL   string
+	chorusURL     string
+	httpClient    *http.Client
+	thresholds    HealthThresholds
+}
+
+// HealthThresholds defines the health criteria for allowing scaling
+type HealthThresholds struct {
+	KachingMaxLatencyMS     int `json:"kaching_max_latency_ms"`     // Maximum acceptable KACHING latency
+	KachingMinRateRemaining int `json:"kaching_min_rate_remaining"` // Minimum rate limit remaining
+	BackbeatMaxLagSeconds   int `json:"backbeat_max_lag_seconds"`   // Maximum subject lag in seconds
+	BootstrapMinHealthyPeers int `json:"bootstrap_min_healthy_peers"` // Minimum healthy bootstrap peers
+	JoinSuccessRateThreshold float64 `json:"join_success_rate_threshold"` // Minimum join success rate (0.0-1.0)
+}
+
+// HealthStatus represents the current health status across all gates
+type HealthStatus struct {
+	Healthy         bool                   `json:"healthy"`
+	Timestamp       time.Time             `json:"timestamp"`
+	Gates           map[string]GateStatus `json:"gates"`
+	OverallReason   string                `json:"overall_reason,omitempty"`
+}
+
+// GateStatus represents the status of an individual health gate
+type GateStatus struct {
+	Name        string                 `json:"name"`
+	Healthy     bool                   `json:"healthy"`
+	Reason      string                 `json:"reason,omitempty"`
+	Metrics     map[string]interface{} `json:"metrics,omitempty"`
+	LastChecked time.Time             `json:"last_checked"`
+}
+
+// KachingHealth represents KACHING health metrics
+type KachingHealth struct {
+	Healthy              bool    `json:"healthy"`
+	LatencyP95MS         float64 `json:"latency_p95_ms"`
+	QueueDepth          int     `json:"queue_depth"`
+	RateLimitRemaining  int     `json:"rate_limit_remaining"`
+	ActiveLeases        int     `json:"active_leases"`
+	ClusterCapacity     int     `json:"cluster_capacity"`
+}
+
+// BackbeatHealth represents BACKBEAT health metrics
+type BackbeatHealth struct {
+	Healthy          bool              `json:"healthy"`
+	SubjectLags      map[string]int    `json:"subject_lags"`
+	MaxLagSeconds    int               `json:"max_lag_seconds"`
+	ConsumerHealth   map[string]bool   `json:"consumer_health"`
+}
+
+// BootstrapHealth represents bootstrap peer pool health
+type BootstrapHealth struct {
+	Healthy       bool `json:"healthy"`
+	TotalPeers    int  `json:"total_peers"`
+	HealthyPeers  int  `json:"healthy_peers"`
+	ReachablePeers int `json:"reachable_peers"`
+}
+
+// ScalingMetrics represents recent scaling operation metrics
+type ScalingMetrics struct {
+	LastWaveSize        int     `json:"last_wave_size"`
+	LastWaveStarted     time.Time `json:"last_wave_started"`
+	LastWaveCompleted   time.Time `json:"last_wave_completed"`
+	JoinSuccessRate     float64 `json:"join_success_rate"`
+	SuccessfulJoins     int     `json:"successful_joins"`
+	FailedJoins         int     `json:"failed_joins"`
+}
+
+// NewHealthGates creates a new health gates manager
+func NewHealthGates(kachingURL, backbeatURL, chorusURL string) *HealthGates {
+	return &HealthGates{
+		kachingURL:  kachingURL,
+		backbeatURL: backbeatURL,
+		chorusURL:   chorusURL,
+		httpClient:  &http.Client{Timeout: 10 * time.Second},
+		thresholds: HealthThresholds{
+			KachingMaxLatencyMS:      500,  // 500ms max latency
+			KachingMinRateRemaining:  20,   // At least 20 requests remaining
+			BackbeatMaxLagSeconds:    30,   // Max 30 seconds lag
+			BootstrapMinHealthyPeers: 3,    // At least 3 healthy bootstrap peers
+			JoinSuccessRateThreshold: 0.8,  // 80% join success rate
+		},
+	}
+}
+
+// SetThresholds updates the health thresholds
+func (hg *HealthGates) SetThresholds(thresholds HealthThresholds) {
+	hg.thresholds = thresholds
+}
+
+// CheckHealth checks all health gates and returns overall status
+func (hg *HealthGates) CheckHealth(ctx context.Context, recentMetrics *ScalingMetrics) (*HealthStatus, error) {
+	ctx, span := tracing.Tracer.Start(ctx, "health_gates.check_health")
+	defer span.End()
+
+	status := &HealthStatus{
+		Timestamp: time.Now(),
+		Gates:     make(map[string]GateStatus),
+		Healthy:   true,
+	}
+
+	var failReasons []string
+
+	// Check KACHING health
+	if kachingStatus, err := hg.checkKachingHealth(ctx); err != nil {
+		log.Warn().Err(err).Msg("Failed to check KACHING health")
+		status.Gates["kaching"] = GateStatus{
+			Name:        "kaching",
+			Healthy:     false,
+			Reason:      fmt.Sprintf("Health check failed: %v", err),
+			LastChecked: time.Now(),
+		}
+		status.Healthy = false
+		failReasons = append(failReasons, "KACHING unreachable")
+	} else {
+		status.Gates["kaching"] = *kachingStatus
+		if !kachingStatus.Healthy {
+			status.Healthy = false
+			failReasons = append(failReasons, kachingStatus.Reason)
+		}
+	}
+
+	// Check BACKBEAT health
+	if backbeatStatus, err := hg.checkBackbeatHealth(ctx); err != nil {
+		log.Warn().Err(err).Msg("Failed to check BACKBEAT health")
+		status.Gates["backbeat"] = GateStatus{
+			Name:        "backbeat",
+			Healthy:     false,
+			Reason:      fmt.Sprintf("Health check failed: %v", err),
+			LastChecked: time.Now(),
+		}
+		status.Healthy = false
+		failReasons = append(failReasons, "BACKBEAT unreachable")
+	} else {
+		status.Gates["backbeat"] = *backbeatStatus
+		if !backbeatStatus.Healthy {
+			status.Healthy = false
+			failReasons = append(failReasons, backbeatStatus.Reason)
+		}
+	}
+
+	// Check bootstrap peer health
+	if bootstrapStatus, err := hg.checkBootstrapHealth(ctx); err != nil {
+		log.Warn().Err(err).Msg("Failed to check bootstrap health")
+		status.Gates["bootstrap"] = GateStatus{
+			Name:        "bootstrap",
+			Healthy:     false,
+			Reason:      fmt.Sprintf("Health check failed: %v", err),
+			LastChecked: time.Now(),
+		}
+		status.Healthy = false
+		failReasons = append(failReasons, "Bootstrap peers unreachable")
+	} else {
+		status.Gates["bootstrap"] = *bootstrapStatus
+		if !bootstrapStatus.Healthy {
+			status.Healthy = false
+			failReasons = append(failReasons, bootstrapStatus.Reason)
+		}
+	}
+
+	// Check recent scaling metrics if provided
+	if recentMetrics != nil {
+		if metricsStatus := hg.checkScalingMetrics(recentMetrics); !metricsStatus.Healthy {
+			status.Gates["scaling_metrics"] = *metricsStatus
+			status.Healthy = false
+			failReasons = append(failReasons, metricsStatus.Reason)
+		} else {
+			status.Gates["scaling_metrics"] = *metricsStatus
+		}
+	}
+
+	// Set overall reason if unhealthy
+	if !status.Healthy && len(failReasons) > 0 {
+		status.OverallReason = fmt.Sprintf("Health gates failed: %v", failReasons)
+	}
+
+	// Add tracing attributes
+	span.SetAttributes(
+		attribute.Bool("health.overall_healthy", status.Healthy),
+		attribute.Int("health.gate_count", len(status.Gates)),
+	)
+
+	if !status.Healthy {
+		span.SetAttributes(attribute.String("health.fail_reason", status.OverallReason))
+	}
+
+	return status, nil
+}
+
+// checkKachingHealth checks KACHING health and rate limits
+func (hg *HealthGates) checkKachingHealth(ctx context.Context) (*GateStatus, error) {
+	url := fmt.Sprintf("%s/health/burst", hg.kachingURL)
+
+	req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create KACHING health request: %w", err)
+	}
+
+	resp, err := hg.httpClient.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("KACHING health request failed: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		return nil, fmt.Errorf("KACHING health check returned status %d", resp.StatusCode)
+	}
+
+	var health KachingHealth
+	if err := json.NewDecoder(resp.Body).Decode(&health); err != nil {
+		return nil, fmt.Errorf("failed to decode KACHING health response: %w", err)
+	}
+
+	status := &GateStatus{
+		Name:        "kaching",
+		LastChecked: time.Now(),
+		Metrics: map[string]interface{}{
+			"latency_p95_ms":        health.LatencyP95MS,
+			"queue_depth":           health.QueueDepth,
+			"rate_limit_remaining":  health.RateLimitRemaining,
+			"active_leases":         health.ActiveLeases,
+			"cluster_capacity":      health.ClusterCapacity,
+		},
+	}
+
+	// Check latency threshold
+	if health.LatencyP95MS > float64(hg.thresholds.KachingMaxLatencyMS) {
+		status.Healthy = false
+		status.Reason = fmt.Sprintf("KACHING latency too high: %.1fms > %dms",
+			health.LatencyP95MS, hg.thresholds.KachingMaxLatencyMS)
+		return status, nil
+	}
+
+	// Check rate limit threshold
+	if health.RateLimitRemaining < hg.thresholds.KachingMinRateRemaining {
+		status.Healthy = false
+		status.Reason = fmt.Sprintf("KACHING rate limit too low: %d < %d remaining",
+			health.RateLimitRemaining, hg.thresholds.KachingMinRateRemaining)
+		return status, nil
+	}
+
+	// Check overall KACHING health
+	if !health.Healthy {
+		status.Healthy = false
+		status.Reason = "KACHING reports unhealthy status"
+		return status, nil
+	}
+
+	status.Healthy = true
+	return status, nil
+}
+
+// checkBackbeatHealth checks BACKBEAT subject lag and consumer health
+func (hg *HealthGates) checkBackbeatHealth(ctx context.Context) (*GateStatus, error) {
+	url := fmt.Sprintf("%s/metrics", hg.backbeatURL)
+
+	req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create BACKBEAT health request: %w", err)
+	}
+
+	resp, err := hg.httpClient.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("BACKBEAT health request failed: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		return nil, fmt.Errorf("BACKBEAT health check returned status %d", resp.StatusCode)
+	}
+
+	var health BackbeatHealth
+	if err := json.NewDecoder(resp.Body).Decode(&health); err != nil {
+		return nil, fmt.Errorf("failed to decode BACKBEAT health response: %w", err)
+	}
+
+	status := &GateStatus{
+		Name:        "backbeat",
+		LastChecked: time.Now(),
+		Metrics: map[string]interface{}{
+			"subject_lags":    health.SubjectLags,
+			"max_lag_seconds": health.MaxLagSeconds,
+			"consumer_health": health.ConsumerHealth,
+		},
+	}
+
+	// Check subject lag threshold
+	if health.MaxLagSeconds > hg.thresholds.BackbeatMaxLagSeconds {
+		status.Healthy = false
+		status.Reason = fmt.Sprintf("BACKBEAT lag too high: %ds > %ds",
+			health.MaxLagSeconds, hg.thresholds.BackbeatMaxLagSeconds)
+		return status, nil
+	}
+
+	// Check overall BACKBEAT health
+	if !health.Healthy {
+		status.Healthy = false
+		status.Reason = "BACKBEAT reports unhealthy status"
+		return status, nil
+	}
+
+	status.Healthy = true
+	return status, nil
+}
+
+// checkBootstrapHealth checks bootstrap peer pool health
+func (hg *HealthGates) checkBootstrapHealth(ctx context.Context) (*GateStatus, error) {
+	url := fmt.Sprintf("%s/peers", hg.chorusURL)
+
+	req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create bootstrap health request: %w", err)
+	}
+
+	resp, err := hg.httpClient.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("bootstrap health request failed: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		return nil, fmt.Errorf("bootstrap health check returned status %d", resp.StatusCode)
+	}
+
+	var health BootstrapHealth
+	if err := json.NewDecoder(resp.Body).Decode(&health); err != nil {
+		return nil, fmt.Errorf("failed to decode bootstrap health response: %w", err)
+	}
+
+	status := &GateStatus{
+		Name:        "bootstrap",
+		LastChecked: time.Now(),
+		Metrics: map[string]interface{}{
+			"total_peers":     health.TotalPeers,
+			"healthy_peers":   health.HealthyPeers,
+			"reachable_peers": health.ReachablePeers,
+		},
+	}
+
+	// Check minimum healthy peers threshold
+	if health.HealthyPeers < hg.thresholds.BootstrapMinHealthyPeers {
+		status.Healthy = false
+		status.Reason = fmt.Sprintf("Not enough healthy bootstrap peers: %d < %d",
+			health.HealthyPeers, hg.thresholds.BootstrapMinHealthyPeers)
+		return status, nil
+	}
+
+	status.Healthy = true
+	return status, nil
+}
+
+// checkScalingMetrics checks recent scaling success rate
+func (hg *HealthGates) checkScalingMetrics(metrics *ScalingMetrics) *GateStatus {
+	status := &GateStatus{
+		Name:        "scaling_metrics",
+		LastChecked: time.Now(),
+		Metrics: map[string]interface{}{
+			"join_success_rate":  metrics.JoinSuccessRate,
+			"successful_joins":   metrics.SuccessfulJoins,
+			"failed_joins":       metrics.FailedJoins,
+			"last_wave_size":     metrics.LastWaveSize,
+		},
+	}
+
+	// Check join success rate threshold
+	if metrics.JoinSuccessRate < hg.thresholds.JoinSuccessRateThreshold {
+		status.Healthy = false
+		status.Reason = fmt.Sprintf("Join success rate too low: %.1f%% < %.1f%%",
+			metrics.JoinSuccessRate*100, hg.thresholds.JoinSuccessRateThreshold*100)
+		return status
+	}
+
+	status.Healthy = true
+	return status
+}
+
+// GetThresholds returns the current health thresholds
+func (hg *HealthGates) GetThresholds() HealthThresholds {
+	return hg.thresholds
+}
+
+// IsHealthy performs a quick health check and returns boolean result
+func (hg *HealthGates) IsHealthy(ctx context.Context, recentMetrics *ScalingMetrics) bool {
+	status, err := hg.CheckHealth(ctx, recentMetrics)
+	if err != nil {
+		return false
+	}
+	return status.Healthy
+}