Implement wave-based scaling system for CHORUS Docker Swarm orchestration

- Health gates system for pre-scaling validation (KACHING, BACKBEAT, bootstrap peers)
- Assignment broker API for per-replica configuration management
- Bootstrap pool management with weighted peer selection and health monitoring
- Wave-based scaling algorithm with exponential backoff and failure recovery
- Enhanced SwarmManager with Docker service scaling capabilities
- Comprehensive scaling metrics collection and reporting system
- RESTful HTTP API for external scaling operations and monitoring
- Integration with CHORUS P2P networking and assignment systems

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Claude Code
2025-09-22 13:51:34 +10:00
parent 55dd5951ea
commit 564852dc91
9 changed files with 3381 additions and 87 deletions

View File

@@ -0,0 +1,408 @@
package orchestrator
import (
"context"
"encoding/json"
"fmt"
"net/http"
"time"
"github.com/rs/zerolog/log"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/trace"
"github.com/chorus-services/whoosh/internal/tracing"
)
// HealthGates manages health checks that gate scaling operations
type HealthGates struct {
kachingURL string
backbeatURL string
chorusURL string
httpClient *http.Client
thresholds HealthThresholds
}
// HealthThresholds defines the health criteria for allowing scaling
type HealthThresholds struct {
KachingMaxLatencyMS int `json:"kaching_max_latency_ms"` // Maximum acceptable KACHING latency
KachingMinRateRemaining int `json:"kaching_min_rate_remaining"` // Minimum rate limit remaining
BackbeatMaxLagSeconds int `json:"backbeat_max_lag_seconds"` // Maximum subject lag in seconds
BootstrapMinHealthyPeers int `json:"bootstrap_min_healthy_peers"` // Minimum healthy bootstrap peers
JoinSuccessRateThreshold float64 `json:"join_success_rate_threshold"` // Minimum join success rate (0.0-1.0)
}
// HealthStatus represents the current health status across all gates
type HealthStatus struct {
Healthy bool `json:"healthy"`
Timestamp time.Time `json:"timestamp"`
Gates map[string]GateStatus `json:"gates"`
OverallReason string `json:"overall_reason,omitempty"`
}
// GateStatus represents the status of an individual health gate
type GateStatus struct {
Name string `json:"name"`
Healthy bool `json:"healthy"`
Reason string `json:"reason,omitempty"`
Metrics map[string]interface{} `json:"metrics,omitempty"`
LastChecked time.Time `json:"last_checked"`
}
// KachingHealth represents KACHING health metrics
type KachingHealth struct {
Healthy bool `json:"healthy"`
LatencyP95MS float64 `json:"latency_p95_ms"`
QueueDepth int `json:"queue_depth"`
RateLimitRemaining int `json:"rate_limit_remaining"`
ActiveLeases int `json:"active_leases"`
ClusterCapacity int `json:"cluster_capacity"`
}
// BackbeatHealth represents BACKBEAT health metrics
type BackbeatHealth struct {
Healthy bool `json:"healthy"`
SubjectLags map[string]int `json:"subject_lags"`
MaxLagSeconds int `json:"max_lag_seconds"`
ConsumerHealth map[string]bool `json:"consumer_health"`
}
// BootstrapHealth represents bootstrap peer pool health
type BootstrapHealth struct {
Healthy bool `json:"healthy"`
TotalPeers int `json:"total_peers"`
HealthyPeers int `json:"healthy_peers"`
ReachablePeers int `json:"reachable_peers"`
}
// ScalingMetrics represents recent scaling operation metrics
type ScalingMetrics struct {
LastWaveSize int `json:"last_wave_size"`
LastWaveStarted time.Time `json:"last_wave_started"`
LastWaveCompleted time.Time `json:"last_wave_completed"`
JoinSuccessRate float64 `json:"join_success_rate"`
SuccessfulJoins int `json:"successful_joins"`
FailedJoins int `json:"failed_joins"`
}
// NewHealthGates creates a new health gates manager
func NewHealthGates(kachingURL, backbeatURL, chorusURL string) *HealthGates {
return &HealthGates{
kachingURL: kachingURL,
backbeatURL: backbeatURL,
chorusURL: chorusURL,
httpClient: &http.Client{Timeout: 10 * time.Second},
thresholds: HealthThresholds{
KachingMaxLatencyMS: 500, // 500ms max latency
KachingMinRateRemaining: 20, // At least 20 requests remaining
BackbeatMaxLagSeconds: 30, // Max 30 seconds lag
BootstrapMinHealthyPeers: 3, // At least 3 healthy bootstrap peers
JoinSuccessRateThreshold: 0.8, // 80% join success rate
},
}
}
// SetThresholds updates the health thresholds
func (hg *HealthGates) SetThresholds(thresholds HealthThresholds) {
hg.thresholds = thresholds
}
// CheckHealth checks all health gates and returns overall status
func (hg *HealthGates) CheckHealth(ctx context.Context, recentMetrics *ScalingMetrics) (*HealthStatus, error) {
ctx, span := tracing.Tracer.Start(ctx, "health_gates.check_health")
defer span.End()
status := &HealthStatus{
Timestamp: time.Now(),
Gates: make(map[string]GateStatus),
Healthy: true,
}
var failReasons []string
// Check KACHING health
if kachingStatus, err := hg.checkKachingHealth(ctx); err != nil {
log.Warn().Err(err).Msg("Failed to check KACHING health")
status.Gates["kaching"] = GateStatus{
Name: "kaching",
Healthy: false,
Reason: fmt.Sprintf("Health check failed: %v", err),
LastChecked: time.Now(),
}
status.Healthy = false
failReasons = append(failReasons, "KACHING unreachable")
} else {
status.Gates["kaching"] = *kachingStatus
if !kachingStatus.Healthy {
status.Healthy = false
failReasons = append(failReasons, kachingStatus.Reason)
}
}
// Check BACKBEAT health
if backbeatStatus, err := hg.checkBackbeatHealth(ctx); err != nil {
log.Warn().Err(err).Msg("Failed to check BACKBEAT health")
status.Gates["backbeat"] = GateStatus{
Name: "backbeat",
Healthy: false,
Reason: fmt.Sprintf("Health check failed: %v", err),
LastChecked: time.Now(),
}
status.Healthy = false
failReasons = append(failReasons, "BACKBEAT unreachable")
} else {
status.Gates["backbeat"] = *backbeatStatus
if !backbeatStatus.Healthy {
status.Healthy = false
failReasons = append(failReasons, backbeatStatus.Reason)
}
}
// Check bootstrap peer health
if bootstrapStatus, err := hg.checkBootstrapHealth(ctx); err != nil {
log.Warn().Err(err).Msg("Failed to check bootstrap health")
status.Gates["bootstrap"] = GateStatus{
Name: "bootstrap",
Healthy: false,
Reason: fmt.Sprintf("Health check failed: %v", err),
LastChecked: time.Now(),
}
status.Healthy = false
failReasons = append(failReasons, "Bootstrap peers unreachable")
} else {
status.Gates["bootstrap"] = *bootstrapStatus
if !bootstrapStatus.Healthy {
status.Healthy = false
failReasons = append(failReasons, bootstrapStatus.Reason)
}
}
// Check recent scaling metrics if provided
if recentMetrics != nil {
if metricsStatus := hg.checkScalingMetrics(recentMetrics); !metricsStatus.Healthy {
status.Gates["scaling_metrics"] = *metricsStatus
status.Healthy = false
failReasons = append(failReasons, metricsStatus.Reason)
} else {
status.Gates["scaling_metrics"] = *metricsStatus
}
}
// Set overall reason if unhealthy
if !status.Healthy && len(failReasons) > 0 {
status.OverallReason = fmt.Sprintf("Health gates failed: %v", failReasons)
}
// Add tracing attributes
span.SetAttributes(
attribute.Bool("health.overall_healthy", status.Healthy),
attribute.Int("health.gate_count", len(status.Gates)),
)
if !status.Healthy {
span.SetAttributes(attribute.String("health.fail_reason", status.OverallReason))
}
return status, nil
}
// checkKachingHealth checks KACHING health and rate limits
func (hg *HealthGates) checkKachingHealth(ctx context.Context) (*GateStatus, error) {
url := fmt.Sprintf("%s/health/burst", hg.kachingURL)
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return nil, fmt.Errorf("failed to create KACHING health request: %w", err)
}
resp, err := hg.httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("KACHING health request failed: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("KACHING health check returned status %d", resp.StatusCode)
}
var health KachingHealth
if err := json.NewDecoder(resp.Body).Decode(&health); err != nil {
return nil, fmt.Errorf("failed to decode KACHING health response: %w", err)
}
status := &GateStatus{
Name: "kaching",
LastChecked: time.Now(),
Metrics: map[string]interface{}{
"latency_p95_ms": health.LatencyP95MS,
"queue_depth": health.QueueDepth,
"rate_limit_remaining": health.RateLimitRemaining,
"active_leases": health.ActiveLeases,
"cluster_capacity": health.ClusterCapacity,
},
}
// Check latency threshold
if health.LatencyP95MS > float64(hg.thresholds.KachingMaxLatencyMS) {
status.Healthy = false
status.Reason = fmt.Sprintf("KACHING latency too high: %.1fms > %dms",
health.LatencyP95MS, hg.thresholds.KachingMaxLatencyMS)
return status, nil
}
// Check rate limit threshold
if health.RateLimitRemaining < hg.thresholds.KachingMinRateRemaining {
status.Healthy = false
status.Reason = fmt.Sprintf("KACHING rate limit too low: %d < %d remaining",
health.RateLimitRemaining, hg.thresholds.KachingMinRateRemaining)
return status, nil
}
// Check overall KACHING health
if !health.Healthy {
status.Healthy = false
status.Reason = "KACHING reports unhealthy status"
return status, nil
}
status.Healthy = true
return status, nil
}
// checkBackbeatHealth checks BACKBEAT subject lag and consumer health
func (hg *HealthGates) checkBackbeatHealth(ctx context.Context) (*GateStatus, error) {
url := fmt.Sprintf("%s/metrics", hg.backbeatURL)
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return nil, fmt.Errorf("failed to create BACKBEAT health request: %w", err)
}
resp, err := hg.httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("BACKBEAT health request failed: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("BACKBEAT health check returned status %d", resp.StatusCode)
}
var health BackbeatHealth
if err := json.NewDecoder(resp.Body).Decode(&health); err != nil {
return nil, fmt.Errorf("failed to decode BACKBEAT health response: %w", err)
}
status := &GateStatus{
Name: "backbeat",
LastChecked: time.Now(),
Metrics: map[string]interface{}{
"subject_lags": health.SubjectLags,
"max_lag_seconds": health.MaxLagSeconds,
"consumer_health": health.ConsumerHealth,
},
}
// Check subject lag threshold
if health.MaxLagSeconds > hg.thresholds.BackbeatMaxLagSeconds {
status.Healthy = false
status.Reason = fmt.Sprintf("BACKBEAT lag too high: %ds > %ds",
health.MaxLagSeconds, hg.thresholds.BackbeatMaxLagSeconds)
return status, nil
}
// Check overall BACKBEAT health
if !health.Healthy {
status.Healthy = false
status.Reason = "BACKBEAT reports unhealthy status"
return status, nil
}
status.Healthy = true
return status, nil
}
// checkBootstrapHealth checks bootstrap peer pool health
func (hg *HealthGates) checkBootstrapHealth(ctx context.Context) (*GateStatus, error) {
url := fmt.Sprintf("%s/peers", hg.chorusURL)
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return nil, fmt.Errorf("failed to create bootstrap health request: %w", err)
}
resp, err := hg.httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("bootstrap health request failed: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("bootstrap health check returned status %d", resp.StatusCode)
}
var health BootstrapHealth
if err := json.NewDecoder(resp.Body).Decode(&health); err != nil {
return nil, fmt.Errorf("failed to decode bootstrap health response: %w", err)
}
status := &GateStatus{
Name: "bootstrap",
LastChecked: time.Now(),
Metrics: map[string]interface{}{
"total_peers": health.TotalPeers,
"healthy_peers": health.HealthyPeers,
"reachable_peers": health.ReachablePeers,
},
}
// Check minimum healthy peers threshold
if health.HealthyPeers < hg.thresholds.BootstrapMinHealthyPeers {
status.Healthy = false
status.Reason = fmt.Sprintf("Not enough healthy bootstrap peers: %d < %d",
health.HealthyPeers, hg.thresholds.BootstrapMinHealthyPeers)
return status, nil
}
status.Healthy = true
return status, nil
}
// checkScalingMetrics checks recent scaling success rate
func (hg *HealthGates) checkScalingMetrics(metrics *ScalingMetrics) *GateStatus {
status := &GateStatus{
Name: "scaling_metrics",
LastChecked: time.Now(),
Metrics: map[string]interface{}{
"join_success_rate": metrics.JoinSuccessRate,
"successful_joins": metrics.SuccessfulJoins,
"failed_joins": metrics.FailedJoins,
"last_wave_size": metrics.LastWaveSize,
},
}
// Check join success rate threshold
if metrics.JoinSuccessRate < hg.thresholds.JoinSuccessRateThreshold {
status.Healthy = false
status.Reason = fmt.Sprintf("Join success rate too low: %.1f%% < %.1f%%",
metrics.JoinSuccessRate*100, hg.thresholds.JoinSuccessRateThreshold*100)
return status
}
status.Healthy = true
return status
}
// GetThresholds returns the current health thresholds
func (hg *HealthGates) GetThresholds() HealthThresholds {
return hg.thresholds
}
// IsHealthy performs a quick health check and returns boolean result
func (hg *HealthGates) IsHealthy(ctx context.Context, recentMetrics *ScalingMetrics) bool {
status, err := hg.CheckHealth(ctx, recentMetrics)
if err != nil {
return false
}
return status.Healthy
}