Implement wave-based scaling system for CHORUS Docker Swarm orchestration
- Health gates system for pre-scaling validation (KACHING, BACKBEAT, bootstrap peers) - Assignment broker API for per-replica configuration management - Bootstrap pool management with weighted peer selection and health monitoring - Wave-based scaling algorithm with exponential backoff and failure recovery - Enhanced SwarmManager with Docker service scaling capabilities - Comprehensive scaling metrics collection and reporting system - RESTful HTTP API for external scaling operations and monitoring - Integration with CHORUS P2P networking and assignment systems 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
408
internal/orchestrator/health_gates.go
Normal file
408
internal/orchestrator/health_gates.go
Normal file
@@ -0,0 +1,408 @@
|
||||
package orchestrator
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/rs/zerolog/log"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/trace"
|
||||
|
||||
"github.com/chorus-services/whoosh/internal/tracing"
|
||||
)
|
||||
|
||||
// HealthGates manages health checks that gate scaling operations
|
||||
type HealthGates struct {
|
||||
kachingURL string
|
||||
backbeatURL string
|
||||
chorusURL string
|
||||
httpClient *http.Client
|
||||
thresholds HealthThresholds
|
||||
}
|
||||
|
||||
// HealthThresholds defines the health criteria for allowing scaling
|
||||
type HealthThresholds struct {
|
||||
KachingMaxLatencyMS int `json:"kaching_max_latency_ms"` // Maximum acceptable KACHING latency
|
||||
KachingMinRateRemaining int `json:"kaching_min_rate_remaining"` // Minimum rate limit remaining
|
||||
BackbeatMaxLagSeconds int `json:"backbeat_max_lag_seconds"` // Maximum subject lag in seconds
|
||||
BootstrapMinHealthyPeers int `json:"bootstrap_min_healthy_peers"` // Minimum healthy bootstrap peers
|
||||
JoinSuccessRateThreshold float64 `json:"join_success_rate_threshold"` // Minimum join success rate (0.0-1.0)
|
||||
}
|
||||
|
||||
// HealthStatus represents the current health status across all gates
|
||||
type HealthStatus struct {
|
||||
Healthy bool `json:"healthy"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
Gates map[string]GateStatus `json:"gates"`
|
||||
OverallReason string `json:"overall_reason,omitempty"`
|
||||
}
|
||||
|
||||
// GateStatus represents the status of an individual health gate
|
||||
type GateStatus struct {
|
||||
Name string `json:"name"`
|
||||
Healthy bool `json:"healthy"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
Metrics map[string]interface{} `json:"metrics,omitempty"`
|
||||
LastChecked time.Time `json:"last_checked"`
|
||||
}
|
||||
|
||||
// KachingHealth represents KACHING health metrics
|
||||
type KachingHealth struct {
|
||||
Healthy bool `json:"healthy"`
|
||||
LatencyP95MS float64 `json:"latency_p95_ms"`
|
||||
QueueDepth int `json:"queue_depth"`
|
||||
RateLimitRemaining int `json:"rate_limit_remaining"`
|
||||
ActiveLeases int `json:"active_leases"`
|
||||
ClusterCapacity int `json:"cluster_capacity"`
|
||||
}
|
||||
|
||||
// BackbeatHealth represents BACKBEAT health metrics
|
||||
type BackbeatHealth struct {
|
||||
Healthy bool `json:"healthy"`
|
||||
SubjectLags map[string]int `json:"subject_lags"`
|
||||
MaxLagSeconds int `json:"max_lag_seconds"`
|
||||
ConsumerHealth map[string]bool `json:"consumer_health"`
|
||||
}
|
||||
|
||||
// BootstrapHealth represents bootstrap peer pool health
|
||||
type BootstrapHealth struct {
|
||||
Healthy bool `json:"healthy"`
|
||||
TotalPeers int `json:"total_peers"`
|
||||
HealthyPeers int `json:"healthy_peers"`
|
||||
ReachablePeers int `json:"reachable_peers"`
|
||||
}
|
||||
|
||||
// ScalingMetrics represents recent scaling operation metrics
|
||||
type ScalingMetrics struct {
|
||||
LastWaveSize int `json:"last_wave_size"`
|
||||
LastWaveStarted time.Time `json:"last_wave_started"`
|
||||
LastWaveCompleted time.Time `json:"last_wave_completed"`
|
||||
JoinSuccessRate float64 `json:"join_success_rate"`
|
||||
SuccessfulJoins int `json:"successful_joins"`
|
||||
FailedJoins int `json:"failed_joins"`
|
||||
}
|
||||
|
||||
// NewHealthGates creates a new health gates manager
|
||||
func NewHealthGates(kachingURL, backbeatURL, chorusURL string) *HealthGates {
|
||||
return &HealthGates{
|
||||
kachingURL: kachingURL,
|
||||
backbeatURL: backbeatURL,
|
||||
chorusURL: chorusURL,
|
||||
httpClient: &http.Client{Timeout: 10 * time.Second},
|
||||
thresholds: HealthThresholds{
|
||||
KachingMaxLatencyMS: 500, // 500ms max latency
|
||||
KachingMinRateRemaining: 20, // At least 20 requests remaining
|
||||
BackbeatMaxLagSeconds: 30, // Max 30 seconds lag
|
||||
BootstrapMinHealthyPeers: 3, // At least 3 healthy bootstrap peers
|
||||
JoinSuccessRateThreshold: 0.8, // 80% join success rate
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// SetThresholds updates the health thresholds
|
||||
func (hg *HealthGates) SetThresholds(thresholds HealthThresholds) {
|
||||
hg.thresholds = thresholds
|
||||
}
|
||||
|
||||
// CheckHealth checks all health gates and returns overall status
|
||||
func (hg *HealthGates) CheckHealth(ctx context.Context, recentMetrics *ScalingMetrics) (*HealthStatus, error) {
|
||||
ctx, span := tracing.Tracer.Start(ctx, "health_gates.check_health")
|
||||
defer span.End()
|
||||
|
||||
status := &HealthStatus{
|
||||
Timestamp: time.Now(),
|
||||
Gates: make(map[string]GateStatus),
|
||||
Healthy: true,
|
||||
}
|
||||
|
||||
var failReasons []string
|
||||
|
||||
// Check KACHING health
|
||||
if kachingStatus, err := hg.checkKachingHealth(ctx); err != nil {
|
||||
log.Warn().Err(err).Msg("Failed to check KACHING health")
|
||||
status.Gates["kaching"] = GateStatus{
|
||||
Name: "kaching",
|
||||
Healthy: false,
|
||||
Reason: fmt.Sprintf("Health check failed: %v", err),
|
||||
LastChecked: time.Now(),
|
||||
}
|
||||
status.Healthy = false
|
||||
failReasons = append(failReasons, "KACHING unreachable")
|
||||
} else {
|
||||
status.Gates["kaching"] = *kachingStatus
|
||||
if !kachingStatus.Healthy {
|
||||
status.Healthy = false
|
||||
failReasons = append(failReasons, kachingStatus.Reason)
|
||||
}
|
||||
}
|
||||
|
||||
// Check BACKBEAT health
|
||||
if backbeatStatus, err := hg.checkBackbeatHealth(ctx); err != nil {
|
||||
log.Warn().Err(err).Msg("Failed to check BACKBEAT health")
|
||||
status.Gates["backbeat"] = GateStatus{
|
||||
Name: "backbeat",
|
||||
Healthy: false,
|
||||
Reason: fmt.Sprintf("Health check failed: %v", err),
|
||||
LastChecked: time.Now(),
|
||||
}
|
||||
status.Healthy = false
|
||||
failReasons = append(failReasons, "BACKBEAT unreachable")
|
||||
} else {
|
||||
status.Gates["backbeat"] = *backbeatStatus
|
||||
if !backbeatStatus.Healthy {
|
||||
status.Healthy = false
|
||||
failReasons = append(failReasons, backbeatStatus.Reason)
|
||||
}
|
||||
}
|
||||
|
||||
// Check bootstrap peer health
|
||||
if bootstrapStatus, err := hg.checkBootstrapHealth(ctx); err != nil {
|
||||
log.Warn().Err(err).Msg("Failed to check bootstrap health")
|
||||
status.Gates["bootstrap"] = GateStatus{
|
||||
Name: "bootstrap",
|
||||
Healthy: false,
|
||||
Reason: fmt.Sprintf("Health check failed: %v", err),
|
||||
LastChecked: time.Now(),
|
||||
}
|
||||
status.Healthy = false
|
||||
failReasons = append(failReasons, "Bootstrap peers unreachable")
|
||||
} else {
|
||||
status.Gates["bootstrap"] = *bootstrapStatus
|
||||
if !bootstrapStatus.Healthy {
|
||||
status.Healthy = false
|
||||
failReasons = append(failReasons, bootstrapStatus.Reason)
|
||||
}
|
||||
}
|
||||
|
||||
// Check recent scaling metrics if provided
|
||||
if recentMetrics != nil {
|
||||
if metricsStatus := hg.checkScalingMetrics(recentMetrics); !metricsStatus.Healthy {
|
||||
status.Gates["scaling_metrics"] = *metricsStatus
|
||||
status.Healthy = false
|
||||
failReasons = append(failReasons, metricsStatus.Reason)
|
||||
} else {
|
||||
status.Gates["scaling_metrics"] = *metricsStatus
|
||||
}
|
||||
}
|
||||
|
||||
// Set overall reason if unhealthy
|
||||
if !status.Healthy && len(failReasons) > 0 {
|
||||
status.OverallReason = fmt.Sprintf("Health gates failed: %v", failReasons)
|
||||
}
|
||||
|
||||
// Add tracing attributes
|
||||
span.SetAttributes(
|
||||
attribute.Bool("health.overall_healthy", status.Healthy),
|
||||
attribute.Int("health.gate_count", len(status.Gates)),
|
||||
)
|
||||
|
||||
if !status.Healthy {
|
||||
span.SetAttributes(attribute.String("health.fail_reason", status.OverallReason))
|
||||
}
|
||||
|
||||
return status, nil
|
||||
}
|
||||
|
||||
// checkKachingHealth checks KACHING health and rate limits
|
||||
func (hg *HealthGates) checkKachingHealth(ctx context.Context) (*GateStatus, error) {
|
||||
url := fmt.Sprintf("%s/health/burst", hg.kachingURL)
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create KACHING health request: %w", err)
|
||||
}
|
||||
|
||||
resp, err := hg.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("KACHING health request failed: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("KACHING health check returned status %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
var health KachingHealth
|
||||
if err := json.NewDecoder(resp.Body).Decode(&health); err != nil {
|
||||
return nil, fmt.Errorf("failed to decode KACHING health response: %w", err)
|
||||
}
|
||||
|
||||
status := &GateStatus{
|
||||
Name: "kaching",
|
||||
LastChecked: time.Now(),
|
||||
Metrics: map[string]interface{}{
|
||||
"latency_p95_ms": health.LatencyP95MS,
|
||||
"queue_depth": health.QueueDepth,
|
||||
"rate_limit_remaining": health.RateLimitRemaining,
|
||||
"active_leases": health.ActiveLeases,
|
||||
"cluster_capacity": health.ClusterCapacity,
|
||||
},
|
||||
}
|
||||
|
||||
// Check latency threshold
|
||||
if health.LatencyP95MS > float64(hg.thresholds.KachingMaxLatencyMS) {
|
||||
status.Healthy = false
|
||||
status.Reason = fmt.Sprintf("KACHING latency too high: %.1fms > %dms",
|
||||
health.LatencyP95MS, hg.thresholds.KachingMaxLatencyMS)
|
||||
return status, nil
|
||||
}
|
||||
|
||||
// Check rate limit threshold
|
||||
if health.RateLimitRemaining < hg.thresholds.KachingMinRateRemaining {
|
||||
status.Healthy = false
|
||||
status.Reason = fmt.Sprintf("KACHING rate limit too low: %d < %d remaining",
|
||||
health.RateLimitRemaining, hg.thresholds.KachingMinRateRemaining)
|
||||
return status, nil
|
||||
}
|
||||
|
||||
// Check overall KACHING health
|
||||
if !health.Healthy {
|
||||
status.Healthy = false
|
||||
status.Reason = "KACHING reports unhealthy status"
|
||||
return status, nil
|
||||
}
|
||||
|
||||
status.Healthy = true
|
||||
return status, nil
|
||||
}
|
||||
|
||||
// checkBackbeatHealth checks BACKBEAT subject lag and consumer health
|
||||
func (hg *HealthGates) checkBackbeatHealth(ctx context.Context) (*GateStatus, error) {
|
||||
url := fmt.Sprintf("%s/metrics", hg.backbeatURL)
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create BACKBEAT health request: %w", err)
|
||||
}
|
||||
|
||||
resp, err := hg.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("BACKBEAT health request failed: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("BACKBEAT health check returned status %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
var health BackbeatHealth
|
||||
if err := json.NewDecoder(resp.Body).Decode(&health); err != nil {
|
||||
return nil, fmt.Errorf("failed to decode BACKBEAT health response: %w", err)
|
||||
}
|
||||
|
||||
status := &GateStatus{
|
||||
Name: "backbeat",
|
||||
LastChecked: time.Now(),
|
||||
Metrics: map[string]interface{}{
|
||||
"subject_lags": health.SubjectLags,
|
||||
"max_lag_seconds": health.MaxLagSeconds,
|
||||
"consumer_health": health.ConsumerHealth,
|
||||
},
|
||||
}
|
||||
|
||||
// Check subject lag threshold
|
||||
if health.MaxLagSeconds > hg.thresholds.BackbeatMaxLagSeconds {
|
||||
status.Healthy = false
|
||||
status.Reason = fmt.Sprintf("BACKBEAT lag too high: %ds > %ds",
|
||||
health.MaxLagSeconds, hg.thresholds.BackbeatMaxLagSeconds)
|
||||
return status, nil
|
||||
}
|
||||
|
||||
// Check overall BACKBEAT health
|
||||
if !health.Healthy {
|
||||
status.Healthy = false
|
||||
status.Reason = "BACKBEAT reports unhealthy status"
|
||||
return status, nil
|
||||
}
|
||||
|
||||
status.Healthy = true
|
||||
return status, nil
|
||||
}
|
||||
|
||||
// checkBootstrapHealth checks bootstrap peer pool health
|
||||
func (hg *HealthGates) checkBootstrapHealth(ctx context.Context) (*GateStatus, error) {
|
||||
url := fmt.Sprintf("%s/peers", hg.chorusURL)
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create bootstrap health request: %w", err)
|
||||
}
|
||||
|
||||
resp, err := hg.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("bootstrap health request failed: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("bootstrap health check returned status %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
var health BootstrapHealth
|
||||
if err := json.NewDecoder(resp.Body).Decode(&health); err != nil {
|
||||
return nil, fmt.Errorf("failed to decode bootstrap health response: %w", err)
|
||||
}
|
||||
|
||||
status := &GateStatus{
|
||||
Name: "bootstrap",
|
||||
LastChecked: time.Now(),
|
||||
Metrics: map[string]interface{}{
|
||||
"total_peers": health.TotalPeers,
|
||||
"healthy_peers": health.HealthyPeers,
|
||||
"reachable_peers": health.ReachablePeers,
|
||||
},
|
||||
}
|
||||
|
||||
// Check minimum healthy peers threshold
|
||||
if health.HealthyPeers < hg.thresholds.BootstrapMinHealthyPeers {
|
||||
status.Healthy = false
|
||||
status.Reason = fmt.Sprintf("Not enough healthy bootstrap peers: %d < %d",
|
||||
health.HealthyPeers, hg.thresholds.BootstrapMinHealthyPeers)
|
||||
return status, nil
|
||||
}
|
||||
|
||||
status.Healthy = true
|
||||
return status, nil
|
||||
}
|
||||
|
||||
// checkScalingMetrics checks recent scaling success rate
|
||||
func (hg *HealthGates) checkScalingMetrics(metrics *ScalingMetrics) *GateStatus {
|
||||
status := &GateStatus{
|
||||
Name: "scaling_metrics",
|
||||
LastChecked: time.Now(),
|
||||
Metrics: map[string]interface{}{
|
||||
"join_success_rate": metrics.JoinSuccessRate,
|
||||
"successful_joins": metrics.SuccessfulJoins,
|
||||
"failed_joins": metrics.FailedJoins,
|
||||
"last_wave_size": metrics.LastWaveSize,
|
||||
},
|
||||
}
|
||||
|
||||
// Check join success rate threshold
|
||||
if metrics.JoinSuccessRate < hg.thresholds.JoinSuccessRateThreshold {
|
||||
status.Healthy = false
|
||||
status.Reason = fmt.Sprintf("Join success rate too low: %.1f%% < %.1f%%",
|
||||
metrics.JoinSuccessRate*100, hg.thresholds.JoinSuccessRateThreshold*100)
|
||||
return status
|
||||
}
|
||||
|
||||
status.Healthy = true
|
||||
return status
|
||||
}
|
||||
|
||||
// GetThresholds returns the current health thresholds
|
||||
func (hg *HealthGates) GetThresholds() HealthThresholds {
|
||||
return hg.thresholds
|
||||
}
|
||||
|
||||
// IsHealthy performs a quick health check and returns boolean result
|
||||
func (hg *HealthGates) IsHealthy(ctx context.Context, recentMetrics *ScalingMetrics) bool {
|
||||
status, err := hg.CheckHealth(ctx, recentMetrics)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return status.Healthy
|
||||
}
|
||||
Reference in New Issue
Block a user