- Health gates system for pre-scaling validation (KACHING, BACKBEAT, bootstrap peers) - Assignment broker API for per-replica configuration management - Bootstrap pool management with weighted peer selection and health monitoring - Wave-based scaling algorithm with exponential backoff and failure recovery - Enhanced SwarmManager with Docker service scaling capabilities - Comprehensive scaling metrics collection and reporting system - RESTful HTTP API for external scaling operations and monitoring - Integration with CHORUS P2P networking and assignment systems 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
408 lines
13 KiB
Go
408 lines
13 KiB
Go
package orchestrator
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"net/http"
|
|
"time"
|
|
|
|
"github.com/rs/zerolog/log"
|
|
"go.opentelemetry.io/otel/attribute"
|
|
"go.opentelemetry.io/otel/trace"
|
|
|
|
"github.com/chorus-services/whoosh/internal/tracing"
|
|
)
|
|
|
|
// HealthGates manages health checks that gate scaling operations
|
|
type HealthGates struct {
|
|
kachingURL string
|
|
backbeatURL string
|
|
chorusURL string
|
|
httpClient *http.Client
|
|
thresholds HealthThresholds
|
|
}
|
|
|
|
// HealthThresholds defines the health criteria for allowing scaling
|
|
type HealthThresholds struct {
|
|
KachingMaxLatencyMS int `json:"kaching_max_latency_ms"` // Maximum acceptable KACHING latency
|
|
KachingMinRateRemaining int `json:"kaching_min_rate_remaining"` // Minimum rate limit remaining
|
|
BackbeatMaxLagSeconds int `json:"backbeat_max_lag_seconds"` // Maximum subject lag in seconds
|
|
BootstrapMinHealthyPeers int `json:"bootstrap_min_healthy_peers"` // Minimum healthy bootstrap peers
|
|
JoinSuccessRateThreshold float64 `json:"join_success_rate_threshold"` // Minimum join success rate (0.0-1.0)
|
|
}
|
|
|
|
// HealthStatus represents the current health status across all gates
|
|
type HealthStatus struct {
|
|
Healthy bool `json:"healthy"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
Gates map[string]GateStatus `json:"gates"`
|
|
OverallReason string `json:"overall_reason,omitempty"`
|
|
}
|
|
|
|
// GateStatus represents the status of an individual health gate
|
|
type GateStatus struct {
|
|
Name string `json:"name"`
|
|
Healthy bool `json:"healthy"`
|
|
Reason string `json:"reason,omitempty"`
|
|
Metrics map[string]interface{} `json:"metrics,omitempty"`
|
|
LastChecked time.Time `json:"last_checked"`
|
|
}
|
|
|
|
// KachingHealth represents KACHING health metrics
|
|
type KachingHealth struct {
|
|
Healthy bool `json:"healthy"`
|
|
LatencyP95MS float64 `json:"latency_p95_ms"`
|
|
QueueDepth int `json:"queue_depth"`
|
|
RateLimitRemaining int `json:"rate_limit_remaining"`
|
|
ActiveLeases int `json:"active_leases"`
|
|
ClusterCapacity int `json:"cluster_capacity"`
|
|
}
|
|
|
|
// BackbeatHealth represents BACKBEAT health metrics
|
|
type BackbeatHealth struct {
|
|
Healthy bool `json:"healthy"`
|
|
SubjectLags map[string]int `json:"subject_lags"`
|
|
MaxLagSeconds int `json:"max_lag_seconds"`
|
|
ConsumerHealth map[string]bool `json:"consumer_health"`
|
|
}
|
|
|
|
// BootstrapHealth represents bootstrap peer pool health
|
|
type BootstrapHealth struct {
|
|
Healthy bool `json:"healthy"`
|
|
TotalPeers int `json:"total_peers"`
|
|
HealthyPeers int `json:"healthy_peers"`
|
|
ReachablePeers int `json:"reachable_peers"`
|
|
}
|
|
|
|
// ScalingMetrics represents recent scaling operation metrics
|
|
type ScalingMetrics struct {
|
|
LastWaveSize int `json:"last_wave_size"`
|
|
LastWaveStarted time.Time `json:"last_wave_started"`
|
|
LastWaveCompleted time.Time `json:"last_wave_completed"`
|
|
JoinSuccessRate float64 `json:"join_success_rate"`
|
|
SuccessfulJoins int `json:"successful_joins"`
|
|
FailedJoins int `json:"failed_joins"`
|
|
}
|
|
|
|
// NewHealthGates creates a new health gates manager
|
|
func NewHealthGates(kachingURL, backbeatURL, chorusURL string) *HealthGates {
|
|
return &HealthGates{
|
|
kachingURL: kachingURL,
|
|
backbeatURL: backbeatURL,
|
|
chorusURL: chorusURL,
|
|
httpClient: &http.Client{Timeout: 10 * time.Second},
|
|
thresholds: HealthThresholds{
|
|
KachingMaxLatencyMS: 500, // 500ms max latency
|
|
KachingMinRateRemaining: 20, // At least 20 requests remaining
|
|
BackbeatMaxLagSeconds: 30, // Max 30 seconds lag
|
|
BootstrapMinHealthyPeers: 3, // At least 3 healthy bootstrap peers
|
|
JoinSuccessRateThreshold: 0.8, // 80% join success rate
|
|
},
|
|
}
|
|
}
|
|
|
|
// SetThresholds updates the health thresholds
|
|
func (hg *HealthGates) SetThresholds(thresholds HealthThresholds) {
|
|
hg.thresholds = thresholds
|
|
}
|
|
|
|
// CheckHealth checks all health gates and returns overall status
|
|
func (hg *HealthGates) CheckHealth(ctx context.Context, recentMetrics *ScalingMetrics) (*HealthStatus, error) {
|
|
ctx, span := tracing.Tracer.Start(ctx, "health_gates.check_health")
|
|
defer span.End()
|
|
|
|
status := &HealthStatus{
|
|
Timestamp: time.Now(),
|
|
Gates: make(map[string]GateStatus),
|
|
Healthy: true,
|
|
}
|
|
|
|
var failReasons []string
|
|
|
|
// Check KACHING health
|
|
if kachingStatus, err := hg.checkKachingHealth(ctx); err != nil {
|
|
log.Warn().Err(err).Msg("Failed to check KACHING health")
|
|
status.Gates["kaching"] = GateStatus{
|
|
Name: "kaching",
|
|
Healthy: false,
|
|
Reason: fmt.Sprintf("Health check failed: %v", err),
|
|
LastChecked: time.Now(),
|
|
}
|
|
status.Healthy = false
|
|
failReasons = append(failReasons, "KACHING unreachable")
|
|
} else {
|
|
status.Gates["kaching"] = *kachingStatus
|
|
if !kachingStatus.Healthy {
|
|
status.Healthy = false
|
|
failReasons = append(failReasons, kachingStatus.Reason)
|
|
}
|
|
}
|
|
|
|
// Check BACKBEAT health
|
|
if backbeatStatus, err := hg.checkBackbeatHealth(ctx); err != nil {
|
|
log.Warn().Err(err).Msg("Failed to check BACKBEAT health")
|
|
status.Gates["backbeat"] = GateStatus{
|
|
Name: "backbeat",
|
|
Healthy: false,
|
|
Reason: fmt.Sprintf("Health check failed: %v", err),
|
|
LastChecked: time.Now(),
|
|
}
|
|
status.Healthy = false
|
|
failReasons = append(failReasons, "BACKBEAT unreachable")
|
|
} else {
|
|
status.Gates["backbeat"] = *backbeatStatus
|
|
if !backbeatStatus.Healthy {
|
|
status.Healthy = false
|
|
failReasons = append(failReasons, backbeatStatus.Reason)
|
|
}
|
|
}
|
|
|
|
// Check bootstrap peer health
|
|
if bootstrapStatus, err := hg.checkBootstrapHealth(ctx); err != nil {
|
|
log.Warn().Err(err).Msg("Failed to check bootstrap health")
|
|
status.Gates["bootstrap"] = GateStatus{
|
|
Name: "bootstrap",
|
|
Healthy: false,
|
|
Reason: fmt.Sprintf("Health check failed: %v", err),
|
|
LastChecked: time.Now(),
|
|
}
|
|
status.Healthy = false
|
|
failReasons = append(failReasons, "Bootstrap peers unreachable")
|
|
} else {
|
|
status.Gates["bootstrap"] = *bootstrapStatus
|
|
if !bootstrapStatus.Healthy {
|
|
status.Healthy = false
|
|
failReasons = append(failReasons, bootstrapStatus.Reason)
|
|
}
|
|
}
|
|
|
|
// Check recent scaling metrics if provided
|
|
if recentMetrics != nil {
|
|
if metricsStatus := hg.checkScalingMetrics(recentMetrics); !metricsStatus.Healthy {
|
|
status.Gates["scaling_metrics"] = *metricsStatus
|
|
status.Healthy = false
|
|
failReasons = append(failReasons, metricsStatus.Reason)
|
|
} else {
|
|
status.Gates["scaling_metrics"] = *metricsStatus
|
|
}
|
|
}
|
|
|
|
// Set overall reason if unhealthy
|
|
if !status.Healthy && len(failReasons) > 0 {
|
|
status.OverallReason = fmt.Sprintf("Health gates failed: %v", failReasons)
|
|
}
|
|
|
|
// Add tracing attributes
|
|
span.SetAttributes(
|
|
attribute.Bool("health.overall_healthy", status.Healthy),
|
|
attribute.Int("health.gate_count", len(status.Gates)),
|
|
)
|
|
|
|
if !status.Healthy {
|
|
span.SetAttributes(attribute.String("health.fail_reason", status.OverallReason))
|
|
}
|
|
|
|
return status, nil
|
|
}
|
|
|
|
// checkKachingHealth checks KACHING health and rate limits
|
|
func (hg *HealthGates) checkKachingHealth(ctx context.Context) (*GateStatus, error) {
|
|
url := fmt.Sprintf("%s/health/burst", hg.kachingURL)
|
|
|
|
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create KACHING health request: %w", err)
|
|
}
|
|
|
|
resp, err := hg.httpClient.Do(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("KACHING health request failed: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return nil, fmt.Errorf("KACHING health check returned status %d", resp.StatusCode)
|
|
}
|
|
|
|
var health KachingHealth
|
|
if err := json.NewDecoder(resp.Body).Decode(&health); err != nil {
|
|
return nil, fmt.Errorf("failed to decode KACHING health response: %w", err)
|
|
}
|
|
|
|
status := &GateStatus{
|
|
Name: "kaching",
|
|
LastChecked: time.Now(),
|
|
Metrics: map[string]interface{}{
|
|
"latency_p95_ms": health.LatencyP95MS,
|
|
"queue_depth": health.QueueDepth,
|
|
"rate_limit_remaining": health.RateLimitRemaining,
|
|
"active_leases": health.ActiveLeases,
|
|
"cluster_capacity": health.ClusterCapacity,
|
|
},
|
|
}
|
|
|
|
// Check latency threshold
|
|
if health.LatencyP95MS > float64(hg.thresholds.KachingMaxLatencyMS) {
|
|
status.Healthy = false
|
|
status.Reason = fmt.Sprintf("KACHING latency too high: %.1fms > %dms",
|
|
health.LatencyP95MS, hg.thresholds.KachingMaxLatencyMS)
|
|
return status, nil
|
|
}
|
|
|
|
// Check rate limit threshold
|
|
if health.RateLimitRemaining < hg.thresholds.KachingMinRateRemaining {
|
|
status.Healthy = false
|
|
status.Reason = fmt.Sprintf("KACHING rate limit too low: %d < %d remaining",
|
|
health.RateLimitRemaining, hg.thresholds.KachingMinRateRemaining)
|
|
return status, nil
|
|
}
|
|
|
|
// Check overall KACHING health
|
|
if !health.Healthy {
|
|
status.Healthy = false
|
|
status.Reason = "KACHING reports unhealthy status"
|
|
return status, nil
|
|
}
|
|
|
|
status.Healthy = true
|
|
return status, nil
|
|
}
|
|
|
|
// checkBackbeatHealth checks BACKBEAT subject lag and consumer health
|
|
func (hg *HealthGates) checkBackbeatHealth(ctx context.Context) (*GateStatus, error) {
|
|
url := fmt.Sprintf("%s/metrics", hg.backbeatURL)
|
|
|
|
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create BACKBEAT health request: %w", err)
|
|
}
|
|
|
|
resp, err := hg.httpClient.Do(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("BACKBEAT health request failed: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return nil, fmt.Errorf("BACKBEAT health check returned status %d", resp.StatusCode)
|
|
}
|
|
|
|
var health BackbeatHealth
|
|
if err := json.NewDecoder(resp.Body).Decode(&health); err != nil {
|
|
return nil, fmt.Errorf("failed to decode BACKBEAT health response: %w", err)
|
|
}
|
|
|
|
status := &GateStatus{
|
|
Name: "backbeat",
|
|
LastChecked: time.Now(),
|
|
Metrics: map[string]interface{}{
|
|
"subject_lags": health.SubjectLags,
|
|
"max_lag_seconds": health.MaxLagSeconds,
|
|
"consumer_health": health.ConsumerHealth,
|
|
},
|
|
}
|
|
|
|
// Check subject lag threshold
|
|
if health.MaxLagSeconds > hg.thresholds.BackbeatMaxLagSeconds {
|
|
status.Healthy = false
|
|
status.Reason = fmt.Sprintf("BACKBEAT lag too high: %ds > %ds",
|
|
health.MaxLagSeconds, hg.thresholds.BackbeatMaxLagSeconds)
|
|
return status, nil
|
|
}
|
|
|
|
// Check overall BACKBEAT health
|
|
if !health.Healthy {
|
|
status.Healthy = false
|
|
status.Reason = "BACKBEAT reports unhealthy status"
|
|
return status, nil
|
|
}
|
|
|
|
status.Healthy = true
|
|
return status, nil
|
|
}
|
|
|
|
// checkBootstrapHealth checks bootstrap peer pool health
|
|
func (hg *HealthGates) checkBootstrapHealth(ctx context.Context) (*GateStatus, error) {
|
|
url := fmt.Sprintf("%s/peers", hg.chorusURL)
|
|
|
|
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create bootstrap health request: %w", err)
|
|
}
|
|
|
|
resp, err := hg.httpClient.Do(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("bootstrap health request failed: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return nil, fmt.Errorf("bootstrap health check returned status %d", resp.StatusCode)
|
|
}
|
|
|
|
var health BootstrapHealth
|
|
if err := json.NewDecoder(resp.Body).Decode(&health); err != nil {
|
|
return nil, fmt.Errorf("failed to decode bootstrap health response: %w", err)
|
|
}
|
|
|
|
status := &GateStatus{
|
|
Name: "bootstrap",
|
|
LastChecked: time.Now(),
|
|
Metrics: map[string]interface{}{
|
|
"total_peers": health.TotalPeers,
|
|
"healthy_peers": health.HealthyPeers,
|
|
"reachable_peers": health.ReachablePeers,
|
|
},
|
|
}
|
|
|
|
// Check minimum healthy peers threshold
|
|
if health.HealthyPeers < hg.thresholds.BootstrapMinHealthyPeers {
|
|
status.Healthy = false
|
|
status.Reason = fmt.Sprintf("Not enough healthy bootstrap peers: %d < %d",
|
|
health.HealthyPeers, hg.thresholds.BootstrapMinHealthyPeers)
|
|
return status, nil
|
|
}
|
|
|
|
status.Healthy = true
|
|
return status, nil
|
|
}
|
|
|
|
// checkScalingMetrics checks recent scaling success rate
|
|
func (hg *HealthGates) checkScalingMetrics(metrics *ScalingMetrics) *GateStatus {
|
|
status := &GateStatus{
|
|
Name: "scaling_metrics",
|
|
LastChecked: time.Now(),
|
|
Metrics: map[string]interface{}{
|
|
"join_success_rate": metrics.JoinSuccessRate,
|
|
"successful_joins": metrics.SuccessfulJoins,
|
|
"failed_joins": metrics.FailedJoins,
|
|
"last_wave_size": metrics.LastWaveSize,
|
|
},
|
|
}
|
|
|
|
// Check join success rate threshold
|
|
if metrics.JoinSuccessRate < hg.thresholds.JoinSuccessRateThreshold {
|
|
status.Healthy = false
|
|
status.Reason = fmt.Sprintf("Join success rate too low: %.1f%% < %.1f%%",
|
|
metrics.JoinSuccessRate*100, hg.thresholds.JoinSuccessRateThreshold*100)
|
|
return status
|
|
}
|
|
|
|
status.Healthy = true
|
|
return status
|
|
}
|
|
|
|
// GetThresholds returns the current health thresholds
|
|
func (hg *HealthGates) GetThresholds() HealthThresholds {
|
|
return hg.thresholds
|
|
}
|
|
|
|
// IsHealthy performs a quick health check and returns boolean result
|
|
func (hg *HealthGates) IsHealthy(ctx context.Context, recentMetrics *ScalingMetrics) bool {
|
|
status, err := hg.CheckHealth(ctx, recentMetrics)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
return status.Healthy
|
|
} |