package orchestrator import ( "context" "encoding/json" "fmt" "net/http" "time" "github.com/rs/zerolog/log" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/trace" "github.com/chorus-services/whoosh/internal/tracing" ) // HealthGates manages health checks that gate scaling operations type HealthGates struct { kachingURL string backbeatURL string chorusURL string httpClient *http.Client thresholds HealthThresholds } // HealthThresholds defines the health criteria for allowing scaling type HealthThresholds struct { KachingMaxLatencyMS int `json:"kaching_max_latency_ms"` // Maximum acceptable KACHING latency KachingMinRateRemaining int `json:"kaching_min_rate_remaining"` // Minimum rate limit remaining BackbeatMaxLagSeconds int `json:"backbeat_max_lag_seconds"` // Maximum subject lag in seconds BootstrapMinHealthyPeers int `json:"bootstrap_min_healthy_peers"` // Minimum healthy bootstrap peers JoinSuccessRateThreshold float64 `json:"join_success_rate_threshold"` // Minimum join success rate (0.0-1.0) } // HealthStatus represents the current health status across all gates type HealthStatus struct { Healthy bool `json:"healthy"` Timestamp time.Time `json:"timestamp"` Gates map[string]GateStatus `json:"gates"` OverallReason string `json:"overall_reason,omitempty"` } // GateStatus represents the status of an individual health gate type GateStatus struct { Name string `json:"name"` Healthy bool `json:"healthy"` Reason string `json:"reason,omitempty"` Metrics map[string]interface{} `json:"metrics,omitempty"` LastChecked time.Time `json:"last_checked"` } // KachingHealth represents KACHING health metrics type KachingHealth struct { Healthy bool `json:"healthy"` LatencyP95MS float64 `json:"latency_p95_ms"` QueueDepth int `json:"queue_depth"` RateLimitRemaining int `json:"rate_limit_remaining"` ActiveLeases int `json:"active_leases"` ClusterCapacity int `json:"cluster_capacity"` } // BackbeatHealth represents BACKBEAT health metrics type BackbeatHealth struct { Healthy bool `json:"healthy"` SubjectLags map[string]int `json:"subject_lags"` MaxLagSeconds int `json:"max_lag_seconds"` ConsumerHealth map[string]bool `json:"consumer_health"` } // BootstrapHealth represents bootstrap peer pool health type BootstrapHealth struct { Healthy bool `json:"healthy"` TotalPeers int `json:"total_peers"` HealthyPeers int `json:"healthy_peers"` ReachablePeers int `json:"reachable_peers"` } // ScalingMetrics represents recent scaling operation metrics type ScalingMetrics struct { LastWaveSize int `json:"last_wave_size"` LastWaveStarted time.Time `json:"last_wave_started"` LastWaveCompleted time.Time `json:"last_wave_completed"` JoinSuccessRate float64 `json:"join_success_rate"` SuccessfulJoins int `json:"successful_joins"` FailedJoins int `json:"failed_joins"` } // NewHealthGates creates a new health gates manager func NewHealthGates(kachingURL, backbeatURL, chorusURL string) *HealthGates { return &HealthGates{ kachingURL: kachingURL, backbeatURL: backbeatURL, chorusURL: chorusURL, httpClient: &http.Client{Timeout: 10 * time.Second}, thresholds: HealthThresholds{ KachingMaxLatencyMS: 500, // 500ms max latency KachingMinRateRemaining: 20, // At least 20 requests remaining BackbeatMaxLagSeconds: 30, // Max 30 seconds lag BootstrapMinHealthyPeers: 3, // At least 3 healthy bootstrap peers JoinSuccessRateThreshold: 0.8, // 80% join success rate }, } } // SetThresholds updates the health thresholds func (hg *HealthGates) SetThresholds(thresholds HealthThresholds) { hg.thresholds = thresholds } // CheckHealth checks all health gates and returns overall status func (hg *HealthGates) CheckHealth(ctx context.Context, recentMetrics *ScalingMetrics) (*HealthStatus, error) { ctx, span := tracing.Tracer.Start(ctx, "health_gates.check_health") defer span.End() status := &HealthStatus{ Timestamp: time.Now(), Gates: make(map[string]GateStatus), Healthy: true, } var failReasons []string // Check KACHING health if kachingStatus, err := hg.checkKachingHealth(ctx); err != nil { log.Warn().Err(err).Msg("Failed to check KACHING health") status.Gates["kaching"] = GateStatus{ Name: "kaching", Healthy: false, Reason: fmt.Sprintf("Health check failed: %v", err), LastChecked: time.Now(), } status.Healthy = false failReasons = append(failReasons, "KACHING unreachable") } else { status.Gates["kaching"] = *kachingStatus if !kachingStatus.Healthy { status.Healthy = false failReasons = append(failReasons, kachingStatus.Reason) } } // Check BACKBEAT health if backbeatStatus, err := hg.checkBackbeatHealth(ctx); err != nil { log.Warn().Err(err).Msg("Failed to check BACKBEAT health") status.Gates["backbeat"] = GateStatus{ Name: "backbeat", Healthy: false, Reason: fmt.Sprintf("Health check failed: %v", err), LastChecked: time.Now(), } status.Healthy = false failReasons = append(failReasons, "BACKBEAT unreachable") } else { status.Gates["backbeat"] = *backbeatStatus if !backbeatStatus.Healthy { status.Healthy = false failReasons = append(failReasons, backbeatStatus.Reason) } } // Check bootstrap peer health if bootstrapStatus, err := hg.checkBootstrapHealth(ctx); err != nil { log.Warn().Err(err).Msg("Failed to check bootstrap health") status.Gates["bootstrap"] = GateStatus{ Name: "bootstrap", Healthy: false, Reason: fmt.Sprintf("Health check failed: %v", err), LastChecked: time.Now(), } status.Healthy = false failReasons = append(failReasons, "Bootstrap peers unreachable") } else { status.Gates["bootstrap"] = *bootstrapStatus if !bootstrapStatus.Healthy { status.Healthy = false failReasons = append(failReasons, bootstrapStatus.Reason) } } // Check recent scaling metrics if provided if recentMetrics != nil { if metricsStatus := hg.checkScalingMetrics(recentMetrics); !metricsStatus.Healthy { status.Gates["scaling_metrics"] = *metricsStatus status.Healthy = false failReasons = append(failReasons, metricsStatus.Reason) } else { status.Gates["scaling_metrics"] = *metricsStatus } } // Set overall reason if unhealthy if !status.Healthy && len(failReasons) > 0 { status.OverallReason = fmt.Sprintf("Health gates failed: %v", failReasons) } // Add tracing attributes span.SetAttributes( attribute.Bool("health.overall_healthy", status.Healthy), attribute.Int("health.gate_count", len(status.Gates)), ) if !status.Healthy { span.SetAttributes(attribute.String("health.fail_reason", status.OverallReason)) } return status, nil } // checkKachingHealth checks KACHING health and rate limits func (hg *HealthGates) checkKachingHealth(ctx context.Context) (*GateStatus, error) { url := fmt.Sprintf("%s/health/burst", hg.kachingURL) req, err := http.NewRequestWithContext(ctx, "GET", url, nil) if err != nil { return nil, fmt.Errorf("failed to create KACHING health request: %w", err) } resp, err := hg.httpClient.Do(req) if err != nil { return nil, fmt.Errorf("KACHING health request failed: %w", err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("KACHING health check returned status %d", resp.StatusCode) } var health KachingHealth if err := json.NewDecoder(resp.Body).Decode(&health); err != nil { return nil, fmt.Errorf("failed to decode KACHING health response: %w", err) } status := &GateStatus{ Name: "kaching", LastChecked: time.Now(), Metrics: map[string]interface{}{ "latency_p95_ms": health.LatencyP95MS, "queue_depth": health.QueueDepth, "rate_limit_remaining": health.RateLimitRemaining, "active_leases": health.ActiveLeases, "cluster_capacity": health.ClusterCapacity, }, } // Check latency threshold if health.LatencyP95MS > float64(hg.thresholds.KachingMaxLatencyMS) { status.Healthy = false status.Reason = fmt.Sprintf("KACHING latency too high: %.1fms > %dms", health.LatencyP95MS, hg.thresholds.KachingMaxLatencyMS) return status, nil } // Check rate limit threshold if health.RateLimitRemaining < hg.thresholds.KachingMinRateRemaining { status.Healthy = false status.Reason = fmt.Sprintf("KACHING rate limit too low: %d < %d remaining", health.RateLimitRemaining, hg.thresholds.KachingMinRateRemaining) return status, nil } // Check overall KACHING health if !health.Healthy { status.Healthy = false status.Reason = "KACHING reports unhealthy status" return status, nil } status.Healthy = true return status, nil } // checkBackbeatHealth checks BACKBEAT subject lag and consumer health func (hg *HealthGates) checkBackbeatHealth(ctx context.Context) (*GateStatus, error) { url := fmt.Sprintf("%s/metrics", hg.backbeatURL) req, err := http.NewRequestWithContext(ctx, "GET", url, nil) if err != nil { return nil, fmt.Errorf("failed to create BACKBEAT health request: %w", err) } resp, err := hg.httpClient.Do(req) if err != nil { return nil, fmt.Errorf("BACKBEAT health request failed: %w", err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("BACKBEAT health check returned status %d", resp.StatusCode) } var health BackbeatHealth if err := json.NewDecoder(resp.Body).Decode(&health); err != nil { return nil, fmt.Errorf("failed to decode BACKBEAT health response: %w", err) } status := &GateStatus{ Name: "backbeat", LastChecked: time.Now(), Metrics: map[string]interface{}{ "subject_lags": health.SubjectLags, "max_lag_seconds": health.MaxLagSeconds, "consumer_health": health.ConsumerHealth, }, } // Check subject lag threshold if health.MaxLagSeconds > hg.thresholds.BackbeatMaxLagSeconds { status.Healthy = false status.Reason = fmt.Sprintf("BACKBEAT lag too high: %ds > %ds", health.MaxLagSeconds, hg.thresholds.BackbeatMaxLagSeconds) return status, nil } // Check overall BACKBEAT health if !health.Healthy { status.Healthy = false status.Reason = "BACKBEAT reports unhealthy status" return status, nil } status.Healthy = true return status, nil } // checkBootstrapHealth checks bootstrap peer pool health func (hg *HealthGates) checkBootstrapHealth(ctx context.Context) (*GateStatus, error) { url := fmt.Sprintf("%s/peers", hg.chorusURL) req, err := http.NewRequestWithContext(ctx, "GET", url, nil) if err != nil { return nil, fmt.Errorf("failed to create bootstrap health request: %w", err) } resp, err := hg.httpClient.Do(req) if err != nil { return nil, fmt.Errorf("bootstrap health request failed: %w", err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("bootstrap health check returned status %d", resp.StatusCode) } var health BootstrapHealth if err := json.NewDecoder(resp.Body).Decode(&health); err != nil { return nil, fmt.Errorf("failed to decode bootstrap health response: %w", err) } status := &GateStatus{ Name: "bootstrap", LastChecked: time.Now(), Metrics: map[string]interface{}{ "total_peers": health.TotalPeers, "healthy_peers": health.HealthyPeers, "reachable_peers": health.ReachablePeers, }, } // Check minimum healthy peers threshold if health.HealthyPeers < hg.thresholds.BootstrapMinHealthyPeers { status.Healthy = false status.Reason = fmt.Sprintf("Not enough healthy bootstrap peers: %d < %d", health.HealthyPeers, hg.thresholds.BootstrapMinHealthyPeers) return status, nil } status.Healthy = true return status, nil } // checkScalingMetrics checks recent scaling success rate func (hg *HealthGates) checkScalingMetrics(metrics *ScalingMetrics) *GateStatus { status := &GateStatus{ Name: "scaling_metrics", LastChecked: time.Now(), Metrics: map[string]interface{}{ "join_success_rate": metrics.JoinSuccessRate, "successful_joins": metrics.SuccessfulJoins, "failed_joins": metrics.FailedJoins, "last_wave_size": metrics.LastWaveSize, }, } // Check join success rate threshold if metrics.JoinSuccessRate < hg.thresholds.JoinSuccessRateThreshold { status.Healthy = false status.Reason = fmt.Sprintf("Join success rate too low: %.1f%% < %.1f%%", metrics.JoinSuccessRate*100, hg.thresholds.JoinSuccessRateThreshold*100) return status } status.Healthy = true return status } // GetThresholds returns the current health thresholds func (hg *HealthGates) GetThresholds() HealthThresholds { return hg.thresholds } // IsHealthy performs a quick health check and returns boolean result func (hg *HealthGates) IsHealthy(ctx context.Context, recentMetrics *ScalingMetrics) bool { status, err := hg.CheckHealth(ctx, recentMetrics) if err != nil { return false } return status.Healthy }