- Add scaling system components to server initialization - Register scaling API and assignment broker routes - Start bootstrap pool manager in server lifecycle - Add graceful shutdown for scaling controller - Update API routing to use chi.Router instead of gorilla/mux - Fix Docker API compatibility issues - Configure health gates with placeholder URLs for KACHING and BACKBEAT 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
454 lines
14 KiB
Go
454 lines
14 KiB
Go
package orchestrator
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/rs/zerolog/log"
|
|
"go.opentelemetry.io/otel/attribute"
|
|
|
|
"github.com/chorus-services/whoosh/internal/tracing"
|
|
)
|
|
|
|
// ScalingMetricsCollector collects and manages scaling operation metrics
|
|
type ScalingMetricsCollector struct {
|
|
mu sync.RWMutex
|
|
operations []CompletedScalingOperation
|
|
maxHistory int
|
|
currentWave *WaveMetrics
|
|
}
|
|
|
|
// CompletedScalingOperation represents a completed scaling operation for metrics
|
|
type CompletedScalingOperation struct {
|
|
ID string `json:"id"`
|
|
ServiceName string `json:"service_name"`
|
|
WaveNumber int `json:"wave_number"`
|
|
StartedAt time.Time `json:"started_at"`
|
|
CompletedAt time.Time `json:"completed_at"`
|
|
Duration time.Duration `json:"duration"`
|
|
TargetReplicas int `json:"target_replicas"`
|
|
AchievedReplicas int `json:"achieved_replicas"`
|
|
Success bool `json:"success"`
|
|
FailureReason string `json:"failure_reason,omitempty"`
|
|
JoinAttempts []JoinAttempt `json:"join_attempts"`
|
|
HealthGateResults map[string]bool `json:"health_gate_results"`
|
|
BackoffLevel int `json:"backoff_level"`
|
|
}
|
|
|
|
// JoinAttempt represents an individual replica join attempt
|
|
type JoinAttempt struct {
|
|
ReplicaID string `json:"replica_id"`
|
|
AttemptedAt time.Time `json:"attempted_at"`
|
|
CompletedAt time.Time `json:"completed_at,omitempty"`
|
|
Duration time.Duration `json:"duration"`
|
|
Success bool `json:"success"`
|
|
FailureReason string `json:"failure_reason,omitempty"`
|
|
BootstrapPeers []string `json:"bootstrap_peers"`
|
|
}
|
|
|
|
// WaveMetrics tracks metrics for the currently executing wave
|
|
type WaveMetrics struct {
|
|
WaveID string `json:"wave_id"`
|
|
ServiceName string `json:"service_name"`
|
|
StartedAt time.Time `json:"started_at"`
|
|
TargetReplicas int `json:"target_replicas"`
|
|
CurrentReplicas int `json:"current_replicas"`
|
|
JoinAttempts []JoinAttempt `json:"join_attempts"`
|
|
HealthChecks []HealthCheckResult `json:"health_checks"`
|
|
BackoffLevel int `json:"backoff_level"`
|
|
}
|
|
|
|
// HealthCheckResult represents a health gate check result
|
|
type HealthCheckResult struct {
|
|
Timestamp time.Time `json:"timestamp"`
|
|
GateName string `json:"gate_name"`
|
|
Healthy bool `json:"healthy"`
|
|
Reason string `json:"reason,omitempty"`
|
|
Metrics map[string]interface{} `json:"metrics,omitempty"`
|
|
CheckDuration time.Duration `json:"check_duration"`
|
|
}
|
|
|
|
// ScalingMetricsReport provides aggregated metrics for reporting
|
|
type ScalingMetricsReport struct {
|
|
WindowStart time.Time `json:"window_start"`
|
|
WindowEnd time.Time `json:"window_end"`
|
|
TotalOperations int `json:"total_operations"`
|
|
SuccessfulOps int `json:"successful_operations"`
|
|
FailedOps int `json:"failed_operations"`
|
|
SuccessRate float64 `json:"success_rate"`
|
|
AverageWaveTime time.Duration `json:"average_wave_time"`
|
|
AverageJoinTime time.Duration `json:"average_join_time"`
|
|
BackoffEvents int `json:"backoff_events"`
|
|
HealthGateFailures map[string]int `json:"health_gate_failures"`
|
|
ServiceMetrics map[string]ServiceMetrics `json:"service_metrics"`
|
|
CurrentWave *WaveMetrics `json:"current_wave,omitempty"`
|
|
}
|
|
|
|
// ServiceMetrics provides per-service scaling metrics
|
|
type ServiceMetrics struct {
|
|
ServiceName string `json:"service_name"`
|
|
TotalWaves int `json:"total_waves"`
|
|
SuccessfulWaves int `json:"successful_waves"`
|
|
AverageWaveTime time.Duration `json:"average_wave_time"`
|
|
LastScaled time.Time `json:"last_scaled"`
|
|
CurrentReplicas int `json:"current_replicas"`
|
|
}
|
|
|
|
// NewScalingMetricsCollector creates a new metrics collector
|
|
func NewScalingMetricsCollector(maxHistory int) *ScalingMetricsCollector {
|
|
if maxHistory == 0 {
|
|
maxHistory = 1000 // Default to keeping 1000 operations
|
|
}
|
|
|
|
return &ScalingMetricsCollector{
|
|
operations: make([]CompletedScalingOperation, 0),
|
|
maxHistory: maxHistory,
|
|
}
|
|
}
|
|
|
|
// StartWave begins tracking a new scaling wave
|
|
func (smc *ScalingMetricsCollector) StartWave(ctx context.Context, waveID, serviceName string, targetReplicas int) {
|
|
ctx, span := tracing.Tracer.Start(ctx, "scaling_metrics.start_wave")
|
|
defer span.End()
|
|
|
|
smc.mu.Lock()
|
|
defer smc.mu.Unlock()
|
|
|
|
smc.currentWave = &WaveMetrics{
|
|
WaveID: waveID,
|
|
ServiceName: serviceName,
|
|
StartedAt: time.Now(),
|
|
TargetReplicas: targetReplicas,
|
|
JoinAttempts: make([]JoinAttempt, 0),
|
|
HealthChecks: make([]HealthCheckResult, 0),
|
|
}
|
|
|
|
span.SetAttributes(
|
|
attribute.String("wave.id", waveID),
|
|
attribute.String("wave.service", serviceName),
|
|
attribute.Int("wave.target_replicas", targetReplicas),
|
|
)
|
|
|
|
log.Info().
|
|
Str("wave_id", waveID).
|
|
Str("service_name", serviceName).
|
|
Int("target_replicas", targetReplicas).
|
|
Msg("Started tracking scaling wave")
|
|
}
|
|
|
|
// RecordJoinAttempt records a replica join attempt
|
|
func (smc *ScalingMetricsCollector) RecordJoinAttempt(replicaID string, bootstrapPeers []string, success bool, duration time.Duration, failureReason string) {
|
|
smc.mu.Lock()
|
|
defer smc.mu.Unlock()
|
|
|
|
if smc.currentWave == nil {
|
|
log.Warn().Str("replica_id", replicaID).Msg("No active wave to record join attempt")
|
|
return
|
|
}
|
|
|
|
attempt := JoinAttempt{
|
|
ReplicaID: replicaID,
|
|
AttemptedAt: time.Now().Add(-duration),
|
|
CompletedAt: time.Now(),
|
|
Duration: duration,
|
|
Success: success,
|
|
FailureReason: failureReason,
|
|
BootstrapPeers: bootstrapPeers,
|
|
}
|
|
|
|
smc.currentWave.JoinAttempts = append(smc.currentWave.JoinAttempts, attempt)
|
|
|
|
log.Debug().
|
|
Str("wave_id", smc.currentWave.WaveID).
|
|
Str("replica_id", replicaID).
|
|
Bool("success", success).
|
|
Dur("duration", duration).
|
|
Msg("Recorded join attempt")
|
|
}
|
|
|
|
// RecordHealthCheck records a health gate check result
|
|
func (smc *ScalingMetricsCollector) RecordHealthCheck(gateName string, healthy bool, reason string, metrics map[string]interface{}, duration time.Duration) {
|
|
smc.mu.Lock()
|
|
defer smc.mu.Unlock()
|
|
|
|
if smc.currentWave == nil {
|
|
log.Warn().Str("gate_name", gateName).Msg("No active wave to record health check")
|
|
return
|
|
}
|
|
|
|
result := HealthCheckResult{
|
|
Timestamp: time.Now(),
|
|
GateName: gateName,
|
|
Healthy: healthy,
|
|
Reason: reason,
|
|
Metrics: metrics,
|
|
CheckDuration: duration,
|
|
}
|
|
|
|
smc.currentWave.HealthChecks = append(smc.currentWave.HealthChecks, result)
|
|
|
|
log.Debug().
|
|
Str("wave_id", smc.currentWave.WaveID).
|
|
Str("gate_name", gateName).
|
|
Bool("healthy", healthy).
|
|
Dur("duration", duration).
|
|
Msg("Recorded health check")
|
|
}
|
|
|
|
// CompleteWave finishes tracking the current wave and archives it
|
|
func (smc *ScalingMetricsCollector) CompleteWave(ctx context.Context, success bool, achievedReplicas int, failureReason string, backoffLevel int) {
|
|
ctx, span := tracing.Tracer.Start(ctx, "scaling_metrics.complete_wave")
|
|
defer span.End()
|
|
|
|
smc.mu.Lock()
|
|
defer smc.mu.Unlock()
|
|
|
|
if smc.currentWave == nil {
|
|
log.Warn().Msg("No active wave to complete")
|
|
return
|
|
}
|
|
|
|
now := time.Now()
|
|
operation := CompletedScalingOperation{
|
|
ID: smc.currentWave.WaveID,
|
|
ServiceName: smc.currentWave.ServiceName,
|
|
WaveNumber: len(smc.operations) + 1,
|
|
StartedAt: smc.currentWave.StartedAt,
|
|
CompletedAt: now,
|
|
Duration: now.Sub(smc.currentWave.StartedAt),
|
|
TargetReplicas: smc.currentWave.TargetReplicas,
|
|
AchievedReplicas: achievedReplicas,
|
|
Success: success,
|
|
FailureReason: failureReason,
|
|
JoinAttempts: smc.currentWave.JoinAttempts,
|
|
HealthGateResults: smc.extractHealthGateResults(),
|
|
BackoffLevel: backoffLevel,
|
|
}
|
|
|
|
// Add to operations history
|
|
smc.operations = append(smc.operations, operation)
|
|
|
|
// Trim history if needed
|
|
if len(smc.operations) > smc.maxHistory {
|
|
smc.operations = smc.operations[len(smc.operations)-smc.maxHistory:]
|
|
}
|
|
|
|
span.SetAttributes(
|
|
attribute.String("wave.id", operation.ID),
|
|
attribute.String("wave.service", operation.ServiceName),
|
|
attribute.Bool("wave.success", success),
|
|
attribute.Int("wave.achieved_replicas", achievedReplicas),
|
|
attribute.Int("wave.backoff_level", backoffLevel),
|
|
attribute.String("wave.duration", operation.Duration.String()),
|
|
)
|
|
|
|
log.Info().
|
|
Str("wave_id", operation.ID).
|
|
Str("service_name", operation.ServiceName).
|
|
Bool("success", success).
|
|
Int("achieved_replicas", achievedReplicas).
|
|
Dur("duration", operation.Duration).
|
|
Msg("Completed scaling wave")
|
|
|
|
// Clear current wave
|
|
smc.currentWave = nil
|
|
}
|
|
|
|
// extractHealthGateResults extracts the final health gate results from checks
|
|
func (smc *ScalingMetricsCollector) extractHealthGateResults() map[string]bool {
|
|
results := make(map[string]bool)
|
|
|
|
// Get the latest result for each gate
|
|
for _, check := range smc.currentWave.HealthChecks {
|
|
results[check.GateName] = check.Healthy
|
|
}
|
|
|
|
return results
|
|
}
|
|
|
|
// GenerateReport generates a metrics report for the specified time window
|
|
func (smc *ScalingMetricsCollector) GenerateReport(ctx context.Context, windowStart, windowEnd time.Time) *ScalingMetricsReport {
|
|
ctx, span := tracing.Tracer.Start(ctx, "scaling_metrics.generate_report")
|
|
defer span.End()
|
|
|
|
smc.mu.RLock()
|
|
defer smc.mu.RUnlock()
|
|
|
|
report := &ScalingMetricsReport{
|
|
WindowStart: windowStart,
|
|
WindowEnd: windowEnd,
|
|
HealthGateFailures: make(map[string]int),
|
|
ServiceMetrics: make(map[string]ServiceMetrics),
|
|
CurrentWave: smc.currentWave,
|
|
}
|
|
|
|
// Filter operations within window
|
|
var windowOps []CompletedScalingOperation
|
|
for _, op := range smc.operations {
|
|
if op.StartedAt.After(windowStart) && op.StartedAt.Before(windowEnd) {
|
|
windowOps = append(windowOps, op)
|
|
}
|
|
}
|
|
|
|
report.TotalOperations = len(windowOps)
|
|
|
|
if len(windowOps) == 0 {
|
|
return report
|
|
}
|
|
|
|
// Calculate aggregated metrics
|
|
var totalDuration time.Duration
|
|
var totalJoinDuration time.Duration
|
|
var totalJoinAttempts int
|
|
serviceStats := make(map[string]*ServiceMetrics)
|
|
|
|
for _, op := range windowOps {
|
|
// Overall stats
|
|
if op.Success {
|
|
report.SuccessfulOps++
|
|
} else {
|
|
report.FailedOps++
|
|
}
|
|
|
|
totalDuration += op.Duration
|
|
|
|
// Backoff tracking
|
|
if op.BackoffLevel > 0 {
|
|
report.BackoffEvents++
|
|
}
|
|
|
|
// Health gate failures
|
|
for gate, healthy := range op.HealthGateResults {
|
|
if !healthy {
|
|
report.HealthGateFailures[gate]++
|
|
}
|
|
}
|
|
|
|
// Join attempt metrics
|
|
for _, attempt := range op.JoinAttempts {
|
|
totalJoinDuration += attempt.Duration
|
|
totalJoinAttempts++
|
|
}
|
|
|
|
// Service-specific metrics
|
|
if _, exists := serviceStats[op.ServiceName]; !exists {
|
|
serviceStats[op.ServiceName] = &ServiceMetrics{
|
|
ServiceName: op.ServiceName,
|
|
}
|
|
}
|
|
|
|
svc := serviceStats[op.ServiceName]
|
|
svc.TotalWaves++
|
|
if op.Success {
|
|
svc.SuccessfulWaves++
|
|
}
|
|
if op.CompletedAt.After(svc.LastScaled) {
|
|
svc.LastScaled = op.CompletedAt
|
|
svc.CurrentReplicas = op.AchievedReplicas
|
|
}
|
|
}
|
|
|
|
// Calculate rates and averages
|
|
report.SuccessRate = float64(report.SuccessfulOps) / float64(report.TotalOperations)
|
|
report.AverageWaveTime = totalDuration / time.Duration(len(windowOps))
|
|
|
|
if totalJoinAttempts > 0 {
|
|
report.AverageJoinTime = totalJoinDuration / time.Duration(totalJoinAttempts)
|
|
}
|
|
|
|
// Finalize service metrics
|
|
for serviceName, stats := range serviceStats {
|
|
if stats.TotalWaves > 0 {
|
|
// Calculate average wave time for this service
|
|
var serviceDuration time.Duration
|
|
serviceWaves := 0
|
|
for _, op := range windowOps {
|
|
if op.ServiceName == serviceName {
|
|
serviceDuration += op.Duration
|
|
serviceWaves++
|
|
}
|
|
}
|
|
stats.AverageWaveTime = serviceDuration / time.Duration(serviceWaves)
|
|
}
|
|
report.ServiceMetrics[serviceName] = *stats
|
|
}
|
|
|
|
span.SetAttributes(
|
|
attribute.Int("report.total_operations", report.TotalOperations),
|
|
attribute.Int("report.successful_operations", report.SuccessfulOps),
|
|
attribute.Float64("report.success_rate", report.SuccessRate),
|
|
attribute.String("report.window_duration", windowEnd.Sub(windowStart).String()),
|
|
)
|
|
|
|
return report
|
|
}
|
|
|
|
// GetCurrentWave returns the currently active wave metrics
|
|
func (smc *ScalingMetricsCollector) GetCurrentWave() *WaveMetrics {
|
|
smc.mu.RLock()
|
|
defer smc.mu.RUnlock()
|
|
|
|
if smc.currentWave == nil {
|
|
return nil
|
|
}
|
|
|
|
// Return a copy to avoid concurrent access issues
|
|
wave := *smc.currentWave
|
|
wave.JoinAttempts = make([]JoinAttempt, len(smc.currentWave.JoinAttempts))
|
|
copy(wave.JoinAttempts, smc.currentWave.JoinAttempts)
|
|
wave.HealthChecks = make([]HealthCheckResult, len(smc.currentWave.HealthChecks))
|
|
copy(wave.HealthChecks, smc.currentWave.HealthChecks)
|
|
|
|
return &wave
|
|
}
|
|
|
|
// GetRecentOperations returns the most recent scaling operations
|
|
func (smc *ScalingMetricsCollector) GetRecentOperations(limit int) []CompletedScalingOperation {
|
|
smc.mu.RLock()
|
|
defer smc.mu.RUnlock()
|
|
|
|
if limit <= 0 || limit > len(smc.operations) {
|
|
limit = len(smc.operations)
|
|
}
|
|
|
|
// Return most recent operations
|
|
start := len(smc.operations) - limit
|
|
operations := make([]CompletedScalingOperation, limit)
|
|
copy(operations, smc.operations[start:])
|
|
|
|
return operations
|
|
}
|
|
|
|
// ExportMetrics exports metrics in JSON format
|
|
func (smc *ScalingMetricsCollector) ExportMetrics(ctx context.Context) ([]byte, error) {
|
|
ctx, span := tracing.Tracer.Start(ctx, "scaling_metrics.export")
|
|
defer span.End()
|
|
|
|
smc.mu.RLock()
|
|
defer smc.mu.RUnlock()
|
|
|
|
export := struct {
|
|
Operations []CompletedScalingOperation `json:"operations"`
|
|
CurrentWave *WaveMetrics `json:"current_wave,omitempty"`
|
|
ExportedAt time.Time `json:"exported_at"`
|
|
}{
|
|
Operations: smc.operations,
|
|
CurrentWave: smc.currentWave,
|
|
ExportedAt: time.Now(),
|
|
}
|
|
|
|
data, err := json.MarshalIndent(export, "", " ")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to marshal metrics: %w", err)
|
|
}
|
|
|
|
span.SetAttributes(
|
|
attribute.Int("export.operation_count", len(smc.operations)),
|
|
attribute.Bool("export.has_current_wave", smc.currentWave != nil),
|
|
)
|
|
|
|
return data, nil
|
|
} |