package orchestrator import ( "context" "encoding/json" "fmt" "sync" "time" "github.com/rs/zerolog/log" "go.opentelemetry.io/otel/attribute" "github.com/chorus-services/whoosh/internal/tracing" ) // ScalingMetricsCollector collects and manages scaling operation metrics type ScalingMetricsCollector struct { mu sync.RWMutex operations []ScalingOperation maxHistory int currentWave *WaveMetrics } // ScalingOperation represents a completed scaling operation type ScalingOperation struct { ID string `json:"id"` ServiceName string `json:"service_name"` WaveNumber int `json:"wave_number"` StartedAt time.Time `json:"started_at"` CompletedAt time.Time `json:"completed_at"` Duration time.Duration `json:"duration"` TargetReplicas int `json:"target_replicas"` AchievedReplicas int `json:"achieved_replicas"` Success bool `json:"success"` FailureReason string `json:"failure_reason,omitempty"` JoinAttempts []JoinAttempt `json:"join_attempts"` HealthGateResults map[string]bool `json:"health_gate_results"` BackoffLevel int `json:"backoff_level"` } // JoinAttempt represents an individual replica join attempt type JoinAttempt struct { ReplicaID string `json:"replica_id"` AttemptedAt time.Time `json:"attempted_at"` CompletedAt time.Time `json:"completed_at,omitempty"` Duration time.Duration `json:"duration"` Success bool `json:"success"` FailureReason string `json:"failure_reason,omitempty"` BootstrapPeers []string `json:"bootstrap_peers"` } // WaveMetrics tracks metrics for the currently executing wave type WaveMetrics struct { WaveID string `json:"wave_id"` ServiceName string `json:"service_name"` StartedAt time.Time `json:"started_at"` TargetReplicas int `json:"target_replicas"` CurrentReplicas int `json:"current_replicas"` JoinAttempts []JoinAttempt `json:"join_attempts"` HealthChecks []HealthCheckResult `json:"health_checks"` BackoffLevel int `json:"backoff_level"` } // HealthCheckResult represents a health gate check result type HealthCheckResult struct { Timestamp time.Time `json:"timestamp"` GateName string `json:"gate_name"` Healthy bool `json:"healthy"` Reason string `json:"reason,omitempty"` Metrics map[string]interface{} `json:"metrics,omitempty"` CheckDuration time.Duration `json:"check_duration"` } // ScalingMetricsReport provides aggregated metrics for reporting type ScalingMetricsReport struct { WindowStart time.Time `json:"window_start"` WindowEnd time.Time `json:"window_end"` TotalOperations int `json:"total_operations"` SuccessfulOps int `json:"successful_operations"` FailedOps int `json:"failed_operations"` SuccessRate float64 `json:"success_rate"` AverageWaveTime time.Duration `json:"average_wave_time"` AverageJoinTime time.Duration `json:"average_join_time"` BackoffEvents int `json:"backoff_events"` HealthGateFailures map[string]int `json:"health_gate_failures"` ServiceMetrics map[string]ServiceMetrics `json:"service_metrics"` CurrentWave *WaveMetrics `json:"current_wave,omitempty"` } // ServiceMetrics provides per-service scaling metrics type ServiceMetrics struct { ServiceName string `json:"service_name"` TotalWaves int `json:"total_waves"` SuccessfulWaves int `json:"successful_waves"` AverageWaveTime time.Duration `json:"average_wave_time"` LastScaled time.Time `json:"last_scaled"` CurrentReplicas int `json:"current_replicas"` } // NewScalingMetricsCollector creates a new metrics collector func NewScalingMetricsCollector(maxHistory int) *ScalingMetricsCollector { if maxHistory == 0 { maxHistory = 1000 // Default to keeping 1000 operations } return &ScalingMetricsCollector{ operations: make([]ScalingOperation, 0), maxHistory: maxHistory, } } // StartWave begins tracking a new scaling wave func (smc *ScalingMetricsCollector) StartWave(ctx context.Context, waveID, serviceName string, targetReplicas int) { ctx, span := tracing.Tracer.Start(ctx, "scaling_metrics.start_wave") defer span.End() smc.mu.Lock() defer smc.mu.Unlock() smc.currentWave = &WaveMetrics{ WaveID: waveID, ServiceName: serviceName, StartedAt: time.Now(), TargetReplicas: targetReplicas, JoinAttempts: make([]JoinAttempt, 0), HealthChecks: make([]HealthCheckResult, 0), } span.SetAttributes( attribute.String("wave.id", waveID), attribute.String("wave.service", serviceName), attribute.Int("wave.target_replicas", targetReplicas), ) log.Info(). Str("wave_id", waveID). Str("service_name", serviceName). Int("target_replicas", targetReplicas). Msg("Started tracking scaling wave") } // RecordJoinAttempt records a replica join attempt func (smc *ScalingMetricsCollector) RecordJoinAttempt(replicaID string, bootstrapPeers []string, success bool, duration time.Duration, failureReason string) { smc.mu.Lock() defer smc.mu.Unlock() if smc.currentWave == nil { log.Warn().Str("replica_id", replicaID).Msg("No active wave to record join attempt") return } attempt := JoinAttempt{ ReplicaID: replicaID, AttemptedAt: time.Now().Add(-duration), CompletedAt: time.Now(), Duration: duration, Success: success, FailureReason: failureReason, BootstrapPeers: bootstrapPeers, } smc.currentWave.JoinAttempts = append(smc.currentWave.JoinAttempts, attempt) log.Debug(). Str("wave_id", smc.currentWave.WaveID). Str("replica_id", replicaID). Bool("success", success). Dur("duration", duration). Msg("Recorded join attempt") } // RecordHealthCheck records a health gate check result func (smc *ScalingMetricsCollector) RecordHealthCheck(gateName string, healthy bool, reason string, metrics map[string]interface{}, duration time.Duration) { smc.mu.Lock() defer smc.mu.Unlock() if smc.currentWave == nil { log.Warn().Str("gate_name", gateName).Msg("No active wave to record health check") return } result := HealthCheckResult{ Timestamp: time.Now(), GateName: gateName, Healthy: healthy, Reason: reason, Metrics: metrics, CheckDuration: duration, } smc.currentWave.HealthChecks = append(smc.currentWave.HealthChecks, result) log.Debug(). Str("wave_id", smc.currentWave.WaveID). Str("gate_name", gateName). Bool("healthy", healthy). Dur("duration", duration). Msg("Recorded health check") } // CompleteWave finishes tracking the current wave and archives it func (smc *ScalingMetricsCollector) CompleteWave(ctx context.Context, success bool, achievedReplicas int, failureReason string, backoffLevel int) { ctx, span := tracing.Tracer.Start(ctx, "scaling_metrics.complete_wave") defer span.End() smc.mu.Lock() defer smc.mu.Unlock() if smc.currentWave == nil { log.Warn().Msg("No active wave to complete") return } now := time.Now() operation := ScalingOperation{ ID: smc.currentWave.WaveID, ServiceName: smc.currentWave.ServiceName, WaveNumber: len(smc.operations) + 1, StartedAt: smc.currentWave.StartedAt, CompletedAt: now, Duration: now.Sub(smc.currentWave.StartedAt), TargetReplicas: smc.currentWave.TargetReplicas, AchievedReplicas: achievedReplicas, Success: success, FailureReason: failureReason, JoinAttempts: smc.currentWave.JoinAttempts, HealthGateResults: smc.extractHealthGateResults(), BackoffLevel: backoffLevel, } // Add to operations history smc.operations = append(smc.operations, operation) // Trim history if needed if len(smc.operations) > smc.maxHistory { smc.operations = smc.operations[len(smc.operations)-smc.maxHistory:] } span.SetAttributes( attribute.String("wave.id", operation.ID), attribute.String("wave.service", operation.ServiceName), attribute.Bool("wave.success", success), attribute.Int("wave.achieved_replicas", achievedReplicas), attribute.Int("wave.backoff_level", backoffLevel), attribute.String("wave.duration", operation.Duration.String()), ) log.Info(). Str("wave_id", operation.ID). Str("service_name", operation.ServiceName). Bool("success", success). Int("achieved_replicas", achievedReplicas). Dur("duration", operation.Duration). Msg("Completed scaling wave") // Clear current wave smc.currentWave = nil } // extractHealthGateResults extracts the final health gate results from checks func (smc *ScalingMetricsCollector) extractHealthGateResults() map[string]bool { results := make(map[string]bool) // Get the latest result for each gate for _, check := range smc.currentWave.HealthChecks { results[check.GateName] = check.Healthy } return results } // GenerateReport generates a metrics report for the specified time window func (smc *ScalingMetricsCollector) GenerateReport(ctx context.Context, windowStart, windowEnd time.Time) *ScalingMetricsReport { ctx, span := tracing.Tracer.Start(ctx, "scaling_metrics.generate_report") defer span.End() smc.mu.RLock() defer smc.mu.RUnlock() report := &ScalingMetricsReport{ WindowStart: windowStart, WindowEnd: windowEnd, HealthGateFailures: make(map[string]int), ServiceMetrics: make(map[string]ServiceMetrics), CurrentWave: smc.currentWave, } // Filter operations within window var windowOps []ScalingOperation for _, op := range smc.operations { if op.StartedAt.After(windowStart) && op.StartedAt.Before(windowEnd) { windowOps = append(windowOps, op) } } report.TotalOperations = len(windowOps) if len(windowOps) == 0 { return report } // Calculate aggregated metrics var totalDuration time.Duration var totalJoinDuration time.Duration var totalJoinAttempts int serviceStats := make(map[string]*ServiceMetrics) for _, op := range windowOps { // Overall stats if op.Success { report.SuccessfulOps++ } else { report.FailedOps++ } totalDuration += op.Duration // Backoff tracking if op.BackoffLevel > 0 { report.BackoffEvents++ } // Health gate failures for gate, healthy := range op.HealthGateResults { if !healthy { report.HealthGateFailures[gate]++ } } // Join attempt metrics for _, attempt := range op.JoinAttempts { totalJoinDuration += attempt.Duration totalJoinAttempts++ } // Service-specific metrics if _, exists := serviceStats[op.ServiceName]; !exists { serviceStats[op.ServiceName] = &ServiceMetrics{ ServiceName: op.ServiceName, } } svc := serviceStats[op.ServiceName] svc.TotalWaves++ if op.Success { svc.SuccessfulWaves++ } if op.CompletedAt.After(svc.LastScaled) { svc.LastScaled = op.CompletedAt svc.CurrentReplicas = op.AchievedReplicas } } // Calculate rates and averages report.SuccessRate = float64(report.SuccessfulOps) / float64(report.TotalOperations) report.AverageWaveTime = totalDuration / time.Duration(len(windowOps)) if totalJoinAttempts > 0 { report.AverageJoinTime = totalJoinDuration / time.Duration(totalJoinAttempts) } // Finalize service metrics for serviceName, stats := range serviceStats { if stats.TotalWaves > 0 { // Calculate average wave time for this service var serviceDuration time.Duration serviceWaves := 0 for _, op := range windowOps { if op.ServiceName == serviceName { serviceDuration += op.Duration serviceWaves++ } } stats.AverageWaveTime = serviceDuration / time.Duration(serviceWaves) } report.ServiceMetrics[serviceName] = *stats } span.SetAttributes( attribute.Int("report.total_operations", report.TotalOperations), attribute.Int("report.successful_operations", report.SuccessfulOps), attribute.Float64("report.success_rate", report.SuccessRate), attribute.String("report.window_duration", windowEnd.Sub(windowStart).String()), ) return report } // GetCurrentWave returns the currently active wave metrics func (smc *ScalingMetricsCollector) GetCurrentWave() *WaveMetrics { smc.mu.RLock() defer smc.mu.RUnlock() if smc.currentWave == nil { return nil } // Return a copy to avoid concurrent access issues wave := *smc.currentWave wave.JoinAttempts = make([]JoinAttempt, len(smc.currentWave.JoinAttempts)) copy(wave.JoinAttempts, smc.currentWave.JoinAttempts) wave.HealthChecks = make([]HealthCheckResult, len(smc.currentWave.HealthChecks)) copy(wave.HealthChecks, smc.currentWave.HealthChecks) return &wave } // GetRecentOperations returns the most recent scaling operations func (smc *ScalingMetricsCollector) GetRecentOperations(limit int) []ScalingOperation { smc.mu.RLock() defer smc.mu.RUnlock() if limit <= 0 || limit > len(smc.operations) { limit = len(smc.operations) } // Return most recent operations start := len(smc.operations) - limit operations := make([]ScalingOperation, limit) copy(operations, smc.operations[start:]) return operations } // ExportMetrics exports metrics in JSON format func (smc *ScalingMetricsCollector) ExportMetrics(ctx context.Context) ([]byte, error) { ctx, span := tracing.Tracer.Start(ctx, "scaling_metrics.export") defer span.End() smc.mu.RLock() defer smc.mu.RUnlock() export := struct { Operations []ScalingOperation `json:"operations"` CurrentWave *WaveMetrics `json:"current_wave,omitempty"` ExportedAt time.Time `json:"exported_at"` }{ Operations: smc.operations, CurrentWave: smc.currentWave, ExportedAt: time.Now(), } data, err := json.MarshalIndent(export, "", " ") if err != nil { return nil, fmt.Errorf("failed to marshal metrics: %w", err) } span.SetAttributes( attribute.Int("export.operation_count", len(smc.operations)), attribute.Bool("export.has_current_wave", smc.currentWave != nil), ) return data, nil }