Files
WHOOSH/internal/orchestrator/scaling_metrics.go
Claude Code 564852dc91 Implement wave-based scaling system for CHORUS Docker Swarm orchestration
- Health gates system for pre-scaling validation (KACHING, BACKBEAT, bootstrap peers)
- Assignment broker API for per-replica configuration management
- Bootstrap pool management with weighted peer selection and health monitoring
- Wave-based scaling algorithm with exponential backoff and failure recovery
- Enhanced SwarmManager with Docker service scaling capabilities
- Comprehensive scaling metrics collection and reporting system
- RESTful HTTP API for external scaling operations and monitoring
- Integration with CHORUS P2P networking and assignment systems

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-22 13:51:34 +10:00

454 lines
14 KiB
Go

package orchestrator
import (
"context"
"encoding/json"
"fmt"
"sync"
"time"
"github.com/rs/zerolog/log"
"go.opentelemetry.io/otel/attribute"
"github.com/chorus-services/whoosh/internal/tracing"
)
// ScalingMetricsCollector collects and manages scaling operation metrics
type ScalingMetricsCollector struct {
mu sync.RWMutex
operations []ScalingOperation
maxHistory int
currentWave *WaveMetrics
}
// ScalingOperation represents a completed scaling operation
type ScalingOperation struct {
ID string `json:"id"`
ServiceName string `json:"service_name"`
WaveNumber int `json:"wave_number"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at"`
Duration time.Duration `json:"duration"`
TargetReplicas int `json:"target_replicas"`
AchievedReplicas int `json:"achieved_replicas"`
Success bool `json:"success"`
FailureReason string `json:"failure_reason,omitempty"`
JoinAttempts []JoinAttempt `json:"join_attempts"`
HealthGateResults map[string]bool `json:"health_gate_results"`
BackoffLevel int `json:"backoff_level"`
}
// JoinAttempt represents an individual replica join attempt
type JoinAttempt struct {
ReplicaID string `json:"replica_id"`
AttemptedAt time.Time `json:"attempted_at"`
CompletedAt time.Time `json:"completed_at,omitempty"`
Duration time.Duration `json:"duration"`
Success bool `json:"success"`
FailureReason string `json:"failure_reason,omitempty"`
BootstrapPeers []string `json:"bootstrap_peers"`
}
// WaveMetrics tracks metrics for the currently executing wave
type WaveMetrics struct {
WaveID string `json:"wave_id"`
ServiceName string `json:"service_name"`
StartedAt time.Time `json:"started_at"`
TargetReplicas int `json:"target_replicas"`
CurrentReplicas int `json:"current_replicas"`
JoinAttempts []JoinAttempt `json:"join_attempts"`
HealthChecks []HealthCheckResult `json:"health_checks"`
BackoffLevel int `json:"backoff_level"`
}
// HealthCheckResult represents a health gate check result
type HealthCheckResult struct {
Timestamp time.Time `json:"timestamp"`
GateName string `json:"gate_name"`
Healthy bool `json:"healthy"`
Reason string `json:"reason,omitempty"`
Metrics map[string]interface{} `json:"metrics,omitempty"`
CheckDuration time.Duration `json:"check_duration"`
}
// ScalingMetricsReport provides aggregated metrics for reporting
type ScalingMetricsReport struct {
WindowStart time.Time `json:"window_start"`
WindowEnd time.Time `json:"window_end"`
TotalOperations int `json:"total_operations"`
SuccessfulOps int `json:"successful_operations"`
FailedOps int `json:"failed_operations"`
SuccessRate float64 `json:"success_rate"`
AverageWaveTime time.Duration `json:"average_wave_time"`
AverageJoinTime time.Duration `json:"average_join_time"`
BackoffEvents int `json:"backoff_events"`
HealthGateFailures map[string]int `json:"health_gate_failures"`
ServiceMetrics map[string]ServiceMetrics `json:"service_metrics"`
CurrentWave *WaveMetrics `json:"current_wave,omitempty"`
}
// ServiceMetrics provides per-service scaling metrics
type ServiceMetrics struct {
ServiceName string `json:"service_name"`
TotalWaves int `json:"total_waves"`
SuccessfulWaves int `json:"successful_waves"`
AverageWaveTime time.Duration `json:"average_wave_time"`
LastScaled time.Time `json:"last_scaled"`
CurrentReplicas int `json:"current_replicas"`
}
// NewScalingMetricsCollector creates a new metrics collector
func NewScalingMetricsCollector(maxHistory int) *ScalingMetricsCollector {
if maxHistory == 0 {
maxHistory = 1000 // Default to keeping 1000 operations
}
return &ScalingMetricsCollector{
operations: make([]ScalingOperation, 0),
maxHistory: maxHistory,
}
}
// StartWave begins tracking a new scaling wave
func (smc *ScalingMetricsCollector) StartWave(ctx context.Context, waveID, serviceName string, targetReplicas int) {
ctx, span := tracing.Tracer.Start(ctx, "scaling_metrics.start_wave")
defer span.End()
smc.mu.Lock()
defer smc.mu.Unlock()
smc.currentWave = &WaveMetrics{
WaveID: waveID,
ServiceName: serviceName,
StartedAt: time.Now(),
TargetReplicas: targetReplicas,
JoinAttempts: make([]JoinAttempt, 0),
HealthChecks: make([]HealthCheckResult, 0),
}
span.SetAttributes(
attribute.String("wave.id", waveID),
attribute.String("wave.service", serviceName),
attribute.Int("wave.target_replicas", targetReplicas),
)
log.Info().
Str("wave_id", waveID).
Str("service_name", serviceName).
Int("target_replicas", targetReplicas).
Msg("Started tracking scaling wave")
}
// RecordJoinAttempt records a replica join attempt
func (smc *ScalingMetricsCollector) RecordJoinAttempt(replicaID string, bootstrapPeers []string, success bool, duration time.Duration, failureReason string) {
smc.mu.Lock()
defer smc.mu.Unlock()
if smc.currentWave == nil {
log.Warn().Str("replica_id", replicaID).Msg("No active wave to record join attempt")
return
}
attempt := JoinAttempt{
ReplicaID: replicaID,
AttemptedAt: time.Now().Add(-duration),
CompletedAt: time.Now(),
Duration: duration,
Success: success,
FailureReason: failureReason,
BootstrapPeers: bootstrapPeers,
}
smc.currentWave.JoinAttempts = append(smc.currentWave.JoinAttempts, attempt)
log.Debug().
Str("wave_id", smc.currentWave.WaveID).
Str("replica_id", replicaID).
Bool("success", success).
Dur("duration", duration).
Msg("Recorded join attempt")
}
// RecordHealthCheck records a health gate check result
func (smc *ScalingMetricsCollector) RecordHealthCheck(gateName string, healthy bool, reason string, metrics map[string]interface{}, duration time.Duration) {
smc.mu.Lock()
defer smc.mu.Unlock()
if smc.currentWave == nil {
log.Warn().Str("gate_name", gateName).Msg("No active wave to record health check")
return
}
result := HealthCheckResult{
Timestamp: time.Now(),
GateName: gateName,
Healthy: healthy,
Reason: reason,
Metrics: metrics,
CheckDuration: duration,
}
smc.currentWave.HealthChecks = append(smc.currentWave.HealthChecks, result)
log.Debug().
Str("wave_id", smc.currentWave.WaveID).
Str("gate_name", gateName).
Bool("healthy", healthy).
Dur("duration", duration).
Msg("Recorded health check")
}
// CompleteWave finishes tracking the current wave and archives it
func (smc *ScalingMetricsCollector) CompleteWave(ctx context.Context, success bool, achievedReplicas int, failureReason string, backoffLevel int) {
ctx, span := tracing.Tracer.Start(ctx, "scaling_metrics.complete_wave")
defer span.End()
smc.mu.Lock()
defer smc.mu.Unlock()
if smc.currentWave == nil {
log.Warn().Msg("No active wave to complete")
return
}
now := time.Now()
operation := ScalingOperation{
ID: smc.currentWave.WaveID,
ServiceName: smc.currentWave.ServiceName,
WaveNumber: len(smc.operations) + 1,
StartedAt: smc.currentWave.StartedAt,
CompletedAt: now,
Duration: now.Sub(smc.currentWave.StartedAt),
TargetReplicas: smc.currentWave.TargetReplicas,
AchievedReplicas: achievedReplicas,
Success: success,
FailureReason: failureReason,
JoinAttempts: smc.currentWave.JoinAttempts,
HealthGateResults: smc.extractHealthGateResults(),
BackoffLevel: backoffLevel,
}
// Add to operations history
smc.operations = append(smc.operations, operation)
// Trim history if needed
if len(smc.operations) > smc.maxHistory {
smc.operations = smc.operations[len(smc.operations)-smc.maxHistory:]
}
span.SetAttributes(
attribute.String("wave.id", operation.ID),
attribute.String("wave.service", operation.ServiceName),
attribute.Bool("wave.success", success),
attribute.Int("wave.achieved_replicas", achievedReplicas),
attribute.Int("wave.backoff_level", backoffLevel),
attribute.String("wave.duration", operation.Duration.String()),
)
log.Info().
Str("wave_id", operation.ID).
Str("service_name", operation.ServiceName).
Bool("success", success).
Int("achieved_replicas", achievedReplicas).
Dur("duration", operation.Duration).
Msg("Completed scaling wave")
// Clear current wave
smc.currentWave = nil
}
// extractHealthGateResults extracts the final health gate results from checks
func (smc *ScalingMetricsCollector) extractHealthGateResults() map[string]bool {
results := make(map[string]bool)
// Get the latest result for each gate
for _, check := range smc.currentWave.HealthChecks {
results[check.GateName] = check.Healthy
}
return results
}
// GenerateReport generates a metrics report for the specified time window
func (smc *ScalingMetricsCollector) GenerateReport(ctx context.Context, windowStart, windowEnd time.Time) *ScalingMetricsReport {
ctx, span := tracing.Tracer.Start(ctx, "scaling_metrics.generate_report")
defer span.End()
smc.mu.RLock()
defer smc.mu.RUnlock()
report := &ScalingMetricsReport{
WindowStart: windowStart,
WindowEnd: windowEnd,
HealthGateFailures: make(map[string]int),
ServiceMetrics: make(map[string]ServiceMetrics),
CurrentWave: smc.currentWave,
}
// Filter operations within window
var windowOps []ScalingOperation
for _, op := range smc.operations {
if op.StartedAt.After(windowStart) && op.StartedAt.Before(windowEnd) {
windowOps = append(windowOps, op)
}
}
report.TotalOperations = len(windowOps)
if len(windowOps) == 0 {
return report
}
// Calculate aggregated metrics
var totalDuration time.Duration
var totalJoinDuration time.Duration
var totalJoinAttempts int
serviceStats := make(map[string]*ServiceMetrics)
for _, op := range windowOps {
// Overall stats
if op.Success {
report.SuccessfulOps++
} else {
report.FailedOps++
}
totalDuration += op.Duration
// Backoff tracking
if op.BackoffLevel > 0 {
report.BackoffEvents++
}
// Health gate failures
for gate, healthy := range op.HealthGateResults {
if !healthy {
report.HealthGateFailures[gate]++
}
}
// Join attempt metrics
for _, attempt := range op.JoinAttempts {
totalJoinDuration += attempt.Duration
totalJoinAttempts++
}
// Service-specific metrics
if _, exists := serviceStats[op.ServiceName]; !exists {
serviceStats[op.ServiceName] = &ServiceMetrics{
ServiceName: op.ServiceName,
}
}
svc := serviceStats[op.ServiceName]
svc.TotalWaves++
if op.Success {
svc.SuccessfulWaves++
}
if op.CompletedAt.After(svc.LastScaled) {
svc.LastScaled = op.CompletedAt
svc.CurrentReplicas = op.AchievedReplicas
}
}
// Calculate rates and averages
report.SuccessRate = float64(report.SuccessfulOps) / float64(report.TotalOperations)
report.AverageWaveTime = totalDuration / time.Duration(len(windowOps))
if totalJoinAttempts > 0 {
report.AverageJoinTime = totalJoinDuration / time.Duration(totalJoinAttempts)
}
// Finalize service metrics
for serviceName, stats := range serviceStats {
if stats.TotalWaves > 0 {
// Calculate average wave time for this service
var serviceDuration time.Duration
serviceWaves := 0
for _, op := range windowOps {
if op.ServiceName == serviceName {
serviceDuration += op.Duration
serviceWaves++
}
}
stats.AverageWaveTime = serviceDuration / time.Duration(serviceWaves)
}
report.ServiceMetrics[serviceName] = *stats
}
span.SetAttributes(
attribute.Int("report.total_operations", report.TotalOperations),
attribute.Int("report.successful_operations", report.SuccessfulOps),
attribute.Float64("report.success_rate", report.SuccessRate),
attribute.String("report.window_duration", windowEnd.Sub(windowStart).String()),
)
return report
}
// GetCurrentWave returns the currently active wave metrics
func (smc *ScalingMetricsCollector) GetCurrentWave() *WaveMetrics {
smc.mu.RLock()
defer smc.mu.RUnlock()
if smc.currentWave == nil {
return nil
}
// Return a copy to avoid concurrent access issues
wave := *smc.currentWave
wave.JoinAttempts = make([]JoinAttempt, len(smc.currentWave.JoinAttempts))
copy(wave.JoinAttempts, smc.currentWave.JoinAttempts)
wave.HealthChecks = make([]HealthCheckResult, len(smc.currentWave.HealthChecks))
copy(wave.HealthChecks, smc.currentWave.HealthChecks)
return &wave
}
// GetRecentOperations returns the most recent scaling operations
func (smc *ScalingMetricsCollector) GetRecentOperations(limit int) []ScalingOperation {
smc.mu.RLock()
defer smc.mu.RUnlock()
if limit <= 0 || limit > len(smc.operations) {
limit = len(smc.operations)
}
// Return most recent operations
start := len(smc.operations) - limit
operations := make([]ScalingOperation, limit)
copy(operations, smc.operations[start:])
return operations
}
// ExportMetrics exports metrics in JSON format
func (smc *ScalingMetricsCollector) ExportMetrics(ctx context.Context) ([]byte, error) {
ctx, span := tracing.Tracer.Start(ctx, "scaling_metrics.export")
defer span.End()
smc.mu.RLock()
defer smc.mu.RUnlock()
export := struct {
Operations []ScalingOperation `json:"operations"`
CurrentWave *WaveMetrics `json:"current_wave,omitempty"`
ExportedAt time.Time `json:"exported_at"`
}{
Operations: smc.operations,
CurrentWave: smc.currentWave,
ExportedAt: time.Now(),
}
data, err := json.MarshalIndent(export, "", " ")
if err != nil {
return nil, fmt.Errorf("failed to marshal metrics: %w", err)
}
span.SetAttributes(
attribute.Int("export.operation_count", len(smc.operations)),
attribute.Bool("export.has_current_wave", smc.currentWave != nil),
)
return data, nil
}