Implement wave-based scaling system for CHORUS Docker Swarm orchestration

- Health gates system for pre-scaling validation (KACHING, BACKBEAT, bootstrap peers) - Assignment broker API for per-replica configuration management - Bootstrap pool management with weighted peer selection and health monitoring - Wave-based scaling algorithm with exponential backoff and failure recovery - Enhanced SwarmManager with Docker service scaling capabilities - Comprehensive scaling metrics collection and reporting system - RESTful HTTP API for external scaling operations and monitoring - Integration with CHORUS P2P networking and assignment systems 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-22 13:51:34 +10:00
parent 55dd5951ea
commit 564852dc91
9 changed files with 3381 additions and 87 deletions
--- a/internal/orchestrator/scaling_api.go
+++ b/internal/orchestrator/scaling_api.go
@@ -0,0 +1,513 @@
+package orchestrator
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"strconv"
+	"time"
+
+	"github.com/gorilla/mux"
+	"github.com/rs/zerolog/log"
+	"go.opentelemetry.io/otel/attribute"
+
+	"github.com/chorus-services/whoosh/internal/tracing"
+)
+
+// ScalingAPI provides HTTP endpoints for scaling operations
+type ScalingAPI struct {
+	controller *ScalingController
+	metrics    *ScalingMetricsCollector
+}
+
+// ScaleRequest represents a scaling request
+type ScaleRequest struct {
+	ServiceName    string            `json:"service_name"`
+	TargetReplicas int               `json:"target_replicas"`
+	WaveSize       int               `json:"wave_size,omitempty"`
+	Template       string            `json:"template,omitempty"`
+	Environment    map[string]string `json:"environment,omitempty"`
+	ForceScale     bool              `json:"force_scale,omitempty"`
+}
+
+// ScaleResponse represents a scaling response
+type ScaleResponse struct {
+	WaveID         string    `json:"wave_id"`
+	ServiceName    string    `json:"service_name"`
+	TargetReplicas int       `json:"target_replicas"`
+	CurrentReplicas int      `json:"current_replicas"`
+	Status         string    `json:"status"`
+	StartedAt      time.Time `json:"started_at"`
+	Message        string    `json:"message,omitempty"`
+}
+
+// HealthResponse represents health check response
+type HealthResponse struct {
+	Healthy       bool                   `json:"healthy"`
+	Timestamp     time.Time             `json:"timestamp"`
+	Gates         map[string]GateStatus `json:"gates"`
+	OverallReason string                `json:"overall_reason,omitempty"`
+}
+
+// NewScalingAPI creates a new scaling API instance
+func NewScalingAPI(controller *ScalingController, metrics *ScalingMetricsCollector) *ScalingAPI {
+	return &ScalingAPI{
+		controller: controller,
+		metrics:    metrics,
+	}
+}
+
+// RegisterRoutes registers HTTP routes for the scaling API
+func (api *ScalingAPI) RegisterRoutes(router *mux.Router) {
+	// Scaling operations
+	router.HandleFunc("/api/v1/scale", api.ScaleService).Methods("POST")
+	router.HandleFunc("/api/v1/scale/status", api.GetScalingStatus).Methods("GET")
+	router.HandleFunc("/api/v1/scale/stop", api.StopScaling).Methods("POST")
+
+	// Health gates
+	router.HandleFunc("/api/v1/health/gates", api.GetHealthGates).Methods("GET")
+	router.HandleFunc("/api/v1/health/thresholds", api.GetHealthThresholds).Methods("GET")
+	router.HandleFunc("/api/v1/health/thresholds", api.UpdateHealthThresholds).Methods("PUT")
+
+	// Metrics and monitoring
+	router.HandleFunc("/api/v1/metrics/scaling", api.GetScalingMetrics).Methods("GET")
+	router.HandleFunc("/api/v1/metrics/operations", api.GetRecentOperations).Methods("GET")
+	router.HandleFunc("/api/v1/metrics/export", api.ExportMetrics).Methods("GET")
+
+	// Service management
+	router.HandleFunc("/api/v1/services/{serviceName}/status", api.GetServiceStatus).Methods("GET")
+	router.HandleFunc("/api/v1/services/{serviceName}/replicas", api.GetServiceReplicas).Methods("GET")
+
+	// Assignment management
+	router.HandleFunc("/api/v1/assignments/templates", api.GetAssignmentTemplates).Methods("GET")
+	router.HandleFunc("/api/v1/assignments", api.CreateAssignment).Methods("POST")
+
+	// Bootstrap peer management
+	router.HandleFunc("/api/v1/bootstrap/peers", api.GetBootstrapPeers).Methods("GET")
+	router.HandleFunc("/api/v1/bootstrap/stats", api.GetBootstrapStats).Methods("GET")
+}
+
+// ScaleService handles scaling requests
+func (api *ScalingAPI) ScaleService(w http.ResponseWriter, r *http.Request) {
+	ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.scale_service")
+	defer span.End()
+
+	var req ScaleRequest
+	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+		api.writeError(w, http.StatusBadRequest, "Invalid request body", err)
+		return
+	}
+
+	// Validate request
+	if req.ServiceName == "" {
+		api.writeError(w, http.StatusBadRequest, "Service name is required", nil)
+		return
+	}
+	if req.TargetReplicas < 0 {
+		api.writeError(w, http.StatusBadRequest, "Target replicas must be non-negative", nil)
+		return
+	}
+
+	span.SetAttributes(
+		attribute.String("request.service_name", req.ServiceName),
+		attribute.Int("request.target_replicas", req.TargetReplicas),
+		attribute.Bool("request.force_scale", req.ForceScale),
+	)
+
+	// Get current replica count
+	currentReplicas, err := api.controller.swarmManager.GetServiceReplicas(ctx, req.ServiceName)
+	if err != nil {
+		api.writeError(w, http.StatusNotFound, "Service not found", err)
+		return
+	}
+
+	// Check if scaling is needed
+	if currentReplicas == req.TargetReplicas && !req.ForceScale {
+		response := ScaleResponse{
+			ServiceName:     req.ServiceName,
+			TargetReplicas:  req.TargetReplicas,
+			CurrentReplicas: currentReplicas,
+			Status:          "no_action_needed",
+			StartedAt:       time.Now(),
+			Message:         "Service already at target replica count",
+		}
+		api.writeJSON(w, http.StatusOK, response)
+		return
+	}
+
+	// Determine scaling direction and wave size
+	var waveSize int
+	if req.WaveSize > 0 {
+		waveSize = req.WaveSize
+	} else {
+		// Default wave size based on scaling direction
+		if req.TargetReplicas > currentReplicas {
+			waveSize = 3 // Scale up in smaller waves
+		} else {
+			waveSize = 5 // Scale down in larger waves
+		}
+	}
+
+	// Start scaling operation
+	waveID, err := api.controller.StartScaling(ctx, req.ServiceName, req.TargetReplicas, waveSize, req.Template)
+	if err != nil {
+		api.writeError(w, http.StatusInternalServerError, "Failed to start scaling", err)
+		return
+	}
+
+	response := ScaleResponse{
+		WaveID:          waveID,
+		ServiceName:     req.ServiceName,
+		TargetReplicas:  req.TargetReplicas,
+		CurrentReplicas: currentReplicas,
+		Status:          "scaling_started",
+		StartedAt:       time.Now(),
+		Message:         fmt.Sprintf("Started scaling %s from %d to %d replicas", req.ServiceName, currentReplicas, req.TargetReplicas),
+	}
+
+	log.Info().
+		Str("wave_id", waveID).
+		Str("service_name", req.ServiceName).
+		Int("current_replicas", currentReplicas).
+		Int("target_replicas", req.TargetReplicas).
+		Int("wave_size", waveSize).
+		Msg("Started scaling operation via API")
+
+	api.writeJSON(w, http.StatusAccepted, response)
+}
+
+// GetScalingStatus returns the current scaling status
+func (api *ScalingAPI) GetScalingStatus(w http.ResponseWriter, r *http.Request) {
+	ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_scaling_status")
+	defer span.End()
+
+	currentWave := api.metrics.GetCurrentWave()
+	if currentWave == nil {
+		api.writeJSON(w, http.StatusOK, map[string]interface{}{
+			"status":  "idle",
+			"message": "No scaling operation in progress",
+		})
+		return
+	}
+
+	// Calculate progress
+	progress := float64(currentWave.CurrentReplicas) / float64(currentWave.TargetReplicas) * 100
+	if progress > 100 {
+		progress = 100
+	}
+
+	response := map[string]interface{}{
+		"status":           "scaling",
+		"wave_id":          currentWave.WaveID,
+		"service_name":     currentWave.ServiceName,
+		"started_at":       currentWave.StartedAt,
+		"target_replicas":  currentWave.TargetReplicas,
+		"current_replicas": currentWave.CurrentReplicas,
+		"progress_percent": progress,
+		"join_attempts":    len(currentWave.JoinAttempts),
+		"health_checks":    len(currentWave.HealthChecks),
+		"backoff_level":    currentWave.BackoffLevel,
+		"duration":         time.Since(currentWave.StartedAt).String(),
+	}
+
+	api.writeJSON(w, http.StatusOK, response)
+}
+
+// StopScaling stops the current scaling operation
+func (api *ScalingAPI) StopScaling(w http.ResponseWriter, r *http.Request) {
+	ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.stop_scaling")
+	defer span.End()
+
+	currentWave := api.metrics.GetCurrentWave()
+	if currentWave == nil {
+		api.writeError(w, http.StatusBadRequest, "No scaling operation in progress", nil)
+		return
+	}
+
+	// Stop the scaling operation
+	api.controller.StopScaling(ctx)
+
+	response := map[string]interface{}{
+		"status":     "stopped",
+		"wave_id":    currentWave.WaveID,
+		"message":    "Scaling operation stopped",
+		"stopped_at": time.Now(),
+	}
+
+	log.Info().
+		Str("wave_id", currentWave.WaveID).
+		Str("service_name", currentWave.ServiceName).
+		Msg("Stopped scaling operation via API")
+
+	api.writeJSON(w, http.StatusOK, response)
+}
+
+// GetHealthGates returns the current health gate status
+func (api *ScalingAPI) GetHealthGates(w http.ResponseWriter, r *http.Request) {
+	ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_health_gates")
+	defer span.End()
+
+	status, err := api.controller.healthGates.CheckHealth(ctx, nil)
+	if err != nil {
+		api.writeError(w, http.StatusInternalServerError, "Failed to check health gates", err)
+		return
+	}
+
+	response := HealthResponse{
+		Healthy:       status.Healthy,
+		Timestamp:     status.Timestamp,
+		Gates:         status.Gates,
+		OverallReason: status.OverallReason,
+	}
+
+	api.writeJSON(w, http.StatusOK, response)
+}
+
+// GetHealthThresholds returns the current health thresholds
+func (api *ScalingAPI) GetHealthThresholds(w http.ResponseWriter, r *http.Request) {
+	_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_health_thresholds")
+	defer span.End()
+
+	thresholds := api.controller.healthGates.GetThresholds()
+	api.writeJSON(w, http.StatusOK, thresholds)
+}
+
+// UpdateHealthThresholds updates the health thresholds
+func (api *ScalingAPI) UpdateHealthThresholds(w http.ResponseWriter, r *http.Request) {
+	_, span := tracing.Tracer.Start(r.Context(), "scaling_api.update_health_thresholds")
+	defer span.End()
+
+	var thresholds HealthThresholds
+	if err := json.NewDecoder(r.Body).Decode(&thresholds); err != nil {
+		api.writeError(w, http.StatusBadRequest, "Invalid request body", err)
+		return
+	}
+
+	api.controller.healthGates.SetThresholds(thresholds)
+
+	log.Info().
+		Interface("thresholds", thresholds).
+		Msg("Updated health thresholds via API")
+
+	api.writeJSON(w, http.StatusOK, map[string]string{
+		"status":  "updated",
+		"message": "Health thresholds updated successfully",
+	})
+}
+
+// GetScalingMetrics returns scaling metrics for a time window
+func (api *ScalingAPI) GetScalingMetrics(w http.ResponseWriter, r *http.Request) {
+	ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_scaling_metrics")
+	defer span.End()
+
+	// Parse query parameters for time window
+	windowStart, windowEnd := api.parseTimeWindow(r)
+
+	report := api.metrics.GenerateReport(ctx, windowStart, windowEnd)
+	api.writeJSON(w, http.StatusOK, report)
+}
+
+// GetRecentOperations returns recent scaling operations
+func (api *ScalingAPI) GetRecentOperations(w http.ResponseWriter, r *http.Request) {
+	_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_recent_operations")
+	defer span.End()
+
+	// Parse limit parameter
+	limit := 50 // Default limit
+	if limitStr := r.URL.Query().Get("limit"); limitStr != "" {
+		if parsedLimit, err := strconv.Atoi(limitStr); err == nil && parsedLimit > 0 {
+			limit = parsedLimit
+		}
+	}
+
+	operations := api.metrics.GetRecentOperations(limit)
+	api.writeJSON(w, http.StatusOK, map[string]interface{}{
+		"operations": operations,
+		"count":      len(operations),
+	})
+}
+
+// ExportMetrics exports all metrics data
+func (api *ScalingAPI) ExportMetrics(w http.ResponseWriter, r *http.Request) {
+	ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.export_metrics")
+	defer span.End()
+
+	data, err := api.metrics.ExportMetrics(ctx)
+	if err != nil {
+		api.writeError(w, http.StatusInternalServerError, "Failed to export metrics", err)
+		return
+	}
+
+	w.Header().Set("Content-Type", "application/json")
+	w.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=scaling-metrics-%s.json",
+		time.Now().Format("2006-01-02-15-04-05")))
+	w.Write(data)
+}
+
+// GetServiceStatus returns detailed status for a specific service
+func (api *ScalingAPI) GetServiceStatus(w http.ResponseWriter, r *http.Request) {
+	ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_service_status")
+	defer span.End()
+
+	vars := mux.Vars(r)
+	serviceName := vars["serviceName"]
+
+	status, err := api.controller.swarmManager.GetServiceStatus(ctx, serviceName)
+	if err != nil {
+		api.writeError(w, http.StatusNotFound, "Service not found", err)
+		return
+	}
+
+	span.SetAttributes(attribute.String("service.name", serviceName))
+	api.writeJSON(w, http.StatusOK, status)
+}
+
+// GetServiceReplicas returns the current replica count for a service
+func (api *ScalingAPI) GetServiceReplicas(w http.ResponseWriter, r *http.Request) {
+	ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_service_replicas")
+	defer span.End()
+
+	vars := mux.Vars(r)
+	serviceName := vars["serviceName"]
+
+	replicas, err := api.controller.swarmManager.GetServiceReplicas(ctx, serviceName)
+	if err != nil {
+		api.writeError(w, http.StatusNotFound, "Service not found", err)
+		return
+	}
+
+	runningReplicas, err := api.controller.swarmManager.GetRunningReplicas(ctx, serviceName)
+	if err != nil {
+		log.Warn().Err(err).Str("service_name", serviceName).Msg("Failed to get running replica count")
+		runningReplicas = 0
+	}
+
+	response := map[string]interface{}{
+		"service_name":     serviceName,
+		"desired_replicas": replicas,
+		"running_replicas": runningReplicas,
+		"timestamp":        time.Now(),
+	}
+
+	span.SetAttributes(
+		attribute.String("service.name", serviceName),
+		attribute.Int("service.desired_replicas", replicas),
+		attribute.Int("service.running_replicas", runningReplicas),
+	)
+
+	api.writeJSON(w, http.StatusOK, response)
+}
+
+// GetAssignmentTemplates returns available assignment templates
+func (api *ScalingAPI) GetAssignmentTemplates(w http.ResponseWriter, r *http.Request) {
+	_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_assignment_templates")
+	defer span.End()
+
+	templates := api.controller.assignmentBroker.GetAvailableTemplates()
+	api.writeJSON(w, http.StatusOK, map[string]interface{}{
+		"templates": templates,
+		"count":     len(templates),
+	})
+}
+
+// CreateAssignment creates a new assignment
+func (api *ScalingAPI) CreateAssignment(w http.ResponseWriter, r *http.Request) {
+	ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.create_assignment")
+	defer span.End()
+
+	var req AssignmentRequest
+	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+		api.writeError(w, http.StatusBadRequest, "Invalid request body", err)
+		return
+	}
+
+	assignment, err := api.controller.assignmentBroker.CreateAssignment(ctx, req)
+	if err != nil {
+		api.writeError(w, http.StatusBadRequest, "Failed to create assignment", err)
+		return
+	}
+
+	span.SetAttributes(
+		attribute.String("assignment.id", assignment.ID),
+		attribute.String("assignment.template", req.Template),
+	)
+
+	api.writeJSON(w, http.StatusCreated, assignment)
+}
+
+// GetBootstrapPeers returns available bootstrap peers
+func (api *ScalingAPI) GetBootstrapPeers(w http.ResponseWriter, r *http.Request) {
+	_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_bootstrap_peers")
+	defer span.End()
+
+	peers := api.controller.bootstrapManager.GetAllPeers()
+	api.writeJSON(w, http.StatusOK, map[string]interface{}{
+		"peers": peers,
+		"count": len(peers),
+	})
+}
+
+// GetBootstrapStats returns bootstrap pool statistics
+func (api *ScalingAPI) GetBootstrapStats(w http.ResponseWriter, r *http.Request) {
+	_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_bootstrap_stats")
+	defer span.End()
+
+	stats := api.controller.bootstrapManager.GetStats()
+	api.writeJSON(w, http.StatusOK, stats)
+}
+
+// Helper functions
+
+// parseTimeWindow parses start and end time parameters from request
+func (api *ScalingAPI) parseTimeWindow(r *http.Request) (time.Time, time.Time) {
+	now := time.Now()
+
+	// Default to last 24 hours
+	windowEnd := now
+	windowStart := now.Add(-24 * time.Hour)
+
+	// Parse custom window if provided
+	if startStr := r.URL.Query().Get("start"); startStr != "" {
+		if start, err := time.Parse(time.RFC3339, startStr); err == nil {
+			windowStart = start
+		}
+	}
+
+	if endStr := r.URL.Query().Get("end"); endStr != "" {
+		if end, err := time.Parse(time.RFC3339, endStr); err == nil {
+			windowEnd = end
+		}
+	}
+
+	// Parse duration if provided (overrides start)
+	if durationStr := r.URL.Query().Get("duration"); durationStr != "" {
+		if duration, err := time.ParseDuration(durationStr); err == nil {
+			windowStart = windowEnd.Add(-duration)
+		}
+	}
+
+	return windowStart, windowEnd
+}
+
+// writeJSON writes a JSON response
+func (api *ScalingAPI) writeJSON(w http.ResponseWriter, status int, data interface{}) {
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(status)
+	json.NewEncoder(w).Encode(data)
+}
+
+// writeError writes an error response
+func (api *ScalingAPI) writeError(w http.ResponseWriter, status int, message string, err error) {
+	response := map[string]interface{}{
+		"error":     message,
+		"timestamp": time.Now(),
+	}
+
+	if err != nil {
+		response["details"] = err.Error()
+		log.Error().Err(err).Str("error_message", message).Msg("API error")
+	}
+
+	api.writeJSON(w, status, response)
+}