Files
WHOOSH/internal/orchestrator/scaling_api.go
Claude Code 564852dc91 Implement wave-based scaling system for CHORUS Docker Swarm orchestration
- Health gates system for pre-scaling validation (KACHING, BACKBEAT, bootstrap peers)
- Assignment broker API for per-replica configuration management
- Bootstrap pool management with weighted peer selection and health monitoring
- Wave-based scaling algorithm with exponential backoff and failure recovery
- Enhanced SwarmManager with Docker service scaling capabilities
- Comprehensive scaling metrics collection and reporting system
- RESTful HTTP API for external scaling operations and monitoring
- Integration with CHORUS P2P networking and assignment systems

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-22 13:51:34 +10:00

513 lines
16 KiB
Go

package orchestrator
import (
"context"
"encoding/json"
"fmt"
"net/http"
"strconv"
"time"
"github.com/gorilla/mux"
"github.com/rs/zerolog/log"
"go.opentelemetry.io/otel/attribute"
"github.com/chorus-services/whoosh/internal/tracing"
)
// ScalingAPI provides HTTP endpoints for scaling operations
type ScalingAPI struct {
controller *ScalingController
metrics *ScalingMetricsCollector
}
// ScaleRequest represents a scaling request
type ScaleRequest struct {
ServiceName string `json:"service_name"`
TargetReplicas int `json:"target_replicas"`
WaveSize int `json:"wave_size,omitempty"`
Template string `json:"template,omitempty"`
Environment map[string]string `json:"environment,omitempty"`
ForceScale bool `json:"force_scale,omitempty"`
}
// ScaleResponse represents a scaling response
type ScaleResponse struct {
WaveID string `json:"wave_id"`
ServiceName string `json:"service_name"`
TargetReplicas int `json:"target_replicas"`
CurrentReplicas int `json:"current_replicas"`
Status string `json:"status"`
StartedAt time.Time `json:"started_at"`
Message string `json:"message,omitempty"`
}
// HealthResponse represents health check response
type HealthResponse struct {
Healthy bool `json:"healthy"`
Timestamp time.Time `json:"timestamp"`
Gates map[string]GateStatus `json:"gates"`
OverallReason string `json:"overall_reason,omitempty"`
}
// NewScalingAPI creates a new scaling API instance
func NewScalingAPI(controller *ScalingController, metrics *ScalingMetricsCollector) *ScalingAPI {
return &ScalingAPI{
controller: controller,
metrics: metrics,
}
}
// RegisterRoutes registers HTTP routes for the scaling API
func (api *ScalingAPI) RegisterRoutes(router *mux.Router) {
// Scaling operations
router.HandleFunc("/api/v1/scale", api.ScaleService).Methods("POST")
router.HandleFunc("/api/v1/scale/status", api.GetScalingStatus).Methods("GET")
router.HandleFunc("/api/v1/scale/stop", api.StopScaling).Methods("POST")
// Health gates
router.HandleFunc("/api/v1/health/gates", api.GetHealthGates).Methods("GET")
router.HandleFunc("/api/v1/health/thresholds", api.GetHealthThresholds).Methods("GET")
router.HandleFunc("/api/v1/health/thresholds", api.UpdateHealthThresholds).Methods("PUT")
// Metrics and monitoring
router.HandleFunc("/api/v1/metrics/scaling", api.GetScalingMetrics).Methods("GET")
router.HandleFunc("/api/v1/metrics/operations", api.GetRecentOperations).Methods("GET")
router.HandleFunc("/api/v1/metrics/export", api.ExportMetrics).Methods("GET")
// Service management
router.HandleFunc("/api/v1/services/{serviceName}/status", api.GetServiceStatus).Methods("GET")
router.HandleFunc("/api/v1/services/{serviceName}/replicas", api.GetServiceReplicas).Methods("GET")
// Assignment management
router.HandleFunc("/api/v1/assignments/templates", api.GetAssignmentTemplates).Methods("GET")
router.HandleFunc("/api/v1/assignments", api.CreateAssignment).Methods("POST")
// Bootstrap peer management
router.HandleFunc("/api/v1/bootstrap/peers", api.GetBootstrapPeers).Methods("GET")
router.HandleFunc("/api/v1/bootstrap/stats", api.GetBootstrapStats).Methods("GET")
}
// ScaleService handles scaling requests
func (api *ScalingAPI) ScaleService(w http.ResponseWriter, r *http.Request) {
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.scale_service")
defer span.End()
var req ScaleRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
api.writeError(w, http.StatusBadRequest, "Invalid request body", err)
return
}
// Validate request
if req.ServiceName == "" {
api.writeError(w, http.StatusBadRequest, "Service name is required", nil)
return
}
if req.TargetReplicas < 0 {
api.writeError(w, http.StatusBadRequest, "Target replicas must be non-negative", nil)
return
}
span.SetAttributes(
attribute.String("request.service_name", req.ServiceName),
attribute.Int("request.target_replicas", req.TargetReplicas),
attribute.Bool("request.force_scale", req.ForceScale),
)
// Get current replica count
currentReplicas, err := api.controller.swarmManager.GetServiceReplicas(ctx, req.ServiceName)
if err != nil {
api.writeError(w, http.StatusNotFound, "Service not found", err)
return
}
// Check if scaling is needed
if currentReplicas == req.TargetReplicas && !req.ForceScale {
response := ScaleResponse{
ServiceName: req.ServiceName,
TargetReplicas: req.TargetReplicas,
CurrentReplicas: currentReplicas,
Status: "no_action_needed",
StartedAt: time.Now(),
Message: "Service already at target replica count",
}
api.writeJSON(w, http.StatusOK, response)
return
}
// Determine scaling direction and wave size
var waveSize int
if req.WaveSize > 0 {
waveSize = req.WaveSize
} else {
// Default wave size based on scaling direction
if req.TargetReplicas > currentReplicas {
waveSize = 3 // Scale up in smaller waves
} else {
waveSize = 5 // Scale down in larger waves
}
}
// Start scaling operation
waveID, err := api.controller.StartScaling(ctx, req.ServiceName, req.TargetReplicas, waveSize, req.Template)
if err != nil {
api.writeError(w, http.StatusInternalServerError, "Failed to start scaling", err)
return
}
response := ScaleResponse{
WaveID: waveID,
ServiceName: req.ServiceName,
TargetReplicas: req.TargetReplicas,
CurrentReplicas: currentReplicas,
Status: "scaling_started",
StartedAt: time.Now(),
Message: fmt.Sprintf("Started scaling %s from %d to %d replicas", req.ServiceName, currentReplicas, req.TargetReplicas),
}
log.Info().
Str("wave_id", waveID).
Str("service_name", req.ServiceName).
Int("current_replicas", currentReplicas).
Int("target_replicas", req.TargetReplicas).
Int("wave_size", waveSize).
Msg("Started scaling operation via API")
api.writeJSON(w, http.StatusAccepted, response)
}
// GetScalingStatus returns the current scaling status
func (api *ScalingAPI) GetScalingStatus(w http.ResponseWriter, r *http.Request) {
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_scaling_status")
defer span.End()
currentWave := api.metrics.GetCurrentWave()
if currentWave == nil {
api.writeJSON(w, http.StatusOK, map[string]interface{}{
"status": "idle",
"message": "No scaling operation in progress",
})
return
}
// Calculate progress
progress := float64(currentWave.CurrentReplicas) / float64(currentWave.TargetReplicas) * 100
if progress > 100 {
progress = 100
}
response := map[string]interface{}{
"status": "scaling",
"wave_id": currentWave.WaveID,
"service_name": currentWave.ServiceName,
"started_at": currentWave.StartedAt,
"target_replicas": currentWave.TargetReplicas,
"current_replicas": currentWave.CurrentReplicas,
"progress_percent": progress,
"join_attempts": len(currentWave.JoinAttempts),
"health_checks": len(currentWave.HealthChecks),
"backoff_level": currentWave.BackoffLevel,
"duration": time.Since(currentWave.StartedAt).String(),
}
api.writeJSON(w, http.StatusOK, response)
}
// StopScaling stops the current scaling operation
func (api *ScalingAPI) StopScaling(w http.ResponseWriter, r *http.Request) {
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.stop_scaling")
defer span.End()
currentWave := api.metrics.GetCurrentWave()
if currentWave == nil {
api.writeError(w, http.StatusBadRequest, "No scaling operation in progress", nil)
return
}
// Stop the scaling operation
api.controller.StopScaling(ctx)
response := map[string]interface{}{
"status": "stopped",
"wave_id": currentWave.WaveID,
"message": "Scaling operation stopped",
"stopped_at": time.Now(),
}
log.Info().
Str("wave_id", currentWave.WaveID).
Str("service_name", currentWave.ServiceName).
Msg("Stopped scaling operation via API")
api.writeJSON(w, http.StatusOK, response)
}
// GetHealthGates returns the current health gate status
func (api *ScalingAPI) GetHealthGates(w http.ResponseWriter, r *http.Request) {
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_health_gates")
defer span.End()
status, err := api.controller.healthGates.CheckHealth(ctx, nil)
if err != nil {
api.writeError(w, http.StatusInternalServerError, "Failed to check health gates", err)
return
}
response := HealthResponse{
Healthy: status.Healthy,
Timestamp: status.Timestamp,
Gates: status.Gates,
OverallReason: status.OverallReason,
}
api.writeJSON(w, http.StatusOK, response)
}
// GetHealthThresholds returns the current health thresholds
func (api *ScalingAPI) GetHealthThresholds(w http.ResponseWriter, r *http.Request) {
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_health_thresholds")
defer span.End()
thresholds := api.controller.healthGates.GetThresholds()
api.writeJSON(w, http.StatusOK, thresholds)
}
// UpdateHealthThresholds updates the health thresholds
func (api *ScalingAPI) UpdateHealthThresholds(w http.ResponseWriter, r *http.Request) {
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.update_health_thresholds")
defer span.End()
var thresholds HealthThresholds
if err := json.NewDecoder(r.Body).Decode(&thresholds); err != nil {
api.writeError(w, http.StatusBadRequest, "Invalid request body", err)
return
}
api.controller.healthGates.SetThresholds(thresholds)
log.Info().
Interface("thresholds", thresholds).
Msg("Updated health thresholds via API")
api.writeJSON(w, http.StatusOK, map[string]string{
"status": "updated",
"message": "Health thresholds updated successfully",
})
}
// GetScalingMetrics returns scaling metrics for a time window
func (api *ScalingAPI) GetScalingMetrics(w http.ResponseWriter, r *http.Request) {
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_scaling_metrics")
defer span.End()
// Parse query parameters for time window
windowStart, windowEnd := api.parseTimeWindow(r)
report := api.metrics.GenerateReport(ctx, windowStart, windowEnd)
api.writeJSON(w, http.StatusOK, report)
}
// GetRecentOperations returns recent scaling operations
func (api *ScalingAPI) GetRecentOperations(w http.ResponseWriter, r *http.Request) {
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_recent_operations")
defer span.End()
// Parse limit parameter
limit := 50 // Default limit
if limitStr := r.URL.Query().Get("limit"); limitStr != "" {
if parsedLimit, err := strconv.Atoi(limitStr); err == nil && parsedLimit > 0 {
limit = parsedLimit
}
}
operations := api.metrics.GetRecentOperations(limit)
api.writeJSON(w, http.StatusOK, map[string]interface{}{
"operations": operations,
"count": len(operations),
})
}
// ExportMetrics exports all metrics data
func (api *ScalingAPI) ExportMetrics(w http.ResponseWriter, r *http.Request) {
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.export_metrics")
defer span.End()
data, err := api.metrics.ExportMetrics(ctx)
if err != nil {
api.writeError(w, http.StatusInternalServerError, "Failed to export metrics", err)
return
}
w.Header().Set("Content-Type", "application/json")
w.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=scaling-metrics-%s.json",
time.Now().Format("2006-01-02-15-04-05")))
w.Write(data)
}
// GetServiceStatus returns detailed status for a specific service
func (api *ScalingAPI) GetServiceStatus(w http.ResponseWriter, r *http.Request) {
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_service_status")
defer span.End()
vars := mux.Vars(r)
serviceName := vars["serviceName"]
status, err := api.controller.swarmManager.GetServiceStatus(ctx, serviceName)
if err != nil {
api.writeError(w, http.StatusNotFound, "Service not found", err)
return
}
span.SetAttributes(attribute.String("service.name", serviceName))
api.writeJSON(w, http.StatusOK, status)
}
// GetServiceReplicas returns the current replica count for a service
func (api *ScalingAPI) GetServiceReplicas(w http.ResponseWriter, r *http.Request) {
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_service_replicas")
defer span.End()
vars := mux.Vars(r)
serviceName := vars["serviceName"]
replicas, err := api.controller.swarmManager.GetServiceReplicas(ctx, serviceName)
if err != nil {
api.writeError(w, http.StatusNotFound, "Service not found", err)
return
}
runningReplicas, err := api.controller.swarmManager.GetRunningReplicas(ctx, serviceName)
if err != nil {
log.Warn().Err(err).Str("service_name", serviceName).Msg("Failed to get running replica count")
runningReplicas = 0
}
response := map[string]interface{}{
"service_name": serviceName,
"desired_replicas": replicas,
"running_replicas": runningReplicas,
"timestamp": time.Now(),
}
span.SetAttributes(
attribute.String("service.name", serviceName),
attribute.Int("service.desired_replicas", replicas),
attribute.Int("service.running_replicas", runningReplicas),
)
api.writeJSON(w, http.StatusOK, response)
}
// GetAssignmentTemplates returns available assignment templates
func (api *ScalingAPI) GetAssignmentTemplates(w http.ResponseWriter, r *http.Request) {
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_assignment_templates")
defer span.End()
templates := api.controller.assignmentBroker.GetAvailableTemplates()
api.writeJSON(w, http.StatusOK, map[string]interface{}{
"templates": templates,
"count": len(templates),
})
}
// CreateAssignment creates a new assignment
func (api *ScalingAPI) CreateAssignment(w http.ResponseWriter, r *http.Request) {
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.create_assignment")
defer span.End()
var req AssignmentRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
api.writeError(w, http.StatusBadRequest, "Invalid request body", err)
return
}
assignment, err := api.controller.assignmentBroker.CreateAssignment(ctx, req)
if err != nil {
api.writeError(w, http.StatusBadRequest, "Failed to create assignment", err)
return
}
span.SetAttributes(
attribute.String("assignment.id", assignment.ID),
attribute.String("assignment.template", req.Template),
)
api.writeJSON(w, http.StatusCreated, assignment)
}
// GetBootstrapPeers returns available bootstrap peers
func (api *ScalingAPI) GetBootstrapPeers(w http.ResponseWriter, r *http.Request) {
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_bootstrap_peers")
defer span.End()
peers := api.controller.bootstrapManager.GetAllPeers()
api.writeJSON(w, http.StatusOK, map[string]interface{}{
"peers": peers,
"count": len(peers),
})
}
// GetBootstrapStats returns bootstrap pool statistics
func (api *ScalingAPI) GetBootstrapStats(w http.ResponseWriter, r *http.Request) {
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_bootstrap_stats")
defer span.End()
stats := api.controller.bootstrapManager.GetStats()
api.writeJSON(w, http.StatusOK, stats)
}
// Helper functions
// parseTimeWindow parses start and end time parameters from request
func (api *ScalingAPI) parseTimeWindow(r *http.Request) (time.Time, time.Time) {
now := time.Now()
// Default to last 24 hours
windowEnd := now
windowStart := now.Add(-24 * time.Hour)
// Parse custom window if provided
if startStr := r.URL.Query().Get("start"); startStr != "" {
if start, err := time.Parse(time.RFC3339, startStr); err == nil {
windowStart = start
}
}
if endStr := r.URL.Query().Get("end"); endStr != "" {
if end, err := time.Parse(time.RFC3339, endStr); err == nil {
windowEnd = end
}
}
// Parse duration if provided (overrides start)
if durationStr := r.URL.Query().Get("duration"); durationStr != "" {
if duration, err := time.ParseDuration(durationStr); err == nil {
windowStart = windowEnd.Add(-duration)
}
}
return windowStart, windowEnd
}
// writeJSON writes a JSON response
func (api *ScalingAPI) writeJSON(w http.ResponseWriter, status int, data interface{}) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
json.NewEncoder(w).Encode(data)
}
// writeError writes an error response
func (api *ScalingAPI) writeError(w http.ResponseWriter, status int, message string, err error) {
response := map[string]interface{}{
"error": message,
"timestamp": time.Now(),
}
if err != nil {
response["details"] = err.Error()
log.Error().Err(err).Str("error_message", message).Msg("API error")
}
api.writeJSON(w, status, response)
}