Implement wave-based scaling system for CHORUS Docker Swarm orchestration
- Health gates system for pre-scaling validation (KACHING, BACKBEAT, bootstrap peers) - Assignment broker API for per-replica configuration management - Bootstrap pool management with weighted peer selection and health monitoring - Wave-based scaling algorithm with exponential backoff and failure recovery - Enhanced SwarmManager with Docker service scaling capabilities - Comprehensive scaling metrics collection and reporting system - RESTful HTTP API for external scaling operations and monitoring - Integration with CHORUS P2P networking and assignment systems 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
513
internal/orchestrator/scaling_api.go
Normal file
513
internal/orchestrator/scaling_api.go
Normal file
@@ -0,0 +1,513 @@
|
||||
package orchestrator
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/gorilla/mux"
|
||||
"github.com/rs/zerolog/log"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
|
||||
"github.com/chorus-services/whoosh/internal/tracing"
|
||||
)
|
||||
|
||||
// ScalingAPI provides HTTP endpoints for scaling operations
|
||||
type ScalingAPI struct {
|
||||
controller *ScalingController
|
||||
metrics *ScalingMetricsCollector
|
||||
}
|
||||
|
||||
// ScaleRequest represents a scaling request
|
||||
type ScaleRequest struct {
|
||||
ServiceName string `json:"service_name"`
|
||||
TargetReplicas int `json:"target_replicas"`
|
||||
WaveSize int `json:"wave_size,omitempty"`
|
||||
Template string `json:"template,omitempty"`
|
||||
Environment map[string]string `json:"environment,omitempty"`
|
||||
ForceScale bool `json:"force_scale,omitempty"`
|
||||
}
|
||||
|
||||
// ScaleResponse represents a scaling response
|
||||
type ScaleResponse struct {
|
||||
WaveID string `json:"wave_id"`
|
||||
ServiceName string `json:"service_name"`
|
||||
TargetReplicas int `json:"target_replicas"`
|
||||
CurrentReplicas int `json:"current_replicas"`
|
||||
Status string `json:"status"`
|
||||
StartedAt time.Time `json:"started_at"`
|
||||
Message string `json:"message,omitempty"`
|
||||
}
|
||||
|
||||
// HealthResponse represents health check response
|
||||
type HealthResponse struct {
|
||||
Healthy bool `json:"healthy"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
Gates map[string]GateStatus `json:"gates"`
|
||||
OverallReason string `json:"overall_reason,omitempty"`
|
||||
}
|
||||
|
||||
// NewScalingAPI creates a new scaling API instance
|
||||
func NewScalingAPI(controller *ScalingController, metrics *ScalingMetricsCollector) *ScalingAPI {
|
||||
return &ScalingAPI{
|
||||
controller: controller,
|
||||
metrics: metrics,
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterRoutes registers HTTP routes for the scaling API
|
||||
func (api *ScalingAPI) RegisterRoutes(router *mux.Router) {
|
||||
// Scaling operations
|
||||
router.HandleFunc("/api/v1/scale", api.ScaleService).Methods("POST")
|
||||
router.HandleFunc("/api/v1/scale/status", api.GetScalingStatus).Methods("GET")
|
||||
router.HandleFunc("/api/v1/scale/stop", api.StopScaling).Methods("POST")
|
||||
|
||||
// Health gates
|
||||
router.HandleFunc("/api/v1/health/gates", api.GetHealthGates).Methods("GET")
|
||||
router.HandleFunc("/api/v1/health/thresholds", api.GetHealthThresholds).Methods("GET")
|
||||
router.HandleFunc("/api/v1/health/thresholds", api.UpdateHealthThresholds).Methods("PUT")
|
||||
|
||||
// Metrics and monitoring
|
||||
router.HandleFunc("/api/v1/metrics/scaling", api.GetScalingMetrics).Methods("GET")
|
||||
router.HandleFunc("/api/v1/metrics/operations", api.GetRecentOperations).Methods("GET")
|
||||
router.HandleFunc("/api/v1/metrics/export", api.ExportMetrics).Methods("GET")
|
||||
|
||||
// Service management
|
||||
router.HandleFunc("/api/v1/services/{serviceName}/status", api.GetServiceStatus).Methods("GET")
|
||||
router.HandleFunc("/api/v1/services/{serviceName}/replicas", api.GetServiceReplicas).Methods("GET")
|
||||
|
||||
// Assignment management
|
||||
router.HandleFunc("/api/v1/assignments/templates", api.GetAssignmentTemplates).Methods("GET")
|
||||
router.HandleFunc("/api/v1/assignments", api.CreateAssignment).Methods("POST")
|
||||
|
||||
// Bootstrap peer management
|
||||
router.HandleFunc("/api/v1/bootstrap/peers", api.GetBootstrapPeers).Methods("GET")
|
||||
router.HandleFunc("/api/v1/bootstrap/stats", api.GetBootstrapStats).Methods("GET")
|
||||
}
|
||||
|
||||
// ScaleService handles scaling requests
|
||||
func (api *ScalingAPI) ScaleService(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.scale_service")
|
||||
defer span.End()
|
||||
|
||||
var req ScaleRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
api.writeError(w, http.StatusBadRequest, "Invalid request body", err)
|
||||
return
|
||||
}
|
||||
|
||||
// Validate request
|
||||
if req.ServiceName == "" {
|
||||
api.writeError(w, http.StatusBadRequest, "Service name is required", nil)
|
||||
return
|
||||
}
|
||||
if req.TargetReplicas < 0 {
|
||||
api.writeError(w, http.StatusBadRequest, "Target replicas must be non-negative", nil)
|
||||
return
|
||||
}
|
||||
|
||||
span.SetAttributes(
|
||||
attribute.String("request.service_name", req.ServiceName),
|
||||
attribute.Int("request.target_replicas", req.TargetReplicas),
|
||||
attribute.Bool("request.force_scale", req.ForceScale),
|
||||
)
|
||||
|
||||
// Get current replica count
|
||||
currentReplicas, err := api.controller.swarmManager.GetServiceReplicas(ctx, req.ServiceName)
|
||||
if err != nil {
|
||||
api.writeError(w, http.StatusNotFound, "Service not found", err)
|
||||
return
|
||||
}
|
||||
|
||||
// Check if scaling is needed
|
||||
if currentReplicas == req.TargetReplicas && !req.ForceScale {
|
||||
response := ScaleResponse{
|
||||
ServiceName: req.ServiceName,
|
||||
TargetReplicas: req.TargetReplicas,
|
||||
CurrentReplicas: currentReplicas,
|
||||
Status: "no_action_needed",
|
||||
StartedAt: time.Now(),
|
||||
Message: "Service already at target replica count",
|
||||
}
|
||||
api.writeJSON(w, http.StatusOK, response)
|
||||
return
|
||||
}
|
||||
|
||||
// Determine scaling direction and wave size
|
||||
var waveSize int
|
||||
if req.WaveSize > 0 {
|
||||
waveSize = req.WaveSize
|
||||
} else {
|
||||
// Default wave size based on scaling direction
|
||||
if req.TargetReplicas > currentReplicas {
|
||||
waveSize = 3 // Scale up in smaller waves
|
||||
} else {
|
||||
waveSize = 5 // Scale down in larger waves
|
||||
}
|
||||
}
|
||||
|
||||
// Start scaling operation
|
||||
waveID, err := api.controller.StartScaling(ctx, req.ServiceName, req.TargetReplicas, waveSize, req.Template)
|
||||
if err != nil {
|
||||
api.writeError(w, http.StatusInternalServerError, "Failed to start scaling", err)
|
||||
return
|
||||
}
|
||||
|
||||
response := ScaleResponse{
|
||||
WaveID: waveID,
|
||||
ServiceName: req.ServiceName,
|
||||
TargetReplicas: req.TargetReplicas,
|
||||
CurrentReplicas: currentReplicas,
|
||||
Status: "scaling_started",
|
||||
StartedAt: time.Now(),
|
||||
Message: fmt.Sprintf("Started scaling %s from %d to %d replicas", req.ServiceName, currentReplicas, req.TargetReplicas),
|
||||
}
|
||||
|
||||
log.Info().
|
||||
Str("wave_id", waveID).
|
||||
Str("service_name", req.ServiceName).
|
||||
Int("current_replicas", currentReplicas).
|
||||
Int("target_replicas", req.TargetReplicas).
|
||||
Int("wave_size", waveSize).
|
||||
Msg("Started scaling operation via API")
|
||||
|
||||
api.writeJSON(w, http.StatusAccepted, response)
|
||||
}
|
||||
|
||||
// GetScalingStatus returns the current scaling status
|
||||
func (api *ScalingAPI) GetScalingStatus(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_scaling_status")
|
||||
defer span.End()
|
||||
|
||||
currentWave := api.metrics.GetCurrentWave()
|
||||
if currentWave == nil {
|
||||
api.writeJSON(w, http.StatusOK, map[string]interface{}{
|
||||
"status": "idle",
|
||||
"message": "No scaling operation in progress",
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
// Calculate progress
|
||||
progress := float64(currentWave.CurrentReplicas) / float64(currentWave.TargetReplicas) * 100
|
||||
if progress > 100 {
|
||||
progress = 100
|
||||
}
|
||||
|
||||
response := map[string]interface{}{
|
||||
"status": "scaling",
|
||||
"wave_id": currentWave.WaveID,
|
||||
"service_name": currentWave.ServiceName,
|
||||
"started_at": currentWave.StartedAt,
|
||||
"target_replicas": currentWave.TargetReplicas,
|
||||
"current_replicas": currentWave.CurrentReplicas,
|
||||
"progress_percent": progress,
|
||||
"join_attempts": len(currentWave.JoinAttempts),
|
||||
"health_checks": len(currentWave.HealthChecks),
|
||||
"backoff_level": currentWave.BackoffLevel,
|
||||
"duration": time.Since(currentWave.StartedAt).String(),
|
||||
}
|
||||
|
||||
api.writeJSON(w, http.StatusOK, response)
|
||||
}
|
||||
|
||||
// StopScaling stops the current scaling operation
|
||||
func (api *ScalingAPI) StopScaling(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.stop_scaling")
|
||||
defer span.End()
|
||||
|
||||
currentWave := api.metrics.GetCurrentWave()
|
||||
if currentWave == nil {
|
||||
api.writeError(w, http.StatusBadRequest, "No scaling operation in progress", nil)
|
||||
return
|
||||
}
|
||||
|
||||
// Stop the scaling operation
|
||||
api.controller.StopScaling(ctx)
|
||||
|
||||
response := map[string]interface{}{
|
||||
"status": "stopped",
|
||||
"wave_id": currentWave.WaveID,
|
||||
"message": "Scaling operation stopped",
|
||||
"stopped_at": time.Now(),
|
||||
}
|
||||
|
||||
log.Info().
|
||||
Str("wave_id", currentWave.WaveID).
|
||||
Str("service_name", currentWave.ServiceName).
|
||||
Msg("Stopped scaling operation via API")
|
||||
|
||||
api.writeJSON(w, http.StatusOK, response)
|
||||
}
|
||||
|
||||
// GetHealthGates returns the current health gate status
|
||||
func (api *ScalingAPI) GetHealthGates(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_health_gates")
|
||||
defer span.End()
|
||||
|
||||
status, err := api.controller.healthGates.CheckHealth(ctx, nil)
|
||||
if err != nil {
|
||||
api.writeError(w, http.StatusInternalServerError, "Failed to check health gates", err)
|
||||
return
|
||||
}
|
||||
|
||||
response := HealthResponse{
|
||||
Healthy: status.Healthy,
|
||||
Timestamp: status.Timestamp,
|
||||
Gates: status.Gates,
|
||||
OverallReason: status.OverallReason,
|
||||
}
|
||||
|
||||
api.writeJSON(w, http.StatusOK, response)
|
||||
}
|
||||
|
||||
// GetHealthThresholds returns the current health thresholds
|
||||
func (api *ScalingAPI) GetHealthThresholds(w http.ResponseWriter, r *http.Request) {
|
||||
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_health_thresholds")
|
||||
defer span.End()
|
||||
|
||||
thresholds := api.controller.healthGates.GetThresholds()
|
||||
api.writeJSON(w, http.StatusOK, thresholds)
|
||||
}
|
||||
|
||||
// UpdateHealthThresholds updates the health thresholds
|
||||
func (api *ScalingAPI) UpdateHealthThresholds(w http.ResponseWriter, r *http.Request) {
|
||||
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.update_health_thresholds")
|
||||
defer span.End()
|
||||
|
||||
var thresholds HealthThresholds
|
||||
if err := json.NewDecoder(r.Body).Decode(&thresholds); err != nil {
|
||||
api.writeError(w, http.StatusBadRequest, "Invalid request body", err)
|
||||
return
|
||||
}
|
||||
|
||||
api.controller.healthGates.SetThresholds(thresholds)
|
||||
|
||||
log.Info().
|
||||
Interface("thresholds", thresholds).
|
||||
Msg("Updated health thresholds via API")
|
||||
|
||||
api.writeJSON(w, http.StatusOK, map[string]string{
|
||||
"status": "updated",
|
||||
"message": "Health thresholds updated successfully",
|
||||
})
|
||||
}
|
||||
|
||||
// GetScalingMetrics returns scaling metrics for a time window
|
||||
func (api *ScalingAPI) GetScalingMetrics(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_scaling_metrics")
|
||||
defer span.End()
|
||||
|
||||
// Parse query parameters for time window
|
||||
windowStart, windowEnd := api.parseTimeWindow(r)
|
||||
|
||||
report := api.metrics.GenerateReport(ctx, windowStart, windowEnd)
|
||||
api.writeJSON(w, http.StatusOK, report)
|
||||
}
|
||||
|
||||
// GetRecentOperations returns recent scaling operations
|
||||
func (api *ScalingAPI) GetRecentOperations(w http.ResponseWriter, r *http.Request) {
|
||||
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_recent_operations")
|
||||
defer span.End()
|
||||
|
||||
// Parse limit parameter
|
||||
limit := 50 // Default limit
|
||||
if limitStr := r.URL.Query().Get("limit"); limitStr != "" {
|
||||
if parsedLimit, err := strconv.Atoi(limitStr); err == nil && parsedLimit > 0 {
|
||||
limit = parsedLimit
|
||||
}
|
||||
}
|
||||
|
||||
operations := api.metrics.GetRecentOperations(limit)
|
||||
api.writeJSON(w, http.StatusOK, map[string]interface{}{
|
||||
"operations": operations,
|
||||
"count": len(operations),
|
||||
})
|
||||
}
|
||||
|
||||
// ExportMetrics exports all metrics data
|
||||
func (api *ScalingAPI) ExportMetrics(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.export_metrics")
|
||||
defer span.End()
|
||||
|
||||
data, err := api.metrics.ExportMetrics(ctx)
|
||||
if err != nil {
|
||||
api.writeError(w, http.StatusInternalServerError, "Failed to export metrics", err)
|
||||
return
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=scaling-metrics-%s.json",
|
||||
time.Now().Format("2006-01-02-15-04-05")))
|
||||
w.Write(data)
|
||||
}
|
||||
|
||||
// GetServiceStatus returns detailed status for a specific service
|
||||
func (api *ScalingAPI) GetServiceStatus(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_service_status")
|
||||
defer span.End()
|
||||
|
||||
vars := mux.Vars(r)
|
||||
serviceName := vars["serviceName"]
|
||||
|
||||
status, err := api.controller.swarmManager.GetServiceStatus(ctx, serviceName)
|
||||
if err != nil {
|
||||
api.writeError(w, http.StatusNotFound, "Service not found", err)
|
||||
return
|
||||
}
|
||||
|
||||
span.SetAttributes(attribute.String("service.name", serviceName))
|
||||
api.writeJSON(w, http.StatusOK, status)
|
||||
}
|
||||
|
||||
// GetServiceReplicas returns the current replica count for a service
|
||||
func (api *ScalingAPI) GetServiceReplicas(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_service_replicas")
|
||||
defer span.End()
|
||||
|
||||
vars := mux.Vars(r)
|
||||
serviceName := vars["serviceName"]
|
||||
|
||||
replicas, err := api.controller.swarmManager.GetServiceReplicas(ctx, serviceName)
|
||||
if err != nil {
|
||||
api.writeError(w, http.StatusNotFound, "Service not found", err)
|
||||
return
|
||||
}
|
||||
|
||||
runningReplicas, err := api.controller.swarmManager.GetRunningReplicas(ctx, serviceName)
|
||||
if err != nil {
|
||||
log.Warn().Err(err).Str("service_name", serviceName).Msg("Failed to get running replica count")
|
||||
runningReplicas = 0
|
||||
}
|
||||
|
||||
response := map[string]interface{}{
|
||||
"service_name": serviceName,
|
||||
"desired_replicas": replicas,
|
||||
"running_replicas": runningReplicas,
|
||||
"timestamp": time.Now(),
|
||||
}
|
||||
|
||||
span.SetAttributes(
|
||||
attribute.String("service.name", serviceName),
|
||||
attribute.Int("service.desired_replicas", replicas),
|
||||
attribute.Int("service.running_replicas", runningReplicas),
|
||||
)
|
||||
|
||||
api.writeJSON(w, http.StatusOK, response)
|
||||
}
|
||||
|
||||
// GetAssignmentTemplates returns available assignment templates
|
||||
func (api *ScalingAPI) GetAssignmentTemplates(w http.ResponseWriter, r *http.Request) {
|
||||
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_assignment_templates")
|
||||
defer span.End()
|
||||
|
||||
templates := api.controller.assignmentBroker.GetAvailableTemplates()
|
||||
api.writeJSON(w, http.StatusOK, map[string]interface{}{
|
||||
"templates": templates,
|
||||
"count": len(templates),
|
||||
})
|
||||
}
|
||||
|
||||
// CreateAssignment creates a new assignment
|
||||
func (api *ScalingAPI) CreateAssignment(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.create_assignment")
|
||||
defer span.End()
|
||||
|
||||
var req AssignmentRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
api.writeError(w, http.StatusBadRequest, "Invalid request body", err)
|
||||
return
|
||||
}
|
||||
|
||||
assignment, err := api.controller.assignmentBroker.CreateAssignment(ctx, req)
|
||||
if err != nil {
|
||||
api.writeError(w, http.StatusBadRequest, "Failed to create assignment", err)
|
||||
return
|
||||
}
|
||||
|
||||
span.SetAttributes(
|
||||
attribute.String("assignment.id", assignment.ID),
|
||||
attribute.String("assignment.template", req.Template),
|
||||
)
|
||||
|
||||
api.writeJSON(w, http.StatusCreated, assignment)
|
||||
}
|
||||
|
||||
// GetBootstrapPeers returns available bootstrap peers
|
||||
func (api *ScalingAPI) GetBootstrapPeers(w http.ResponseWriter, r *http.Request) {
|
||||
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_bootstrap_peers")
|
||||
defer span.End()
|
||||
|
||||
peers := api.controller.bootstrapManager.GetAllPeers()
|
||||
api.writeJSON(w, http.StatusOK, map[string]interface{}{
|
||||
"peers": peers,
|
||||
"count": len(peers),
|
||||
})
|
||||
}
|
||||
|
||||
// GetBootstrapStats returns bootstrap pool statistics
|
||||
func (api *ScalingAPI) GetBootstrapStats(w http.ResponseWriter, r *http.Request) {
|
||||
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_bootstrap_stats")
|
||||
defer span.End()
|
||||
|
||||
stats := api.controller.bootstrapManager.GetStats()
|
||||
api.writeJSON(w, http.StatusOK, stats)
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
|
||||
// parseTimeWindow parses start and end time parameters from request
|
||||
func (api *ScalingAPI) parseTimeWindow(r *http.Request) (time.Time, time.Time) {
|
||||
now := time.Now()
|
||||
|
||||
// Default to last 24 hours
|
||||
windowEnd := now
|
||||
windowStart := now.Add(-24 * time.Hour)
|
||||
|
||||
// Parse custom window if provided
|
||||
if startStr := r.URL.Query().Get("start"); startStr != "" {
|
||||
if start, err := time.Parse(time.RFC3339, startStr); err == nil {
|
||||
windowStart = start
|
||||
}
|
||||
}
|
||||
|
||||
if endStr := r.URL.Query().Get("end"); endStr != "" {
|
||||
if end, err := time.Parse(time.RFC3339, endStr); err == nil {
|
||||
windowEnd = end
|
||||
}
|
||||
}
|
||||
|
||||
// Parse duration if provided (overrides start)
|
||||
if durationStr := r.URL.Query().Get("duration"); durationStr != "" {
|
||||
if duration, err := time.ParseDuration(durationStr); err == nil {
|
||||
windowStart = windowEnd.Add(-duration)
|
||||
}
|
||||
}
|
||||
|
||||
return windowStart, windowEnd
|
||||
}
|
||||
|
||||
// writeJSON writes a JSON response
|
||||
func (api *ScalingAPI) writeJSON(w http.ResponseWriter, status int, data interface{}) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(status)
|
||||
json.NewEncoder(w).Encode(data)
|
||||
}
|
||||
|
||||
// writeError writes an error response
|
||||
func (api *ScalingAPI) writeError(w http.ResponseWriter, status int, message string, err error) {
|
||||
response := map[string]interface{}{
|
||||
"error": message,
|
||||
"timestamp": time.Now(),
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
response["details"] = err.Error()
|
||||
log.Error().Err(err).Str("error_message", message).Msg("API error")
|
||||
}
|
||||
|
||||
api.writeJSON(w, status, response)
|
||||
}
|
||||
Reference in New Issue
Block a user