Files
WHOOSH/internal/orchestrator/scaling_api.go
Claude Code 28f02b61d1 Integrate wave-based scaling system with WHOOSH server
- Add scaling system components to server initialization
- Register scaling API and assignment broker routes
- Start bootstrap pool manager in server lifecycle
- Add graceful shutdown for scaling controller
- Update API routing to use chi.Router instead of gorilla/mux
- Fix Docker API compatibility issues
- Configure health gates with placeholder URLs for KACHING and BACKBEAT

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-22 13:59:01 +10:00

510 lines
16 KiB
Go

package orchestrator
import (
"encoding/json"
"fmt"
"net/http"
"strconv"
"time"
"github.com/go-chi/chi/v5"
"github.com/rs/zerolog/log"
"go.opentelemetry.io/otel/attribute"
"github.com/chorus-services/whoosh/internal/tracing"
)
// ScalingAPI provides HTTP endpoints for scaling operations
type ScalingAPI struct {
controller *ScalingController
metrics *ScalingMetricsCollector
}
// ScaleRequest represents a scaling request
type ScaleRequest struct {
ServiceName string `json:"service_name"`
TargetReplicas int `json:"target_replicas"`
WaveSize int `json:"wave_size,omitempty"`
Template string `json:"template,omitempty"`
Environment map[string]string `json:"environment,omitempty"`
ForceScale bool `json:"force_scale,omitempty"`
}
// ScaleResponse represents a scaling response
type ScaleResponse struct {
WaveID string `json:"wave_id"`
ServiceName string `json:"service_name"`
TargetReplicas int `json:"target_replicas"`
CurrentReplicas int `json:"current_replicas"`
Status string `json:"status"`
StartedAt time.Time `json:"started_at"`
Message string `json:"message,omitempty"`
}
// HealthResponse represents health check response
type HealthResponse struct {
Healthy bool `json:"healthy"`
Timestamp time.Time `json:"timestamp"`
Gates map[string]GateStatus `json:"gates"`
OverallReason string `json:"overall_reason,omitempty"`
}
// NewScalingAPI creates a new scaling API instance
func NewScalingAPI(controller *ScalingController, metrics *ScalingMetricsCollector) *ScalingAPI {
return &ScalingAPI{
controller: controller,
metrics: metrics,
}
}
// RegisterRoutes registers HTTP routes for the scaling API
func (api *ScalingAPI) RegisterRoutes(router chi.Router) {
// Scaling operations
router.Post("/scale", api.ScaleService)
router.Get("/scale/status", api.GetScalingStatus)
router.Post("/scale/stop", api.StopScaling)
// Health gates
router.Get("/health/gates", api.GetHealthGates)
router.Get("/health/thresholds", api.GetHealthThresholds)
router.Put("/health/thresholds", api.UpdateHealthThresholds)
// Metrics and monitoring
router.Get("/metrics/scaling", api.GetScalingMetrics)
router.Get("/metrics/operations", api.GetRecentOperations)
router.Get("/metrics/export", api.ExportMetrics)
// Service management
router.Get("/services/{serviceName}/status", api.GetServiceStatus)
router.Get("/services/{serviceName}/replicas", api.GetServiceReplicas)
// Assignment management
router.Get("/assignments/templates", api.GetAssignmentTemplates)
router.Post("/assignments", api.CreateAssignment)
// Bootstrap peer management
router.Get("/bootstrap/peers", api.GetBootstrapPeers)
router.Get("/bootstrap/stats", api.GetBootstrapStats)
}
// ScaleService handles scaling requests
func (api *ScalingAPI) ScaleService(w http.ResponseWriter, r *http.Request) {
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.scale_service")
defer span.End()
var req ScaleRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
api.writeError(w, http.StatusBadRequest, "Invalid request body", err)
return
}
// Validate request
if req.ServiceName == "" {
api.writeError(w, http.StatusBadRequest, "Service name is required", nil)
return
}
if req.TargetReplicas < 0 {
api.writeError(w, http.StatusBadRequest, "Target replicas must be non-negative", nil)
return
}
span.SetAttributes(
attribute.String("request.service_name", req.ServiceName),
attribute.Int("request.target_replicas", req.TargetReplicas),
attribute.Bool("request.force_scale", req.ForceScale),
)
// Get current replica count
currentReplicas, err := api.controller.swarmManager.GetServiceReplicas(ctx, req.ServiceName)
if err != nil {
api.writeError(w, http.StatusNotFound, "Service not found", err)
return
}
// Check if scaling is needed
if currentReplicas == req.TargetReplicas && !req.ForceScale {
response := ScaleResponse{
ServiceName: req.ServiceName,
TargetReplicas: req.TargetReplicas,
CurrentReplicas: currentReplicas,
Status: "no_action_needed",
StartedAt: time.Now(),
Message: "Service already at target replica count",
}
api.writeJSON(w, http.StatusOK, response)
return
}
// Determine scaling direction and wave size
var waveSize int
if req.WaveSize > 0 {
waveSize = req.WaveSize
} else {
// Default wave size based on scaling direction
if req.TargetReplicas > currentReplicas {
waveSize = 3 // Scale up in smaller waves
} else {
waveSize = 5 // Scale down in larger waves
}
}
// Start scaling operation
waveID, err := api.controller.StartScaling(ctx, req.ServiceName, req.TargetReplicas, waveSize, req.Template)
if err != nil {
api.writeError(w, http.StatusInternalServerError, "Failed to start scaling", err)
return
}
response := ScaleResponse{
WaveID: waveID,
ServiceName: req.ServiceName,
TargetReplicas: req.TargetReplicas,
CurrentReplicas: currentReplicas,
Status: "scaling_started",
StartedAt: time.Now(),
Message: fmt.Sprintf("Started scaling %s from %d to %d replicas", req.ServiceName, currentReplicas, req.TargetReplicas),
}
log.Info().
Str("wave_id", waveID).
Str("service_name", req.ServiceName).
Int("current_replicas", currentReplicas).
Int("target_replicas", req.TargetReplicas).
Int("wave_size", waveSize).
Msg("Started scaling operation via API")
api.writeJSON(w, http.StatusAccepted, response)
}
// GetScalingStatus returns the current scaling status
func (api *ScalingAPI) GetScalingStatus(w http.ResponseWriter, r *http.Request) {
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_scaling_status")
defer span.End()
currentWave := api.metrics.GetCurrentWave()
if currentWave == nil {
api.writeJSON(w, http.StatusOK, map[string]interface{}{
"status": "idle",
"message": "No scaling operation in progress",
})
return
}
// Calculate progress
progress := float64(currentWave.CurrentReplicas) / float64(currentWave.TargetReplicas) * 100
if progress > 100 {
progress = 100
}
response := map[string]interface{}{
"status": "scaling",
"wave_id": currentWave.WaveID,
"service_name": currentWave.ServiceName,
"started_at": currentWave.StartedAt,
"target_replicas": currentWave.TargetReplicas,
"current_replicas": currentWave.CurrentReplicas,
"progress_percent": progress,
"join_attempts": len(currentWave.JoinAttempts),
"health_checks": len(currentWave.HealthChecks),
"backoff_level": currentWave.BackoffLevel,
"duration": time.Since(currentWave.StartedAt).String(),
}
api.writeJSON(w, http.StatusOK, response)
}
// StopScaling stops the current scaling operation
func (api *ScalingAPI) StopScaling(w http.ResponseWriter, r *http.Request) {
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.stop_scaling")
defer span.End()
currentWave := api.metrics.GetCurrentWave()
if currentWave == nil {
api.writeError(w, http.StatusBadRequest, "No scaling operation in progress", nil)
return
}
// Stop the scaling operation
api.controller.StopScaling(ctx)
response := map[string]interface{}{
"status": "stopped",
"wave_id": currentWave.WaveID,
"message": "Scaling operation stopped",
"stopped_at": time.Now(),
}
log.Info().
Str("wave_id", currentWave.WaveID).
Str("service_name", currentWave.ServiceName).
Msg("Stopped scaling operation via API")
api.writeJSON(w, http.StatusOK, response)
}
// GetHealthGates returns the current health gate status
func (api *ScalingAPI) GetHealthGates(w http.ResponseWriter, r *http.Request) {
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_health_gates")
defer span.End()
status, err := api.controller.healthGates.CheckHealth(ctx, nil)
if err != nil {
api.writeError(w, http.StatusInternalServerError, "Failed to check health gates", err)
return
}
response := HealthResponse{
Healthy: status.Healthy,
Timestamp: status.Timestamp,
Gates: status.Gates,
OverallReason: status.OverallReason,
}
api.writeJSON(w, http.StatusOK, response)
}
// GetHealthThresholds returns the current health thresholds
func (api *ScalingAPI) GetHealthThresholds(w http.ResponseWriter, r *http.Request) {
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_health_thresholds")
defer span.End()
thresholds := api.controller.healthGates.GetThresholds()
api.writeJSON(w, http.StatusOK, thresholds)
}
// UpdateHealthThresholds updates the health thresholds
func (api *ScalingAPI) UpdateHealthThresholds(w http.ResponseWriter, r *http.Request) {
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.update_health_thresholds")
defer span.End()
var thresholds HealthThresholds
if err := json.NewDecoder(r.Body).Decode(&thresholds); err != nil {
api.writeError(w, http.StatusBadRequest, "Invalid request body", err)
return
}
api.controller.healthGates.SetThresholds(thresholds)
log.Info().
Interface("thresholds", thresholds).
Msg("Updated health thresholds via API")
api.writeJSON(w, http.StatusOK, map[string]string{
"status": "updated",
"message": "Health thresholds updated successfully",
})
}
// GetScalingMetrics returns scaling metrics for a time window
func (api *ScalingAPI) GetScalingMetrics(w http.ResponseWriter, r *http.Request) {
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_scaling_metrics")
defer span.End()
// Parse query parameters for time window
windowStart, windowEnd := api.parseTimeWindow(r)
report := api.metrics.GenerateReport(ctx, windowStart, windowEnd)
api.writeJSON(w, http.StatusOK, report)
}
// GetRecentOperations returns recent scaling operations
func (api *ScalingAPI) GetRecentOperations(w http.ResponseWriter, r *http.Request) {
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_recent_operations")
defer span.End()
// Parse limit parameter
limit := 50 // Default limit
if limitStr := r.URL.Query().Get("limit"); limitStr != "" {
if parsedLimit, err := strconv.Atoi(limitStr); err == nil && parsedLimit > 0 {
limit = parsedLimit
}
}
operations := api.metrics.GetRecentOperations(limit)
api.writeJSON(w, http.StatusOK, map[string]interface{}{
"operations": operations,
"count": len(operations),
})
}
// ExportMetrics exports all metrics data
func (api *ScalingAPI) ExportMetrics(w http.ResponseWriter, r *http.Request) {
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.export_metrics")
defer span.End()
data, err := api.metrics.ExportMetrics(ctx)
if err != nil {
api.writeError(w, http.StatusInternalServerError, "Failed to export metrics", err)
return
}
w.Header().Set("Content-Type", "application/json")
w.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=scaling-metrics-%s.json",
time.Now().Format("2006-01-02-15-04-05")))
w.Write(data)
}
// GetServiceStatus returns detailed status for a specific service
func (api *ScalingAPI) GetServiceStatus(w http.ResponseWriter, r *http.Request) {
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_service_status")
defer span.End()
serviceName := chi.URLParam(r, "serviceName")
status, err := api.controller.swarmManager.GetServiceStatus(ctx, serviceName)
if err != nil {
api.writeError(w, http.StatusNotFound, "Service not found", err)
return
}
span.SetAttributes(attribute.String("service.name", serviceName))
api.writeJSON(w, http.StatusOK, status)
}
// GetServiceReplicas returns the current replica count for a service
func (api *ScalingAPI) GetServiceReplicas(w http.ResponseWriter, r *http.Request) {
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_service_replicas")
defer span.End()
serviceName := chi.URLParam(r, "serviceName")
replicas, err := api.controller.swarmManager.GetServiceReplicas(ctx, serviceName)
if err != nil {
api.writeError(w, http.StatusNotFound, "Service not found", err)
return
}
runningReplicas, err := api.controller.swarmManager.GetRunningReplicas(ctx, serviceName)
if err != nil {
log.Warn().Err(err).Str("service_name", serviceName).Msg("Failed to get running replica count")
runningReplicas = 0
}
response := map[string]interface{}{
"service_name": serviceName,
"desired_replicas": replicas,
"running_replicas": runningReplicas,
"timestamp": time.Now(),
}
span.SetAttributes(
attribute.String("service.name", serviceName),
attribute.Int("service.desired_replicas", replicas),
attribute.Int("service.running_replicas", runningReplicas),
)
api.writeJSON(w, http.StatusOK, response)
}
// GetAssignmentTemplates returns available assignment templates
func (api *ScalingAPI) GetAssignmentTemplates(w http.ResponseWriter, r *http.Request) {
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_assignment_templates")
defer span.End()
// Return empty templates for now - can be implemented later
api.writeJSON(w, http.StatusOK, map[string]interface{}{
"templates": []interface{}{},
"count": 0,
})
}
// CreateAssignment creates a new assignment
func (api *ScalingAPI) CreateAssignment(w http.ResponseWriter, r *http.Request) {
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.create_assignment")
defer span.End()
var req AssignmentRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
api.writeError(w, http.StatusBadRequest, "Invalid request body", err)
return
}
assignment, err := api.controller.assignmentBroker.CreateAssignment(ctx, req)
if err != nil {
api.writeError(w, http.StatusBadRequest, "Failed to create assignment", err)
return
}
span.SetAttributes(
attribute.String("assignment.id", assignment.ID),
attribute.String("assignment.template", req.Template),
)
api.writeJSON(w, http.StatusCreated, assignment)
}
// GetBootstrapPeers returns available bootstrap peers
func (api *ScalingAPI) GetBootstrapPeers(w http.ResponseWriter, r *http.Request) {
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_bootstrap_peers")
defer span.End()
peers := api.controller.bootstrapManager.GetAllPeers()
api.writeJSON(w, http.StatusOK, map[string]interface{}{
"peers": peers,
"count": len(peers),
})
}
// GetBootstrapStats returns bootstrap pool statistics
func (api *ScalingAPI) GetBootstrapStats(w http.ResponseWriter, r *http.Request) {
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_bootstrap_stats")
defer span.End()
stats := api.controller.bootstrapManager.GetStats()
api.writeJSON(w, http.StatusOK, stats)
}
// Helper functions
// parseTimeWindow parses start and end time parameters from request
func (api *ScalingAPI) parseTimeWindow(r *http.Request) (time.Time, time.Time) {
now := time.Now()
// Default to last 24 hours
windowEnd := now
windowStart := now.Add(-24 * time.Hour)
// Parse custom window if provided
if startStr := r.URL.Query().Get("start"); startStr != "" {
if start, err := time.Parse(time.RFC3339, startStr); err == nil {
windowStart = start
}
}
if endStr := r.URL.Query().Get("end"); endStr != "" {
if end, err := time.Parse(time.RFC3339, endStr); err == nil {
windowEnd = end
}
}
// Parse duration if provided (overrides start)
if durationStr := r.URL.Query().Get("duration"); durationStr != "" {
if duration, err := time.ParseDuration(durationStr); err == nil {
windowStart = windowEnd.Add(-duration)
}
}
return windowStart, windowEnd
}
// writeJSON writes a JSON response
func (api *ScalingAPI) writeJSON(w http.ResponseWriter, status int, data interface{}) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
json.NewEncoder(w).Encode(data)
}
// writeError writes an error response
func (api *ScalingAPI) writeError(w http.ResponseWriter, status int, message string, err error) {
response := map[string]interface{}{
"error": message,
"timestamp": time.Now(),
}
if err != nil {
response["details"] = err.Error()
log.Error().Err(err).Str("error_message", message).Msg("API error")
}
api.writeJSON(w, status, response)
}