- Add scaling system components to server initialization - Register scaling API and assignment broker routes - Start bootstrap pool manager in server lifecycle - Add graceful shutdown for scaling controller - Update API routing to use chi.Router instead of gorilla/mux - Fix Docker API compatibility issues - Configure health gates with placeholder URLs for KACHING and BACKBEAT 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
510 lines
16 KiB
Go
510 lines
16 KiB
Go
package orchestrator
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"net/http"
|
|
"strconv"
|
|
"time"
|
|
|
|
"github.com/go-chi/chi/v5"
|
|
"github.com/rs/zerolog/log"
|
|
"go.opentelemetry.io/otel/attribute"
|
|
|
|
"github.com/chorus-services/whoosh/internal/tracing"
|
|
)
|
|
|
|
// ScalingAPI provides HTTP endpoints for scaling operations
|
|
type ScalingAPI struct {
|
|
controller *ScalingController
|
|
metrics *ScalingMetricsCollector
|
|
}
|
|
|
|
// ScaleRequest represents a scaling request
|
|
type ScaleRequest struct {
|
|
ServiceName string `json:"service_name"`
|
|
TargetReplicas int `json:"target_replicas"`
|
|
WaveSize int `json:"wave_size,omitempty"`
|
|
Template string `json:"template,omitempty"`
|
|
Environment map[string]string `json:"environment,omitempty"`
|
|
ForceScale bool `json:"force_scale,omitempty"`
|
|
}
|
|
|
|
// ScaleResponse represents a scaling response
|
|
type ScaleResponse struct {
|
|
WaveID string `json:"wave_id"`
|
|
ServiceName string `json:"service_name"`
|
|
TargetReplicas int `json:"target_replicas"`
|
|
CurrentReplicas int `json:"current_replicas"`
|
|
Status string `json:"status"`
|
|
StartedAt time.Time `json:"started_at"`
|
|
Message string `json:"message,omitempty"`
|
|
}
|
|
|
|
// HealthResponse represents health check response
|
|
type HealthResponse struct {
|
|
Healthy bool `json:"healthy"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
Gates map[string]GateStatus `json:"gates"`
|
|
OverallReason string `json:"overall_reason,omitempty"`
|
|
}
|
|
|
|
// NewScalingAPI creates a new scaling API instance
|
|
func NewScalingAPI(controller *ScalingController, metrics *ScalingMetricsCollector) *ScalingAPI {
|
|
return &ScalingAPI{
|
|
controller: controller,
|
|
metrics: metrics,
|
|
}
|
|
}
|
|
|
|
// RegisterRoutes registers HTTP routes for the scaling API
|
|
func (api *ScalingAPI) RegisterRoutes(router chi.Router) {
|
|
// Scaling operations
|
|
router.Post("/scale", api.ScaleService)
|
|
router.Get("/scale/status", api.GetScalingStatus)
|
|
router.Post("/scale/stop", api.StopScaling)
|
|
|
|
// Health gates
|
|
router.Get("/health/gates", api.GetHealthGates)
|
|
router.Get("/health/thresholds", api.GetHealthThresholds)
|
|
router.Put("/health/thresholds", api.UpdateHealthThresholds)
|
|
|
|
// Metrics and monitoring
|
|
router.Get("/metrics/scaling", api.GetScalingMetrics)
|
|
router.Get("/metrics/operations", api.GetRecentOperations)
|
|
router.Get("/metrics/export", api.ExportMetrics)
|
|
|
|
// Service management
|
|
router.Get("/services/{serviceName}/status", api.GetServiceStatus)
|
|
router.Get("/services/{serviceName}/replicas", api.GetServiceReplicas)
|
|
|
|
// Assignment management
|
|
router.Get("/assignments/templates", api.GetAssignmentTemplates)
|
|
router.Post("/assignments", api.CreateAssignment)
|
|
|
|
// Bootstrap peer management
|
|
router.Get("/bootstrap/peers", api.GetBootstrapPeers)
|
|
router.Get("/bootstrap/stats", api.GetBootstrapStats)
|
|
}
|
|
|
|
// ScaleService handles scaling requests
|
|
func (api *ScalingAPI) ScaleService(w http.ResponseWriter, r *http.Request) {
|
|
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.scale_service")
|
|
defer span.End()
|
|
|
|
var req ScaleRequest
|
|
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
|
api.writeError(w, http.StatusBadRequest, "Invalid request body", err)
|
|
return
|
|
}
|
|
|
|
// Validate request
|
|
if req.ServiceName == "" {
|
|
api.writeError(w, http.StatusBadRequest, "Service name is required", nil)
|
|
return
|
|
}
|
|
if req.TargetReplicas < 0 {
|
|
api.writeError(w, http.StatusBadRequest, "Target replicas must be non-negative", nil)
|
|
return
|
|
}
|
|
|
|
span.SetAttributes(
|
|
attribute.String("request.service_name", req.ServiceName),
|
|
attribute.Int("request.target_replicas", req.TargetReplicas),
|
|
attribute.Bool("request.force_scale", req.ForceScale),
|
|
)
|
|
|
|
// Get current replica count
|
|
currentReplicas, err := api.controller.swarmManager.GetServiceReplicas(ctx, req.ServiceName)
|
|
if err != nil {
|
|
api.writeError(w, http.StatusNotFound, "Service not found", err)
|
|
return
|
|
}
|
|
|
|
// Check if scaling is needed
|
|
if currentReplicas == req.TargetReplicas && !req.ForceScale {
|
|
response := ScaleResponse{
|
|
ServiceName: req.ServiceName,
|
|
TargetReplicas: req.TargetReplicas,
|
|
CurrentReplicas: currentReplicas,
|
|
Status: "no_action_needed",
|
|
StartedAt: time.Now(),
|
|
Message: "Service already at target replica count",
|
|
}
|
|
api.writeJSON(w, http.StatusOK, response)
|
|
return
|
|
}
|
|
|
|
// Determine scaling direction and wave size
|
|
var waveSize int
|
|
if req.WaveSize > 0 {
|
|
waveSize = req.WaveSize
|
|
} else {
|
|
// Default wave size based on scaling direction
|
|
if req.TargetReplicas > currentReplicas {
|
|
waveSize = 3 // Scale up in smaller waves
|
|
} else {
|
|
waveSize = 5 // Scale down in larger waves
|
|
}
|
|
}
|
|
|
|
// Start scaling operation
|
|
waveID, err := api.controller.StartScaling(ctx, req.ServiceName, req.TargetReplicas, waveSize, req.Template)
|
|
if err != nil {
|
|
api.writeError(w, http.StatusInternalServerError, "Failed to start scaling", err)
|
|
return
|
|
}
|
|
|
|
response := ScaleResponse{
|
|
WaveID: waveID,
|
|
ServiceName: req.ServiceName,
|
|
TargetReplicas: req.TargetReplicas,
|
|
CurrentReplicas: currentReplicas,
|
|
Status: "scaling_started",
|
|
StartedAt: time.Now(),
|
|
Message: fmt.Sprintf("Started scaling %s from %d to %d replicas", req.ServiceName, currentReplicas, req.TargetReplicas),
|
|
}
|
|
|
|
log.Info().
|
|
Str("wave_id", waveID).
|
|
Str("service_name", req.ServiceName).
|
|
Int("current_replicas", currentReplicas).
|
|
Int("target_replicas", req.TargetReplicas).
|
|
Int("wave_size", waveSize).
|
|
Msg("Started scaling operation via API")
|
|
|
|
api.writeJSON(w, http.StatusAccepted, response)
|
|
}
|
|
|
|
// GetScalingStatus returns the current scaling status
|
|
func (api *ScalingAPI) GetScalingStatus(w http.ResponseWriter, r *http.Request) {
|
|
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_scaling_status")
|
|
defer span.End()
|
|
|
|
currentWave := api.metrics.GetCurrentWave()
|
|
if currentWave == nil {
|
|
api.writeJSON(w, http.StatusOK, map[string]interface{}{
|
|
"status": "idle",
|
|
"message": "No scaling operation in progress",
|
|
})
|
|
return
|
|
}
|
|
|
|
// Calculate progress
|
|
progress := float64(currentWave.CurrentReplicas) / float64(currentWave.TargetReplicas) * 100
|
|
if progress > 100 {
|
|
progress = 100
|
|
}
|
|
|
|
response := map[string]interface{}{
|
|
"status": "scaling",
|
|
"wave_id": currentWave.WaveID,
|
|
"service_name": currentWave.ServiceName,
|
|
"started_at": currentWave.StartedAt,
|
|
"target_replicas": currentWave.TargetReplicas,
|
|
"current_replicas": currentWave.CurrentReplicas,
|
|
"progress_percent": progress,
|
|
"join_attempts": len(currentWave.JoinAttempts),
|
|
"health_checks": len(currentWave.HealthChecks),
|
|
"backoff_level": currentWave.BackoffLevel,
|
|
"duration": time.Since(currentWave.StartedAt).String(),
|
|
}
|
|
|
|
api.writeJSON(w, http.StatusOK, response)
|
|
}
|
|
|
|
// StopScaling stops the current scaling operation
|
|
func (api *ScalingAPI) StopScaling(w http.ResponseWriter, r *http.Request) {
|
|
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.stop_scaling")
|
|
defer span.End()
|
|
|
|
currentWave := api.metrics.GetCurrentWave()
|
|
if currentWave == nil {
|
|
api.writeError(w, http.StatusBadRequest, "No scaling operation in progress", nil)
|
|
return
|
|
}
|
|
|
|
// Stop the scaling operation
|
|
api.controller.StopScaling(ctx)
|
|
|
|
response := map[string]interface{}{
|
|
"status": "stopped",
|
|
"wave_id": currentWave.WaveID,
|
|
"message": "Scaling operation stopped",
|
|
"stopped_at": time.Now(),
|
|
}
|
|
|
|
log.Info().
|
|
Str("wave_id", currentWave.WaveID).
|
|
Str("service_name", currentWave.ServiceName).
|
|
Msg("Stopped scaling operation via API")
|
|
|
|
api.writeJSON(w, http.StatusOK, response)
|
|
}
|
|
|
|
// GetHealthGates returns the current health gate status
|
|
func (api *ScalingAPI) GetHealthGates(w http.ResponseWriter, r *http.Request) {
|
|
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_health_gates")
|
|
defer span.End()
|
|
|
|
status, err := api.controller.healthGates.CheckHealth(ctx, nil)
|
|
if err != nil {
|
|
api.writeError(w, http.StatusInternalServerError, "Failed to check health gates", err)
|
|
return
|
|
}
|
|
|
|
response := HealthResponse{
|
|
Healthy: status.Healthy,
|
|
Timestamp: status.Timestamp,
|
|
Gates: status.Gates,
|
|
OverallReason: status.OverallReason,
|
|
}
|
|
|
|
api.writeJSON(w, http.StatusOK, response)
|
|
}
|
|
|
|
// GetHealthThresholds returns the current health thresholds
|
|
func (api *ScalingAPI) GetHealthThresholds(w http.ResponseWriter, r *http.Request) {
|
|
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_health_thresholds")
|
|
defer span.End()
|
|
|
|
thresholds := api.controller.healthGates.GetThresholds()
|
|
api.writeJSON(w, http.StatusOK, thresholds)
|
|
}
|
|
|
|
// UpdateHealthThresholds updates the health thresholds
|
|
func (api *ScalingAPI) UpdateHealthThresholds(w http.ResponseWriter, r *http.Request) {
|
|
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.update_health_thresholds")
|
|
defer span.End()
|
|
|
|
var thresholds HealthThresholds
|
|
if err := json.NewDecoder(r.Body).Decode(&thresholds); err != nil {
|
|
api.writeError(w, http.StatusBadRequest, "Invalid request body", err)
|
|
return
|
|
}
|
|
|
|
api.controller.healthGates.SetThresholds(thresholds)
|
|
|
|
log.Info().
|
|
Interface("thresholds", thresholds).
|
|
Msg("Updated health thresholds via API")
|
|
|
|
api.writeJSON(w, http.StatusOK, map[string]string{
|
|
"status": "updated",
|
|
"message": "Health thresholds updated successfully",
|
|
})
|
|
}
|
|
|
|
// GetScalingMetrics returns scaling metrics for a time window
|
|
func (api *ScalingAPI) GetScalingMetrics(w http.ResponseWriter, r *http.Request) {
|
|
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_scaling_metrics")
|
|
defer span.End()
|
|
|
|
// Parse query parameters for time window
|
|
windowStart, windowEnd := api.parseTimeWindow(r)
|
|
|
|
report := api.metrics.GenerateReport(ctx, windowStart, windowEnd)
|
|
api.writeJSON(w, http.StatusOK, report)
|
|
}
|
|
|
|
// GetRecentOperations returns recent scaling operations
|
|
func (api *ScalingAPI) GetRecentOperations(w http.ResponseWriter, r *http.Request) {
|
|
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_recent_operations")
|
|
defer span.End()
|
|
|
|
// Parse limit parameter
|
|
limit := 50 // Default limit
|
|
if limitStr := r.URL.Query().Get("limit"); limitStr != "" {
|
|
if parsedLimit, err := strconv.Atoi(limitStr); err == nil && parsedLimit > 0 {
|
|
limit = parsedLimit
|
|
}
|
|
}
|
|
|
|
operations := api.metrics.GetRecentOperations(limit)
|
|
api.writeJSON(w, http.StatusOK, map[string]interface{}{
|
|
"operations": operations,
|
|
"count": len(operations),
|
|
})
|
|
}
|
|
|
|
// ExportMetrics exports all metrics data
|
|
func (api *ScalingAPI) ExportMetrics(w http.ResponseWriter, r *http.Request) {
|
|
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.export_metrics")
|
|
defer span.End()
|
|
|
|
data, err := api.metrics.ExportMetrics(ctx)
|
|
if err != nil {
|
|
api.writeError(w, http.StatusInternalServerError, "Failed to export metrics", err)
|
|
return
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
w.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=scaling-metrics-%s.json",
|
|
time.Now().Format("2006-01-02-15-04-05")))
|
|
w.Write(data)
|
|
}
|
|
|
|
// GetServiceStatus returns detailed status for a specific service
|
|
func (api *ScalingAPI) GetServiceStatus(w http.ResponseWriter, r *http.Request) {
|
|
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_service_status")
|
|
defer span.End()
|
|
|
|
serviceName := chi.URLParam(r, "serviceName")
|
|
|
|
status, err := api.controller.swarmManager.GetServiceStatus(ctx, serviceName)
|
|
if err != nil {
|
|
api.writeError(w, http.StatusNotFound, "Service not found", err)
|
|
return
|
|
}
|
|
|
|
span.SetAttributes(attribute.String("service.name", serviceName))
|
|
api.writeJSON(w, http.StatusOK, status)
|
|
}
|
|
|
|
// GetServiceReplicas returns the current replica count for a service
|
|
func (api *ScalingAPI) GetServiceReplicas(w http.ResponseWriter, r *http.Request) {
|
|
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_service_replicas")
|
|
defer span.End()
|
|
|
|
serviceName := chi.URLParam(r, "serviceName")
|
|
|
|
replicas, err := api.controller.swarmManager.GetServiceReplicas(ctx, serviceName)
|
|
if err != nil {
|
|
api.writeError(w, http.StatusNotFound, "Service not found", err)
|
|
return
|
|
}
|
|
|
|
runningReplicas, err := api.controller.swarmManager.GetRunningReplicas(ctx, serviceName)
|
|
if err != nil {
|
|
log.Warn().Err(err).Str("service_name", serviceName).Msg("Failed to get running replica count")
|
|
runningReplicas = 0
|
|
}
|
|
|
|
response := map[string]interface{}{
|
|
"service_name": serviceName,
|
|
"desired_replicas": replicas,
|
|
"running_replicas": runningReplicas,
|
|
"timestamp": time.Now(),
|
|
}
|
|
|
|
span.SetAttributes(
|
|
attribute.String("service.name", serviceName),
|
|
attribute.Int("service.desired_replicas", replicas),
|
|
attribute.Int("service.running_replicas", runningReplicas),
|
|
)
|
|
|
|
api.writeJSON(w, http.StatusOK, response)
|
|
}
|
|
|
|
// GetAssignmentTemplates returns available assignment templates
|
|
func (api *ScalingAPI) GetAssignmentTemplates(w http.ResponseWriter, r *http.Request) {
|
|
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_assignment_templates")
|
|
defer span.End()
|
|
|
|
// Return empty templates for now - can be implemented later
|
|
api.writeJSON(w, http.StatusOK, map[string]interface{}{
|
|
"templates": []interface{}{},
|
|
"count": 0,
|
|
})
|
|
}
|
|
|
|
// CreateAssignment creates a new assignment
|
|
func (api *ScalingAPI) CreateAssignment(w http.ResponseWriter, r *http.Request) {
|
|
ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.create_assignment")
|
|
defer span.End()
|
|
|
|
var req AssignmentRequest
|
|
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
|
api.writeError(w, http.StatusBadRequest, "Invalid request body", err)
|
|
return
|
|
}
|
|
|
|
assignment, err := api.controller.assignmentBroker.CreateAssignment(ctx, req)
|
|
if err != nil {
|
|
api.writeError(w, http.StatusBadRequest, "Failed to create assignment", err)
|
|
return
|
|
}
|
|
|
|
span.SetAttributes(
|
|
attribute.String("assignment.id", assignment.ID),
|
|
attribute.String("assignment.template", req.Template),
|
|
)
|
|
|
|
api.writeJSON(w, http.StatusCreated, assignment)
|
|
}
|
|
|
|
// GetBootstrapPeers returns available bootstrap peers
|
|
func (api *ScalingAPI) GetBootstrapPeers(w http.ResponseWriter, r *http.Request) {
|
|
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_bootstrap_peers")
|
|
defer span.End()
|
|
|
|
peers := api.controller.bootstrapManager.GetAllPeers()
|
|
api.writeJSON(w, http.StatusOK, map[string]interface{}{
|
|
"peers": peers,
|
|
"count": len(peers),
|
|
})
|
|
}
|
|
|
|
// GetBootstrapStats returns bootstrap pool statistics
|
|
func (api *ScalingAPI) GetBootstrapStats(w http.ResponseWriter, r *http.Request) {
|
|
_, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_bootstrap_stats")
|
|
defer span.End()
|
|
|
|
stats := api.controller.bootstrapManager.GetStats()
|
|
api.writeJSON(w, http.StatusOK, stats)
|
|
}
|
|
|
|
// Helper functions
|
|
|
|
// parseTimeWindow parses start and end time parameters from request
|
|
func (api *ScalingAPI) parseTimeWindow(r *http.Request) (time.Time, time.Time) {
|
|
now := time.Now()
|
|
|
|
// Default to last 24 hours
|
|
windowEnd := now
|
|
windowStart := now.Add(-24 * time.Hour)
|
|
|
|
// Parse custom window if provided
|
|
if startStr := r.URL.Query().Get("start"); startStr != "" {
|
|
if start, err := time.Parse(time.RFC3339, startStr); err == nil {
|
|
windowStart = start
|
|
}
|
|
}
|
|
|
|
if endStr := r.URL.Query().Get("end"); endStr != "" {
|
|
if end, err := time.Parse(time.RFC3339, endStr); err == nil {
|
|
windowEnd = end
|
|
}
|
|
}
|
|
|
|
// Parse duration if provided (overrides start)
|
|
if durationStr := r.URL.Query().Get("duration"); durationStr != "" {
|
|
if duration, err := time.ParseDuration(durationStr); err == nil {
|
|
windowStart = windowEnd.Add(-duration)
|
|
}
|
|
}
|
|
|
|
return windowStart, windowEnd
|
|
}
|
|
|
|
// writeJSON writes a JSON response
|
|
func (api *ScalingAPI) writeJSON(w http.ResponseWriter, status int, data interface{}) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
w.WriteHeader(status)
|
|
json.NewEncoder(w).Encode(data)
|
|
}
|
|
|
|
// writeError writes an error response
|
|
func (api *ScalingAPI) writeError(w http.ResponseWriter, status int, message string, err error) {
|
|
response := map[string]interface{}{
|
|
"error": message,
|
|
"timestamp": time.Now(),
|
|
}
|
|
|
|
if err != nil {
|
|
response["details"] = err.Error()
|
|
log.Error().Err(err).Str("error_message", message).Msg("API error")
|
|
}
|
|
|
|
api.writeJSON(w, status, response)
|
|
} |