package orchestrator import ( "encoding/json" "fmt" "net/http" "strconv" "time" "github.com/go-chi/chi/v5" "github.com/rs/zerolog/log" "go.opentelemetry.io/otel/attribute" "github.com/chorus-services/whoosh/internal/tracing" ) // ScalingAPI provides HTTP endpoints for scaling operations type ScalingAPI struct { controller *ScalingController metrics *ScalingMetricsCollector } // ScaleRequest represents a scaling request type ScaleRequest struct { ServiceName string `json:"service_name"` TargetReplicas int `json:"target_replicas"` WaveSize int `json:"wave_size,omitempty"` Template string `json:"template,omitempty"` Environment map[string]string `json:"environment,omitempty"` ForceScale bool `json:"force_scale,omitempty"` } // ScaleResponse represents a scaling response type ScaleResponse struct { WaveID string `json:"wave_id"` ServiceName string `json:"service_name"` TargetReplicas int `json:"target_replicas"` CurrentReplicas int `json:"current_replicas"` Status string `json:"status"` StartedAt time.Time `json:"started_at"` Message string `json:"message,omitempty"` } // HealthResponse represents health check response type HealthResponse struct { Healthy bool `json:"healthy"` Timestamp time.Time `json:"timestamp"` Gates map[string]GateStatus `json:"gates"` OverallReason string `json:"overall_reason,omitempty"` } // NewScalingAPI creates a new scaling API instance func NewScalingAPI(controller *ScalingController, metrics *ScalingMetricsCollector) *ScalingAPI { return &ScalingAPI{ controller: controller, metrics: metrics, } } // RegisterRoutes registers HTTP routes for the scaling API func (api *ScalingAPI) RegisterRoutes(router chi.Router) { // Scaling operations router.Post("/scale", api.ScaleService) router.Get("/scale/status", api.GetScalingStatus) router.Post("/scale/stop", api.StopScaling) // Health gates router.Get("/health/gates", api.GetHealthGates) router.Get("/health/thresholds", api.GetHealthThresholds) router.Put("/health/thresholds", api.UpdateHealthThresholds) // Metrics and monitoring router.Get("/metrics/scaling", api.GetScalingMetrics) router.Get("/metrics/operations", api.GetRecentOperations) router.Get("/metrics/export", api.ExportMetrics) // Service management router.Get("/services/{serviceName}/status", api.GetServiceStatus) router.Get("/services/{serviceName}/replicas", api.GetServiceReplicas) // Assignment management router.Get("/assignments/templates", api.GetAssignmentTemplates) router.Post("/assignments", api.CreateAssignment) // Bootstrap peer management router.Get("/bootstrap/peers", api.GetBootstrapPeers) router.Get("/bootstrap/stats", api.GetBootstrapStats) } // ScaleService handles scaling requests func (api *ScalingAPI) ScaleService(w http.ResponseWriter, r *http.Request) { ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.scale_service") defer span.End() var req ScaleRequest if err := json.NewDecoder(r.Body).Decode(&req); err != nil { api.writeError(w, http.StatusBadRequest, "Invalid request body", err) return } // Validate request if req.ServiceName == "" { api.writeError(w, http.StatusBadRequest, "Service name is required", nil) return } if req.TargetReplicas < 0 { api.writeError(w, http.StatusBadRequest, "Target replicas must be non-negative", nil) return } span.SetAttributes( attribute.String("request.service_name", req.ServiceName), attribute.Int("request.target_replicas", req.TargetReplicas), attribute.Bool("request.force_scale", req.ForceScale), ) // Get current replica count currentReplicas, err := api.controller.swarmManager.GetServiceReplicas(ctx, req.ServiceName) if err != nil { api.writeError(w, http.StatusNotFound, "Service not found", err) return } // Check if scaling is needed if currentReplicas == req.TargetReplicas && !req.ForceScale { response := ScaleResponse{ ServiceName: req.ServiceName, TargetReplicas: req.TargetReplicas, CurrentReplicas: currentReplicas, Status: "no_action_needed", StartedAt: time.Now(), Message: "Service already at target replica count", } api.writeJSON(w, http.StatusOK, response) return } // Determine scaling direction and wave size var waveSize int if req.WaveSize > 0 { waveSize = req.WaveSize } else { // Default wave size based on scaling direction if req.TargetReplicas > currentReplicas { waveSize = 3 // Scale up in smaller waves } else { waveSize = 5 // Scale down in larger waves } } // Start scaling operation waveID, err := api.controller.StartScaling(ctx, req.ServiceName, req.TargetReplicas, waveSize, req.Template) if err != nil { api.writeError(w, http.StatusInternalServerError, "Failed to start scaling", err) return } response := ScaleResponse{ WaveID: waveID, ServiceName: req.ServiceName, TargetReplicas: req.TargetReplicas, CurrentReplicas: currentReplicas, Status: "scaling_started", StartedAt: time.Now(), Message: fmt.Sprintf("Started scaling %s from %d to %d replicas", req.ServiceName, currentReplicas, req.TargetReplicas), } log.Info(). Str("wave_id", waveID). Str("service_name", req.ServiceName). Int("current_replicas", currentReplicas). Int("target_replicas", req.TargetReplicas). Int("wave_size", waveSize). Msg("Started scaling operation via API") api.writeJSON(w, http.StatusAccepted, response) } // GetScalingStatus returns the current scaling status func (api *ScalingAPI) GetScalingStatus(w http.ResponseWriter, r *http.Request) { _, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_scaling_status") defer span.End() currentWave := api.metrics.GetCurrentWave() if currentWave == nil { api.writeJSON(w, http.StatusOK, map[string]interface{}{ "status": "idle", "message": "No scaling operation in progress", }) return } // Calculate progress progress := float64(currentWave.CurrentReplicas) / float64(currentWave.TargetReplicas) * 100 if progress > 100 { progress = 100 } response := map[string]interface{}{ "status": "scaling", "wave_id": currentWave.WaveID, "service_name": currentWave.ServiceName, "started_at": currentWave.StartedAt, "target_replicas": currentWave.TargetReplicas, "current_replicas": currentWave.CurrentReplicas, "progress_percent": progress, "join_attempts": len(currentWave.JoinAttempts), "health_checks": len(currentWave.HealthChecks), "backoff_level": currentWave.BackoffLevel, "duration": time.Since(currentWave.StartedAt).String(), } api.writeJSON(w, http.StatusOK, response) } // StopScaling stops the current scaling operation func (api *ScalingAPI) StopScaling(w http.ResponseWriter, r *http.Request) { ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.stop_scaling") defer span.End() currentWave := api.metrics.GetCurrentWave() if currentWave == nil { api.writeError(w, http.StatusBadRequest, "No scaling operation in progress", nil) return } // Stop the scaling operation api.controller.StopScaling(ctx) response := map[string]interface{}{ "status": "stopped", "wave_id": currentWave.WaveID, "message": "Scaling operation stopped", "stopped_at": time.Now(), } log.Info(). Str("wave_id", currentWave.WaveID). Str("service_name", currentWave.ServiceName). Msg("Stopped scaling operation via API") api.writeJSON(w, http.StatusOK, response) } // GetHealthGates returns the current health gate status func (api *ScalingAPI) GetHealthGates(w http.ResponseWriter, r *http.Request) { ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_health_gates") defer span.End() status, err := api.controller.healthGates.CheckHealth(ctx, nil) if err != nil { api.writeError(w, http.StatusInternalServerError, "Failed to check health gates", err) return } response := HealthResponse{ Healthy: status.Healthy, Timestamp: status.Timestamp, Gates: status.Gates, OverallReason: status.OverallReason, } api.writeJSON(w, http.StatusOK, response) } // GetHealthThresholds returns the current health thresholds func (api *ScalingAPI) GetHealthThresholds(w http.ResponseWriter, r *http.Request) { _, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_health_thresholds") defer span.End() thresholds := api.controller.healthGates.GetThresholds() api.writeJSON(w, http.StatusOK, thresholds) } // UpdateHealthThresholds updates the health thresholds func (api *ScalingAPI) UpdateHealthThresholds(w http.ResponseWriter, r *http.Request) { _, span := tracing.Tracer.Start(r.Context(), "scaling_api.update_health_thresholds") defer span.End() var thresholds HealthThresholds if err := json.NewDecoder(r.Body).Decode(&thresholds); err != nil { api.writeError(w, http.StatusBadRequest, "Invalid request body", err) return } api.controller.healthGates.SetThresholds(thresholds) log.Info(). Interface("thresholds", thresholds). Msg("Updated health thresholds via API") api.writeJSON(w, http.StatusOK, map[string]string{ "status": "updated", "message": "Health thresholds updated successfully", }) } // GetScalingMetrics returns scaling metrics for a time window func (api *ScalingAPI) GetScalingMetrics(w http.ResponseWriter, r *http.Request) { ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_scaling_metrics") defer span.End() // Parse query parameters for time window windowStart, windowEnd := api.parseTimeWindow(r) report := api.metrics.GenerateReport(ctx, windowStart, windowEnd) api.writeJSON(w, http.StatusOK, report) } // GetRecentOperations returns recent scaling operations func (api *ScalingAPI) GetRecentOperations(w http.ResponseWriter, r *http.Request) { _, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_recent_operations") defer span.End() // Parse limit parameter limit := 50 // Default limit if limitStr := r.URL.Query().Get("limit"); limitStr != "" { if parsedLimit, err := strconv.Atoi(limitStr); err == nil && parsedLimit > 0 { limit = parsedLimit } } operations := api.metrics.GetRecentOperations(limit) api.writeJSON(w, http.StatusOK, map[string]interface{}{ "operations": operations, "count": len(operations), }) } // ExportMetrics exports all metrics data func (api *ScalingAPI) ExportMetrics(w http.ResponseWriter, r *http.Request) { ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.export_metrics") defer span.End() data, err := api.metrics.ExportMetrics(ctx) if err != nil { api.writeError(w, http.StatusInternalServerError, "Failed to export metrics", err) return } w.Header().Set("Content-Type", "application/json") w.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=scaling-metrics-%s.json", time.Now().Format("2006-01-02-15-04-05"))) w.Write(data) } // GetServiceStatus returns detailed status for a specific service func (api *ScalingAPI) GetServiceStatus(w http.ResponseWriter, r *http.Request) { ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_service_status") defer span.End() serviceName := chi.URLParam(r, "serviceName") status, err := api.controller.swarmManager.GetServiceStatus(ctx, serviceName) if err != nil { api.writeError(w, http.StatusNotFound, "Service not found", err) return } span.SetAttributes(attribute.String("service.name", serviceName)) api.writeJSON(w, http.StatusOK, status) } // GetServiceReplicas returns the current replica count for a service func (api *ScalingAPI) GetServiceReplicas(w http.ResponseWriter, r *http.Request) { ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_service_replicas") defer span.End() serviceName := chi.URLParam(r, "serviceName") replicas, err := api.controller.swarmManager.GetServiceReplicas(ctx, serviceName) if err != nil { api.writeError(w, http.StatusNotFound, "Service not found", err) return } runningReplicas, err := api.controller.swarmManager.GetRunningReplicas(ctx, serviceName) if err != nil { log.Warn().Err(err).Str("service_name", serviceName).Msg("Failed to get running replica count") runningReplicas = 0 } response := map[string]interface{}{ "service_name": serviceName, "desired_replicas": replicas, "running_replicas": runningReplicas, "timestamp": time.Now(), } span.SetAttributes( attribute.String("service.name", serviceName), attribute.Int("service.desired_replicas", replicas), attribute.Int("service.running_replicas", runningReplicas), ) api.writeJSON(w, http.StatusOK, response) } // GetAssignmentTemplates returns available assignment templates func (api *ScalingAPI) GetAssignmentTemplates(w http.ResponseWriter, r *http.Request) { _, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_assignment_templates") defer span.End() // Return empty templates for now - can be implemented later api.writeJSON(w, http.StatusOK, map[string]interface{}{ "templates": []interface{}{}, "count": 0, }) } // CreateAssignment creates a new assignment func (api *ScalingAPI) CreateAssignment(w http.ResponseWriter, r *http.Request) { ctx, span := tracing.Tracer.Start(r.Context(), "scaling_api.create_assignment") defer span.End() var req AssignmentRequest if err := json.NewDecoder(r.Body).Decode(&req); err != nil { api.writeError(w, http.StatusBadRequest, "Invalid request body", err) return } assignment, err := api.controller.assignmentBroker.CreateAssignment(ctx, req) if err != nil { api.writeError(w, http.StatusBadRequest, "Failed to create assignment", err) return } span.SetAttributes( attribute.String("assignment.id", assignment.ID), attribute.String("assignment.template", req.Template), ) api.writeJSON(w, http.StatusCreated, assignment) } // GetBootstrapPeers returns available bootstrap peers func (api *ScalingAPI) GetBootstrapPeers(w http.ResponseWriter, r *http.Request) { _, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_bootstrap_peers") defer span.End() peers := api.controller.bootstrapManager.GetAllPeers() api.writeJSON(w, http.StatusOK, map[string]interface{}{ "peers": peers, "count": len(peers), }) } // GetBootstrapStats returns bootstrap pool statistics func (api *ScalingAPI) GetBootstrapStats(w http.ResponseWriter, r *http.Request) { _, span := tracing.Tracer.Start(r.Context(), "scaling_api.get_bootstrap_stats") defer span.End() stats := api.controller.bootstrapManager.GetStats() api.writeJSON(w, http.StatusOK, stats) } // Helper functions // parseTimeWindow parses start and end time parameters from request func (api *ScalingAPI) parseTimeWindow(r *http.Request) (time.Time, time.Time) { now := time.Now() // Default to last 24 hours windowEnd := now windowStart := now.Add(-24 * time.Hour) // Parse custom window if provided if startStr := r.URL.Query().Get("start"); startStr != "" { if start, err := time.Parse(time.RFC3339, startStr); err == nil { windowStart = start } } if endStr := r.URL.Query().Get("end"); endStr != "" { if end, err := time.Parse(time.RFC3339, endStr); err == nil { windowEnd = end } } // Parse duration if provided (overrides start) if durationStr := r.URL.Query().Get("duration"); durationStr != "" { if duration, err := time.ParseDuration(durationStr); err == nil { windowStart = windowEnd.Add(-duration) } } return windowStart, windowEnd } // writeJSON writes a JSON response func (api *ScalingAPI) writeJSON(w http.ResponseWriter, status int, data interface{}) { w.Header().Set("Content-Type", "application/json") w.WriteHeader(status) json.NewEncoder(w).Encode(data) } // writeError writes an error response func (api *ScalingAPI) writeError(w http.ResponseWriter, status int, message string, err error) { response := map[string]interface{}{ "error": message, "timestamp": time.Now(), } if err != nil { response["details"] = err.Error() log.Error().Err(err).Str("error_message", message).Msg("API error") } api.writeJSON(w, status, response) }