- Health gates system for pre-scaling validation (KACHING, BACKBEAT, bootstrap peers) - Assignment broker API for per-replica configuration management - Bootstrap pool management with weighted peer selection and health monitoring - Wave-based scaling algorithm with exponential backoff and failure recovery - Enhanced SwarmManager with Docker service scaling capabilities - Comprehensive scaling metrics collection and reporting system - RESTful HTTP API for external scaling operations and monitoring - Integration with CHORUS P2P networking and assignment systems 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
849 lines
25 KiB
Go
849 lines
25 KiB
Go
package orchestrator
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"time"
|
|
|
|
"github.com/docker/docker/api/types"
|
|
"github.com/docker/docker/api/types/container"
|
|
"github.com/docker/docker/api/types/filters"
|
|
"github.com/docker/docker/api/types/mount"
|
|
"github.com/docker/docker/api/types/swarm"
|
|
"github.com/docker/docker/client"
|
|
"github.com/rs/zerolog/log"
|
|
"go.opentelemetry.io/otel/attribute"
|
|
|
|
"github.com/chorus-services/whoosh/internal/tracing"
|
|
)
|
|
|
|
// SwarmManager manages Docker Swarm services for agent deployment
|
|
type SwarmManager struct {
|
|
client *client.Client
|
|
ctx context.Context
|
|
cancel context.CancelFunc
|
|
registry string // Docker registry for agent images
|
|
}
|
|
|
|
// NewSwarmManager creates a new Docker Swarm manager
|
|
func NewSwarmManager(dockerHost, registry string) (*SwarmManager, error) {
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
|
|
// Create Docker client
|
|
var dockerClient *client.Client
|
|
var err error
|
|
|
|
if dockerHost != "" {
|
|
dockerClient, err = client.NewClientWithOpts(
|
|
client.WithHost(dockerHost),
|
|
client.WithAPIVersionNegotiation(),
|
|
)
|
|
} else {
|
|
dockerClient, err = client.NewClientWithOpts(
|
|
client.FromEnv,
|
|
client.WithAPIVersionNegotiation(),
|
|
)
|
|
}
|
|
|
|
if err != nil {
|
|
cancel()
|
|
return nil, fmt.Errorf("failed to create Docker client: %w", err)
|
|
}
|
|
|
|
// Test connection
|
|
_, err = dockerClient.Ping(ctx)
|
|
if err != nil {
|
|
cancel()
|
|
return nil, fmt.Errorf("failed to connect to Docker daemon: %w", err)
|
|
}
|
|
|
|
if registry == "" {
|
|
registry = "registry.home.deepblack.cloud" // Default private registry
|
|
}
|
|
|
|
return &SwarmManager{
|
|
client: dockerClient,
|
|
ctx: ctx,
|
|
cancel: cancel,
|
|
registry: registry,
|
|
}, nil
|
|
}
|
|
|
|
// Close closes the Docker client and cancels context
|
|
func (sm *SwarmManager) Close() error {
|
|
sm.cancel()
|
|
return sm.client.Close()
|
|
}
|
|
|
|
// ScaleService scales a Docker Swarm service to the specified replica count
|
|
func (sm *SwarmManager) ScaleService(ctx context.Context, serviceName string, replicas int) error {
|
|
ctx, span := tracing.Tracer.Start(ctx, "swarm_manager.scale_service")
|
|
defer span.End()
|
|
|
|
// Get the service
|
|
service, _, err := sm.client.ServiceInspectWithRaw(ctx, serviceName, types.ServiceInspectOptions{})
|
|
if err != nil {
|
|
return fmt.Errorf("failed to inspect service %s: %w", serviceName, err)
|
|
}
|
|
|
|
// Update replica count
|
|
serviceSpec := service.Spec
|
|
if serviceSpec.Mode.Replicated == nil {
|
|
return fmt.Errorf("service %s is not in replicated mode", serviceName)
|
|
}
|
|
|
|
currentReplicas := *serviceSpec.Mode.Replicated.Replicas
|
|
serviceSpec.Mode.Replicated.Replicas = uint64Ptr(uint64(replicas))
|
|
|
|
// Update the service
|
|
updateResponse, err := sm.client.ServiceUpdate(
|
|
ctx,
|
|
service.ID,
|
|
service.Version,
|
|
serviceSpec,
|
|
types.ServiceUpdateOptions{},
|
|
)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to update service %s: %w", serviceName, err)
|
|
}
|
|
|
|
span.SetAttributes(
|
|
attribute.String("service.name", serviceName),
|
|
attribute.String("service.id", service.ID),
|
|
attribute.Int("scaling.current_replicas", int(currentReplicas)),
|
|
attribute.Int("scaling.target_replicas", replicas),
|
|
)
|
|
|
|
log.Info().
|
|
Str("service_name", serviceName).
|
|
Str("service_id", service.ID).
|
|
Uint64("current_replicas", currentReplicas).
|
|
Int("target_replicas", replicas).
|
|
Str("update_id", updateResponse.ID).
|
|
Msg("Scaled service")
|
|
|
|
return nil
|
|
}
|
|
|
|
// GetServiceReplicas returns the current replica count for a service
|
|
func (sm *SwarmManager) GetServiceReplicas(ctx context.Context, serviceName string) (int, error) {
|
|
service, _, err := sm.client.ServiceInspectWithRaw(ctx, serviceName, types.ServiceInspectOptions{})
|
|
if err != nil {
|
|
return 0, fmt.Errorf("failed to inspect service %s: %w", serviceName, err)
|
|
}
|
|
|
|
if service.Spec.Mode.Replicated == nil {
|
|
return 0, fmt.Errorf("service %s is not in replicated mode", serviceName)
|
|
}
|
|
|
|
return int(*service.Spec.Mode.Replicated.Replicas), nil
|
|
}
|
|
|
|
// GetRunningReplicas returns the number of currently running replicas for a service
|
|
func (sm *SwarmManager) GetRunningReplicas(ctx context.Context, serviceName string) (int, error) {
|
|
// Get service to get its ID
|
|
service, _, err := sm.client.ServiceInspectWithRaw(ctx, serviceName, types.ServiceInspectOptions{})
|
|
if err != nil {
|
|
return 0, fmt.Errorf("failed to inspect service %s: %w", serviceName, err)
|
|
}
|
|
|
|
// List tasks for this service
|
|
taskFilters := filters.NewArgs()
|
|
taskFilters.Add("service", service.ID)
|
|
|
|
tasks, err := sm.client.TaskList(ctx, types.TaskListOptions{
|
|
Filters: taskFilters,
|
|
})
|
|
if err != nil {
|
|
return 0, fmt.Errorf("failed to list tasks for service %s: %w", serviceName, err)
|
|
}
|
|
|
|
// Count running tasks
|
|
runningCount := 0
|
|
for _, task := range tasks {
|
|
if task.Status.State == swarm.TaskStateRunning {
|
|
runningCount++
|
|
}
|
|
}
|
|
|
|
return runningCount, nil
|
|
}
|
|
|
|
// GetServiceStatus returns detailed status information for a service
|
|
func (sm *SwarmManager) GetServiceStatus(ctx context.Context, serviceName string) (*ServiceStatus, error) {
|
|
service, _, err := sm.client.ServiceInspectWithRaw(ctx, serviceName, types.ServiceInspectOptions{})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to inspect service %s: %w", serviceName, err)
|
|
}
|
|
|
|
// Get tasks for detailed status
|
|
taskFilters := filters.NewArgs()
|
|
taskFilters.Add("service", service.ID)
|
|
|
|
tasks, err := sm.client.TaskList(ctx, types.TaskListOptions{
|
|
Filters: taskFilters,
|
|
})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to list tasks for service %s: %w", serviceName, err)
|
|
}
|
|
|
|
status := &ServiceStatus{
|
|
ServiceID: service.ID,
|
|
ServiceName: serviceName,
|
|
Image: service.Spec.TaskTemplate.ContainerSpec.Image,
|
|
CreatedAt: service.CreatedAt,
|
|
UpdatedAt: service.UpdatedAt,
|
|
Tasks: make([]TaskStatus, 0, len(tasks)),
|
|
}
|
|
|
|
if service.Spec.Mode.Replicated != nil {
|
|
status.DesiredReplicas = int(*service.Spec.Mode.Replicated.Replicas)
|
|
}
|
|
|
|
// Process tasks
|
|
runningCount := 0
|
|
for _, task := range tasks {
|
|
taskStatus := TaskStatus{
|
|
TaskID: task.ID,
|
|
NodeID: task.NodeID,
|
|
State: string(task.Status.State),
|
|
Message: task.Status.Message,
|
|
CreatedAt: task.CreatedAt,
|
|
UpdatedAt: task.UpdatedAt,
|
|
}
|
|
|
|
if task.Status.Timestamp != nil {
|
|
taskStatus.StatusTimestamp = *task.Status.Timestamp
|
|
}
|
|
|
|
status.Tasks = append(status.Tasks, taskStatus)
|
|
|
|
if task.Status.State == swarm.TaskStateRunning {
|
|
runningCount++
|
|
}
|
|
}
|
|
|
|
status.RunningReplicas = runningCount
|
|
|
|
return status, nil
|
|
}
|
|
|
|
// CreateCHORUSService creates a new CHORUS service with the specified configuration
|
|
func (sm *SwarmManager) CreateCHORUSService(ctx context.Context, config *CHORUSServiceConfig) (*swarm.Service, error) {
|
|
ctx, span := tracing.Tracer.Start(ctx, "swarm_manager.create_chorus_service")
|
|
defer span.End()
|
|
|
|
// Build service specification
|
|
serviceSpec := swarm.ServiceSpec{
|
|
Annotations: swarm.Annotations{
|
|
Name: config.ServiceName,
|
|
Labels: config.Labels,
|
|
},
|
|
TaskTemplate: swarm.TaskSpec{
|
|
ContainerSpec: &swarm.ContainerSpec{
|
|
Image: config.Image,
|
|
Env: buildEnvironmentList(config.Environment),
|
|
},
|
|
Resources: &swarm.ResourceRequirements{
|
|
Limits: &swarm.Resources{
|
|
NanoCPUs: config.Resources.CPULimit,
|
|
MemoryBytes: config.Resources.MemoryLimit,
|
|
},
|
|
Reservations: &swarm.Resources{
|
|
NanoCPUs: config.Resources.CPURequest,
|
|
MemoryBytes: config.Resources.MemoryRequest,
|
|
},
|
|
},
|
|
Placement: &swarm.Placement{
|
|
Constraints: config.Placement.Constraints,
|
|
},
|
|
},
|
|
Mode: swarm.ServiceMode{
|
|
Replicated: &swarm.ReplicatedService{
|
|
Replicas: uint64Ptr(uint64(config.InitialReplicas)),
|
|
},
|
|
},
|
|
Networks: buildNetworkAttachments(config.Networks),
|
|
UpdateConfig: &swarm.UpdateConfig{
|
|
Parallelism: 1,
|
|
Delay: 15 * time.Second,
|
|
Order: swarm.UpdateOrderStartFirst,
|
|
},
|
|
}
|
|
|
|
// Add volumes if specified
|
|
if len(config.Volumes) > 0 {
|
|
serviceSpec.TaskTemplate.ContainerSpec.Mounts = buildMounts(config.Volumes)
|
|
}
|
|
|
|
// Create the service
|
|
response, err := sm.client.ServiceCreate(ctx, serviceSpec, types.ServiceCreateOptions{})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create service %s: %w", config.ServiceName, err)
|
|
}
|
|
|
|
// Get the created service
|
|
service, _, err := sm.client.ServiceInspectWithRaw(ctx, response.ID, types.ServiceInspectOptions{})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to inspect created service: %w", err)
|
|
}
|
|
|
|
span.SetAttributes(
|
|
attribute.String("service.name", config.ServiceName),
|
|
attribute.String("service.id", response.ID),
|
|
attribute.Int("service.initial_replicas", config.InitialReplicas),
|
|
attribute.String("service.image", config.Image),
|
|
)
|
|
|
|
log.Info().
|
|
Str("service_name", config.ServiceName).
|
|
Str("service_id", response.ID).
|
|
Int("initial_replicas", config.InitialReplicas).
|
|
Str("image", config.Image).
|
|
Msg("Created CHORUS service")
|
|
|
|
return &service, nil
|
|
}
|
|
|
|
// AgentDeploymentConfig defines configuration for deploying an agent
|
|
type AgentDeploymentConfig struct {
|
|
TeamID string `json:"team_id"`
|
|
TaskID string `json:"task_id"`
|
|
AgentRole string `json:"agent_role"` // executor, coordinator, reviewer
|
|
AgentType string `json:"agent_type"` // general, specialized
|
|
Image string `json:"image"` // Docker image to use
|
|
Replicas uint64 `json:"replicas"` // Number of instances
|
|
Resources ResourceLimits `json:"resources"` // CPU/Memory limits
|
|
Environment map[string]string `json:"environment"` // Environment variables
|
|
TaskContext TaskContext `json:"task_context"` // Task-specific context
|
|
Networks []string `json:"networks"` // Docker networks to join
|
|
Volumes []VolumeMount `json:"volumes"` // Volume mounts
|
|
Placement PlacementConfig `json:"placement"` // Node placement constraints
|
|
GoalID string `json:"goal_id,omitempty"`
|
|
PulseID string `json:"pulse_id,omitempty"`
|
|
}
|
|
|
|
// ResourceLimits defines CPU and memory limits for containers
|
|
type ResourceLimits struct {
|
|
CPULimit int64 `json:"cpu_limit"` // CPU limit in nano CPUs (1e9 = 1 CPU)
|
|
MemoryLimit int64 `json:"memory_limit"` // Memory limit in bytes
|
|
CPURequest int64 `json:"cpu_request"` // CPU request in nano CPUs
|
|
MemoryRequest int64 `json:"memory_request"` // Memory request in bytes
|
|
}
|
|
|
|
// TaskContext provides task-specific information to agents
|
|
type TaskContext struct {
|
|
IssueTitle string `json:"issue_title"`
|
|
IssueDescription string `json:"issue_description"`
|
|
Repository string `json:"repository"`
|
|
TechStack []string `json:"tech_stack"`
|
|
Requirements []string `json:"requirements"`
|
|
Priority string `json:"priority"`
|
|
ExternalURL string `json:"external_url"`
|
|
Metadata map[string]interface{} `json:"metadata"`
|
|
}
|
|
|
|
// VolumeMount defines a volume mount for containers
|
|
type VolumeMount struct {
|
|
Source string `json:"source"` // Host path or volume name
|
|
Target string `json:"target"` // Container path
|
|
ReadOnly bool `json:"readonly"` // Read-only mount
|
|
Type string `json:"type"` // bind, volume, tmpfs
|
|
}
|
|
|
|
// PlacementConfig defines where containers should be placed
|
|
type PlacementConfig struct {
|
|
Constraints []string `json:"constraints"` // Node constraints
|
|
Preferences []PlacementPref `json:"preferences"` // Placement preferences
|
|
Platforms []Platform `json:"platforms"` // Target platforms
|
|
}
|
|
|
|
// PlacementPref defines placement preferences
|
|
type PlacementPref struct {
|
|
Spread string `json:"spread"` // Spread across nodes
|
|
}
|
|
|
|
// Platform defines target platform for containers
|
|
type Platform struct {
|
|
Architecture string `json:"architecture"` // amd64, arm64, etc.
|
|
OS string `json:"os"` // linux, windows
|
|
}
|
|
|
|
// DeployAgent deploys an agent service to Docker Swarm
|
|
func (sm *SwarmManager) DeployAgent(config *AgentDeploymentConfig) (*swarm.Service, error) {
|
|
ctx, span := tracing.StartDeploymentSpan(sm.ctx, "deploy_agent", config.AgentRole)
|
|
defer span.End()
|
|
|
|
// Add tracing attributes
|
|
span.SetAttributes(
|
|
attribute.String("agent.team_id", config.TeamID),
|
|
attribute.String("agent.task_id", config.TaskID),
|
|
attribute.String("agent.role", config.AgentRole),
|
|
attribute.String("agent.type", config.AgentType),
|
|
attribute.String("agent.image", config.Image),
|
|
)
|
|
|
|
// Add goal.id and pulse.id if available in config
|
|
if config.GoalID != "" {
|
|
span.SetAttributes(attribute.String("goal.id", config.GoalID))
|
|
}
|
|
if config.PulseID != "" {
|
|
span.SetAttributes(attribute.String("pulse.id", config.PulseID))
|
|
}
|
|
|
|
log.Info().
|
|
Str("team_id", config.TeamID).
|
|
Str("task_id", config.TaskID).
|
|
Str("agent_role", config.AgentRole).
|
|
Str("image", config.Image).
|
|
Msg("🚀 Deploying agent to Docker Swarm")
|
|
|
|
// Generate unique service name
|
|
serviceName := fmt.Sprintf("whoosh-agent-%s-%s-%s",
|
|
config.TeamID[:8],
|
|
config.TaskID[:8],
|
|
config.AgentRole,
|
|
)
|
|
|
|
// Build environment variables
|
|
env := sm.buildEnvironment(config)
|
|
|
|
// Build volume mounts
|
|
mounts := sm.buildMounts(config.Volumes)
|
|
|
|
// Build resource specifications
|
|
resources := sm.buildResources(config.Resources)
|
|
|
|
// Build placement constraints
|
|
placement := sm.buildPlacement(config.Placement)
|
|
|
|
// Create service specification
|
|
serviceSpec := swarm.ServiceSpec{
|
|
Annotations: swarm.Annotations{
|
|
Name: serviceName,
|
|
Labels: map[string]string{
|
|
"whoosh.team_id": config.TeamID,
|
|
"whoosh.task_id": config.TaskID,
|
|
"whoosh.agent_role": config.AgentRole,
|
|
"whoosh.agent_type": config.AgentType,
|
|
"whoosh.managed_by": "whoosh",
|
|
"whoosh.created_at": time.Now().Format(time.RFC3339),
|
|
},
|
|
},
|
|
TaskTemplate: swarm.TaskSpec{
|
|
ContainerSpec: &swarm.ContainerSpec{
|
|
Image: config.Image,
|
|
Env: env,
|
|
Mounts: mounts,
|
|
Labels: map[string]string{
|
|
"whoosh.team_id": config.TeamID,
|
|
"whoosh.task_id": config.TaskID,
|
|
"whoosh.agent_role": config.AgentRole,
|
|
},
|
|
// Add healthcheck
|
|
Healthcheck: &container.HealthConfig{
|
|
Test: []string{"CMD-SHELL", "curl -f http://localhost:8080/health || exit 1"},
|
|
Interval: 30 * time.Second,
|
|
Timeout: 10 * time.Second,
|
|
Retries: 3,
|
|
},
|
|
},
|
|
Resources: resources,
|
|
Placement: placement,
|
|
Networks: sm.buildNetworks(config.Networks),
|
|
},
|
|
Mode: swarm.ServiceMode{
|
|
Replicated: &swarm.ReplicatedService{
|
|
Replicas: &config.Replicas,
|
|
},
|
|
},
|
|
UpdateConfig: &swarm.UpdateConfig{
|
|
Parallelism: 1,
|
|
Order: "start-first",
|
|
},
|
|
// RollbackConfig removed for compatibility
|
|
}
|
|
|
|
// Create the service
|
|
response, err := sm.client.ServiceCreate(ctx, serviceSpec, types.ServiceCreateOptions{})
|
|
if err != nil {
|
|
tracing.SetSpanError(span, err)
|
|
span.SetAttributes(
|
|
attribute.String("deployment.status", "failed"),
|
|
attribute.String("deployment.service_name", serviceName),
|
|
)
|
|
return nil, fmt.Errorf("failed to create agent service: %w", err)
|
|
}
|
|
|
|
// Add success metrics to span
|
|
span.SetAttributes(
|
|
attribute.String("deployment.status", "success"),
|
|
attribute.String("deployment.service_id", response.ID),
|
|
attribute.String("deployment.service_name", serviceName),
|
|
attribute.Int64("deployment.replicas", int64(config.Replicas)),
|
|
)
|
|
|
|
log.Info().
|
|
Str("service_id", response.ID).
|
|
Str("service_name", serviceName).
|
|
Msg("✅ Agent service created successfully")
|
|
|
|
// Wait for service to be created and return service info
|
|
service, _, err := sm.client.ServiceInspectWithRaw(sm.ctx, response.ID, types.ServiceInspectOptions{})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to inspect created service: %w", err)
|
|
}
|
|
|
|
return &service, nil
|
|
}
|
|
|
|
// buildEnvironment constructs environment variables for the container
|
|
func (sm *SwarmManager) buildEnvironment(config *AgentDeploymentConfig) []string {
|
|
env := []string{
|
|
fmt.Sprintf("WHOOSH_TEAM_ID=%s", config.TeamID),
|
|
fmt.Sprintf("WHOOSH_TASK_ID=%s", config.TaskID),
|
|
fmt.Sprintf("WHOOSH_AGENT_ROLE=%s", config.AgentRole),
|
|
fmt.Sprintf("WHOOSH_AGENT_TYPE=%s", config.AgentType),
|
|
}
|
|
|
|
// Add task context as environment variables
|
|
if config.TaskContext.IssueTitle != "" {
|
|
env = append(env, fmt.Sprintf("TASK_TITLE=%s", config.TaskContext.IssueTitle))
|
|
}
|
|
if config.TaskContext.Repository != "" {
|
|
env = append(env, fmt.Sprintf("TASK_REPOSITORY=%s", config.TaskContext.Repository))
|
|
}
|
|
if config.TaskContext.Priority != "" {
|
|
env = append(env, fmt.Sprintf("TASK_PRIORITY=%s", config.TaskContext.Priority))
|
|
}
|
|
if config.TaskContext.ExternalURL != "" {
|
|
env = append(env, fmt.Sprintf("TASK_EXTERNAL_URL=%s", config.TaskContext.ExternalURL))
|
|
}
|
|
|
|
// Add tech stack as JSON
|
|
if len(config.TaskContext.TechStack) > 0 {
|
|
techStackJSON, _ := json.Marshal(config.TaskContext.TechStack)
|
|
env = append(env, fmt.Sprintf("TASK_TECH_STACK=%s", string(techStackJSON)))
|
|
}
|
|
|
|
// Add requirements as JSON
|
|
if len(config.TaskContext.Requirements) > 0 {
|
|
requirementsJSON, _ := json.Marshal(config.TaskContext.Requirements)
|
|
env = append(env, fmt.Sprintf("TASK_REQUIREMENTS=%s", string(requirementsJSON)))
|
|
}
|
|
|
|
// Add custom environment variables
|
|
for key, value := range config.Environment {
|
|
env = append(env, fmt.Sprintf("%s=%s", key, value))
|
|
}
|
|
|
|
return env
|
|
}
|
|
|
|
// buildMounts constructs volume mounts for the container
|
|
func (sm *SwarmManager) buildMounts(volumes []VolumeMount) []mount.Mount {
|
|
mounts := make([]mount.Mount, len(volumes))
|
|
|
|
for i, vol := range volumes {
|
|
mountType := mount.TypeBind
|
|
switch vol.Type {
|
|
case "volume":
|
|
mountType = mount.TypeVolume
|
|
case "tmpfs":
|
|
mountType = mount.TypeTmpfs
|
|
}
|
|
|
|
mounts[i] = mount.Mount{
|
|
Type: mountType,
|
|
Source: vol.Source,
|
|
Target: vol.Target,
|
|
ReadOnly: vol.ReadOnly,
|
|
}
|
|
}
|
|
|
|
// Add default workspace volume
|
|
mounts = append(mounts, mount.Mount{
|
|
Type: mount.TypeVolume,
|
|
Source: fmt.Sprintf("whoosh-workspace"), // Shared workspace volume
|
|
Target: "/workspace",
|
|
ReadOnly: false,
|
|
})
|
|
|
|
return mounts
|
|
}
|
|
|
|
// buildResources constructs resource specifications
|
|
func (sm *SwarmManager) buildResources(limits ResourceLimits) *swarm.ResourceRequirements {
|
|
resources := &swarm.ResourceRequirements{}
|
|
|
|
// Set limits
|
|
if limits.CPULimit > 0 || limits.MemoryLimit > 0 {
|
|
resources.Limits = &swarm.Limit{}
|
|
if limits.CPULimit > 0 {
|
|
resources.Limits.NanoCPUs = limits.CPULimit
|
|
}
|
|
if limits.MemoryLimit > 0 {
|
|
resources.Limits.MemoryBytes = limits.MemoryLimit
|
|
}
|
|
}
|
|
|
|
// Set requests/reservations
|
|
if limits.CPURequest > 0 || limits.MemoryRequest > 0 {
|
|
resources.Reservations = &swarm.Resources{}
|
|
if limits.CPURequest > 0 {
|
|
resources.Reservations.NanoCPUs = limits.CPURequest
|
|
}
|
|
if limits.MemoryRequest > 0 {
|
|
resources.Reservations.MemoryBytes = limits.MemoryRequest
|
|
}
|
|
}
|
|
|
|
return resources
|
|
}
|
|
|
|
// buildPlacement constructs placement specifications
|
|
func (sm *SwarmManager) buildPlacement(config PlacementConfig) *swarm.Placement {
|
|
placement := &swarm.Placement{
|
|
Constraints: config.Constraints,
|
|
}
|
|
|
|
// Add preferences
|
|
for _, pref := range config.Preferences {
|
|
placement.Preferences = append(placement.Preferences, swarm.PlacementPreference{
|
|
Spread: &swarm.SpreadOver{
|
|
SpreadDescriptor: pref.Spread,
|
|
},
|
|
})
|
|
}
|
|
|
|
// Add platforms
|
|
for _, platform := range config.Platforms {
|
|
placement.Platforms = append(placement.Platforms, swarm.Platform{
|
|
Architecture: platform.Architecture,
|
|
OS: platform.OS,
|
|
})
|
|
}
|
|
|
|
return placement
|
|
}
|
|
|
|
// buildNetworks constructs network specifications
|
|
func (sm *SwarmManager) buildNetworks(networks []string) []swarm.NetworkAttachmentConfig {
|
|
if len(networks) == 0 {
|
|
// Default to chorus_default network
|
|
networks = []string{"chorus_default"}
|
|
}
|
|
|
|
networkConfigs := make([]swarm.NetworkAttachmentConfig, len(networks))
|
|
for i, networkName := range networks {
|
|
networkConfigs[i] = swarm.NetworkAttachmentConfig{
|
|
Target: networkName,
|
|
}
|
|
}
|
|
|
|
return networkConfigs
|
|
}
|
|
|
|
// RemoveAgent removes an agent service from Docker Swarm
|
|
func (sm *SwarmManager) RemoveAgent(serviceID string) error {
|
|
log.Info().
|
|
Str("service_id", serviceID).
|
|
Msg("🗑️ Removing agent service from Docker Swarm")
|
|
|
|
err := sm.client.ServiceRemove(sm.ctx, serviceID)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to remove service: %w", err)
|
|
}
|
|
|
|
log.Info().
|
|
Str("service_id", serviceID).
|
|
Msg("✅ Agent service removed successfully")
|
|
|
|
return nil
|
|
}
|
|
|
|
// ListAgentServices lists all agent services managed by WHOOSH
|
|
func (sm *SwarmManager) ListAgentServices() ([]swarm.Service, error) {
|
|
services, err := sm.client.ServiceList(sm.ctx, types.ServiceListOptions{
|
|
Filters: filters.NewArgs(),
|
|
})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to list services: %w", err)
|
|
}
|
|
|
|
// Filter for WHOOSH-managed services
|
|
var agentServices []swarm.Service
|
|
for _, service := range services {
|
|
if managed, exists := service.Spec.Labels["whoosh.managed_by"]; exists && managed == "whoosh" {
|
|
agentServices = append(agentServices, service)
|
|
}
|
|
}
|
|
|
|
return agentServices, nil
|
|
}
|
|
|
|
// GetServiceLogs retrieves logs for a service
|
|
func (sm *SwarmManager) GetServiceLogs(serviceID string, lines int) (string, error) {
|
|
// Create logs options struct inline to avoid import issues
|
|
options := struct {
|
|
ShowStdout bool
|
|
ShowStderr bool
|
|
Since string
|
|
Until string
|
|
Timestamps bool
|
|
Follow bool
|
|
Tail string
|
|
Details bool
|
|
}{
|
|
ShowStdout: true,
|
|
ShowStderr: true,
|
|
Tail: fmt.Sprintf("%d", lines),
|
|
Timestamps: true,
|
|
}
|
|
|
|
reader, err := sm.client.ServiceLogs(sm.ctx, serviceID, options)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to get service logs: %w", err)
|
|
}
|
|
defer reader.Close()
|
|
|
|
logs, err := io.ReadAll(reader)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to read service logs: %w", err)
|
|
}
|
|
|
|
return string(logs), nil
|
|
}
|
|
|
|
|
|
|
|
// ServiceStatus represents the current status of a service with detailed task information
|
|
type ServiceStatus struct {
|
|
ServiceID string `json:"service_id"`
|
|
ServiceName string `json:"service_name"`
|
|
Image string `json:"image"`
|
|
DesiredReplicas int `json:"desired_replicas"`
|
|
RunningReplicas int `json:"running_replicas"`
|
|
Tasks []TaskStatus `json:"tasks"`
|
|
CreatedAt time.Time `json:"created_at"`
|
|
UpdatedAt time.Time `json:"updated_at"`
|
|
}
|
|
|
|
// TaskStatus represents the status of an individual task
|
|
type TaskStatus struct {
|
|
TaskID string `json:"task_id"`
|
|
NodeID string `json:"node_id"`
|
|
State string `json:"state"`
|
|
Message string `json:"message"`
|
|
CreatedAt time.Time `json:"created_at"`
|
|
UpdatedAt time.Time `json:"updated_at"`
|
|
StatusTimestamp time.Time `json:"status_timestamp"`
|
|
}
|
|
|
|
// CHORUSServiceConfig represents configuration for creating a CHORUS service
|
|
type CHORUSServiceConfig struct {
|
|
ServiceName string `json:"service_name"`
|
|
Image string `json:"image"`
|
|
InitialReplicas int `json:"initial_replicas"`
|
|
Environment map[string]string `json:"environment"`
|
|
Labels map[string]string `json:"labels"`
|
|
Networks []string `json:"networks"`
|
|
Volumes []VolumeMount `json:"volumes"`
|
|
Resources ResourceLimits `json:"resources"`
|
|
Placement PlacementConfig `json:"placement"`
|
|
}
|
|
|
|
// CleanupFailedServices removes failed services
|
|
func (sm *SwarmManager) CleanupFailedServices() error {
|
|
services, err := sm.ListAgentServices()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to list services: %w", err)
|
|
}
|
|
|
|
for _, service := range services {
|
|
status, err := sm.GetServiceStatus(service.ID)
|
|
if err != nil {
|
|
log.Error().
|
|
Err(err).
|
|
Str("service_id", service.ID).
|
|
Msg("Failed to get service status")
|
|
continue
|
|
}
|
|
|
|
// Remove services with all failed tasks and no running tasks
|
|
if status.FailedTasks > 0 && status.RunningTasks == 0 {
|
|
log.Warn().
|
|
Str("service_id", service.ID).
|
|
Str("service_name", service.Spec.Name).
|
|
Uint64("failed_tasks", status.FailedTasks).
|
|
Msg("Removing failed service")
|
|
|
|
err = sm.RemoveAgent(service.ID)
|
|
if err != nil {
|
|
log.Error().
|
|
Err(err).
|
|
Str("service_id", service.ID).
|
|
Msg("Failed to remove failed service")
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Helper functions for SwarmManager
|
|
|
|
// uint64Ptr returns a pointer to a uint64 value
|
|
func uint64Ptr(v uint64) *uint64 {
|
|
return &v
|
|
}
|
|
|
|
// buildEnvironmentList converts a map to a slice of environment variables
|
|
func buildEnvironmentList(env map[string]string) []string {
|
|
var envList []string
|
|
for key, value := range env {
|
|
envList = append(envList, fmt.Sprintf("%s=%s", key, value))
|
|
}
|
|
return envList
|
|
}
|
|
|
|
// buildNetworkAttachments converts network names to attachment configs
|
|
func buildNetworkAttachments(networks []string) []swarm.NetworkAttachmentConfig {
|
|
if len(networks) == 0 {
|
|
networks = []string{"chorus_default"}
|
|
}
|
|
|
|
var attachments []swarm.NetworkAttachmentConfig
|
|
for _, network := range networks {
|
|
attachments = append(attachments, swarm.NetworkAttachmentConfig{
|
|
Target: network,
|
|
})
|
|
}
|
|
return attachments
|
|
}
|
|
|
|
// buildMounts converts volume mounts to Docker mount specs
|
|
func buildMounts(volumes []VolumeMount) []mount.Mount {
|
|
var mounts []mount.Mount
|
|
|
|
for _, vol := range volumes {
|
|
mountType := mount.TypeBind
|
|
switch vol.Type {
|
|
case "volume":
|
|
mountType = mount.TypeVolume
|
|
case "tmpfs":
|
|
mountType = mount.TypeTmpfs
|
|
}
|
|
|
|
mounts = append(mounts, mount.Mount{
|
|
Type: mountType,
|
|
Source: vol.Source,
|
|
Target: vol.Target,
|
|
ReadOnly: vol.ReadOnly,
|
|
})
|
|
}
|
|
|
|
return mounts
|
|
} |