Implement wave-based scaling system for CHORUS Docker Swarm orchestration
- Health gates system for pre-scaling validation (KACHING, BACKBEAT, bootstrap peers) - Assignment broker API for per-replica configuration management - Bootstrap pool management with weighted peer selection and health monitoring - Wave-based scaling algorithm with exponential backoff and failure recovery - Enhanced SwarmManager with Docker service scaling capabilities - Comprehensive scaling metrics collection and reporting system - RESTful HTTP API for external scaling operations and monitoring - Integration with CHORUS P2P networking and assignment systems 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -77,6 +77,236 @@ func (sm *SwarmManager) Close() error {
|
||||
return sm.client.Close()
|
||||
}
|
||||
|
||||
// ScaleService scales a Docker Swarm service to the specified replica count
|
||||
func (sm *SwarmManager) ScaleService(ctx context.Context, serviceName string, replicas int) error {
|
||||
ctx, span := tracing.Tracer.Start(ctx, "swarm_manager.scale_service")
|
||||
defer span.End()
|
||||
|
||||
// Get the service
|
||||
service, _, err := sm.client.ServiceInspectWithRaw(ctx, serviceName, types.ServiceInspectOptions{})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to inspect service %s: %w", serviceName, err)
|
||||
}
|
||||
|
||||
// Update replica count
|
||||
serviceSpec := service.Spec
|
||||
if serviceSpec.Mode.Replicated == nil {
|
||||
return fmt.Errorf("service %s is not in replicated mode", serviceName)
|
||||
}
|
||||
|
||||
currentReplicas := *serviceSpec.Mode.Replicated.Replicas
|
||||
serviceSpec.Mode.Replicated.Replicas = uint64Ptr(uint64(replicas))
|
||||
|
||||
// Update the service
|
||||
updateResponse, err := sm.client.ServiceUpdate(
|
||||
ctx,
|
||||
service.ID,
|
||||
service.Version,
|
||||
serviceSpec,
|
||||
types.ServiceUpdateOptions{},
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to update service %s: %w", serviceName, err)
|
||||
}
|
||||
|
||||
span.SetAttributes(
|
||||
attribute.String("service.name", serviceName),
|
||||
attribute.String("service.id", service.ID),
|
||||
attribute.Int("scaling.current_replicas", int(currentReplicas)),
|
||||
attribute.Int("scaling.target_replicas", replicas),
|
||||
)
|
||||
|
||||
log.Info().
|
||||
Str("service_name", serviceName).
|
||||
Str("service_id", service.ID).
|
||||
Uint64("current_replicas", currentReplicas).
|
||||
Int("target_replicas", replicas).
|
||||
Str("update_id", updateResponse.ID).
|
||||
Msg("Scaled service")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetServiceReplicas returns the current replica count for a service
|
||||
func (sm *SwarmManager) GetServiceReplicas(ctx context.Context, serviceName string) (int, error) {
|
||||
service, _, err := sm.client.ServiceInspectWithRaw(ctx, serviceName, types.ServiceInspectOptions{})
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("failed to inspect service %s: %w", serviceName, err)
|
||||
}
|
||||
|
||||
if service.Spec.Mode.Replicated == nil {
|
||||
return 0, fmt.Errorf("service %s is not in replicated mode", serviceName)
|
||||
}
|
||||
|
||||
return int(*service.Spec.Mode.Replicated.Replicas), nil
|
||||
}
|
||||
|
||||
// GetRunningReplicas returns the number of currently running replicas for a service
|
||||
func (sm *SwarmManager) GetRunningReplicas(ctx context.Context, serviceName string) (int, error) {
|
||||
// Get service to get its ID
|
||||
service, _, err := sm.client.ServiceInspectWithRaw(ctx, serviceName, types.ServiceInspectOptions{})
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("failed to inspect service %s: %w", serviceName, err)
|
||||
}
|
||||
|
||||
// List tasks for this service
|
||||
taskFilters := filters.NewArgs()
|
||||
taskFilters.Add("service", service.ID)
|
||||
|
||||
tasks, err := sm.client.TaskList(ctx, types.TaskListOptions{
|
||||
Filters: taskFilters,
|
||||
})
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("failed to list tasks for service %s: %w", serviceName, err)
|
||||
}
|
||||
|
||||
// Count running tasks
|
||||
runningCount := 0
|
||||
for _, task := range tasks {
|
||||
if task.Status.State == swarm.TaskStateRunning {
|
||||
runningCount++
|
||||
}
|
||||
}
|
||||
|
||||
return runningCount, nil
|
||||
}
|
||||
|
||||
// GetServiceStatus returns detailed status information for a service
|
||||
func (sm *SwarmManager) GetServiceStatus(ctx context.Context, serviceName string) (*ServiceStatus, error) {
|
||||
service, _, err := sm.client.ServiceInspectWithRaw(ctx, serviceName, types.ServiceInspectOptions{})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to inspect service %s: %w", serviceName, err)
|
||||
}
|
||||
|
||||
// Get tasks for detailed status
|
||||
taskFilters := filters.NewArgs()
|
||||
taskFilters.Add("service", service.ID)
|
||||
|
||||
tasks, err := sm.client.TaskList(ctx, types.TaskListOptions{
|
||||
Filters: taskFilters,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to list tasks for service %s: %w", serviceName, err)
|
||||
}
|
||||
|
||||
status := &ServiceStatus{
|
||||
ServiceID: service.ID,
|
||||
ServiceName: serviceName,
|
||||
Image: service.Spec.TaskTemplate.ContainerSpec.Image,
|
||||
CreatedAt: service.CreatedAt,
|
||||
UpdatedAt: service.UpdatedAt,
|
||||
Tasks: make([]TaskStatus, 0, len(tasks)),
|
||||
}
|
||||
|
||||
if service.Spec.Mode.Replicated != nil {
|
||||
status.DesiredReplicas = int(*service.Spec.Mode.Replicated.Replicas)
|
||||
}
|
||||
|
||||
// Process tasks
|
||||
runningCount := 0
|
||||
for _, task := range tasks {
|
||||
taskStatus := TaskStatus{
|
||||
TaskID: task.ID,
|
||||
NodeID: task.NodeID,
|
||||
State: string(task.Status.State),
|
||||
Message: task.Status.Message,
|
||||
CreatedAt: task.CreatedAt,
|
||||
UpdatedAt: task.UpdatedAt,
|
||||
}
|
||||
|
||||
if task.Status.Timestamp != nil {
|
||||
taskStatus.StatusTimestamp = *task.Status.Timestamp
|
||||
}
|
||||
|
||||
status.Tasks = append(status.Tasks, taskStatus)
|
||||
|
||||
if task.Status.State == swarm.TaskStateRunning {
|
||||
runningCount++
|
||||
}
|
||||
}
|
||||
|
||||
status.RunningReplicas = runningCount
|
||||
|
||||
return status, nil
|
||||
}
|
||||
|
||||
// CreateCHORUSService creates a new CHORUS service with the specified configuration
|
||||
func (sm *SwarmManager) CreateCHORUSService(ctx context.Context, config *CHORUSServiceConfig) (*swarm.Service, error) {
|
||||
ctx, span := tracing.Tracer.Start(ctx, "swarm_manager.create_chorus_service")
|
||||
defer span.End()
|
||||
|
||||
// Build service specification
|
||||
serviceSpec := swarm.ServiceSpec{
|
||||
Annotations: swarm.Annotations{
|
||||
Name: config.ServiceName,
|
||||
Labels: config.Labels,
|
||||
},
|
||||
TaskTemplate: swarm.TaskSpec{
|
||||
ContainerSpec: &swarm.ContainerSpec{
|
||||
Image: config.Image,
|
||||
Env: buildEnvironmentList(config.Environment),
|
||||
},
|
||||
Resources: &swarm.ResourceRequirements{
|
||||
Limits: &swarm.Resources{
|
||||
NanoCPUs: config.Resources.CPULimit,
|
||||
MemoryBytes: config.Resources.MemoryLimit,
|
||||
},
|
||||
Reservations: &swarm.Resources{
|
||||
NanoCPUs: config.Resources.CPURequest,
|
||||
MemoryBytes: config.Resources.MemoryRequest,
|
||||
},
|
||||
},
|
||||
Placement: &swarm.Placement{
|
||||
Constraints: config.Placement.Constraints,
|
||||
},
|
||||
},
|
||||
Mode: swarm.ServiceMode{
|
||||
Replicated: &swarm.ReplicatedService{
|
||||
Replicas: uint64Ptr(uint64(config.InitialReplicas)),
|
||||
},
|
||||
},
|
||||
Networks: buildNetworkAttachments(config.Networks),
|
||||
UpdateConfig: &swarm.UpdateConfig{
|
||||
Parallelism: 1,
|
||||
Delay: 15 * time.Second,
|
||||
Order: swarm.UpdateOrderStartFirst,
|
||||
},
|
||||
}
|
||||
|
||||
// Add volumes if specified
|
||||
if len(config.Volumes) > 0 {
|
||||
serviceSpec.TaskTemplate.ContainerSpec.Mounts = buildMounts(config.Volumes)
|
||||
}
|
||||
|
||||
// Create the service
|
||||
response, err := sm.client.ServiceCreate(ctx, serviceSpec, types.ServiceCreateOptions{})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create service %s: %w", config.ServiceName, err)
|
||||
}
|
||||
|
||||
// Get the created service
|
||||
service, _, err := sm.client.ServiceInspectWithRaw(ctx, response.ID, types.ServiceInspectOptions{})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to inspect created service: %w", err)
|
||||
}
|
||||
|
||||
span.SetAttributes(
|
||||
attribute.String("service.name", config.ServiceName),
|
||||
attribute.String("service.id", response.ID),
|
||||
attribute.Int("service.initial_replicas", config.InitialReplicas),
|
||||
attribute.String("service.image", config.Image),
|
||||
)
|
||||
|
||||
log.Info().
|
||||
Str("service_name", config.ServiceName).
|
||||
Str("service_id", response.ID).
|
||||
Int("initial_replicas", config.InitialReplicas).
|
||||
Str("image", config.Image).
|
||||
Msg("Created CHORUS service")
|
||||
|
||||
return &service, nil
|
||||
}
|
||||
|
||||
// AgentDeploymentConfig defines configuration for deploying an agent
|
||||
type AgentDeploymentConfig struct {
|
||||
TeamID string `json:"team_id"`
|
||||
@@ -487,94 +717,42 @@ func (sm *SwarmManager) GetServiceLogs(serviceID string, lines int) (string, err
|
||||
return string(logs), nil
|
||||
}
|
||||
|
||||
// ScaleService scales a service to the specified number of replicas
|
||||
func (sm *SwarmManager) ScaleService(serviceID string, replicas uint64) error {
|
||||
log.Info().
|
||||
Str("service_id", serviceID).
|
||||
Uint64("replicas", replicas).
|
||||
Msg("📈 Scaling agent service")
|
||||
|
||||
// Get current service spec
|
||||
service, _, err := sm.client.ServiceInspectWithRaw(sm.ctx, serviceID, types.ServiceInspectOptions{})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to inspect service: %w", err)
|
||||
}
|
||||
|
||||
// Update replicas
|
||||
service.Spec.Mode.Replicated.Replicas = &replicas
|
||||
|
||||
// Update the service
|
||||
_, err = sm.client.ServiceUpdate(sm.ctx, serviceID, service.Version, service.Spec, types.ServiceUpdateOptions{})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to scale service: %w", err)
|
||||
}
|
||||
|
||||
log.Info().
|
||||
Str("service_id", serviceID).
|
||||
Uint64("replicas", replicas).
|
||||
Msg("✅ Service scaled successfully")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetServiceStatus returns the current status of a service
|
||||
func (sm *SwarmManager) GetServiceStatus(serviceID string) (*ServiceStatus, error) {
|
||||
service, _, err := sm.client.ServiceInspectWithRaw(sm.ctx, serviceID, types.ServiceInspectOptions{})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to inspect service: %w", err)
|
||||
}
|
||||
|
||||
// Get task status
|
||||
tasks, err := sm.client.TaskList(sm.ctx, types.TaskListOptions{
|
||||
Filters: filters.NewArgs(filters.Arg("service", serviceID)),
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to list tasks: %w", err)
|
||||
}
|
||||
|
||||
status := &ServiceStatus{
|
||||
ServiceID: serviceID,
|
||||
ServiceName: service.Spec.Name,
|
||||
Image: service.Spec.TaskTemplate.ContainerSpec.Image,
|
||||
Replicas: 0,
|
||||
RunningTasks: 0,
|
||||
FailedTasks: 0,
|
||||
TaskStates: make(map[string]int),
|
||||
CreatedAt: service.CreatedAt,
|
||||
UpdatedAt: service.UpdatedAt,
|
||||
}
|
||||
|
||||
if service.Spec.Mode.Replicated != nil && service.Spec.Mode.Replicated.Replicas != nil {
|
||||
status.Replicas = *service.Spec.Mode.Replicated.Replicas
|
||||
}
|
||||
|
||||
// Count task states
|
||||
for _, task := range tasks {
|
||||
state := string(task.Status.State)
|
||||
status.TaskStates[state]++
|
||||
|
||||
switch task.Status.State {
|
||||
case swarm.TaskStateRunning:
|
||||
status.RunningTasks++
|
||||
case swarm.TaskStateFailed:
|
||||
status.FailedTasks++
|
||||
}
|
||||
}
|
||||
|
||||
return status, nil
|
||||
}
|
||||
|
||||
// ServiceStatus represents the current status of a service
|
||||
// ServiceStatus represents the current status of a service with detailed task information
|
||||
type ServiceStatus struct {
|
||||
ServiceID string `json:"service_id"`
|
||||
ServiceName string `json:"service_name"`
|
||||
Image string `json:"image"`
|
||||
Replicas uint64 `json:"replicas"`
|
||||
RunningTasks uint64 `json:"running_tasks"`
|
||||
FailedTasks uint64 `json:"failed_tasks"`
|
||||
TaskStates map[string]int `json:"task_states"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
ServiceID string `json:"service_id"`
|
||||
ServiceName string `json:"service_name"`
|
||||
Image string `json:"image"`
|
||||
DesiredReplicas int `json:"desired_replicas"`
|
||||
RunningReplicas int `json:"running_replicas"`
|
||||
Tasks []TaskStatus `json:"tasks"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
// TaskStatus represents the status of an individual task
|
||||
type TaskStatus struct {
|
||||
TaskID string `json:"task_id"`
|
||||
NodeID string `json:"node_id"`
|
||||
State string `json:"state"`
|
||||
Message string `json:"message"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
StatusTimestamp time.Time `json:"status_timestamp"`
|
||||
}
|
||||
|
||||
// CHORUSServiceConfig represents configuration for creating a CHORUS service
|
||||
type CHORUSServiceConfig struct {
|
||||
ServiceName string `json:"service_name"`
|
||||
Image string `json:"image"`
|
||||
InitialReplicas int `json:"initial_replicas"`
|
||||
Environment map[string]string `json:"environment"`
|
||||
Labels map[string]string `json:"labels"`
|
||||
Networks []string `json:"networks"`
|
||||
Volumes []VolumeMount `json:"volumes"`
|
||||
Resources ResourceLimits `json:"resources"`
|
||||
Placement PlacementConfig `json:"placement"`
|
||||
}
|
||||
|
||||
// CleanupFailedServices removes failed services
|
||||
@@ -611,6 +789,61 @@ func (sm *SwarmManager) CleanupFailedServices() error {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Helper functions for SwarmManager
|
||||
|
||||
// uint64Ptr returns a pointer to a uint64 value
|
||||
func uint64Ptr(v uint64) *uint64 {
|
||||
return &v
|
||||
}
|
||||
|
||||
// buildEnvironmentList converts a map to a slice of environment variables
|
||||
func buildEnvironmentList(env map[string]string) []string {
|
||||
var envList []string
|
||||
for key, value := range env {
|
||||
envList = append(envList, fmt.Sprintf("%s=%s", key, value))
|
||||
}
|
||||
return envList
|
||||
}
|
||||
|
||||
// buildNetworkAttachments converts network names to attachment configs
|
||||
func buildNetworkAttachments(networks []string) []swarm.NetworkAttachmentConfig {
|
||||
if len(networks) == 0 {
|
||||
networks = []string{"chorus_default"}
|
||||
}
|
||||
|
||||
var attachments []swarm.NetworkAttachmentConfig
|
||||
for _, network := range networks {
|
||||
attachments = append(attachments, swarm.NetworkAttachmentConfig{
|
||||
Target: network,
|
||||
})
|
||||
}
|
||||
return attachments
|
||||
}
|
||||
|
||||
// buildMounts converts volume mounts to Docker mount specs
|
||||
func buildMounts(volumes []VolumeMount) []mount.Mount {
|
||||
var mounts []mount.Mount
|
||||
|
||||
for _, vol := range volumes {
|
||||
mountType := mount.TypeBind
|
||||
switch vol.Type {
|
||||
case "volume":
|
||||
mountType = mount.TypeVolume
|
||||
case "tmpfs":
|
||||
mountType = mount.TypeTmpfs
|
||||
}
|
||||
|
||||
mounts = append(mounts, mount.Mount{
|
||||
Type: mountType,
|
||||
Source: vol.Source,
|
||||
Target: vol.Target,
|
||||
ReadOnly: vol.ReadOnly,
|
||||
})
|
||||
}
|
||||
|
||||
return mounts
|
||||
}
|
||||
Reference in New Issue
Block a user