Implement wave-based scaling system for CHORUS Docker Swarm orchestration

- Health gates system for pre-scaling validation (KACHING, BACKBEAT, bootstrap peers)
- Assignment broker API for per-replica configuration management
- Bootstrap pool management with weighted peer selection and health monitoring
- Wave-based scaling algorithm with exponential backoff and failure recovery
- Enhanced SwarmManager with Docker service scaling capabilities
- Comprehensive scaling metrics collection and reporting system
- RESTful HTTP API for external scaling operations and monitoring
- Integration with CHORUS P2P networking and assignment systems

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Claude Code
2025-09-22 13:51:34 +10:00
parent 55dd5951ea
commit 564852dc91
9 changed files with 3381 additions and 87 deletions

View File

@@ -77,6 +77,236 @@ func (sm *SwarmManager) Close() error {
return sm.client.Close()
}
// ScaleService scales a Docker Swarm service to the specified replica count
func (sm *SwarmManager) ScaleService(ctx context.Context, serviceName string, replicas int) error {
ctx, span := tracing.Tracer.Start(ctx, "swarm_manager.scale_service")
defer span.End()
// Get the service
service, _, err := sm.client.ServiceInspectWithRaw(ctx, serviceName, types.ServiceInspectOptions{})
if err != nil {
return fmt.Errorf("failed to inspect service %s: %w", serviceName, err)
}
// Update replica count
serviceSpec := service.Spec
if serviceSpec.Mode.Replicated == nil {
return fmt.Errorf("service %s is not in replicated mode", serviceName)
}
currentReplicas := *serviceSpec.Mode.Replicated.Replicas
serviceSpec.Mode.Replicated.Replicas = uint64Ptr(uint64(replicas))
// Update the service
updateResponse, err := sm.client.ServiceUpdate(
ctx,
service.ID,
service.Version,
serviceSpec,
types.ServiceUpdateOptions{},
)
if err != nil {
return fmt.Errorf("failed to update service %s: %w", serviceName, err)
}
span.SetAttributes(
attribute.String("service.name", serviceName),
attribute.String("service.id", service.ID),
attribute.Int("scaling.current_replicas", int(currentReplicas)),
attribute.Int("scaling.target_replicas", replicas),
)
log.Info().
Str("service_name", serviceName).
Str("service_id", service.ID).
Uint64("current_replicas", currentReplicas).
Int("target_replicas", replicas).
Str("update_id", updateResponse.ID).
Msg("Scaled service")
return nil
}
// GetServiceReplicas returns the current replica count for a service
func (sm *SwarmManager) GetServiceReplicas(ctx context.Context, serviceName string) (int, error) {
service, _, err := sm.client.ServiceInspectWithRaw(ctx, serviceName, types.ServiceInspectOptions{})
if err != nil {
return 0, fmt.Errorf("failed to inspect service %s: %w", serviceName, err)
}
if service.Spec.Mode.Replicated == nil {
return 0, fmt.Errorf("service %s is not in replicated mode", serviceName)
}
return int(*service.Spec.Mode.Replicated.Replicas), nil
}
// GetRunningReplicas returns the number of currently running replicas for a service
func (sm *SwarmManager) GetRunningReplicas(ctx context.Context, serviceName string) (int, error) {
// Get service to get its ID
service, _, err := sm.client.ServiceInspectWithRaw(ctx, serviceName, types.ServiceInspectOptions{})
if err != nil {
return 0, fmt.Errorf("failed to inspect service %s: %w", serviceName, err)
}
// List tasks for this service
taskFilters := filters.NewArgs()
taskFilters.Add("service", service.ID)
tasks, err := sm.client.TaskList(ctx, types.TaskListOptions{
Filters: taskFilters,
})
if err != nil {
return 0, fmt.Errorf("failed to list tasks for service %s: %w", serviceName, err)
}
// Count running tasks
runningCount := 0
for _, task := range tasks {
if task.Status.State == swarm.TaskStateRunning {
runningCount++
}
}
return runningCount, nil
}
// GetServiceStatus returns detailed status information for a service
func (sm *SwarmManager) GetServiceStatus(ctx context.Context, serviceName string) (*ServiceStatus, error) {
service, _, err := sm.client.ServiceInspectWithRaw(ctx, serviceName, types.ServiceInspectOptions{})
if err != nil {
return nil, fmt.Errorf("failed to inspect service %s: %w", serviceName, err)
}
// Get tasks for detailed status
taskFilters := filters.NewArgs()
taskFilters.Add("service", service.ID)
tasks, err := sm.client.TaskList(ctx, types.TaskListOptions{
Filters: taskFilters,
})
if err != nil {
return nil, fmt.Errorf("failed to list tasks for service %s: %w", serviceName, err)
}
status := &ServiceStatus{
ServiceID: service.ID,
ServiceName: serviceName,
Image: service.Spec.TaskTemplate.ContainerSpec.Image,
CreatedAt: service.CreatedAt,
UpdatedAt: service.UpdatedAt,
Tasks: make([]TaskStatus, 0, len(tasks)),
}
if service.Spec.Mode.Replicated != nil {
status.DesiredReplicas = int(*service.Spec.Mode.Replicated.Replicas)
}
// Process tasks
runningCount := 0
for _, task := range tasks {
taskStatus := TaskStatus{
TaskID: task.ID,
NodeID: task.NodeID,
State: string(task.Status.State),
Message: task.Status.Message,
CreatedAt: task.CreatedAt,
UpdatedAt: task.UpdatedAt,
}
if task.Status.Timestamp != nil {
taskStatus.StatusTimestamp = *task.Status.Timestamp
}
status.Tasks = append(status.Tasks, taskStatus)
if task.Status.State == swarm.TaskStateRunning {
runningCount++
}
}
status.RunningReplicas = runningCount
return status, nil
}
// CreateCHORUSService creates a new CHORUS service with the specified configuration
func (sm *SwarmManager) CreateCHORUSService(ctx context.Context, config *CHORUSServiceConfig) (*swarm.Service, error) {
ctx, span := tracing.Tracer.Start(ctx, "swarm_manager.create_chorus_service")
defer span.End()
// Build service specification
serviceSpec := swarm.ServiceSpec{
Annotations: swarm.Annotations{
Name: config.ServiceName,
Labels: config.Labels,
},
TaskTemplate: swarm.TaskSpec{
ContainerSpec: &swarm.ContainerSpec{
Image: config.Image,
Env: buildEnvironmentList(config.Environment),
},
Resources: &swarm.ResourceRequirements{
Limits: &swarm.Resources{
NanoCPUs: config.Resources.CPULimit,
MemoryBytes: config.Resources.MemoryLimit,
},
Reservations: &swarm.Resources{
NanoCPUs: config.Resources.CPURequest,
MemoryBytes: config.Resources.MemoryRequest,
},
},
Placement: &swarm.Placement{
Constraints: config.Placement.Constraints,
},
},
Mode: swarm.ServiceMode{
Replicated: &swarm.ReplicatedService{
Replicas: uint64Ptr(uint64(config.InitialReplicas)),
},
},
Networks: buildNetworkAttachments(config.Networks),
UpdateConfig: &swarm.UpdateConfig{
Parallelism: 1,
Delay: 15 * time.Second,
Order: swarm.UpdateOrderStartFirst,
},
}
// Add volumes if specified
if len(config.Volumes) > 0 {
serviceSpec.TaskTemplate.ContainerSpec.Mounts = buildMounts(config.Volumes)
}
// Create the service
response, err := sm.client.ServiceCreate(ctx, serviceSpec, types.ServiceCreateOptions{})
if err != nil {
return nil, fmt.Errorf("failed to create service %s: %w", config.ServiceName, err)
}
// Get the created service
service, _, err := sm.client.ServiceInspectWithRaw(ctx, response.ID, types.ServiceInspectOptions{})
if err != nil {
return nil, fmt.Errorf("failed to inspect created service: %w", err)
}
span.SetAttributes(
attribute.String("service.name", config.ServiceName),
attribute.String("service.id", response.ID),
attribute.Int("service.initial_replicas", config.InitialReplicas),
attribute.String("service.image", config.Image),
)
log.Info().
Str("service_name", config.ServiceName).
Str("service_id", response.ID).
Int("initial_replicas", config.InitialReplicas).
Str("image", config.Image).
Msg("Created CHORUS service")
return &service, nil
}
// AgentDeploymentConfig defines configuration for deploying an agent
type AgentDeploymentConfig struct {
TeamID string `json:"team_id"`
@@ -487,94 +717,42 @@ func (sm *SwarmManager) GetServiceLogs(serviceID string, lines int) (string, err
return string(logs), nil
}
// ScaleService scales a service to the specified number of replicas
func (sm *SwarmManager) ScaleService(serviceID string, replicas uint64) error {
log.Info().
Str("service_id", serviceID).
Uint64("replicas", replicas).
Msg("📈 Scaling agent service")
// Get current service spec
service, _, err := sm.client.ServiceInspectWithRaw(sm.ctx, serviceID, types.ServiceInspectOptions{})
if err != nil {
return fmt.Errorf("failed to inspect service: %w", err)
}
// Update replicas
service.Spec.Mode.Replicated.Replicas = &replicas
// Update the service
_, err = sm.client.ServiceUpdate(sm.ctx, serviceID, service.Version, service.Spec, types.ServiceUpdateOptions{})
if err != nil {
return fmt.Errorf("failed to scale service: %w", err)
}
log.Info().
Str("service_id", serviceID).
Uint64("replicas", replicas).
Msg("✅ Service scaled successfully")
return nil
}
// GetServiceStatus returns the current status of a service
func (sm *SwarmManager) GetServiceStatus(serviceID string) (*ServiceStatus, error) {
service, _, err := sm.client.ServiceInspectWithRaw(sm.ctx, serviceID, types.ServiceInspectOptions{})
if err != nil {
return nil, fmt.Errorf("failed to inspect service: %w", err)
}
// Get task status
tasks, err := sm.client.TaskList(sm.ctx, types.TaskListOptions{
Filters: filters.NewArgs(filters.Arg("service", serviceID)),
})
if err != nil {
return nil, fmt.Errorf("failed to list tasks: %w", err)
}
status := &ServiceStatus{
ServiceID: serviceID,
ServiceName: service.Spec.Name,
Image: service.Spec.TaskTemplate.ContainerSpec.Image,
Replicas: 0,
RunningTasks: 0,
FailedTasks: 0,
TaskStates: make(map[string]int),
CreatedAt: service.CreatedAt,
UpdatedAt: service.UpdatedAt,
}
if service.Spec.Mode.Replicated != nil && service.Spec.Mode.Replicated.Replicas != nil {
status.Replicas = *service.Spec.Mode.Replicated.Replicas
}
// Count task states
for _, task := range tasks {
state := string(task.Status.State)
status.TaskStates[state]++
switch task.Status.State {
case swarm.TaskStateRunning:
status.RunningTasks++
case swarm.TaskStateFailed:
status.FailedTasks++
}
}
return status, nil
}
// ServiceStatus represents the current status of a service
// ServiceStatus represents the current status of a service with detailed task information
type ServiceStatus struct {
ServiceID string `json:"service_id"`
ServiceName string `json:"service_name"`
Image string `json:"image"`
Replicas uint64 `json:"replicas"`
RunningTasks uint64 `json:"running_tasks"`
FailedTasks uint64 `json:"failed_tasks"`
TaskStates map[string]int `json:"task_states"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
ServiceID string `json:"service_id"`
ServiceName string `json:"service_name"`
Image string `json:"image"`
DesiredReplicas int `json:"desired_replicas"`
RunningReplicas int `json:"running_replicas"`
Tasks []TaskStatus `json:"tasks"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
// TaskStatus represents the status of an individual task
type TaskStatus struct {
TaskID string `json:"task_id"`
NodeID string `json:"node_id"`
State string `json:"state"`
Message string `json:"message"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
StatusTimestamp time.Time `json:"status_timestamp"`
}
// CHORUSServiceConfig represents configuration for creating a CHORUS service
type CHORUSServiceConfig struct {
ServiceName string `json:"service_name"`
Image string `json:"image"`
InitialReplicas int `json:"initial_replicas"`
Environment map[string]string `json:"environment"`
Labels map[string]string `json:"labels"`
Networks []string `json:"networks"`
Volumes []VolumeMount `json:"volumes"`
Resources ResourceLimits `json:"resources"`
Placement PlacementConfig `json:"placement"`
}
// CleanupFailedServices removes failed services
@@ -611,6 +789,61 @@ func (sm *SwarmManager) CleanupFailedServices() error {
}
}
}
return nil
}
// Helper functions for SwarmManager
// uint64Ptr returns a pointer to a uint64 value
func uint64Ptr(v uint64) *uint64 {
return &v
}
// buildEnvironmentList converts a map to a slice of environment variables
func buildEnvironmentList(env map[string]string) []string {
var envList []string
for key, value := range env {
envList = append(envList, fmt.Sprintf("%s=%s", key, value))
}
return envList
}
// buildNetworkAttachments converts network names to attachment configs
func buildNetworkAttachments(networks []string) []swarm.NetworkAttachmentConfig {
if len(networks) == 0 {
networks = []string{"chorus_default"}
}
var attachments []swarm.NetworkAttachmentConfig
for _, network := range networks {
attachments = append(attachments, swarm.NetworkAttachmentConfig{
Target: network,
})
}
return attachments
}
// buildMounts converts volume mounts to Docker mount specs
func buildMounts(volumes []VolumeMount) []mount.Mount {
var mounts []mount.Mount
for _, vol := range volumes {
mountType := mount.TypeBind
switch vol.Type {
case "volume":
mountType = mount.TypeVolume
case "tmpfs":
mountType = mount.TypeTmpfs
}
mounts = append(mounts, mount.Mount{
Type: mountType,
Source: vol.Source,
Target: vol.Target,
ReadOnly: vol.ReadOnly,
})
}
return mounts
}