Implement initial scan logic and council formation for WHOOSH project kickoffs
- Replace incremental sync with full scan for new repositories - Add initial_scan status to bypass Since parameter filtering - Implement council formation detection for Design Brief issues - Add version display to WHOOSH UI header for debugging - Fix Docker token authentication with trailing newline removal - Add comprehensive council orchestration with Docker Swarm integration - Include BACKBEAT prototype integration for distributed timing - Support council-specific agent roles and deployment strategies - Transition repositories to active status after content discovery Key architectural improvements: - Full scan approach for new project detection vs incremental sync - Council formation triggered by chorus-entrypoint labeled Design Briefs - Proper token handling and authentication for Gitea API calls - Support for both initial discovery and ongoing task monitoring This enables autonomous project kickoff workflows where Design Brief issues automatically trigger formation of specialized agent councils for new projects. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
591
internal/orchestrator/agent_deployer.go
Normal file
591
internal/orchestrator/agent_deployer.go
Normal file
@@ -0,0 +1,591 @@
|
||||
package orchestrator
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/chorus-services/whoosh/internal/composer"
|
||||
"github.com/chorus-services/whoosh/internal/council"
|
||||
"github.com/docker/docker/api/types/swarm"
|
||||
"github.com/google/uuid"
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
// AgentDeployer manages deployment of agent containers for teams
|
||||
type AgentDeployer struct {
|
||||
swarmManager *SwarmManager
|
||||
db *pgxpool.Pool
|
||||
registry string
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
}
|
||||
|
||||
// NewAgentDeployer creates a new agent deployer
|
||||
func NewAgentDeployer(swarmManager *SwarmManager, db *pgxpool.Pool, registry string) *AgentDeployer {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
if registry == "" {
|
||||
registry = "registry.home.deepblack.cloud"
|
||||
}
|
||||
|
||||
return &AgentDeployer{
|
||||
swarmManager: swarmManager,
|
||||
db: db,
|
||||
registry: registry,
|
||||
ctx: ctx,
|
||||
cancel: cancel,
|
||||
}
|
||||
}
|
||||
|
||||
// Close shuts down the agent deployer
|
||||
func (ad *AgentDeployer) Close() error {
|
||||
ad.cancel()
|
||||
return nil
|
||||
}
|
||||
|
||||
// DeploymentRequest represents a request to deploy agents for a team
|
||||
type DeploymentRequest struct {
|
||||
TeamID uuid.UUID `json:"team_id"`
|
||||
TaskID uuid.UUID `json:"task_id"`
|
||||
TeamComposition *composer.TeamComposition `json:"team_composition"`
|
||||
TaskContext *TaskContext `json:"task_context"`
|
||||
DeploymentMode string `json:"deployment_mode"` // immediate, scheduled, manual
|
||||
}
|
||||
|
||||
// DeploymentResult represents the result of a deployment operation
|
||||
type DeploymentResult struct {
|
||||
TeamID uuid.UUID `json:"team_id"`
|
||||
TaskID uuid.UUID `json:"task_id"`
|
||||
DeployedServices []DeployedService `json:"deployed_services"`
|
||||
Status string `json:"status"` // success, partial, failed
|
||||
Message string `json:"message"`
|
||||
DeployedAt time.Time `json:"deployed_at"`
|
||||
Errors []string `json:"errors,omitempty"`
|
||||
}
|
||||
|
||||
// DeployedService represents a successfully deployed service
|
||||
type DeployedService struct {
|
||||
ServiceID string `json:"service_id"`
|
||||
ServiceName string `json:"service_name"`
|
||||
AgentRole string `json:"agent_role"`
|
||||
AgentID string `json:"agent_id"`
|
||||
Image string `json:"image"`
|
||||
Status string `json:"status"`
|
||||
}
|
||||
|
||||
// CouncilDeploymentRequest represents a request to deploy council agents
|
||||
type CouncilDeploymentRequest struct {
|
||||
CouncilID uuid.UUID `json:"council_id"`
|
||||
ProjectName string `json:"project_name"`
|
||||
CouncilComposition *council.CouncilComposition `json:"council_composition"`
|
||||
ProjectContext *CouncilProjectContext `json:"project_context"`
|
||||
DeploymentMode string `json:"deployment_mode"` // immediate, scheduled, manual
|
||||
}
|
||||
|
||||
// CouncilProjectContext contains the project information for council agents
|
||||
type CouncilProjectContext struct {
|
||||
ProjectName string `json:"project_name"`
|
||||
Repository string `json:"repository"`
|
||||
ProjectBrief string `json:"project_brief"`
|
||||
Constraints string `json:"constraints,omitempty"`
|
||||
TechLimits string `json:"tech_limits,omitempty"`
|
||||
ComplianceNotes string `json:"compliance_notes,omitempty"`
|
||||
Targets string `json:"targets,omitempty"`
|
||||
ExternalURL string `json:"external_url,omitempty"`
|
||||
}
|
||||
|
||||
// DeployTeamAgents deploys all agents for a team
|
||||
func (ad *AgentDeployer) DeployTeamAgents(request *DeploymentRequest) (*DeploymentResult, error) {
|
||||
log.Info().
|
||||
Str("team_id", request.TeamID.String()).
|
||||
Str("task_id", request.TaskID.String()).
|
||||
Int("agent_matches", len(request.TeamComposition.AgentMatches)).
|
||||
Msg("🚀 Starting team agent deployment")
|
||||
|
||||
result := &DeploymentResult{
|
||||
TeamID: request.TeamID,
|
||||
TaskID: request.TaskID,
|
||||
DeployedServices: []DeployedService{},
|
||||
DeployedAt: time.Now(),
|
||||
Errors: []string{},
|
||||
}
|
||||
|
||||
// Deploy each agent in the team composition
|
||||
for _, agentMatch := range request.TeamComposition.AgentMatches {
|
||||
service, err := ad.deploySingleAgent(request, agentMatch)
|
||||
if err != nil {
|
||||
errorMsg := fmt.Sprintf("Failed to deploy agent %s for role %s: %v",
|
||||
agentMatch.Agent.Name, agentMatch.Role.Name, err)
|
||||
result.Errors = append(result.Errors, errorMsg)
|
||||
log.Error().
|
||||
Err(err).
|
||||
Str("agent_id", agentMatch.Agent.ID.String()).
|
||||
Str("role", agentMatch.Role.Name).
|
||||
Msg("Failed to deploy agent")
|
||||
continue
|
||||
}
|
||||
|
||||
deployedService := DeployedService{
|
||||
ServiceID: service.ID,
|
||||
ServiceName: service.Spec.Name,
|
||||
AgentRole: agentMatch.Role.Name,
|
||||
AgentID: agentMatch.Agent.ID.String(),
|
||||
Image: service.Spec.TaskTemplate.ContainerSpec.Image,
|
||||
Status: "deploying",
|
||||
}
|
||||
|
||||
result.DeployedServices = append(result.DeployedServices, deployedService)
|
||||
|
||||
// Update database with deployment info
|
||||
err = ad.recordDeployment(request.TeamID, request.TaskID, agentMatch, service.ID)
|
||||
if err != nil {
|
||||
log.Error().
|
||||
Err(err).
|
||||
Str("service_id", service.ID).
|
||||
Msg("Failed to record deployment in database")
|
||||
}
|
||||
}
|
||||
|
||||
// Determine overall deployment status
|
||||
if len(result.Errors) == 0 {
|
||||
result.Status = "success"
|
||||
result.Message = fmt.Sprintf("Successfully deployed %d agents", len(result.DeployedServices))
|
||||
} else if len(result.DeployedServices) > 0 {
|
||||
result.Status = "partial"
|
||||
result.Message = fmt.Sprintf("Deployed %d/%d agents with %d errors",
|
||||
len(result.DeployedServices),
|
||||
len(request.TeamComposition.AgentMatches),
|
||||
len(result.Errors))
|
||||
} else {
|
||||
result.Status = "failed"
|
||||
result.Message = "Failed to deploy any agents"
|
||||
}
|
||||
|
||||
// Update team deployment status in database
|
||||
err := ad.updateTeamDeploymentStatus(request.TeamID, result.Status, result.Message)
|
||||
if err != nil {
|
||||
log.Error().
|
||||
Err(err).
|
||||
Str("team_id", request.TeamID.String()).
|
||||
Msg("Failed to update team deployment status")
|
||||
}
|
||||
|
||||
log.Info().
|
||||
Str("team_id", request.TeamID.String()).
|
||||
Str("status", result.Status).
|
||||
Int("deployed", len(result.DeployedServices)).
|
||||
Int("errors", len(result.Errors)).
|
||||
Msg("✅ Team agent deployment completed")
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// selectAgentImage determines the appropriate CHORUS image for the agent role
|
||||
func (ad *AgentDeployer) selectAgentImage(roleName string, agent *composer.Agent) string {
|
||||
// All agents use the same CHORUS image, but with different configurations
|
||||
// The image handles role specialization internally based on environment variables
|
||||
return "docker.io/anthonyrawlins/chorus:backbeat-v2.0.1"
|
||||
}
|
||||
|
||||
// buildAgentEnvironment creates environment variables for CHORUS agent configuration
|
||||
func (ad *AgentDeployer) buildAgentEnvironment(request *DeploymentRequest, agentMatch *composer.AgentMatch) map[string]string {
|
||||
env := map[string]string{
|
||||
// Core CHORUS configuration - just pass the agent name from human-roles.yaml
|
||||
// CHORUS will handle its own prompt composition and system behavior
|
||||
"CHORUS_AGENT_NAME": agentMatch.Role.Name, // This maps to human-roles.yaml agent definition
|
||||
"CHORUS_TEAM_ID": request.TeamID.String(),
|
||||
"CHORUS_TASK_ID": request.TaskID.String(),
|
||||
|
||||
// Essential task context
|
||||
"CHORUS_PROJECT": request.TaskContext.Repository,
|
||||
"CHORUS_TASK_TITLE": request.TaskContext.IssueTitle,
|
||||
"CHORUS_TASK_DESC": request.TaskContext.IssueDescription,
|
||||
"CHORUS_PRIORITY": request.TaskContext.Priority,
|
||||
"CHORUS_EXTERNAL_URL": request.TaskContext.ExternalURL,
|
||||
|
||||
// WHOOSH coordination
|
||||
"WHOOSH_COORDINATOR": "true",
|
||||
"WHOOSH_ENDPOINT": "http://whoosh:8080",
|
||||
|
||||
// Docker access for CHORUS sandbox management
|
||||
"DOCKER_HOST": "unix:///var/run/docker.sock",
|
||||
}
|
||||
|
||||
return env
|
||||
}
|
||||
|
||||
// Note: CHORUS handles its own prompt composition from human-roles.yaml
|
||||
// We just need to pass the agent name and essential task context
|
||||
|
||||
// determineAgentType maps role to agent type for resource allocation
|
||||
func (ad *AgentDeployer) determineAgentType(agentMatch *composer.AgentMatch) string {
|
||||
// Simple mapping for now - could be enhanced based on role complexity
|
||||
return "standard"
|
||||
}
|
||||
|
||||
// calculateResources determines resource requirements for the agent
|
||||
func (ad *AgentDeployer) calculateResources(agentMatch *composer.AgentMatch) ResourceLimits {
|
||||
// Standard resource allocation for CHORUS agents
|
||||
// CHORUS handles its own resource management internally
|
||||
return ResourceLimits{
|
||||
CPULimit: 1000000000, // 1 CPU core
|
||||
MemoryLimit: 1073741824, // 1GB RAM
|
||||
CPURequest: 500000000, // 0.5 CPU core
|
||||
MemoryRequest: 536870912, // 512MB RAM
|
||||
}
|
||||
}
|
||||
|
||||
// buildAgentVolumes creates volume mounts for CHORUS agents
|
||||
func (ad *AgentDeployer) buildAgentVolumes(request *DeploymentRequest) []VolumeMount {
|
||||
return []VolumeMount{
|
||||
{
|
||||
Type: "bind",
|
||||
Source: "/var/run/docker.sock",
|
||||
Target: "/var/run/docker.sock",
|
||||
ReadOnly: false, // CHORUS needs Docker access for sandboxing
|
||||
},
|
||||
{
|
||||
Type: "volume",
|
||||
Source: fmt.Sprintf("whoosh-workspace-%s", request.TeamID.String()),
|
||||
Target: "/workspace",
|
||||
ReadOnly: false,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// buildAgentPlacement creates placement constraints for agents
|
||||
func (ad *AgentDeployer) buildAgentPlacement(agentMatch *composer.AgentMatch) PlacementConfig {
|
||||
return PlacementConfig{
|
||||
Constraints: []string{
|
||||
"node.role==worker", // Prefer worker nodes for agent containers
|
||||
},
|
||||
// Note: Placement preferences removed for compilation compatibility
|
||||
}
|
||||
}
|
||||
|
||||
// deploySingleAgent deploys a single agent for a specific role
|
||||
func (ad *AgentDeployer) deploySingleAgent(request *DeploymentRequest, agentMatch *composer.AgentMatch) (*swarm.Service, error) {
|
||||
// Determine agent image based on role
|
||||
image := ad.selectAgentImage(agentMatch.Role.Name, agentMatch.Agent)
|
||||
|
||||
// Build deployment configuration
|
||||
config := &AgentDeploymentConfig{
|
||||
TeamID: request.TeamID.String(),
|
||||
TaskID: request.TaskID.String(),
|
||||
AgentRole: agentMatch.Role.Name,
|
||||
AgentType: ad.determineAgentType(agentMatch),
|
||||
Image: image,
|
||||
Replicas: 1, // Start with single replica per agent
|
||||
Resources: ad.calculateResources(agentMatch),
|
||||
Environment: ad.buildAgentEnvironment(request, agentMatch),
|
||||
TaskContext: *request.TaskContext,
|
||||
Networks: []string{"chorus_default"},
|
||||
Volumes: ad.buildAgentVolumes(request),
|
||||
Placement: ad.buildAgentPlacement(agentMatch),
|
||||
}
|
||||
|
||||
// Deploy the service
|
||||
service, err := ad.swarmManager.DeployAgent(config)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to deploy agent service: %w", err)
|
||||
}
|
||||
|
||||
return service, nil
|
||||
}
|
||||
|
||||
// recordDeployment records agent deployment information in the database
|
||||
func (ad *AgentDeployer) recordDeployment(teamID uuid.UUID, taskID uuid.UUID, agentMatch *composer.AgentMatch, serviceID string) error {
|
||||
query := `
|
||||
INSERT INTO agent_deployments (team_id, task_id, agent_id, role_id, service_id, status, deployed_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, NOW())
|
||||
`
|
||||
|
||||
_, err := ad.db.Exec(ad.ctx, query, teamID, taskID, agentMatch.Agent.ID, agentMatch.Role.ID, serviceID, "deployed")
|
||||
return err
|
||||
}
|
||||
|
||||
// updateTeamDeploymentStatus updates the team deployment status in the database
|
||||
func (ad *AgentDeployer) updateTeamDeploymentStatus(teamID uuid.UUID, status, message string) error {
|
||||
query := `
|
||||
UPDATE teams
|
||||
SET deployment_status = $1, deployment_message = $2, updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`
|
||||
|
||||
_, err := ad.db.Exec(ad.ctx, query, status, message, teamID)
|
||||
return err
|
||||
}
|
||||
|
||||
// DeployCouncilAgents deploys all agents for a project kickoff council
|
||||
func (ad *AgentDeployer) DeployCouncilAgents(request *CouncilDeploymentRequest) (*council.CouncilDeploymentResult, error) {
|
||||
log.Info().
|
||||
Str("council_id", request.CouncilID.String()).
|
||||
Str("project_name", request.ProjectName).
|
||||
Int("core_agents", len(request.CouncilComposition.CoreAgents)).
|
||||
Int("optional_agents", len(request.CouncilComposition.OptionalAgents)).
|
||||
Msg("🎭 Starting council agent deployment")
|
||||
|
||||
result := &council.CouncilDeploymentResult{
|
||||
CouncilID: request.CouncilID,
|
||||
ProjectName: request.ProjectName,
|
||||
DeployedAgents: []council.DeployedCouncilAgent{},
|
||||
DeployedAt: time.Now(),
|
||||
Errors: []string{},
|
||||
}
|
||||
|
||||
// Deploy core agents (required)
|
||||
for _, agent := range request.CouncilComposition.CoreAgents {
|
||||
deployedAgent, err := ad.deploySingleCouncilAgent(request, agent)
|
||||
if err != nil {
|
||||
errorMsg := fmt.Sprintf("Failed to deploy core agent %s (%s): %v",
|
||||
agent.AgentName, agent.RoleName, err)
|
||||
result.Errors = append(result.Errors, errorMsg)
|
||||
log.Error().
|
||||
Err(err).
|
||||
Str("agent_id", agent.AgentID).
|
||||
Str("role", agent.RoleName).
|
||||
Msg("Failed to deploy core council agent")
|
||||
continue
|
||||
}
|
||||
|
||||
result.DeployedAgents = append(result.DeployedAgents, *deployedAgent)
|
||||
|
||||
// Update database with deployment info
|
||||
err = ad.recordCouncilAgentDeployment(request.CouncilID, agent, deployedAgent.ServiceID)
|
||||
if err != nil {
|
||||
log.Error().
|
||||
Err(err).
|
||||
Str("service_id", deployedAgent.ServiceID).
|
||||
Msg("Failed to record council agent deployment in database")
|
||||
}
|
||||
}
|
||||
|
||||
// Deploy optional agents (best effort)
|
||||
for _, agent := range request.CouncilComposition.OptionalAgents {
|
||||
deployedAgent, err := ad.deploySingleCouncilAgent(request, agent)
|
||||
if err != nil {
|
||||
// Optional agents failing is not critical
|
||||
log.Warn().
|
||||
Err(err).
|
||||
Str("agent_id", agent.AgentID).
|
||||
Str("role", agent.RoleName).
|
||||
Msg("Failed to deploy optional council agent (non-critical)")
|
||||
continue
|
||||
}
|
||||
|
||||
result.DeployedAgents = append(result.DeployedAgents, *deployedAgent)
|
||||
|
||||
// Update database with deployment info
|
||||
err = ad.recordCouncilAgentDeployment(request.CouncilID, agent, deployedAgent.ServiceID)
|
||||
if err != nil {
|
||||
log.Error().
|
||||
Err(err).
|
||||
Str("service_id", deployedAgent.ServiceID).
|
||||
Msg("Failed to record council agent deployment in database")
|
||||
}
|
||||
}
|
||||
|
||||
// Determine overall deployment status
|
||||
coreAgentsCount := len(request.CouncilComposition.CoreAgents)
|
||||
deployedCoreAgents := 0
|
||||
|
||||
for _, deployedAgent := range result.DeployedAgents {
|
||||
// Check if this deployed agent is a core agent
|
||||
for _, coreAgent := range request.CouncilComposition.CoreAgents {
|
||||
if coreAgent.RoleName == deployedAgent.RoleName {
|
||||
deployedCoreAgents++
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if deployedCoreAgents == coreAgentsCount {
|
||||
result.Status = "success"
|
||||
result.Message = fmt.Sprintf("Successfully deployed %d agents (%d core, %d optional)",
|
||||
len(result.DeployedAgents), deployedCoreAgents, len(result.DeployedAgents)-deployedCoreAgents)
|
||||
} else if deployedCoreAgents > 0 {
|
||||
result.Status = "partial"
|
||||
result.Message = fmt.Sprintf("Deployed %d/%d core agents with %d errors",
|
||||
deployedCoreAgents, coreAgentsCount, len(result.Errors))
|
||||
} else {
|
||||
result.Status = "failed"
|
||||
result.Message = "Failed to deploy any core council agents"
|
||||
}
|
||||
|
||||
// Update council deployment status in database
|
||||
err := ad.updateCouncilDeploymentStatus(request.CouncilID, result.Status, result.Message)
|
||||
if err != nil {
|
||||
log.Error().
|
||||
Err(err).
|
||||
Str("council_id", request.CouncilID.String()).
|
||||
Msg("Failed to update council deployment status")
|
||||
}
|
||||
|
||||
log.Info().
|
||||
Str("council_id", request.CouncilID.String()).
|
||||
Str("status", result.Status).
|
||||
Int("deployed", len(result.DeployedAgents)).
|
||||
Int("errors", len(result.Errors)).
|
||||
Msg("✅ Council agent deployment completed")
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// deploySingleCouncilAgent deploys a single council agent
|
||||
func (ad *AgentDeployer) deploySingleCouncilAgent(request *CouncilDeploymentRequest, agent council.CouncilAgent) (*council.DeployedCouncilAgent, error) {
|
||||
// Use the CHORUS image for all council agents
|
||||
image := "docker.io/anthonyrawlins/chorus:backbeat-v2.0.1"
|
||||
|
||||
// Build council-specific deployment configuration
|
||||
config := &AgentDeploymentConfig{
|
||||
TeamID: request.CouncilID.String(), // Use council ID as team ID
|
||||
TaskID: request.CouncilID.String(), // Use council ID as task ID
|
||||
AgentRole: agent.RoleName,
|
||||
AgentType: "council",
|
||||
Image: image,
|
||||
Replicas: 1, // Single replica per council agent
|
||||
Resources: ad.calculateCouncilResources(agent),
|
||||
Environment: ad.buildCouncilAgentEnvironment(request, agent),
|
||||
TaskContext: TaskContext{
|
||||
Repository: request.ProjectContext.Repository,
|
||||
IssueTitle: request.ProjectContext.ProjectName,
|
||||
IssueDescription: request.ProjectContext.ProjectBrief,
|
||||
Priority: "high", // Council formation is always high priority
|
||||
ExternalURL: request.ProjectContext.ExternalURL,
|
||||
},
|
||||
Networks: []string{"chorus_default"}, // Connect to CHORUS network
|
||||
Volumes: ad.buildCouncilAgentVolumes(request),
|
||||
Placement: ad.buildCouncilAgentPlacement(agent),
|
||||
}
|
||||
|
||||
// Deploy the service
|
||||
service, err := ad.swarmManager.DeployAgent(config)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to deploy council agent service: %w", err)
|
||||
}
|
||||
|
||||
// Create deployed agent result
|
||||
deployedAgent := &council.DeployedCouncilAgent{
|
||||
ServiceID: service.ID,
|
||||
ServiceName: service.Spec.Name,
|
||||
RoleName: agent.RoleName,
|
||||
AgentID: agent.AgentID,
|
||||
Image: image,
|
||||
Status: "deploying",
|
||||
DeployedAt: time.Now(),
|
||||
}
|
||||
|
||||
return deployedAgent, nil
|
||||
}
|
||||
|
||||
// buildCouncilAgentEnvironment creates environment variables for council agent configuration
|
||||
func (ad *AgentDeployer) buildCouncilAgentEnvironment(request *CouncilDeploymentRequest, agent council.CouncilAgent) map[string]string {
|
||||
env := map[string]string{
|
||||
// Core CHORUS configuration for council mode
|
||||
"CHORUS_AGENT_NAME": agent.RoleName, // Maps to human-roles.yaml agent definition
|
||||
"CHORUS_COUNCIL_MODE": "true", // Enable council mode
|
||||
"CHORUS_COUNCIL_ID": request.CouncilID.String(),
|
||||
"CHORUS_PROJECT_NAME": request.ProjectContext.ProjectName,
|
||||
|
||||
// Council prompt and context
|
||||
"CHORUS_COUNCIL_PROMPT": "/app/prompts/council.md",
|
||||
"CHORUS_PROJECT_BRIEF": request.ProjectContext.ProjectBrief,
|
||||
"CHORUS_CONSTRAINTS": request.ProjectContext.Constraints,
|
||||
"CHORUS_TECH_LIMITS": request.ProjectContext.TechLimits,
|
||||
"CHORUS_COMPLIANCE_NOTES": request.ProjectContext.ComplianceNotes,
|
||||
"CHORUS_TARGETS": request.ProjectContext.Targets,
|
||||
|
||||
// Essential project context
|
||||
"CHORUS_PROJECT": request.ProjectContext.Repository,
|
||||
"CHORUS_EXTERNAL_URL": request.ProjectContext.ExternalURL,
|
||||
"CHORUS_PRIORITY": "high",
|
||||
|
||||
// WHOOSH coordination
|
||||
"WHOOSH_COORDINATOR": "true",
|
||||
"WHOOSH_ENDPOINT": "http://whoosh:8080",
|
||||
|
||||
// Docker access for CHORUS sandbox management
|
||||
"DOCKER_HOST": "unix:///var/run/docker.sock",
|
||||
}
|
||||
|
||||
return env
|
||||
}
|
||||
|
||||
// calculateCouncilResources determines resource requirements for council agents
|
||||
func (ad *AgentDeployer) calculateCouncilResources(agent council.CouncilAgent) ResourceLimits {
|
||||
// Council agents get slightly more resources since they handle complex analysis
|
||||
return ResourceLimits{
|
||||
CPULimit: 1500000000, // 1.5 CPU cores
|
||||
MemoryLimit: 2147483648, // 2GB RAM
|
||||
CPURequest: 750000000, // 0.75 CPU core
|
||||
MemoryRequest: 1073741824, // 1GB RAM
|
||||
}
|
||||
}
|
||||
|
||||
// buildCouncilAgentVolumes creates volume mounts for council agents
|
||||
func (ad *AgentDeployer) buildCouncilAgentVolumes(request *CouncilDeploymentRequest) []VolumeMount {
|
||||
return []VolumeMount{
|
||||
{
|
||||
Type: "bind",
|
||||
Source: "/var/run/docker.sock",
|
||||
Target: "/var/run/docker.sock",
|
||||
ReadOnly: false, // Council agents need Docker access for complex setup
|
||||
},
|
||||
{
|
||||
Type: "volume",
|
||||
Source: fmt.Sprintf("whoosh-council-%s", request.CouncilID.String()),
|
||||
Target: "/workspace",
|
||||
ReadOnly: false,
|
||||
},
|
||||
{
|
||||
Type: "bind",
|
||||
Source: "/rust/containers/WHOOSH/prompts",
|
||||
Target: "/app/prompts",
|
||||
ReadOnly: true, // Mount council prompts
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// buildCouncilAgentPlacement creates placement constraints for council agents
|
||||
func (ad *AgentDeployer) buildCouncilAgentPlacement(agent council.CouncilAgent) PlacementConfig {
|
||||
return PlacementConfig{
|
||||
Constraints: []string{
|
||||
"node.role==worker", // Prefer worker nodes for council containers
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// recordCouncilAgentDeployment records council agent deployment information in the database
|
||||
func (ad *AgentDeployer) recordCouncilAgentDeployment(councilID uuid.UUID, agent council.CouncilAgent, serviceID string) error {
|
||||
query := `
|
||||
UPDATE council_agents
|
||||
SET deployed = true, status = 'active', service_id = $1, deployed_at = NOW(), updated_at = NOW()
|
||||
WHERE council_id = $2 AND agent_id = $3
|
||||
`
|
||||
|
||||
_, err := ad.db.Exec(ad.ctx, query, serviceID, councilID, agent.AgentID)
|
||||
return err
|
||||
}
|
||||
|
||||
// updateCouncilDeploymentStatus updates the council deployment status in the database
|
||||
func (ad *AgentDeployer) updateCouncilDeploymentStatus(councilID uuid.UUID, status, message string) error {
|
||||
query := `
|
||||
UPDATE councils
|
||||
SET status = $1, updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`
|
||||
|
||||
// Map deployment status to council status
|
||||
councilStatus := "active"
|
||||
if status == "failed" {
|
||||
councilStatus = "failed"
|
||||
} else if status == "partial" {
|
||||
councilStatus = "active" // Partial deployment still allows council to function
|
||||
}
|
||||
|
||||
_, err := ad.db.Exec(ad.ctx, query, councilStatus, councilID)
|
||||
return err
|
||||
}
|
||||
|
||||
568
internal/orchestrator/swarm_manager.go
Normal file
568
internal/orchestrator/swarm_manager.go
Normal file
@@ -0,0 +1,568 @@
|
||||
package orchestrator
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"time"
|
||||
|
||||
"github.com/docker/docker/api/types"
|
||||
"github.com/docker/docker/api/types/container"
|
||||
"github.com/docker/docker/api/types/filters"
|
||||
"github.com/docker/docker/api/types/mount"
|
||||
"github.com/docker/docker/api/types/swarm"
|
||||
"github.com/docker/docker/client"
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
// SwarmManager manages Docker Swarm services for agent deployment
|
||||
type SwarmManager struct {
|
||||
client *client.Client
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
registry string // Docker registry for agent images
|
||||
}
|
||||
|
||||
// NewSwarmManager creates a new Docker Swarm manager
|
||||
func NewSwarmManager(dockerHost, registry string) (*SwarmManager, error) {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
// Create Docker client
|
||||
var dockerClient *client.Client
|
||||
var err error
|
||||
|
||||
if dockerHost != "" {
|
||||
dockerClient, err = client.NewClientWithOpts(
|
||||
client.WithHost(dockerHost),
|
||||
client.WithAPIVersionNegotiation(),
|
||||
)
|
||||
} else {
|
||||
dockerClient, err = client.NewClientWithOpts(
|
||||
client.FromEnv,
|
||||
client.WithAPIVersionNegotiation(),
|
||||
)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
cancel()
|
||||
return nil, fmt.Errorf("failed to create Docker client: %w", err)
|
||||
}
|
||||
|
||||
// Test connection
|
||||
_, err = dockerClient.Ping(ctx)
|
||||
if err != nil {
|
||||
cancel()
|
||||
return nil, fmt.Errorf("failed to connect to Docker daemon: %w", err)
|
||||
}
|
||||
|
||||
if registry == "" {
|
||||
registry = "registry.home.deepblack.cloud" // Default private registry
|
||||
}
|
||||
|
||||
return &SwarmManager{
|
||||
client: dockerClient,
|
||||
ctx: ctx,
|
||||
cancel: cancel,
|
||||
registry: registry,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Close closes the Docker client and cancels context
|
||||
func (sm *SwarmManager) Close() error {
|
||||
sm.cancel()
|
||||
return sm.client.Close()
|
||||
}
|
||||
|
||||
// AgentDeploymentConfig defines configuration for deploying an agent
|
||||
type AgentDeploymentConfig struct {
|
||||
TeamID string `json:"team_id"`
|
||||
TaskID string `json:"task_id"`
|
||||
AgentRole string `json:"agent_role"` // executor, coordinator, reviewer
|
||||
AgentType string `json:"agent_type"` // general, specialized
|
||||
Image string `json:"image"` // Docker image to use
|
||||
Replicas uint64 `json:"replicas"` // Number of instances
|
||||
Resources ResourceLimits `json:"resources"` // CPU/Memory limits
|
||||
Environment map[string]string `json:"environment"` // Environment variables
|
||||
TaskContext TaskContext `json:"task_context"` // Task-specific context
|
||||
Networks []string `json:"networks"` // Docker networks to join
|
||||
Volumes []VolumeMount `json:"volumes"` // Volume mounts
|
||||
Placement PlacementConfig `json:"placement"` // Node placement constraints
|
||||
}
|
||||
|
||||
// ResourceLimits defines CPU and memory limits for containers
|
||||
type ResourceLimits struct {
|
||||
CPULimit int64 `json:"cpu_limit"` // CPU limit in nano CPUs (1e9 = 1 CPU)
|
||||
MemoryLimit int64 `json:"memory_limit"` // Memory limit in bytes
|
||||
CPURequest int64 `json:"cpu_request"` // CPU request in nano CPUs
|
||||
MemoryRequest int64 `json:"memory_request"` // Memory request in bytes
|
||||
}
|
||||
|
||||
// TaskContext provides task-specific information to agents
|
||||
type TaskContext struct {
|
||||
IssueTitle string `json:"issue_title"`
|
||||
IssueDescription string `json:"issue_description"`
|
||||
Repository string `json:"repository"`
|
||||
TechStack []string `json:"tech_stack"`
|
||||
Requirements []string `json:"requirements"`
|
||||
Priority string `json:"priority"`
|
||||
ExternalURL string `json:"external_url"`
|
||||
Metadata map[string]interface{} `json:"metadata"`
|
||||
}
|
||||
|
||||
// VolumeMount defines a volume mount for containers
|
||||
type VolumeMount struct {
|
||||
Source string `json:"source"` // Host path or volume name
|
||||
Target string `json:"target"` // Container path
|
||||
ReadOnly bool `json:"readonly"` // Read-only mount
|
||||
Type string `json:"type"` // bind, volume, tmpfs
|
||||
}
|
||||
|
||||
// PlacementConfig defines where containers should be placed
|
||||
type PlacementConfig struct {
|
||||
Constraints []string `json:"constraints"` // Node constraints
|
||||
Preferences []PlacementPref `json:"preferences"` // Placement preferences
|
||||
Platforms []Platform `json:"platforms"` // Target platforms
|
||||
}
|
||||
|
||||
// PlacementPref defines placement preferences
|
||||
type PlacementPref struct {
|
||||
Spread string `json:"spread"` // Spread across nodes
|
||||
}
|
||||
|
||||
// Platform defines target platform for containers
|
||||
type Platform struct {
|
||||
Architecture string `json:"architecture"` // amd64, arm64, etc.
|
||||
OS string `json:"os"` // linux, windows
|
||||
}
|
||||
|
||||
// DeployAgent deploys an agent service to Docker Swarm
|
||||
func (sm *SwarmManager) DeployAgent(config *AgentDeploymentConfig) (*swarm.Service, error) {
|
||||
log.Info().
|
||||
Str("team_id", config.TeamID).
|
||||
Str("task_id", config.TaskID).
|
||||
Str("agent_role", config.AgentRole).
|
||||
Str("image", config.Image).
|
||||
Msg("🚀 Deploying agent to Docker Swarm")
|
||||
|
||||
// Generate unique service name
|
||||
serviceName := fmt.Sprintf("whoosh-agent-%s-%s-%s",
|
||||
config.TeamID[:8],
|
||||
config.TaskID[:8],
|
||||
config.AgentRole,
|
||||
)
|
||||
|
||||
// Build environment variables
|
||||
env := sm.buildEnvironment(config)
|
||||
|
||||
// Build volume mounts
|
||||
mounts := sm.buildMounts(config.Volumes)
|
||||
|
||||
// Build resource specifications
|
||||
resources := sm.buildResources(config.Resources)
|
||||
|
||||
// Build placement constraints
|
||||
placement := sm.buildPlacement(config.Placement)
|
||||
|
||||
// Create service specification
|
||||
serviceSpec := swarm.ServiceSpec{
|
||||
Annotations: swarm.Annotations{
|
||||
Name: serviceName,
|
||||
Labels: map[string]string{
|
||||
"whoosh.team_id": config.TeamID,
|
||||
"whoosh.task_id": config.TaskID,
|
||||
"whoosh.agent_role": config.AgentRole,
|
||||
"whoosh.agent_type": config.AgentType,
|
||||
"whoosh.managed_by": "whoosh",
|
||||
"whoosh.created_at": time.Now().Format(time.RFC3339),
|
||||
},
|
||||
},
|
||||
TaskTemplate: swarm.TaskSpec{
|
||||
ContainerSpec: &swarm.ContainerSpec{
|
||||
Image: config.Image,
|
||||
Env: env,
|
||||
Mounts: mounts,
|
||||
Labels: map[string]string{
|
||||
"whoosh.team_id": config.TeamID,
|
||||
"whoosh.task_id": config.TaskID,
|
||||
"whoosh.agent_role": config.AgentRole,
|
||||
},
|
||||
// Add healthcheck
|
||||
Healthcheck: &container.HealthConfig{
|
||||
Test: []string{"CMD-SHELL", "curl -f http://localhost:8080/health || exit 1"},
|
||||
Interval: 30 * time.Second,
|
||||
Timeout: 10 * time.Second,
|
||||
Retries: 3,
|
||||
},
|
||||
},
|
||||
Resources: resources,
|
||||
Placement: placement,
|
||||
Networks: sm.buildNetworks(config.Networks),
|
||||
},
|
||||
Mode: swarm.ServiceMode{
|
||||
Replicated: &swarm.ReplicatedService{
|
||||
Replicas: &config.Replicas,
|
||||
},
|
||||
},
|
||||
UpdateConfig: &swarm.UpdateConfig{
|
||||
Parallelism: 1,
|
||||
Order: "start-first",
|
||||
},
|
||||
// RollbackConfig removed for compatibility
|
||||
}
|
||||
|
||||
// Create the service
|
||||
response, err := sm.client.ServiceCreate(sm.ctx, serviceSpec, types.ServiceCreateOptions{})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create agent service: %w", err)
|
||||
}
|
||||
|
||||
log.Info().
|
||||
Str("service_id", response.ID).
|
||||
Str("service_name", serviceName).
|
||||
Msg("✅ Agent service created successfully")
|
||||
|
||||
// Wait for service to be created and return service info
|
||||
service, _, err := sm.client.ServiceInspectWithRaw(sm.ctx, response.ID, types.ServiceInspectOptions{})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to inspect created service: %w", err)
|
||||
}
|
||||
|
||||
return &service, nil
|
||||
}
|
||||
|
||||
// buildEnvironment constructs environment variables for the container
|
||||
func (sm *SwarmManager) buildEnvironment(config *AgentDeploymentConfig) []string {
|
||||
env := []string{
|
||||
fmt.Sprintf("WHOOSH_TEAM_ID=%s", config.TeamID),
|
||||
fmt.Sprintf("WHOOSH_TASK_ID=%s", config.TaskID),
|
||||
fmt.Sprintf("WHOOSH_AGENT_ROLE=%s", config.AgentRole),
|
||||
fmt.Sprintf("WHOOSH_AGENT_TYPE=%s", config.AgentType),
|
||||
}
|
||||
|
||||
// Add task context as environment variables
|
||||
if config.TaskContext.IssueTitle != "" {
|
||||
env = append(env, fmt.Sprintf("TASK_TITLE=%s", config.TaskContext.IssueTitle))
|
||||
}
|
||||
if config.TaskContext.Repository != "" {
|
||||
env = append(env, fmt.Sprintf("TASK_REPOSITORY=%s", config.TaskContext.Repository))
|
||||
}
|
||||
if config.TaskContext.Priority != "" {
|
||||
env = append(env, fmt.Sprintf("TASK_PRIORITY=%s", config.TaskContext.Priority))
|
||||
}
|
||||
if config.TaskContext.ExternalURL != "" {
|
||||
env = append(env, fmt.Sprintf("TASK_EXTERNAL_URL=%s", config.TaskContext.ExternalURL))
|
||||
}
|
||||
|
||||
// Add tech stack as JSON
|
||||
if len(config.TaskContext.TechStack) > 0 {
|
||||
techStackJSON, _ := json.Marshal(config.TaskContext.TechStack)
|
||||
env = append(env, fmt.Sprintf("TASK_TECH_STACK=%s", string(techStackJSON)))
|
||||
}
|
||||
|
||||
// Add requirements as JSON
|
||||
if len(config.TaskContext.Requirements) > 0 {
|
||||
requirementsJSON, _ := json.Marshal(config.TaskContext.Requirements)
|
||||
env = append(env, fmt.Sprintf("TASK_REQUIREMENTS=%s", string(requirementsJSON)))
|
||||
}
|
||||
|
||||
// Add custom environment variables
|
||||
for key, value := range config.Environment {
|
||||
env = append(env, fmt.Sprintf("%s=%s", key, value))
|
||||
}
|
||||
|
||||
return env
|
||||
}
|
||||
|
||||
// buildMounts constructs volume mounts for the container
|
||||
func (sm *SwarmManager) buildMounts(volumes []VolumeMount) []mount.Mount {
|
||||
mounts := make([]mount.Mount, len(volumes))
|
||||
|
||||
for i, vol := range volumes {
|
||||
mountType := mount.TypeBind
|
||||
switch vol.Type {
|
||||
case "volume":
|
||||
mountType = mount.TypeVolume
|
||||
case "tmpfs":
|
||||
mountType = mount.TypeTmpfs
|
||||
}
|
||||
|
||||
mounts[i] = mount.Mount{
|
||||
Type: mountType,
|
||||
Source: vol.Source,
|
||||
Target: vol.Target,
|
||||
ReadOnly: vol.ReadOnly,
|
||||
}
|
||||
}
|
||||
|
||||
// Add default workspace volume
|
||||
mounts = append(mounts, mount.Mount{
|
||||
Type: mount.TypeVolume,
|
||||
Source: fmt.Sprintf("whoosh-workspace"), // Shared workspace volume
|
||||
Target: "/workspace",
|
||||
ReadOnly: false,
|
||||
})
|
||||
|
||||
return mounts
|
||||
}
|
||||
|
||||
// buildResources constructs resource specifications
|
||||
func (sm *SwarmManager) buildResources(limits ResourceLimits) *swarm.ResourceRequirements {
|
||||
resources := &swarm.ResourceRequirements{}
|
||||
|
||||
// Set limits
|
||||
if limits.CPULimit > 0 || limits.MemoryLimit > 0 {
|
||||
resources.Limits = &swarm.Limit{}
|
||||
if limits.CPULimit > 0 {
|
||||
resources.Limits.NanoCPUs = limits.CPULimit
|
||||
}
|
||||
if limits.MemoryLimit > 0 {
|
||||
resources.Limits.MemoryBytes = limits.MemoryLimit
|
||||
}
|
||||
}
|
||||
|
||||
// Set requests/reservations
|
||||
if limits.CPURequest > 0 || limits.MemoryRequest > 0 {
|
||||
resources.Reservations = &swarm.Resources{}
|
||||
if limits.CPURequest > 0 {
|
||||
resources.Reservations.NanoCPUs = limits.CPURequest
|
||||
}
|
||||
if limits.MemoryRequest > 0 {
|
||||
resources.Reservations.MemoryBytes = limits.MemoryRequest
|
||||
}
|
||||
}
|
||||
|
||||
return resources
|
||||
}
|
||||
|
||||
// buildPlacement constructs placement specifications
|
||||
func (sm *SwarmManager) buildPlacement(config PlacementConfig) *swarm.Placement {
|
||||
placement := &swarm.Placement{
|
||||
Constraints: config.Constraints,
|
||||
}
|
||||
|
||||
// Add preferences
|
||||
for _, pref := range config.Preferences {
|
||||
placement.Preferences = append(placement.Preferences, swarm.PlacementPreference{
|
||||
Spread: &swarm.SpreadOver{
|
||||
SpreadDescriptor: pref.Spread,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// Add platforms
|
||||
for _, platform := range config.Platforms {
|
||||
placement.Platforms = append(placement.Platforms, swarm.Platform{
|
||||
Architecture: platform.Architecture,
|
||||
OS: platform.OS,
|
||||
})
|
||||
}
|
||||
|
||||
return placement
|
||||
}
|
||||
|
||||
// buildNetworks constructs network specifications
|
||||
func (sm *SwarmManager) buildNetworks(networks []string) []swarm.NetworkAttachmentConfig {
|
||||
if len(networks) == 0 {
|
||||
// Default to chorus_default network
|
||||
networks = []string{"chorus_default"}
|
||||
}
|
||||
|
||||
networkConfigs := make([]swarm.NetworkAttachmentConfig, len(networks))
|
||||
for i, networkName := range networks {
|
||||
networkConfigs[i] = swarm.NetworkAttachmentConfig{
|
||||
Target: networkName,
|
||||
}
|
||||
}
|
||||
|
||||
return networkConfigs
|
||||
}
|
||||
|
||||
// RemoveAgent removes an agent service from Docker Swarm
|
||||
func (sm *SwarmManager) RemoveAgent(serviceID string) error {
|
||||
log.Info().
|
||||
Str("service_id", serviceID).
|
||||
Msg("🗑️ Removing agent service from Docker Swarm")
|
||||
|
||||
err := sm.client.ServiceRemove(sm.ctx, serviceID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to remove service: %w", err)
|
||||
}
|
||||
|
||||
log.Info().
|
||||
Str("service_id", serviceID).
|
||||
Msg("✅ Agent service removed successfully")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ListAgentServices lists all agent services managed by WHOOSH
|
||||
func (sm *SwarmManager) ListAgentServices() ([]swarm.Service, error) {
|
||||
services, err := sm.client.ServiceList(sm.ctx, types.ServiceListOptions{
|
||||
Filters: filters.NewArgs(),
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to list services: %w", err)
|
||||
}
|
||||
|
||||
// Filter for WHOOSH-managed services
|
||||
var agentServices []swarm.Service
|
||||
for _, service := range services {
|
||||
if managed, exists := service.Spec.Labels["whoosh.managed_by"]; exists && managed == "whoosh" {
|
||||
agentServices = append(agentServices, service)
|
||||
}
|
||||
}
|
||||
|
||||
return agentServices, nil
|
||||
}
|
||||
|
||||
// GetServiceLogs retrieves logs for a service
|
||||
func (sm *SwarmManager) GetServiceLogs(serviceID string, lines int) (string, error) {
|
||||
options := types.ContainerLogsOptions{
|
||||
ShowStdout: true,
|
||||
ShowStderr: true,
|
||||
Tail: fmt.Sprintf("%d", lines),
|
||||
Timestamps: true,
|
||||
}
|
||||
|
||||
reader, err := sm.client.ServiceLogs(sm.ctx, serviceID, options)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to get service logs: %w", err)
|
||||
}
|
||||
defer reader.Close()
|
||||
|
||||
logs, err := io.ReadAll(reader)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to read service logs: %w", err)
|
||||
}
|
||||
|
||||
return string(logs), nil
|
||||
}
|
||||
|
||||
// ScaleService scales a service to the specified number of replicas
|
||||
func (sm *SwarmManager) ScaleService(serviceID string, replicas uint64) error {
|
||||
log.Info().
|
||||
Str("service_id", serviceID).
|
||||
Uint64("replicas", replicas).
|
||||
Msg("📈 Scaling agent service")
|
||||
|
||||
// Get current service spec
|
||||
service, _, err := sm.client.ServiceInspectWithRaw(sm.ctx, serviceID, types.ServiceInspectOptions{})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to inspect service: %w", err)
|
||||
}
|
||||
|
||||
// Update replicas
|
||||
service.Spec.Mode.Replicated.Replicas = &replicas
|
||||
|
||||
// Update the service
|
||||
_, err = sm.client.ServiceUpdate(sm.ctx, serviceID, service.Version, service.Spec, types.ServiceUpdateOptions{})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to scale service: %w", err)
|
||||
}
|
||||
|
||||
log.Info().
|
||||
Str("service_id", serviceID).
|
||||
Uint64("replicas", replicas).
|
||||
Msg("✅ Service scaled successfully")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetServiceStatus returns the current status of a service
|
||||
func (sm *SwarmManager) GetServiceStatus(serviceID string) (*ServiceStatus, error) {
|
||||
service, _, err := sm.client.ServiceInspectWithRaw(sm.ctx, serviceID, types.ServiceInspectOptions{})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to inspect service: %w", err)
|
||||
}
|
||||
|
||||
// Get task status
|
||||
tasks, err := sm.client.TaskList(sm.ctx, types.TaskListOptions{
|
||||
Filters: filters.NewArgs(filters.Arg("service", serviceID)),
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to list tasks: %w", err)
|
||||
}
|
||||
|
||||
status := &ServiceStatus{
|
||||
ServiceID: serviceID,
|
||||
ServiceName: service.Spec.Name,
|
||||
Image: service.Spec.TaskTemplate.ContainerSpec.Image,
|
||||
Replicas: 0,
|
||||
RunningTasks: 0,
|
||||
FailedTasks: 0,
|
||||
TaskStates: make(map[string]int),
|
||||
CreatedAt: service.CreatedAt,
|
||||
UpdatedAt: service.UpdatedAt,
|
||||
}
|
||||
|
||||
if service.Spec.Mode.Replicated != nil && service.Spec.Mode.Replicated.Replicas != nil {
|
||||
status.Replicas = *service.Spec.Mode.Replicated.Replicas
|
||||
}
|
||||
|
||||
// Count task states
|
||||
for _, task := range tasks {
|
||||
state := string(task.Status.State)
|
||||
status.TaskStates[state]++
|
||||
|
||||
switch task.Status.State {
|
||||
case swarm.TaskStateRunning:
|
||||
status.RunningTasks++
|
||||
case swarm.TaskStateFailed:
|
||||
status.FailedTasks++
|
||||
}
|
||||
}
|
||||
|
||||
return status, nil
|
||||
}
|
||||
|
||||
// ServiceStatus represents the current status of a service
|
||||
type ServiceStatus struct {
|
||||
ServiceID string `json:"service_id"`
|
||||
ServiceName string `json:"service_name"`
|
||||
Image string `json:"image"`
|
||||
Replicas uint64 `json:"replicas"`
|
||||
RunningTasks uint64 `json:"running_tasks"`
|
||||
FailedTasks uint64 `json:"failed_tasks"`
|
||||
TaskStates map[string]int `json:"task_states"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
// CleanupFailedServices removes failed services
|
||||
func (sm *SwarmManager) CleanupFailedServices() error {
|
||||
services, err := sm.ListAgentServices()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to list services: %w", err)
|
||||
}
|
||||
|
||||
for _, service := range services {
|
||||
status, err := sm.GetServiceStatus(service.ID)
|
||||
if err != nil {
|
||||
log.Error().
|
||||
Err(err).
|
||||
Str("service_id", service.ID).
|
||||
Msg("Failed to get service status")
|
||||
continue
|
||||
}
|
||||
|
||||
// Remove services with all failed tasks and no running tasks
|
||||
if status.FailedTasks > 0 && status.RunningTasks == 0 {
|
||||
log.Warn().
|
||||
Str("service_id", service.ID).
|
||||
Str("service_name", service.Spec.Name).
|
||||
Uint64("failed_tasks", status.FailedTasks).
|
||||
Msg("Removing failed service")
|
||||
|
||||
err = sm.RemoveAgent(service.ID)
|
||||
if err != nil {
|
||||
log.Error().
|
||||
Err(err).
|
||||
Str("service_id", service.ID).
|
||||
Msg("Failed to remove failed service")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
Reference in New Issue
Block a user