feat: Production readiness improvements for WHOOSH council formation

Major security, observability, and configuration improvements:

## Security Hardening
- Implemented configurable CORS (no more wildcards)
- Added comprehensive auth middleware for admin endpoints
- Enhanced webhook HMAC validation
- Added input validation and rate limiting
- Security headers and CSP policies

## Configuration Management
- Made N8N webhook URL configurable (WHOOSH_N8N_BASE_URL)
- Replaced all hardcoded endpoints with environment variables
- Added feature flags for LLM vs heuristic composition
- Gitea fetch hardening with EAGER_FILTER and FULL_RESCAN options

## API Completeness
- Implemented GetCouncilComposition function
- Added GET /api/v1/councils/{id} endpoint
- Council artifacts API (POST/GET /api/v1/councils/{id}/artifacts)
- /admin/health/details endpoint with component status
- Database lookup for repository URLs (no hardcoded fallbacks)

## Observability & Performance
- Added OpenTelemetry distributed tracing with goal/pulse correlation
- Performance optimization database indexes
- Comprehensive health monitoring
- Enhanced logging and error handling

## Infrastructure
- Production-ready P2P discovery (replaces mock implementation)
- Removed unused Redis configuration
- Enhanced Docker Swarm integration
- Added migration files for performance indexes

## Code Quality
- Comprehensive input validation
- Graceful error handling and failsafe fallbacks
- Backwards compatibility maintained
- Following security best practices

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Claude Code
2025-09-12 20:34:17 +10:00
parent 56ea52b743
commit 131868bdca
1740 changed files with 575904 additions and 171 deletions

View File

@@ -14,6 +14,9 @@ import (
"github.com/docker/docker/api/types/swarm"
"github.com/docker/docker/client"
"github.com/rs/zerolog/log"
"go.opentelemetry.io/otel/attribute"
"github.com/chorus-services/whoosh/internal/tracing"
)
// SwarmManager manages Docker Swarm services for agent deployment
@@ -88,6 +91,8 @@ type AgentDeploymentConfig struct {
Networks []string `json:"networks"` // Docker networks to join
Volumes []VolumeMount `json:"volumes"` // Volume mounts
Placement PlacementConfig `json:"placement"` // Node placement constraints
GoalID string `json:"goal_id,omitempty"`
PulseID string `json:"pulse_id,omitempty"`
}
// ResourceLimits defines CPU and memory limits for containers
@@ -138,6 +143,26 @@ type Platform struct {
// DeployAgent deploys an agent service to Docker Swarm
func (sm *SwarmManager) DeployAgent(config *AgentDeploymentConfig) (*swarm.Service, error) {
ctx, span := tracing.StartDeploymentSpan(sm.ctx, "deploy_agent", config.AgentRole)
defer span.End()
// Add tracing attributes
span.SetAttributes(
attribute.String("agent.team_id", config.TeamID),
attribute.String("agent.task_id", config.TaskID),
attribute.String("agent.role", config.AgentRole),
attribute.String("agent.type", config.AgentType),
attribute.String("agent.image", config.Image),
)
// Add goal.id and pulse.id if available in config
if config.GoalID != "" {
span.SetAttributes(attribute.String("goal.id", config.GoalID))
}
if config.PulseID != "" {
span.SetAttributes(attribute.String("pulse.id", config.PulseID))
}
log.Info().
Str("team_id", config.TeamID).
Str("task_id", config.TaskID).
@@ -212,11 +237,24 @@ func (sm *SwarmManager) DeployAgent(config *AgentDeploymentConfig) (*swarm.Servi
}
// Create the service
response, err := sm.client.ServiceCreate(sm.ctx, serviceSpec, types.ServiceCreateOptions{})
response, err := sm.client.ServiceCreate(ctx, serviceSpec, types.ServiceCreateOptions{})
if err != nil {
tracing.SetSpanError(span, err)
span.SetAttributes(
attribute.String("deployment.status", "failed"),
attribute.String("deployment.service_name", serviceName),
)
return nil, fmt.Errorf("failed to create agent service: %w", err)
}
// Add success metrics to span
span.SetAttributes(
attribute.String("deployment.status", "success"),
attribute.String("deployment.service_id", response.ID),
attribute.String("deployment.service_name", serviceName),
attribute.Int64("deployment.replicas", int64(config.Replicas)),
)
log.Info().
Str("service_id", response.ID).
Str("service_name", serviceName).