feat: Production readiness improvements for WHOOSH council formation
Major security, observability, and configuration improvements:
## Security Hardening
- Implemented configurable CORS (no more wildcards)
- Added comprehensive auth middleware for admin endpoints
- Enhanced webhook HMAC validation
- Added input validation and rate limiting
- Security headers and CSP policies
## Configuration Management
- Made N8N webhook URL configurable (WHOOSH_N8N_BASE_URL)
- Replaced all hardcoded endpoints with environment variables
- Added feature flags for LLM vs heuristic composition
- Gitea fetch hardening with EAGER_FILTER and FULL_RESCAN options
## API Completeness
- Implemented GetCouncilComposition function
- Added GET /api/v1/councils/{id} endpoint
- Council artifacts API (POST/GET /api/v1/councils/{id}/artifacts)
- /admin/health/details endpoint with component status
- Database lookup for repository URLs (no hardcoded fallbacks)
## Observability & Performance
- Added OpenTelemetry distributed tracing with goal/pulse correlation
- Performance optimization database indexes
- Comprehensive health monitoring
- Enhanced logging and error handling
## Infrastructure
- Production-ready P2P discovery (replaces mock implementation)
- Removed unused Redis configuration
- Enhanced Docker Swarm integration
- Added migration files for performance indexes
## Code Quality
- Comprehensive input validation
- Graceful error handling and failsafe fallbacks
- Backwards compatibility maintained
- Following security best practices
🤖 Generated with [Claude Code](https://claude.ai/code)
Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
152
internal/tracing/tracing.go
Normal file
152
internal/tracing/tracing.go
Normal file
@@ -0,0 +1,152 @@
|
||||
package tracing
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"go.opentelemetry.io/otel"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/codes"
|
||||
"go.opentelemetry.io/otel/exporters/jaeger"
|
||||
"go.opentelemetry.io/otel/propagation"
|
||||
"go.opentelemetry.io/otel/sdk/resource"
|
||||
tracesdk "go.opentelemetry.io/otel/sdk/trace"
|
||||
semconv "go.opentelemetry.io/otel/semconv/v1.24.0"
|
||||
"go.opentelemetry.io/otel/trace"
|
||||
|
||||
"github.com/chorus-services/whoosh/internal/config"
|
||||
)
|
||||
|
||||
// Tracer is the global tracer for WHOOSH
|
||||
var Tracer trace.Tracer
|
||||
|
||||
// Initialize sets up OpenTelemetry tracing
|
||||
func Initialize(cfg config.OpenTelemetryConfig) (func(), error) {
|
||||
if !cfg.Enabled {
|
||||
// Set up no-op tracer
|
||||
Tracer = otel.Tracer("whoosh")
|
||||
return func() {}, nil
|
||||
}
|
||||
|
||||
// Create Jaeger exporter
|
||||
exp, err := jaeger.New(jaeger.WithCollectorEndpoint(jaeger.WithEndpoint(cfg.JaegerEndpoint)))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create jaeger exporter: %w", err)
|
||||
}
|
||||
|
||||
// Create resource with service information
|
||||
res, err := resource.Merge(
|
||||
resource.Default(),
|
||||
resource.NewWithAttributes(
|
||||
semconv.SchemaURL,
|
||||
semconv.ServiceName(cfg.ServiceName),
|
||||
semconv.ServiceVersion(cfg.ServiceVersion),
|
||||
semconv.DeploymentEnvironment(cfg.Environment),
|
||||
),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create resource: %w", err)
|
||||
}
|
||||
|
||||
// Create trace provider
|
||||
tp := tracesdk.NewTracerProvider(
|
||||
tracesdk.WithBatcher(exp),
|
||||
tracesdk.WithResource(res),
|
||||
tracesdk.WithSampler(tracesdk.TraceIDRatioBased(cfg.SampleRate)),
|
||||
)
|
||||
|
||||
// Set global trace provider
|
||||
otel.SetTracerProvider(tp)
|
||||
|
||||
// Set global propagator
|
||||
otel.SetTextMapPropagator(propagation.TraceContext{})
|
||||
|
||||
// Create tracer
|
||||
Tracer = otel.Tracer("whoosh")
|
||||
|
||||
// Return cleanup function
|
||||
cleanup := func() {
|
||||
if err := tp.Shutdown(context.Background()); err != nil {
|
||||
// Log error but don't return it since this is cleanup
|
||||
fmt.Printf("Error shutting down tracer provider: %v\n", err)
|
||||
}
|
||||
}
|
||||
|
||||
return cleanup, nil
|
||||
}
|
||||
|
||||
// StartSpan creates a new span with the given name and attributes
|
||||
func StartSpan(ctx context.Context, name string, opts ...trace.SpanStartOption) (context.Context, trace.Span) {
|
||||
return Tracer.Start(ctx, name, opts...)
|
||||
}
|
||||
|
||||
// AddAttributes adds attributes to the current span
|
||||
func AddAttributes(span trace.Span, attributes ...attribute.KeyValue) {
|
||||
span.SetAttributes(attributes...)
|
||||
}
|
||||
|
||||
// SetSpanError records an error in the span and sets the status
|
||||
func SetSpanError(span trace.Span, err error) {
|
||||
if err != nil {
|
||||
span.RecordError(err)
|
||||
span.SetStatus(codes.Error, err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
// Common attribute keys for WHOOSH tracing
|
||||
var (
|
||||
// Goal and Pulse correlation attributes
|
||||
AttrGoalIDKey = attribute.Key("goal.id")
|
||||
AttrPulseIDKey = attribute.Key("pulse.id")
|
||||
|
||||
// Component attributes
|
||||
AttrComponentKey = attribute.Key("whoosh.component")
|
||||
AttrOperationKey = attribute.Key("whoosh.operation")
|
||||
|
||||
// Resource attributes
|
||||
AttrTaskIDKey = attribute.Key("task.id")
|
||||
AttrCouncilIDKey = attribute.Key("council.id")
|
||||
AttrAgentIDKey = attribute.Key("agent.id")
|
||||
AttrRepositoryKey = attribute.Key("repository.name")
|
||||
)
|
||||
|
||||
// Convenience functions for creating common spans
|
||||
func StartMonitorSpan(ctx context.Context, operation string, repository string) (context.Context, trace.Span) {
|
||||
return StartSpan(ctx, fmt.Sprintf("monitor.%s", operation),
|
||||
trace.WithAttributes(
|
||||
attribute.String("whoosh.component", "monitor"),
|
||||
attribute.String("whoosh.operation", operation),
|
||||
attribute.String("repository.name", repository),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
func StartCouncilSpan(ctx context.Context, operation string, councilID string) (context.Context, trace.Span) {
|
||||
return StartSpan(ctx, fmt.Sprintf("council.%s", operation),
|
||||
trace.WithAttributes(
|
||||
attribute.String("whoosh.component", "council"),
|
||||
attribute.String("whoosh.operation", operation),
|
||||
attribute.String("council.id", councilID),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
func StartDeploymentSpan(ctx context.Context, operation string, serviceName string) (context.Context, trace.Span) {
|
||||
return StartSpan(ctx, fmt.Sprintf("deployment.%s", operation),
|
||||
trace.WithAttributes(
|
||||
attribute.String("whoosh.component", "deployment"),
|
||||
attribute.String("whoosh.operation", operation),
|
||||
attribute.String("service.name", serviceName),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
func StartWebhookSpan(ctx context.Context, operation string, source string) (context.Context, trace.Span) {
|
||||
return StartSpan(ctx, fmt.Sprintf("webhook.%s", operation),
|
||||
trace.WithAttributes(
|
||||
attribute.String("whoosh.component", "webhook"),
|
||||
attribute.String("whoosh.operation", operation),
|
||||
attribute.String("webhook.source", source),
|
||||
),
|
||||
)
|
||||
}
|
||||
Reference in New Issue
Block a user