chore: align slurp config and scaffolding

2025-09-27 21:03:12 +10:00
parent acc4361463
commit 4a77862289
47 changed files with 5133 additions and 4274 deletions
--- a/pkg/slurp/storage/monitoring.go
+++ b/pkg/slurp/storage/monitoring.go
@@ -14,77 +14,77 @@ import (

 // MonitoringSystem provides comprehensive monitoring for the storage system
 type MonitoringSystem struct {
-	mu                sync.RWMutex
-	nodeID            string
-	metrics           *StorageMetrics
-	alerts            *AlertManager
-	healthChecker     *HealthChecker
+	mu                  sync.RWMutex
+	nodeID              string
+	metrics             *StorageMetrics
+	alerts              *AlertManager
+	healthChecker       *HealthChecker
 	performanceProfiler *PerformanceProfiler
-	logger            *StructuredLogger
-	notifications     chan *MonitoringEvent
-	stopCh            chan struct{}
+	logger              *StructuredLogger
+	notifications       chan *MonitoringEvent
+	stopCh              chan struct{}
 }

 // StorageMetrics contains all Prometheus metrics for storage operations
 type StorageMetrics struct {
 	// Operation counters
-	StoreOperations     prometheus.Counter
-	RetrieveOperations  prometheus.Counter
-	DeleteOperations    prometheus.Counter
-	UpdateOperations    prometheus.Counter
-	SearchOperations    prometheus.Counter
-	BatchOperations     prometheus.Counter
+	StoreOperations    prometheus.Counter
+	RetrieveOperations prometheus.Counter
+	DeleteOperations   prometheus.Counter
+	UpdateOperations   prometheus.Counter
+	SearchOperations   prometheus.Counter
+	BatchOperations    prometheus.Counter

 	// Error counters
-	StoreErrors         prometheus.Counter
-	RetrieveErrors      prometheus.Counter
-	EncryptionErrors    prometheus.Counter
-	DecryptionErrors    prometheus.Counter
-	ReplicationErrors   prometheus.Counter
-	CacheErrors         prometheus.Counter
-	IndexErrors         prometheus.Counter
+	StoreErrors       prometheus.Counter
+	RetrieveErrors    prometheus.Counter
+	EncryptionErrors  prometheus.Counter
+	DecryptionErrors  prometheus.Counter
+	ReplicationErrors prometheus.Counter
+	CacheErrors       prometheus.Counter
+	IndexErrors       prometheus.Counter

 	// Latency histograms
-	StoreLatency        prometheus.Histogram
-	RetrieveLatency     prometheus.Histogram
-	EncryptionLatency   prometheus.Histogram
-	DecryptionLatency   prometheus.Histogram
-	ReplicationLatency  prometheus.Histogram
-	SearchLatency       prometheus.Histogram
+	StoreLatency       prometheus.Histogram
+	RetrieveLatency    prometheus.Histogram
+	EncryptionLatency  prometheus.Histogram
+	DecryptionLatency  prometheus.Histogram
+	ReplicationLatency prometheus.Histogram
+	SearchLatency      prometheus.Histogram

 	// Cache metrics
-	CacheHits           prometheus.Counter
-	CacheMisses         prometheus.Counter
-	CacheEvictions      prometheus.Counter
-	CacheSize           prometheus.Gauge
+	CacheHits      prometheus.Counter
+	CacheMisses    prometheus.Counter
+	CacheEvictions prometheus.Counter
+	CacheSize      prometheus.Gauge

 	// Storage size metrics
-	LocalStorageSize    prometheus.Gauge
+	LocalStorageSize       prometheus.Gauge
 	DistributedStorageSize prometheus.Gauge
 	CompressedStorageSize  prometheus.Gauge
 	IndexStorageSize       prometheus.Gauge

 	// Replication metrics
-	ReplicationFactor   prometheus.Gauge
-	HealthyReplicas     prometheus.Gauge
-	UnderReplicated     prometheus.Gauge
-	ReplicationLag      prometheus.Histogram
+	ReplicationFactor prometheus.Gauge
+	HealthyReplicas   prometheus.Gauge
+	UnderReplicated   prometheus.Gauge
+	ReplicationLag    prometheus.Histogram

 	// Encryption metrics
-	EncryptedContexts   prometheus.Gauge
-	KeyRotations        prometheus.Counter
-	AccessDenials       prometheus.Counter
-	ActiveKeys          prometheus.Gauge
+	EncryptedContexts prometheus.Gauge
+	KeyRotations      prometheus.Counter
+	AccessDenials     prometheus.Counter
+	ActiveKeys        prometheus.Gauge

 	// Performance metrics
-	Throughput          prometheus.Gauge
+	Throughput           prometheus.Gauge
 	ConcurrentOperations prometheus.Gauge
-	QueueDepth          prometheus.Gauge
+	QueueDepth           prometheus.Gauge

 	// Health metrics
-	StorageHealth       prometheus.Gauge
-	NodeConnectivity    prometheus.Gauge
-	SyncLatency         prometheus.Histogram
+	StorageHealth    prometheus.Gauge
+	NodeConnectivity prometheus.Gauge
+	SyncLatency      prometheus.Histogram
 }

 // AlertManager handles storage-related alerts and notifications
@@ -97,18 +97,96 @@ type AlertManager struct {
 	maxHistory   int
 }

+func (am *AlertManager) severityRank(severity AlertSeverity) int {
+	switch severity {
+	case SeverityCritical:
+		return 4
+	case SeverityError:
+		return 3
+	case SeverityWarning:
+		return 2
+	case SeverityInfo:
+		return 1
+	default:
+		return 0
+	}
+}
+
+// GetActiveAlerts returns sorted active alerts (SEC-SLURP-1.1 monitoring path)
+func (am *AlertManager) GetActiveAlerts() []*Alert {
+	am.mu.RLock()
+	defer am.mu.RUnlock()
+
+	if len(am.activealerts) == 0 {
+		return nil
+	}
+
+	alerts := make([]*Alert, 0, len(am.activealerts))
+	for _, alert := range am.activealerts {
+		alerts = append(alerts, alert)
+	}
+
+	sort.Slice(alerts, func(i, j int) bool {
+		iRank := am.severityRank(alerts[i].Severity)
+		jRank := am.severityRank(alerts[j].Severity)
+		if iRank == jRank {
+			return alerts[i].StartTime.After(alerts[j].StartTime)
+		}
+		return iRank > jRank
+	})
+
+	return alerts
+}
+
+// Snapshot marshals monitoring state for UCXL persistence (SEC-SLURP-1.1a telemetry)
+func (ms *MonitoringSystem) Snapshot(ctx context.Context) (string, error) {
+	ms.mu.RLock()
+	defer ms.mu.RUnlock()
+
+	if ms.alerts == nil {
+		return "", fmt.Errorf("alert manager not initialised")
+	}
+
+	active := ms.alerts.GetActiveAlerts()
+	alertPayload := make([]map[string]interface{}, 0, len(active))
+	for _, alert := range active {
+		alertPayload = append(alertPayload, map[string]interface{}{
+			"id":         alert.ID,
+			"name":       alert.Name,
+			"severity":   alert.Severity,
+			"message":    fmt.Sprintf("%s (threshold %.2f)", alert.Description, alert.Threshold),
+			"labels":     alert.Labels,
+			"started_at": alert.StartTime,
+		})
+	}
+
+	snapshot := map[string]interface{}{
+		"node_id":      ms.nodeID,
+		"generated_at": time.Now().UTC(),
+		"alert_count":  len(active),
+		"alerts":       alertPayload,
+	}
+
+	encoded, err := json.MarshalIndent(snapshot, "", "  ")
+	if err != nil {
+		return "", fmt.Errorf("failed to marshal monitoring snapshot: %w", err)
+	}
+
+	return string(encoded), nil
+}
+
 // AlertRule defines conditions for triggering alerts
 type AlertRule struct {
-	ID          string             `json:"id"`
-	Name        string             `json:"name"`
-	Description string             `json:"description"`
-	Metric      string             `json:"metric"`
-	Condition   string             `json:"condition"` // >, <, ==, !=, etc.
-	Threshold   float64            `json:"threshold"`
-	Duration    time.Duration      `json:"duration"`
-	Severity    AlertSeverity      `json:"severity"`
-	Labels      map[string]string  `json:"labels"`
-	Enabled     bool               `json:"enabled"`
+	ID          string            `json:"id"`
+	Name        string            `json:"name"`
+	Description string            `json:"description"`
+	Metric      string            `json:"metric"`
+	Condition   string            `json:"condition"` // >, <, ==, !=, etc.
+	Threshold   float64           `json:"threshold"`
+	Duration    time.Duration     `json:"duration"`
+	Severity    AlertSeverity     `json:"severity"`
+	Labels      map[string]string `json:"labels"`
+	Enabled     bool              `json:"enabled"`
 }

 // Alert represents an active or resolved alert
@@ -163,30 +241,30 @@ type HealthChecker struct {

 // HealthCheck defines a single health check
 type HealthCheck struct {
-	Name        string                                `json:"name"`
-	Description string                                `json:"description"`
+	Name        string                                 `json:"name"`
+	Description string                                 `json:"description"`
 	Checker     func(ctx context.Context) HealthResult `json:"-"`
-	Interval    time.Duration                        `json:"interval"`
-	Timeout     time.Duration                        `json:"timeout"`
-	Enabled     bool                                 `json:"enabled"`
+	Interval    time.Duration                          `json:"interval"`
+	Timeout     time.Duration                          `json:"timeout"`
+	Enabled     bool                                   `json:"enabled"`
 }

 // HealthResult represents the result of a health check
 type HealthResult struct {
-	Healthy   bool              `json:"healthy"`
-	Message   string            `json:"message"`
-	Latency   time.Duration     `json:"latency"`
+	Healthy   bool                   `json:"healthy"`
+	Message   string                 `json:"message"`
+	Latency   time.Duration          `json:"latency"`
 	Metadata  map[string]interface{} `json:"metadata"`
-	Timestamp time.Time         `json:"timestamp"`
+	Timestamp time.Time              `json:"timestamp"`
 }

 // SystemHealth represents the overall health of the storage system
 type SystemHealth struct {
-	OverallStatus  HealthStatus           `json:"overall_status"`
-	Components     map[string]HealthResult `json:"components"`
-	LastUpdate     time.Time              `json:"last_update"`
-	Uptime         time.Duration          `json:"uptime"`
-	StartTime      time.Time              `json:"start_time"`
+	OverallStatus HealthStatus            `json:"overall_status"`
+	Components    map[string]HealthResult `json:"components"`
+	LastUpdate    time.Time               `json:"last_update"`
+	Uptime        time.Duration           `json:"uptime"`
+	StartTime     time.Time               `json:"start_time"`
 }

 // HealthStatus represents system health status
@@ -200,82 +278,82 @@ const (

 // PerformanceProfiler analyzes storage performance patterns
 type PerformanceProfiler struct {
-	mu               sync.RWMutex
+	mu                sync.RWMutex
 	operationProfiles map[string]*OperationProfile
-	resourceUsage    *ResourceUsage
-	bottlenecks      []*Bottleneck
-	recommendations  []*PerformanceRecommendation
+	resourceUsage     *ResourceUsage
+	bottlenecks       []*Bottleneck
+	recommendations   []*PerformanceRecommendation
 }

 // OperationProfile contains performance analysis for a specific operation type
 type OperationProfile struct {
-	Operation       string            `json:"operation"`
-	TotalOperations int64             `json:"total_operations"`
-	AverageLatency  time.Duration     `json:"average_latency"`
-	P50Latency      time.Duration     `json:"p50_latency"`
-	P95Latency      time.Duration     `json:"p95_latency"`
-	P99Latency      time.Duration     `json:"p99_latency"`
-	Throughput      float64           `json:"throughput"`
-	ErrorRate       float64           `json:"error_rate"`
-	LatencyHistory  []time.Duration   `json:"-"`
-	LastUpdated     time.Time         `json:"last_updated"`
+	Operation       string          `json:"operation"`
+	TotalOperations int64           `json:"total_operations"`
+	AverageLatency  time.Duration   `json:"average_latency"`
+	P50Latency      time.Duration   `json:"p50_latency"`
+	P95Latency      time.Duration   `json:"p95_latency"`
+	P99Latency      time.Duration   `json:"p99_latency"`
+	Throughput      float64         `json:"throughput"`
+	ErrorRate       float64         `json:"error_rate"`
+	LatencyHistory  []time.Duration `json:"-"`
+	LastUpdated     time.Time       `json:"last_updated"`
 }

 // ResourceUsage tracks resource consumption
 type ResourceUsage struct {
-	CPUUsage       float64   `json:"cpu_usage"`
-	MemoryUsage    int64     `json:"memory_usage"`
-	DiskUsage      int64     `json:"disk_usage"`
-	NetworkIn      int64     `json:"network_in"`
-	NetworkOut     int64     `json:"network_out"`
-	OpenFiles      int       `json:"open_files"`
-	Goroutines     int       `json:"goroutines"`
-	LastUpdated    time.Time `json:"last_updated"`
+	CPUUsage    float64   `json:"cpu_usage"`
+	MemoryUsage int64     `json:"memory_usage"`
+	DiskUsage   int64     `json:"disk_usage"`
+	NetworkIn   int64     `json:"network_in"`
+	NetworkOut  int64     `json:"network_out"`
+	OpenFiles   int       `json:"open_files"`
+	Goroutines  int       `json:"goroutines"`
+	LastUpdated time.Time `json:"last_updated"`
 }

 // Bottleneck represents a performance bottleneck
 type Bottleneck struct {
-	ID          string            `json:"id"`
-	Type        string            `json:"type"` // cpu, memory, disk, network, etc.
-	Component   string            `json:"component"`
-	Description string            `json:"description"`
-	Severity    AlertSeverity     `json:"severity"`
-	Impact      float64           `json:"impact"`
-	DetectedAt  time.Time         `json:"detected_at"`
+	ID          string                 `json:"id"`
+	Type        string                 `json:"type"` // cpu, memory, disk, network, etc.
+	Component   string                 `json:"component"`
+	Description string                 `json:"description"`
+	Severity    AlertSeverity          `json:"severity"`
+	Impact      float64                `json:"impact"`
+	DetectedAt  time.Time              `json:"detected_at"`
 	Metadata    map[string]interface{} `json:"metadata"`
 }

 // PerformanceRecommendation suggests optimizations
 type PerformanceRecommendation struct {
-	ID          string            `json:"id"`
-	Type        string            `json:"type"`
-	Title       string            `json:"title"`
-	Description string            `json:"description"`
-	Priority    int               `json:"priority"`
-	Impact      string            `json:"impact"`
-	Effort      string            `json:"effort"`
-	GeneratedAt time.Time         `json:"generated_at"`
+	ID          string                 `json:"id"`
+	Type        string                 `json:"type"`
+	Title       string                 `json:"title"`
+	Description string                 `json:"description"`
+	Priority    int                    `json:"priority"`
+	Impact      string                 `json:"impact"`
+	Effort      string                 `json:"effort"`
+	GeneratedAt time.Time              `json:"generated_at"`
 	Metadata    map[string]interface{} `json:"metadata"`
 }

 // MonitoringEvent represents a monitoring system event
 type MonitoringEvent struct {
-	Type        string            `json:"type"`
-	Level       string            `json:"level"`
-	Message     string            `json:"message"`
-	Component   string            `json:"component"`
-	NodeID      string            `json:"node_id"`
-	Timestamp   time.Time         `json:"timestamp"`
-	Metadata    map[string]interface{} `json:"metadata"`
+	Type      string                 `json:"type"`
+	Level     string                 `json:"level"`
+	Message   string                 `json:"message"`
+	Component string                 `json:"component"`
+	NodeID    string                 `json:"node_id"`
+	Timestamp time.Time              `json:"timestamp"`
+	Metadata  map[string]interface{} `json:"metadata"`
 }

 // StructuredLogger provides structured logging for storage operations
 type StructuredLogger struct {
-	mu       sync.RWMutex
-	level    LogLevel
-	output   LogOutput
+	mu        sync.RWMutex
+	level     LogLevel
+	output    LogOutput
 	formatter LogFormatter
-	buffer   []*LogEntry
+	buffer    []*LogEntry
 	maxBuffer int
 }

@@ -303,27 +381,27 @@ type LogFormatter interface {

 // LogEntry represents a single log entry
 type LogEntry struct {
-	Level     LogLevel          `json:"level"`
-	Message   string            `json:"message"`
-	Component string            `json:"component"`
-	Operation string            `json:"operation"`
-	NodeID    string            `json:"node_id"`
-	Timestamp time.Time         `json:"timestamp"`
+	Level     LogLevel               `json:"level"`
+	Message   string                 `json:"message"`
+	Component string                 `json:"component"`
+	Operation string                 `json:"operation"`
+	NodeID    string                 `json:"node_id"`
+	Timestamp time.Time              `json:"timestamp"`
 	Fields    map[string]interface{} `json:"fields"`
-	Error     error             `json:"error,omitempty"`
+	Error     error                  `json:"error,omitempty"`
 }

 // NewMonitoringSystem creates a new monitoring system
 func NewMonitoringSystem(nodeID string) *MonitoringSystem {
 	ms := &MonitoringSystem{
-		nodeID:        nodeID,
-		metrics:       initializeMetrics(nodeID),
-		alerts:        newAlertManager(),
-		healthChecker: newHealthChecker(),
+		nodeID:              nodeID,
+		metrics:             initializeMetrics(nodeID),
+		alerts:              newAlertManager(),
+		healthChecker:       newHealthChecker(),
 		performanceProfiler: newPerformanceProfiler(),
-		logger:        newStructuredLogger(),
-		notifications: make(chan *MonitoringEvent, 1000),
-		stopCh:        make(chan struct{}),
+		logger:              newStructuredLogger(),
+		notifications:       make(chan *MonitoringEvent, 1000),
+		stopCh:              make(chan struct{}),
 	}

 	// Start monitoring goroutines
@@ -571,7 +649,7 @@ func (ms *MonitoringSystem) executeHealthCheck(check HealthCheck) {
 	defer cancel()

 	result := check.Checker(ctx)
-	
+
 	ms.healthChecker.mu.Lock()
 	ms.healthChecker.status.Components[check.Name] = result
 	ms.healthChecker.mu.Unlock()
@@ -592,21 +670,21 @@ func (ms *MonitoringSystem) analyzePerformance() {

 func newAlertManager() *AlertManager {
 	return &AlertManager{
-		rules:       make([]*AlertRule, 0),
+		rules:        make([]*AlertRule, 0),
 		activealerts: make(map[string]*Alert),
 		notifiers:    make([]AlertNotifier, 0),
-		history:     make([]*Alert, 0),
-		maxHistory:  1000,
+		history:      make([]*Alert, 0),
+		maxHistory:   1000,
 	}
 }

 func newHealthChecker() *HealthChecker {
 	return &HealthChecker{
-		checks:        make(map[string]HealthCheck),
-		status:        &SystemHealth{
+		checks: make(map[string]HealthCheck),
+		status: &SystemHealth{
 			OverallStatus: HealthHealthy,
-			Components:   make(map[string]HealthResult),
-			StartTime:    time.Now(),
+			Components:    make(map[string]HealthResult),
+			StartTime:     time.Now(),
 		},
 		checkInterval: 1 * time.Minute,
 		timeout:       30 * time.Second,
@@ -664,8 +742,8 @@ func (ms *MonitoringSystem) GetMonitoringStats() (*MonitoringStats, error) {
 	defer ms.mu.RUnlock()

 	stats := &MonitoringStats{
-		NodeID:      ms.nodeID,
-		Timestamp:   time.Now(),
+		NodeID:       ms.nodeID,
+		Timestamp:    time.Now(),
 		HealthStatus: ms.healthChecker.status.OverallStatus,
 		ActiveAlerts: len(ms.alerts.activealerts),
 		Bottlenecks:  len(ms.performanceProfiler.bottlenecks),