chore: align slurp config and scaffolding

This commit is contained in:
anthonyrawlins
2025-09-27 21:03:12 +10:00
parent acc4361463
commit 4a77862289
47 changed files with 5133 additions and 4274 deletions

View File

@@ -14,77 +14,77 @@ import (
// MonitoringSystem provides comprehensive monitoring for the storage system
type MonitoringSystem struct {
mu sync.RWMutex
nodeID string
metrics *StorageMetrics
alerts *AlertManager
healthChecker *HealthChecker
mu sync.RWMutex
nodeID string
metrics *StorageMetrics
alerts *AlertManager
healthChecker *HealthChecker
performanceProfiler *PerformanceProfiler
logger *StructuredLogger
notifications chan *MonitoringEvent
stopCh chan struct{}
logger *StructuredLogger
notifications chan *MonitoringEvent
stopCh chan struct{}
}
// StorageMetrics contains all Prometheus metrics for storage operations
type StorageMetrics struct {
// Operation counters
StoreOperations prometheus.Counter
RetrieveOperations prometheus.Counter
DeleteOperations prometheus.Counter
UpdateOperations prometheus.Counter
SearchOperations prometheus.Counter
BatchOperations prometheus.Counter
StoreOperations prometheus.Counter
RetrieveOperations prometheus.Counter
DeleteOperations prometheus.Counter
UpdateOperations prometheus.Counter
SearchOperations prometheus.Counter
BatchOperations prometheus.Counter
// Error counters
StoreErrors prometheus.Counter
RetrieveErrors prometheus.Counter
EncryptionErrors prometheus.Counter
DecryptionErrors prometheus.Counter
ReplicationErrors prometheus.Counter
CacheErrors prometheus.Counter
IndexErrors prometheus.Counter
StoreErrors prometheus.Counter
RetrieveErrors prometheus.Counter
EncryptionErrors prometheus.Counter
DecryptionErrors prometheus.Counter
ReplicationErrors prometheus.Counter
CacheErrors prometheus.Counter
IndexErrors prometheus.Counter
// Latency histograms
StoreLatency prometheus.Histogram
RetrieveLatency prometheus.Histogram
EncryptionLatency prometheus.Histogram
DecryptionLatency prometheus.Histogram
ReplicationLatency prometheus.Histogram
SearchLatency prometheus.Histogram
StoreLatency prometheus.Histogram
RetrieveLatency prometheus.Histogram
EncryptionLatency prometheus.Histogram
DecryptionLatency prometheus.Histogram
ReplicationLatency prometheus.Histogram
SearchLatency prometheus.Histogram
// Cache metrics
CacheHits prometheus.Counter
CacheMisses prometheus.Counter
CacheEvictions prometheus.Counter
CacheSize prometheus.Gauge
CacheHits prometheus.Counter
CacheMisses prometheus.Counter
CacheEvictions prometheus.Counter
CacheSize prometheus.Gauge
// Storage size metrics
LocalStorageSize prometheus.Gauge
LocalStorageSize prometheus.Gauge
DistributedStorageSize prometheus.Gauge
CompressedStorageSize prometheus.Gauge
IndexStorageSize prometheus.Gauge
// Replication metrics
ReplicationFactor prometheus.Gauge
HealthyReplicas prometheus.Gauge
UnderReplicated prometheus.Gauge
ReplicationLag prometheus.Histogram
ReplicationFactor prometheus.Gauge
HealthyReplicas prometheus.Gauge
UnderReplicated prometheus.Gauge
ReplicationLag prometheus.Histogram
// Encryption metrics
EncryptedContexts prometheus.Gauge
KeyRotations prometheus.Counter
AccessDenials prometheus.Counter
ActiveKeys prometheus.Gauge
EncryptedContexts prometheus.Gauge
KeyRotations prometheus.Counter
AccessDenials prometheus.Counter
ActiveKeys prometheus.Gauge
// Performance metrics
Throughput prometheus.Gauge
Throughput prometheus.Gauge
ConcurrentOperations prometheus.Gauge
QueueDepth prometheus.Gauge
QueueDepth prometheus.Gauge
// Health metrics
StorageHealth prometheus.Gauge
NodeConnectivity prometheus.Gauge
SyncLatency prometheus.Histogram
StorageHealth prometheus.Gauge
NodeConnectivity prometheus.Gauge
SyncLatency prometheus.Histogram
}
// AlertManager handles storage-related alerts and notifications
@@ -97,18 +97,96 @@ type AlertManager struct {
maxHistory int
}
func (am *AlertManager) severityRank(severity AlertSeverity) int {
switch severity {
case SeverityCritical:
return 4
case SeverityError:
return 3
case SeverityWarning:
return 2
case SeverityInfo:
return 1
default:
return 0
}
}
// GetActiveAlerts returns sorted active alerts (SEC-SLURP-1.1 monitoring path)
func (am *AlertManager) GetActiveAlerts() []*Alert {
am.mu.RLock()
defer am.mu.RUnlock()
if len(am.activealerts) == 0 {
return nil
}
alerts := make([]*Alert, 0, len(am.activealerts))
for _, alert := range am.activealerts {
alerts = append(alerts, alert)
}
sort.Slice(alerts, func(i, j int) bool {
iRank := am.severityRank(alerts[i].Severity)
jRank := am.severityRank(alerts[j].Severity)
if iRank == jRank {
return alerts[i].StartTime.After(alerts[j].StartTime)
}
return iRank > jRank
})
return alerts
}
// Snapshot marshals monitoring state for UCXL persistence (SEC-SLURP-1.1a telemetry)
func (ms *MonitoringSystem) Snapshot(ctx context.Context) (string, error) {
ms.mu.RLock()
defer ms.mu.RUnlock()
if ms.alerts == nil {
return "", fmt.Errorf("alert manager not initialised")
}
active := ms.alerts.GetActiveAlerts()
alertPayload := make([]map[string]interface{}, 0, len(active))
for _, alert := range active {
alertPayload = append(alertPayload, map[string]interface{}{
"id": alert.ID,
"name": alert.Name,
"severity": alert.Severity,
"message": fmt.Sprintf("%s (threshold %.2f)", alert.Description, alert.Threshold),
"labels": alert.Labels,
"started_at": alert.StartTime,
})
}
snapshot := map[string]interface{}{
"node_id": ms.nodeID,
"generated_at": time.Now().UTC(),
"alert_count": len(active),
"alerts": alertPayload,
}
encoded, err := json.MarshalIndent(snapshot, "", " ")
if err != nil {
return "", fmt.Errorf("failed to marshal monitoring snapshot: %w", err)
}
return string(encoded), nil
}
// AlertRule defines conditions for triggering alerts
type AlertRule struct {
ID string `json:"id"`
Name string `json:"name"`
Description string `json:"description"`
Metric string `json:"metric"`
Condition string `json:"condition"` // >, <, ==, !=, etc.
Threshold float64 `json:"threshold"`
Duration time.Duration `json:"duration"`
Severity AlertSeverity `json:"severity"`
Labels map[string]string `json:"labels"`
Enabled bool `json:"enabled"`
ID string `json:"id"`
Name string `json:"name"`
Description string `json:"description"`
Metric string `json:"metric"`
Condition string `json:"condition"` // >, <, ==, !=, etc.
Threshold float64 `json:"threshold"`
Duration time.Duration `json:"duration"`
Severity AlertSeverity `json:"severity"`
Labels map[string]string `json:"labels"`
Enabled bool `json:"enabled"`
}
// Alert represents an active or resolved alert
@@ -163,30 +241,30 @@ type HealthChecker struct {
// HealthCheck defines a single health check
type HealthCheck struct {
Name string `json:"name"`
Description string `json:"description"`
Name string `json:"name"`
Description string `json:"description"`
Checker func(ctx context.Context) HealthResult `json:"-"`
Interval time.Duration `json:"interval"`
Timeout time.Duration `json:"timeout"`
Enabled bool `json:"enabled"`
Interval time.Duration `json:"interval"`
Timeout time.Duration `json:"timeout"`
Enabled bool `json:"enabled"`
}
// HealthResult represents the result of a health check
type HealthResult struct {
Healthy bool `json:"healthy"`
Message string `json:"message"`
Latency time.Duration `json:"latency"`
Healthy bool `json:"healthy"`
Message string `json:"message"`
Latency time.Duration `json:"latency"`
Metadata map[string]interface{} `json:"metadata"`
Timestamp time.Time `json:"timestamp"`
Timestamp time.Time `json:"timestamp"`
}
// SystemHealth represents the overall health of the storage system
type SystemHealth struct {
OverallStatus HealthStatus `json:"overall_status"`
Components map[string]HealthResult `json:"components"`
LastUpdate time.Time `json:"last_update"`
Uptime time.Duration `json:"uptime"`
StartTime time.Time `json:"start_time"`
OverallStatus HealthStatus `json:"overall_status"`
Components map[string]HealthResult `json:"components"`
LastUpdate time.Time `json:"last_update"`
Uptime time.Duration `json:"uptime"`
StartTime time.Time `json:"start_time"`
}
// HealthStatus represents system health status
@@ -200,82 +278,82 @@ const (
// PerformanceProfiler analyzes storage performance patterns
type PerformanceProfiler struct {
mu sync.RWMutex
mu sync.RWMutex
operationProfiles map[string]*OperationProfile
resourceUsage *ResourceUsage
bottlenecks []*Bottleneck
recommendations []*PerformanceRecommendation
resourceUsage *ResourceUsage
bottlenecks []*Bottleneck
recommendations []*PerformanceRecommendation
}
// OperationProfile contains performance analysis for a specific operation type
type OperationProfile struct {
Operation string `json:"operation"`
TotalOperations int64 `json:"total_operations"`
AverageLatency time.Duration `json:"average_latency"`
P50Latency time.Duration `json:"p50_latency"`
P95Latency time.Duration `json:"p95_latency"`
P99Latency time.Duration `json:"p99_latency"`
Throughput float64 `json:"throughput"`
ErrorRate float64 `json:"error_rate"`
LatencyHistory []time.Duration `json:"-"`
LastUpdated time.Time `json:"last_updated"`
Operation string `json:"operation"`
TotalOperations int64 `json:"total_operations"`
AverageLatency time.Duration `json:"average_latency"`
P50Latency time.Duration `json:"p50_latency"`
P95Latency time.Duration `json:"p95_latency"`
P99Latency time.Duration `json:"p99_latency"`
Throughput float64 `json:"throughput"`
ErrorRate float64 `json:"error_rate"`
LatencyHistory []time.Duration `json:"-"`
LastUpdated time.Time `json:"last_updated"`
}
// ResourceUsage tracks resource consumption
type ResourceUsage struct {
CPUUsage float64 `json:"cpu_usage"`
MemoryUsage int64 `json:"memory_usage"`
DiskUsage int64 `json:"disk_usage"`
NetworkIn int64 `json:"network_in"`
NetworkOut int64 `json:"network_out"`
OpenFiles int `json:"open_files"`
Goroutines int `json:"goroutines"`
LastUpdated time.Time `json:"last_updated"`
CPUUsage float64 `json:"cpu_usage"`
MemoryUsage int64 `json:"memory_usage"`
DiskUsage int64 `json:"disk_usage"`
NetworkIn int64 `json:"network_in"`
NetworkOut int64 `json:"network_out"`
OpenFiles int `json:"open_files"`
Goroutines int `json:"goroutines"`
LastUpdated time.Time `json:"last_updated"`
}
// Bottleneck represents a performance bottleneck
type Bottleneck struct {
ID string `json:"id"`
Type string `json:"type"` // cpu, memory, disk, network, etc.
Component string `json:"component"`
Description string `json:"description"`
Severity AlertSeverity `json:"severity"`
Impact float64 `json:"impact"`
DetectedAt time.Time `json:"detected_at"`
ID string `json:"id"`
Type string `json:"type"` // cpu, memory, disk, network, etc.
Component string `json:"component"`
Description string `json:"description"`
Severity AlertSeverity `json:"severity"`
Impact float64 `json:"impact"`
DetectedAt time.Time `json:"detected_at"`
Metadata map[string]interface{} `json:"metadata"`
}
// PerformanceRecommendation suggests optimizations
type PerformanceRecommendation struct {
ID string `json:"id"`
Type string `json:"type"`
Title string `json:"title"`
Description string `json:"description"`
Priority int `json:"priority"`
Impact string `json:"impact"`
Effort string `json:"effort"`
GeneratedAt time.Time `json:"generated_at"`
ID string `json:"id"`
Type string `json:"type"`
Title string `json:"title"`
Description string `json:"description"`
Priority int `json:"priority"`
Impact string `json:"impact"`
Effort string `json:"effort"`
GeneratedAt time.Time `json:"generated_at"`
Metadata map[string]interface{} `json:"metadata"`
}
// MonitoringEvent represents a monitoring system event
type MonitoringEvent struct {
Type string `json:"type"`
Level string `json:"level"`
Message string `json:"message"`
Component string `json:"component"`
NodeID string `json:"node_id"`
Timestamp time.Time `json:"timestamp"`
Metadata map[string]interface{} `json:"metadata"`
Type string `json:"type"`
Level string `json:"level"`
Message string `json:"message"`
Component string `json:"component"`
NodeID string `json:"node_id"`
Timestamp time.Time `json:"timestamp"`
Metadata map[string]interface{} `json:"metadata"`
}
// StructuredLogger provides structured logging for storage operations
type StructuredLogger struct {
mu sync.RWMutex
level LogLevel
output LogOutput
mu sync.RWMutex
level LogLevel
output LogOutput
formatter LogFormatter
buffer []*LogEntry
buffer []*LogEntry
maxBuffer int
}
@@ -303,27 +381,27 @@ type LogFormatter interface {
// LogEntry represents a single log entry
type LogEntry struct {
Level LogLevel `json:"level"`
Message string `json:"message"`
Component string `json:"component"`
Operation string `json:"operation"`
NodeID string `json:"node_id"`
Timestamp time.Time `json:"timestamp"`
Level LogLevel `json:"level"`
Message string `json:"message"`
Component string `json:"component"`
Operation string `json:"operation"`
NodeID string `json:"node_id"`
Timestamp time.Time `json:"timestamp"`
Fields map[string]interface{} `json:"fields"`
Error error `json:"error,omitempty"`
Error error `json:"error,omitempty"`
}
// NewMonitoringSystem creates a new monitoring system
func NewMonitoringSystem(nodeID string) *MonitoringSystem {
ms := &MonitoringSystem{
nodeID: nodeID,
metrics: initializeMetrics(nodeID),
alerts: newAlertManager(),
healthChecker: newHealthChecker(),
nodeID: nodeID,
metrics: initializeMetrics(nodeID),
alerts: newAlertManager(),
healthChecker: newHealthChecker(),
performanceProfiler: newPerformanceProfiler(),
logger: newStructuredLogger(),
notifications: make(chan *MonitoringEvent, 1000),
stopCh: make(chan struct{}),
logger: newStructuredLogger(),
notifications: make(chan *MonitoringEvent, 1000),
stopCh: make(chan struct{}),
}
// Start monitoring goroutines
@@ -571,7 +649,7 @@ func (ms *MonitoringSystem) executeHealthCheck(check HealthCheck) {
defer cancel()
result := check.Checker(ctx)
ms.healthChecker.mu.Lock()
ms.healthChecker.status.Components[check.Name] = result
ms.healthChecker.mu.Unlock()
@@ -592,21 +670,21 @@ func (ms *MonitoringSystem) analyzePerformance() {
func newAlertManager() *AlertManager {
return &AlertManager{
rules: make([]*AlertRule, 0),
rules: make([]*AlertRule, 0),
activealerts: make(map[string]*Alert),
notifiers: make([]AlertNotifier, 0),
history: make([]*Alert, 0),
maxHistory: 1000,
history: make([]*Alert, 0),
maxHistory: 1000,
}
}
func newHealthChecker() *HealthChecker {
return &HealthChecker{
checks: make(map[string]HealthCheck),
status: &SystemHealth{
checks: make(map[string]HealthCheck),
status: &SystemHealth{
OverallStatus: HealthHealthy,
Components: make(map[string]HealthResult),
StartTime: time.Now(),
Components: make(map[string]HealthResult),
StartTime: time.Now(),
},
checkInterval: 1 * time.Minute,
timeout: 30 * time.Second,
@@ -664,8 +742,8 @@ func (ms *MonitoringSystem) GetMonitoringStats() (*MonitoringStats, error) {
defer ms.mu.RUnlock()
stats := &MonitoringStats{
NodeID: ms.nodeID,
Timestamp: time.Now(),
NodeID: ms.nodeID,
Timestamp: time.Now(),
HealthStatus: ms.healthChecker.status.OverallStatus,
ActiveAlerts: len(ms.alerts.activealerts),
Bottlenecks: len(ms.performanceProfiler.bottlenecks),