chore: align slurp config and scaffolding
This commit is contained in:
@@ -14,77 +14,77 @@ import (
|
||||
|
||||
// MonitoringSystem provides comprehensive monitoring for the storage system
|
||||
type MonitoringSystem struct {
|
||||
mu sync.RWMutex
|
||||
nodeID string
|
||||
metrics *StorageMetrics
|
||||
alerts *AlertManager
|
||||
healthChecker *HealthChecker
|
||||
mu sync.RWMutex
|
||||
nodeID string
|
||||
metrics *StorageMetrics
|
||||
alerts *AlertManager
|
||||
healthChecker *HealthChecker
|
||||
performanceProfiler *PerformanceProfiler
|
||||
logger *StructuredLogger
|
||||
notifications chan *MonitoringEvent
|
||||
stopCh chan struct{}
|
||||
logger *StructuredLogger
|
||||
notifications chan *MonitoringEvent
|
||||
stopCh chan struct{}
|
||||
}
|
||||
|
||||
// StorageMetrics contains all Prometheus metrics for storage operations
|
||||
type StorageMetrics struct {
|
||||
// Operation counters
|
||||
StoreOperations prometheus.Counter
|
||||
RetrieveOperations prometheus.Counter
|
||||
DeleteOperations prometheus.Counter
|
||||
UpdateOperations prometheus.Counter
|
||||
SearchOperations prometheus.Counter
|
||||
BatchOperations prometheus.Counter
|
||||
StoreOperations prometheus.Counter
|
||||
RetrieveOperations prometheus.Counter
|
||||
DeleteOperations prometheus.Counter
|
||||
UpdateOperations prometheus.Counter
|
||||
SearchOperations prometheus.Counter
|
||||
BatchOperations prometheus.Counter
|
||||
|
||||
// Error counters
|
||||
StoreErrors prometheus.Counter
|
||||
RetrieveErrors prometheus.Counter
|
||||
EncryptionErrors prometheus.Counter
|
||||
DecryptionErrors prometheus.Counter
|
||||
ReplicationErrors prometheus.Counter
|
||||
CacheErrors prometheus.Counter
|
||||
IndexErrors prometheus.Counter
|
||||
StoreErrors prometheus.Counter
|
||||
RetrieveErrors prometheus.Counter
|
||||
EncryptionErrors prometheus.Counter
|
||||
DecryptionErrors prometheus.Counter
|
||||
ReplicationErrors prometheus.Counter
|
||||
CacheErrors prometheus.Counter
|
||||
IndexErrors prometheus.Counter
|
||||
|
||||
// Latency histograms
|
||||
StoreLatency prometheus.Histogram
|
||||
RetrieveLatency prometheus.Histogram
|
||||
EncryptionLatency prometheus.Histogram
|
||||
DecryptionLatency prometheus.Histogram
|
||||
ReplicationLatency prometheus.Histogram
|
||||
SearchLatency prometheus.Histogram
|
||||
StoreLatency prometheus.Histogram
|
||||
RetrieveLatency prometheus.Histogram
|
||||
EncryptionLatency prometheus.Histogram
|
||||
DecryptionLatency prometheus.Histogram
|
||||
ReplicationLatency prometheus.Histogram
|
||||
SearchLatency prometheus.Histogram
|
||||
|
||||
// Cache metrics
|
||||
CacheHits prometheus.Counter
|
||||
CacheMisses prometheus.Counter
|
||||
CacheEvictions prometheus.Counter
|
||||
CacheSize prometheus.Gauge
|
||||
CacheHits prometheus.Counter
|
||||
CacheMisses prometheus.Counter
|
||||
CacheEvictions prometheus.Counter
|
||||
CacheSize prometheus.Gauge
|
||||
|
||||
// Storage size metrics
|
||||
LocalStorageSize prometheus.Gauge
|
||||
LocalStorageSize prometheus.Gauge
|
||||
DistributedStorageSize prometheus.Gauge
|
||||
CompressedStorageSize prometheus.Gauge
|
||||
IndexStorageSize prometheus.Gauge
|
||||
|
||||
// Replication metrics
|
||||
ReplicationFactor prometheus.Gauge
|
||||
HealthyReplicas prometheus.Gauge
|
||||
UnderReplicated prometheus.Gauge
|
||||
ReplicationLag prometheus.Histogram
|
||||
ReplicationFactor prometheus.Gauge
|
||||
HealthyReplicas prometheus.Gauge
|
||||
UnderReplicated prometheus.Gauge
|
||||
ReplicationLag prometheus.Histogram
|
||||
|
||||
// Encryption metrics
|
||||
EncryptedContexts prometheus.Gauge
|
||||
KeyRotations prometheus.Counter
|
||||
AccessDenials prometheus.Counter
|
||||
ActiveKeys prometheus.Gauge
|
||||
EncryptedContexts prometheus.Gauge
|
||||
KeyRotations prometheus.Counter
|
||||
AccessDenials prometheus.Counter
|
||||
ActiveKeys prometheus.Gauge
|
||||
|
||||
// Performance metrics
|
||||
Throughput prometheus.Gauge
|
||||
Throughput prometheus.Gauge
|
||||
ConcurrentOperations prometheus.Gauge
|
||||
QueueDepth prometheus.Gauge
|
||||
QueueDepth prometheus.Gauge
|
||||
|
||||
// Health metrics
|
||||
StorageHealth prometheus.Gauge
|
||||
NodeConnectivity prometheus.Gauge
|
||||
SyncLatency prometheus.Histogram
|
||||
StorageHealth prometheus.Gauge
|
||||
NodeConnectivity prometheus.Gauge
|
||||
SyncLatency prometheus.Histogram
|
||||
}
|
||||
|
||||
// AlertManager handles storage-related alerts and notifications
|
||||
@@ -97,18 +97,96 @@ type AlertManager struct {
|
||||
maxHistory int
|
||||
}
|
||||
|
||||
func (am *AlertManager) severityRank(severity AlertSeverity) int {
|
||||
switch severity {
|
||||
case SeverityCritical:
|
||||
return 4
|
||||
case SeverityError:
|
||||
return 3
|
||||
case SeverityWarning:
|
||||
return 2
|
||||
case SeverityInfo:
|
||||
return 1
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
// GetActiveAlerts returns sorted active alerts (SEC-SLURP-1.1 monitoring path)
|
||||
func (am *AlertManager) GetActiveAlerts() []*Alert {
|
||||
am.mu.RLock()
|
||||
defer am.mu.RUnlock()
|
||||
|
||||
if len(am.activealerts) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
alerts := make([]*Alert, 0, len(am.activealerts))
|
||||
for _, alert := range am.activealerts {
|
||||
alerts = append(alerts, alert)
|
||||
}
|
||||
|
||||
sort.Slice(alerts, func(i, j int) bool {
|
||||
iRank := am.severityRank(alerts[i].Severity)
|
||||
jRank := am.severityRank(alerts[j].Severity)
|
||||
if iRank == jRank {
|
||||
return alerts[i].StartTime.After(alerts[j].StartTime)
|
||||
}
|
||||
return iRank > jRank
|
||||
})
|
||||
|
||||
return alerts
|
||||
}
|
||||
|
||||
// Snapshot marshals monitoring state for UCXL persistence (SEC-SLURP-1.1a telemetry)
|
||||
func (ms *MonitoringSystem) Snapshot(ctx context.Context) (string, error) {
|
||||
ms.mu.RLock()
|
||||
defer ms.mu.RUnlock()
|
||||
|
||||
if ms.alerts == nil {
|
||||
return "", fmt.Errorf("alert manager not initialised")
|
||||
}
|
||||
|
||||
active := ms.alerts.GetActiveAlerts()
|
||||
alertPayload := make([]map[string]interface{}, 0, len(active))
|
||||
for _, alert := range active {
|
||||
alertPayload = append(alertPayload, map[string]interface{}{
|
||||
"id": alert.ID,
|
||||
"name": alert.Name,
|
||||
"severity": alert.Severity,
|
||||
"message": fmt.Sprintf("%s (threshold %.2f)", alert.Description, alert.Threshold),
|
||||
"labels": alert.Labels,
|
||||
"started_at": alert.StartTime,
|
||||
})
|
||||
}
|
||||
|
||||
snapshot := map[string]interface{}{
|
||||
"node_id": ms.nodeID,
|
||||
"generated_at": time.Now().UTC(),
|
||||
"alert_count": len(active),
|
||||
"alerts": alertPayload,
|
||||
}
|
||||
|
||||
encoded, err := json.MarshalIndent(snapshot, "", " ")
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to marshal monitoring snapshot: %w", err)
|
||||
}
|
||||
|
||||
return string(encoded), nil
|
||||
}
|
||||
|
||||
// AlertRule defines conditions for triggering alerts
|
||||
type AlertRule struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Description string `json:"description"`
|
||||
Metric string `json:"metric"`
|
||||
Condition string `json:"condition"` // >, <, ==, !=, etc.
|
||||
Threshold float64 `json:"threshold"`
|
||||
Duration time.Duration `json:"duration"`
|
||||
Severity AlertSeverity `json:"severity"`
|
||||
Labels map[string]string `json:"labels"`
|
||||
Enabled bool `json:"enabled"`
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Description string `json:"description"`
|
||||
Metric string `json:"metric"`
|
||||
Condition string `json:"condition"` // >, <, ==, !=, etc.
|
||||
Threshold float64 `json:"threshold"`
|
||||
Duration time.Duration `json:"duration"`
|
||||
Severity AlertSeverity `json:"severity"`
|
||||
Labels map[string]string `json:"labels"`
|
||||
Enabled bool `json:"enabled"`
|
||||
}
|
||||
|
||||
// Alert represents an active or resolved alert
|
||||
@@ -163,30 +241,30 @@ type HealthChecker struct {
|
||||
|
||||
// HealthCheck defines a single health check
|
||||
type HealthCheck struct {
|
||||
Name string `json:"name"`
|
||||
Description string `json:"description"`
|
||||
Name string `json:"name"`
|
||||
Description string `json:"description"`
|
||||
Checker func(ctx context.Context) HealthResult `json:"-"`
|
||||
Interval time.Duration `json:"interval"`
|
||||
Timeout time.Duration `json:"timeout"`
|
||||
Enabled bool `json:"enabled"`
|
||||
Interval time.Duration `json:"interval"`
|
||||
Timeout time.Duration `json:"timeout"`
|
||||
Enabled bool `json:"enabled"`
|
||||
}
|
||||
|
||||
// HealthResult represents the result of a health check
|
||||
type HealthResult struct {
|
||||
Healthy bool `json:"healthy"`
|
||||
Message string `json:"message"`
|
||||
Latency time.Duration `json:"latency"`
|
||||
Healthy bool `json:"healthy"`
|
||||
Message string `json:"message"`
|
||||
Latency time.Duration `json:"latency"`
|
||||
Metadata map[string]interface{} `json:"metadata"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
}
|
||||
|
||||
// SystemHealth represents the overall health of the storage system
|
||||
type SystemHealth struct {
|
||||
OverallStatus HealthStatus `json:"overall_status"`
|
||||
Components map[string]HealthResult `json:"components"`
|
||||
LastUpdate time.Time `json:"last_update"`
|
||||
Uptime time.Duration `json:"uptime"`
|
||||
StartTime time.Time `json:"start_time"`
|
||||
OverallStatus HealthStatus `json:"overall_status"`
|
||||
Components map[string]HealthResult `json:"components"`
|
||||
LastUpdate time.Time `json:"last_update"`
|
||||
Uptime time.Duration `json:"uptime"`
|
||||
StartTime time.Time `json:"start_time"`
|
||||
}
|
||||
|
||||
// HealthStatus represents system health status
|
||||
@@ -200,82 +278,82 @@ const (
|
||||
|
||||
// PerformanceProfiler analyzes storage performance patterns
|
||||
type PerformanceProfiler struct {
|
||||
mu sync.RWMutex
|
||||
mu sync.RWMutex
|
||||
operationProfiles map[string]*OperationProfile
|
||||
resourceUsage *ResourceUsage
|
||||
bottlenecks []*Bottleneck
|
||||
recommendations []*PerformanceRecommendation
|
||||
resourceUsage *ResourceUsage
|
||||
bottlenecks []*Bottleneck
|
||||
recommendations []*PerformanceRecommendation
|
||||
}
|
||||
|
||||
// OperationProfile contains performance analysis for a specific operation type
|
||||
type OperationProfile struct {
|
||||
Operation string `json:"operation"`
|
||||
TotalOperations int64 `json:"total_operations"`
|
||||
AverageLatency time.Duration `json:"average_latency"`
|
||||
P50Latency time.Duration `json:"p50_latency"`
|
||||
P95Latency time.Duration `json:"p95_latency"`
|
||||
P99Latency time.Duration `json:"p99_latency"`
|
||||
Throughput float64 `json:"throughput"`
|
||||
ErrorRate float64 `json:"error_rate"`
|
||||
LatencyHistory []time.Duration `json:"-"`
|
||||
LastUpdated time.Time `json:"last_updated"`
|
||||
Operation string `json:"operation"`
|
||||
TotalOperations int64 `json:"total_operations"`
|
||||
AverageLatency time.Duration `json:"average_latency"`
|
||||
P50Latency time.Duration `json:"p50_latency"`
|
||||
P95Latency time.Duration `json:"p95_latency"`
|
||||
P99Latency time.Duration `json:"p99_latency"`
|
||||
Throughput float64 `json:"throughput"`
|
||||
ErrorRate float64 `json:"error_rate"`
|
||||
LatencyHistory []time.Duration `json:"-"`
|
||||
LastUpdated time.Time `json:"last_updated"`
|
||||
}
|
||||
|
||||
// ResourceUsage tracks resource consumption
|
||||
type ResourceUsage struct {
|
||||
CPUUsage float64 `json:"cpu_usage"`
|
||||
MemoryUsage int64 `json:"memory_usage"`
|
||||
DiskUsage int64 `json:"disk_usage"`
|
||||
NetworkIn int64 `json:"network_in"`
|
||||
NetworkOut int64 `json:"network_out"`
|
||||
OpenFiles int `json:"open_files"`
|
||||
Goroutines int `json:"goroutines"`
|
||||
LastUpdated time.Time `json:"last_updated"`
|
||||
CPUUsage float64 `json:"cpu_usage"`
|
||||
MemoryUsage int64 `json:"memory_usage"`
|
||||
DiskUsage int64 `json:"disk_usage"`
|
||||
NetworkIn int64 `json:"network_in"`
|
||||
NetworkOut int64 `json:"network_out"`
|
||||
OpenFiles int `json:"open_files"`
|
||||
Goroutines int `json:"goroutines"`
|
||||
LastUpdated time.Time `json:"last_updated"`
|
||||
}
|
||||
|
||||
// Bottleneck represents a performance bottleneck
|
||||
type Bottleneck struct {
|
||||
ID string `json:"id"`
|
||||
Type string `json:"type"` // cpu, memory, disk, network, etc.
|
||||
Component string `json:"component"`
|
||||
Description string `json:"description"`
|
||||
Severity AlertSeverity `json:"severity"`
|
||||
Impact float64 `json:"impact"`
|
||||
DetectedAt time.Time `json:"detected_at"`
|
||||
ID string `json:"id"`
|
||||
Type string `json:"type"` // cpu, memory, disk, network, etc.
|
||||
Component string `json:"component"`
|
||||
Description string `json:"description"`
|
||||
Severity AlertSeverity `json:"severity"`
|
||||
Impact float64 `json:"impact"`
|
||||
DetectedAt time.Time `json:"detected_at"`
|
||||
Metadata map[string]interface{} `json:"metadata"`
|
||||
}
|
||||
|
||||
// PerformanceRecommendation suggests optimizations
|
||||
type PerformanceRecommendation struct {
|
||||
ID string `json:"id"`
|
||||
Type string `json:"type"`
|
||||
Title string `json:"title"`
|
||||
Description string `json:"description"`
|
||||
Priority int `json:"priority"`
|
||||
Impact string `json:"impact"`
|
||||
Effort string `json:"effort"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
ID string `json:"id"`
|
||||
Type string `json:"type"`
|
||||
Title string `json:"title"`
|
||||
Description string `json:"description"`
|
||||
Priority int `json:"priority"`
|
||||
Impact string `json:"impact"`
|
||||
Effort string `json:"effort"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Metadata map[string]interface{} `json:"metadata"`
|
||||
}
|
||||
|
||||
// MonitoringEvent represents a monitoring system event
|
||||
type MonitoringEvent struct {
|
||||
Type string `json:"type"`
|
||||
Level string `json:"level"`
|
||||
Message string `json:"message"`
|
||||
Component string `json:"component"`
|
||||
NodeID string `json:"node_id"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
Metadata map[string]interface{} `json:"metadata"`
|
||||
Type string `json:"type"`
|
||||
Level string `json:"level"`
|
||||
Message string `json:"message"`
|
||||
Component string `json:"component"`
|
||||
NodeID string `json:"node_id"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
Metadata map[string]interface{} `json:"metadata"`
|
||||
}
|
||||
|
||||
// StructuredLogger provides structured logging for storage operations
|
||||
type StructuredLogger struct {
|
||||
mu sync.RWMutex
|
||||
level LogLevel
|
||||
output LogOutput
|
||||
mu sync.RWMutex
|
||||
level LogLevel
|
||||
output LogOutput
|
||||
formatter LogFormatter
|
||||
buffer []*LogEntry
|
||||
buffer []*LogEntry
|
||||
maxBuffer int
|
||||
}
|
||||
|
||||
@@ -303,27 +381,27 @@ type LogFormatter interface {
|
||||
|
||||
// LogEntry represents a single log entry
|
||||
type LogEntry struct {
|
||||
Level LogLevel `json:"level"`
|
||||
Message string `json:"message"`
|
||||
Component string `json:"component"`
|
||||
Operation string `json:"operation"`
|
||||
NodeID string `json:"node_id"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
Level LogLevel `json:"level"`
|
||||
Message string `json:"message"`
|
||||
Component string `json:"component"`
|
||||
Operation string `json:"operation"`
|
||||
NodeID string `json:"node_id"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
Fields map[string]interface{} `json:"fields"`
|
||||
Error error `json:"error,omitempty"`
|
||||
Error error `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
// NewMonitoringSystem creates a new monitoring system
|
||||
func NewMonitoringSystem(nodeID string) *MonitoringSystem {
|
||||
ms := &MonitoringSystem{
|
||||
nodeID: nodeID,
|
||||
metrics: initializeMetrics(nodeID),
|
||||
alerts: newAlertManager(),
|
||||
healthChecker: newHealthChecker(),
|
||||
nodeID: nodeID,
|
||||
metrics: initializeMetrics(nodeID),
|
||||
alerts: newAlertManager(),
|
||||
healthChecker: newHealthChecker(),
|
||||
performanceProfiler: newPerformanceProfiler(),
|
||||
logger: newStructuredLogger(),
|
||||
notifications: make(chan *MonitoringEvent, 1000),
|
||||
stopCh: make(chan struct{}),
|
||||
logger: newStructuredLogger(),
|
||||
notifications: make(chan *MonitoringEvent, 1000),
|
||||
stopCh: make(chan struct{}),
|
||||
}
|
||||
|
||||
// Start monitoring goroutines
|
||||
@@ -571,7 +649,7 @@ func (ms *MonitoringSystem) executeHealthCheck(check HealthCheck) {
|
||||
defer cancel()
|
||||
|
||||
result := check.Checker(ctx)
|
||||
|
||||
|
||||
ms.healthChecker.mu.Lock()
|
||||
ms.healthChecker.status.Components[check.Name] = result
|
||||
ms.healthChecker.mu.Unlock()
|
||||
@@ -592,21 +670,21 @@ func (ms *MonitoringSystem) analyzePerformance() {
|
||||
|
||||
func newAlertManager() *AlertManager {
|
||||
return &AlertManager{
|
||||
rules: make([]*AlertRule, 0),
|
||||
rules: make([]*AlertRule, 0),
|
||||
activealerts: make(map[string]*Alert),
|
||||
notifiers: make([]AlertNotifier, 0),
|
||||
history: make([]*Alert, 0),
|
||||
maxHistory: 1000,
|
||||
history: make([]*Alert, 0),
|
||||
maxHistory: 1000,
|
||||
}
|
||||
}
|
||||
|
||||
func newHealthChecker() *HealthChecker {
|
||||
return &HealthChecker{
|
||||
checks: make(map[string]HealthCheck),
|
||||
status: &SystemHealth{
|
||||
checks: make(map[string]HealthCheck),
|
||||
status: &SystemHealth{
|
||||
OverallStatus: HealthHealthy,
|
||||
Components: make(map[string]HealthResult),
|
||||
StartTime: time.Now(),
|
||||
Components: make(map[string]HealthResult),
|
||||
StartTime: time.Now(),
|
||||
},
|
||||
checkInterval: 1 * time.Minute,
|
||||
timeout: 30 * time.Second,
|
||||
@@ -664,8 +742,8 @@ func (ms *MonitoringSystem) GetMonitoringStats() (*MonitoringStats, error) {
|
||||
defer ms.mu.RUnlock()
|
||||
|
||||
stats := &MonitoringStats{
|
||||
NodeID: ms.nodeID,
|
||||
Timestamp: time.Now(),
|
||||
NodeID: ms.nodeID,
|
||||
Timestamp: time.Now(),
|
||||
HealthStatus: ms.healthChecker.status.OverallStatus,
|
||||
ActiveAlerts: len(ms.alerts.activealerts),
|
||||
Bottlenecks: len(ms.performanceProfiler.bottlenecks),
|
||||
|
||||
Reference in New Issue
Block a user