769 lines
		
	
	
		
			22 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			769 lines
		
	
	
		
			22 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| package storage
 | |
| 
 | |
| import (
 | |
| 	"context"
 | |
| 	"encoding/json"
 | |
| 	"fmt"
 | |
| 	"sort"
 | |
| 	"sync"
 | |
| 	"time"
 | |
| 
 | |
| 	"github.com/prometheus/client_golang/prometheus"
 | |
| 	"github.com/prometheus/client_golang/prometheus/promauto"
 | |
| )
 | |
| 
 | |
| // MonitoringSystem provides comprehensive monitoring for the storage system
 | |
| type MonitoringSystem struct {
 | |
| 	mu                  sync.RWMutex
 | |
| 	nodeID              string
 | |
| 	metrics             *StorageMetrics
 | |
| 	alerts              *AlertManager
 | |
| 	healthChecker       *HealthChecker
 | |
| 	performanceProfiler *PerformanceProfiler
 | |
| 	logger              *StructuredLogger
 | |
| 	notifications       chan *MonitoringEvent
 | |
| 	stopCh              chan struct{}
 | |
| }
 | |
| 
 | |
| // StorageMetrics contains all Prometheus metrics for storage operations
 | |
| type StorageMetrics struct {
 | |
| 	// Operation counters
 | |
| 	StoreOperations    prometheus.Counter
 | |
| 	RetrieveOperations prometheus.Counter
 | |
| 	DeleteOperations   prometheus.Counter
 | |
| 	UpdateOperations   prometheus.Counter
 | |
| 	SearchOperations   prometheus.Counter
 | |
| 	BatchOperations    prometheus.Counter
 | |
| 
 | |
| 	// Error counters
 | |
| 	StoreErrors       prometheus.Counter
 | |
| 	RetrieveErrors    prometheus.Counter
 | |
| 	EncryptionErrors  prometheus.Counter
 | |
| 	DecryptionErrors  prometheus.Counter
 | |
| 	ReplicationErrors prometheus.Counter
 | |
| 	CacheErrors       prometheus.Counter
 | |
| 	IndexErrors       prometheus.Counter
 | |
| 
 | |
| 	// Latency histograms
 | |
| 	StoreLatency       prometheus.Histogram
 | |
| 	RetrieveLatency    prometheus.Histogram
 | |
| 	EncryptionLatency  prometheus.Histogram
 | |
| 	DecryptionLatency  prometheus.Histogram
 | |
| 	ReplicationLatency prometheus.Histogram
 | |
| 	SearchLatency      prometheus.Histogram
 | |
| 
 | |
| 	// Cache metrics
 | |
| 	CacheHits      prometheus.Counter
 | |
| 	CacheMisses    prometheus.Counter
 | |
| 	CacheEvictions prometheus.Counter
 | |
| 	CacheSize      prometheus.Gauge
 | |
| 
 | |
| 	// Storage size metrics
 | |
| 	LocalStorageSize       prometheus.Gauge
 | |
| 	DistributedStorageSize prometheus.Gauge
 | |
| 	CompressedStorageSize  prometheus.Gauge
 | |
| 	IndexStorageSize       prometheus.Gauge
 | |
| 
 | |
| 	// Replication metrics
 | |
| 	ReplicationFactor prometheus.Gauge
 | |
| 	HealthyReplicas   prometheus.Gauge
 | |
| 	UnderReplicated   prometheus.Gauge
 | |
| 	ReplicationLag    prometheus.Histogram
 | |
| 
 | |
| 	// Encryption metrics
 | |
| 	EncryptedContexts prometheus.Gauge
 | |
| 	KeyRotations      prometheus.Counter
 | |
| 	AccessDenials     prometheus.Counter
 | |
| 	ActiveKeys        prometheus.Gauge
 | |
| 
 | |
| 	// Performance metrics
 | |
| 	Throughput           prometheus.Gauge
 | |
| 	ConcurrentOperations prometheus.Gauge
 | |
| 	QueueDepth           prometheus.Gauge
 | |
| 
 | |
| 	// Health metrics
 | |
| 	StorageHealth    prometheus.Gauge
 | |
| 	NodeConnectivity prometheus.Gauge
 | |
| 	SyncLatency      prometheus.Histogram
 | |
| }
 | |
| 
 | |
| // AlertManager handles storage-related alerts and notifications
 | |
| type AlertManager struct {
 | |
| 	mu           sync.RWMutex
 | |
| 	rules        []*AlertRule
 | |
| 	activealerts map[string]*Alert
 | |
| 	notifiers    []AlertNotifier
 | |
| 	history      []*Alert
 | |
| 	maxHistory   int
 | |
| }
 | |
| 
 | |
| func (am *AlertManager) severityRank(severity AlertSeverity) int {
 | |
| 	switch severity {
 | |
| 	case SeverityCritical:
 | |
| 		return 4
 | |
| 	case SeverityError:
 | |
| 		return 3
 | |
| 	case SeverityWarning:
 | |
| 		return 2
 | |
| 	case SeverityInfo:
 | |
| 		return 1
 | |
| 	default:
 | |
| 		return 0
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // GetActiveAlerts returns sorted active alerts (SEC-SLURP-1.1 monitoring path)
 | |
| func (am *AlertManager) GetActiveAlerts() []*Alert {
 | |
| 	am.mu.RLock()
 | |
| 	defer am.mu.RUnlock()
 | |
| 
 | |
| 	if len(am.activealerts) == 0 {
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	alerts := make([]*Alert, 0, len(am.activealerts))
 | |
| 	for _, alert := range am.activealerts {
 | |
| 		alerts = append(alerts, alert)
 | |
| 	}
 | |
| 
 | |
| 	sort.Slice(alerts, func(i, j int) bool {
 | |
| 		iRank := am.severityRank(alerts[i].Severity)
 | |
| 		jRank := am.severityRank(alerts[j].Severity)
 | |
| 		if iRank == jRank {
 | |
| 			return alerts[i].StartTime.After(alerts[j].StartTime)
 | |
| 		}
 | |
| 		return iRank > jRank
 | |
| 	})
 | |
| 
 | |
| 	return alerts
 | |
| }
 | |
| 
 | |
| // Snapshot marshals monitoring state for UCXL persistence (SEC-SLURP-1.1a telemetry)
 | |
| func (ms *MonitoringSystem) Snapshot(ctx context.Context) (string, error) {
 | |
| 	ms.mu.RLock()
 | |
| 	defer ms.mu.RUnlock()
 | |
| 
 | |
| 	if ms.alerts == nil {
 | |
| 		return "", fmt.Errorf("alert manager not initialised")
 | |
| 	}
 | |
| 
 | |
| 	active := ms.alerts.GetActiveAlerts()
 | |
| 	alertPayload := make([]map[string]interface{}, 0, len(active))
 | |
| 	for _, alert := range active {
 | |
| 		alertPayload = append(alertPayload, map[string]interface{}{
 | |
| 			"id":         alert.ID,
 | |
| 			"name":       alert.Name,
 | |
| 			"severity":   alert.Severity,
 | |
| 			"message":    fmt.Sprintf("%s (threshold %.2f)", alert.Description, alert.Threshold),
 | |
| 			"labels":     alert.Labels,
 | |
| 			"started_at": alert.StartTime,
 | |
| 		})
 | |
| 	}
 | |
| 
 | |
| 	snapshot := map[string]interface{}{
 | |
| 		"node_id":      ms.nodeID,
 | |
| 		"generated_at": time.Now().UTC(),
 | |
| 		"alert_count":  len(active),
 | |
| 		"alerts":       alertPayload,
 | |
| 	}
 | |
| 
 | |
| 	encoded, err := json.MarshalIndent(snapshot, "", "  ")
 | |
| 	if err != nil {
 | |
| 		return "", fmt.Errorf("failed to marshal monitoring snapshot: %w", err)
 | |
| 	}
 | |
| 
 | |
| 	return string(encoded), nil
 | |
| }
 | |
| 
 | |
| // AlertRule defines conditions for triggering alerts
 | |
| type AlertRule struct {
 | |
| 	ID          string            `json:"id"`
 | |
| 	Name        string            `json:"name"`
 | |
| 	Description string            `json:"description"`
 | |
| 	Metric      string            `json:"metric"`
 | |
| 	Condition   string            `json:"condition"` // >, <, ==, !=, etc.
 | |
| 	Threshold   float64           `json:"threshold"`
 | |
| 	Duration    time.Duration     `json:"duration"`
 | |
| 	Severity    AlertSeverity     `json:"severity"`
 | |
| 	Labels      map[string]string `json:"labels"`
 | |
| 	Enabled     bool              `json:"enabled"`
 | |
| }
 | |
| 
 | |
| // Alert represents an active or resolved alert
 | |
| type Alert struct {
 | |
| 	ID          string            `json:"id"`
 | |
| 	RuleID      string            `json:"rule_id"`
 | |
| 	Name        string            `json:"name"`
 | |
| 	Description string            `json:"description"`
 | |
| 	Severity    AlertSeverity     `json:"severity"`
 | |
| 	Status      AlertStatus       `json:"status"`
 | |
| 	Value       float64           `json:"value"`
 | |
| 	Threshold   float64           `json:"threshold"`
 | |
| 	Labels      map[string]string `json:"labels"`
 | |
| 	StartTime   time.Time         `json:"start_time"`
 | |
| 	EndTime     *time.Time        `json:"end_time,omitempty"`
 | |
| 	LastUpdate  time.Time         `json:"last_update"`
 | |
| }
 | |
| 
 | |
| // AlertSeverity defines alert severity levels
 | |
| type AlertSeverity string
 | |
| 
 | |
| const (
 | |
| 	SeverityInfo     AlertSeverity = "info"
 | |
| 	SeverityWarning  AlertSeverity = "warning"
 | |
| 	SeverityError    AlertSeverity = "error"
 | |
| 	SeverityCritical AlertSeverity = "critical"
 | |
| )
 | |
| 
 | |
| // AlertStatus defines alert status
 | |
| type AlertStatus string
 | |
| 
 | |
| const (
 | |
| 	StatusPending  AlertStatus = "pending"
 | |
| 	StatusFiring   AlertStatus = "firing"
 | |
| 	StatusResolved AlertStatus = "resolved"
 | |
| )
 | |
| 
 | |
| // AlertNotifier interface for sending alert notifications
 | |
| type AlertNotifier interface {
 | |
| 	Notify(alert *Alert) error
 | |
| 	GetType() string
 | |
| }
 | |
| 
 | |
| // HealthChecker monitors the overall health of the storage system
 | |
| type HealthChecker struct {
 | |
| 	mu            sync.RWMutex
 | |
| 	checks        map[string]HealthCheck
 | |
| 	status        *SystemHealth
 | |
| 	checkInterval time.Duration
 | |
| 	timeout       time.Duration
 | |
| }
 | |
| 
 | |
| // HealthCheck defines a single health check
 | |
| type HealthCheck struct {
 | |
| 	Name        string                                 `json:"name"`
 | |
| 	Description string                                 `json:"description"`
 | |
| 	Checker     func(ctx context.Context) HealthResult `json:"-"`
 | |
| 	Interval    time.Duration                          `json:"interval"`
 | |
| 	Timeout     time.Duration                          `json:"timeout"`
 | |
| 	Enabled     bool                                   `json:"enabled"`
 | |
| }
 | |
| 
 | |
| // HealthResult represents the result of a health check
 | |
| type HealthResult struct {
 | |
| 	Healthy   bool                   `json:"healthy"`
 | |
| 	Message   string                 `json:"message"`
 | |
| 	Latency   time.Duration          `json:"latency"`
 | |
| 	Metadata  map[string]interface{} `json:"metadata"`
 | |
| 	Timestamp time.Time              `json:"timestamp"`
 | |
| }
 | |
| 
 | |
| // SystemHealth represents the overall health of the storage system
 | |
| type SystemHealth struct {
 | |
| 	OverallStatus HealthStatus            `json:"overall_status"`
 | |
| 	Components    map[string]HealthResult `json:"components"`
 | |
| 	LastUpdate    time.Time               `json:"last_update"`
 | |
| 	Uptime        time.Duration           `json:"uptime"`
 | |
| 	StartTime     time.Time               `json:"start_time"`
 | |
| }
 | |
| 
 | |
| // HealthStatus represents system health status
 | |
| type HealthStatus string
 | |
| 
 | |
| const (
 | |
| 	HealthHealthy   HealthStatus = "healthy"
 | |
| 	HealthDegraded  HealthStatus = "degraded"
 | |
| 	HealthUnhealthy HealthStatus = "unhealthy"
 | |
| )
 | |
| 
 | |
| // PerformanceProfiler analyzes storage performance patterns
 | |
| type PerformanceProfiler struct {
 | |
| 	mu                sync.RWMutex
 | |
| 	operationProfiles map[string]*OperationProfile
 | |
| 	resourceUsage     *ResourceUsage
 | |
| 	bottlenecks       []*Bottleneck
 | |
| 	recommendations   []*PerformanceRecommendation
 | |
| }
 | |
| 
 | |
| // OperationProfile contains performance analysis for a specific operation type
 | |
| type OperationProfile struct {
 | |
| 	Operation       string          `json:"operation"`
 | |
| 	TotalOperations int64           `json:"total_operations"`
 | |
| 	AverageLatency  time.Duration   `json:"average_latency"`
 | |
| 	P50Latency      time.Duration   `json:"p50_latency"`
 | |
| 	P95Latency      time.Duration   `json:"p95_latency"`
 | |
| 	P99Latency      time.Duration   `json:"p99_latency"`
 | |
| 	Throughput      float64         `json:"throughput"`
 | |
| 	ErrorRate       float64         `json:"error_rate"`
 | |
| 	LatencyHistory  []time.Duration `json:"-"`
 | |
| 	LastUpdated     time.Time       `json:"last_updated"`
 | |
| }
 | |
| 
 | |
| // ResourceUsage tracks resource consumption
 | |
| type ResourceUsage struct {
 | |
| 	CPUUsage    float64   `json:"cpu_usage"`
 | |
| 	MemoryUsage int64     `json:"memory_usage"`
 | |
| 	DiskUsage   int64     `json:"disk_usage"`
 | |
| 	NetworkIn   int64     `json:"network_in"`
 | |
| 	NetworkOut  int64     `json:"network_out"`
 | |
| 	OpenFiles   int       `json:"open_files"`
 | |
| 	Goroutines  int       `json:"goroutines"`
 | |
| 	LastUpdated time.Time `json:"last_updated"`
 | |
| }
 | |
| 
 | |
| // Bottleneck represents a performance bottleneck
 | |
| type Bottleneck struct {
 | |
| 	ID          string                 `json:"id"`
 | |
| 	Type        string                 `json:"type"` // cpu, memory, disk, network, etc.
 | |
| 	Component   string                 `json:"component"`
 | |
| 	Description string                 `json:"description"`
 | |
| 	Severity    AlertSeverity          `json:"severity"`
 | |
| 	Impact      float64                `json:"impact"`
 | |
| 	DetectedAt  time.Time              `json:"detected_at"`
 | |
| 	Metadata    map[string]interface{} `json:"metadata"`
 | |
| }
 | |
| 
 | |
| // PerformanceRecommendation suggests optimizations
 | |
| type PerformanceRecommendation struct {
 | |
| 	ID          string                 `json:"id"`
 | |
| 	Type        string                 `json:"type"`
 | |
| 	Title       string                 `json:"title"`
 | |
| 	Description string                 `json:"description"`
 | |
| 	Priority    int                    `json:"priority"`
 | |
| 	Impact      string                 `json:"impact"`
 | |
| 	Effort      string                 `json:"effort"`
 | |
| 	GeneratedAt time.Time              `json:"generated_at"`
 | |
| 	Metadata    map[string]interface{} `json:"metadata"`
 | |
| }
 | |
| 
 | |
| // MonitoringEvent represents a monitoring system event
 | |
| type MonitoringEvent struct {
 | |
| 	Type      string                 `json:"type"`
 | |
| 	Level     string                 `json:"level"`
 | |
| 	Message   string                 `json:"message"`
 | |
| 	Component string                 `json:"component"`
 | |
| 	NodeID    string                 `json:"node_id"`
 | |
| 	Timestamp time.Time              `json:"timestamp"`
 | |
| 	Metadata  map[string]interface{} `json:"metadata"`
 | |
| }
 | |
| 
 | |
| // StructuredLogger provides structured logging for storage operations
 | |
| type StructuredLogger struct {
 | |
| 	mu        sync.RWMutex
 | |
| 	level     LogLevel
 | |
| 	output    LogOutput
 | |
| 	formatter LogFormatter
 | |
| 	buffer    []*LogEntry
 | |
| 	maxBuffer int
 | |
| }
 | |
| 
 | |
| // LogLevel defines logging levels
 | |
| type LogLevel int
 | |
| 
 | |
| const (
 | |
| 	LogDebug LogLevel = iota
 | |
| 	LogInfo
 | |
| 	LogWarning
 | |
| 	LogError
 | |
| 	LogCritical
 | |
| )
 | |
| 
 | |
| // LogOutput interface for different output destinations
 | |
| type LogOutput interface {
 | |
| 	Write(entry *LogEntry) error
 | |
| 	Flush() error
 | |
| }
 | |
| 
 | |
| // LogFormatter interface for different log formats
 | |
| type LogFormatter interface {
 | |
| 	Format(entry *LogEntry) ([]byte, error)
 | |
| }
 | |
| 
 | |
| // LogEntry represents a single log entry
 | |
| type LogEntry struct {
 | |
| 	Level     LogLevel               `json:"level"`
 | |
| 	Message   string                 `json:"message"`
 | |
| 	Component string                 `json:"component"`
 | |
| 	Operation string                 `json:"operation"`
 | |
| 	NodeID    string                 `json:"node_id"`
 | |
| 	Timestamp time.Time              `json:"timestamp"`
 | |
| 	Fields    map[string]interface{} `json:"fields"`
 | |
| 	Error     error                  `json:"error,omitempty"`
 | |
| }
 | |
| 
 | |
| // NewMonitoringSystem creates a new monitoring system
 | |
| func NewMonitoringSystem(nodeID string) *MonitoringSystem {
 | |
| 	ms := &MonitoringSystem{
 | |
| 		nodeID:              nodeID,
 | |
| 		metrics:             initializeMetrics(nodeID),
 | |
| 		alerts:              newAlertManager(),
 | |
| 		healthChecker:       newHealthChecker(),
 | |
| 		performanceProfiler: newPerformanceProfiler(),
 | |
| 		logger:              newStructuredLogger(),
 | |
| 		notifications:       make(chan *MonitoringEvent, 1000),
 | |
| 		stopCh:              make(chan struct{}),
 | |
| 	}
 | |
| 
 | |
| 	// Start monitoring goroutines
 | |
| 	go ms.monitoringLoop()
 | |
| 	go ms.healthCheckLoop()
 | |
| 	go ms.alertEvaluationLoop()
 | |
| 	go ms.performanceAnalysisLoop()
 | |
| 
 | |
| 	return ms
 | |
| }
 | |
| 
 | |
| // initializeMetrics creates and registers all Prometheus metrics
 | |
| func initializeMetrics(nodeID string) *StorageMetrics {
 | |
| 	labels := prometheus.Labels{"node_id": nodeID}
 | |
| 
 | |
| 	return &StorageMetrics{
 | |
| 		// Operation counters
 | |
| 		StoreOperations: promauto.NewCounter(prometheus.CounterOpts{
 | |
| 			Name:        "slurp_storage_store_operations_total",
 | |
| 			Help:        "Total number of store operations",
 | |
| 			ConstLabels: labels,
 | |
| 		}),
 | |
| 		RetrieveOperations: promauto.NewCounter(prometheus.CounterOpts{
 | |
| 			Name:        "slurp_storage_retrieve_operations_total",
 | |
| 			Help:        "Total number of retrieve operations",
 | |
| 			ConstLabels: labels,
 | |
| 		}),
 | |
| 		DeleteOperations: promauto.NewCounter(prometheus.CounterOpts{
 | |
| 			Name:        "slurp_storage_delete_operations_total",
 | |
| 			Help:        "Total number of delete operations",
 | |
| 			ConstLabels: labels,
 | |
| 		}),
 | |
| 		UpdateOperations: promauto.NewCounter(prometheus.CounterOpts{
 | |
| 			Name:        "slurp_storage_update_operations_total",
 | |
| 			Help:        "Total number of update operations",
 | |
| 			ConstLabels: labels,
 | |
| 		}),
 | |
| 		SearchOperations: promauto.NewCounter(prometheus.CounterOpts{
 | |
| 			Name:        "slurp_storage_search_operations_total",
 | |
| 			Help:        "Total number of search operations",
 | |
| 			ConstLabels: labels,
 | |
| 		}),
 | |
| 		BatchOperations: promauto.NewCounter(prometheus.CounterOpts{
 | |
| 			Name:        "slurp_storage_batch_operations_total",
 | |
| 			Help:        "Total number of batch operations",
 | |
| 			ConstLabels: labels,
 | |
| 		}),
 | |
| 
 | |
| 		// Error counters
 | |
| 		StoreErrors: promauto.NewCounter(prometheus.CounterOpts{
 | |
| 			Name:        "slurp_storage_store_errors_total",
 | |
| 			Help:        "Total number of store errors",
 | |
| 			ConstLabels: labels,
 | |
| 		}),
 | |
| 		RetrieveErrors: promauto.NewCounter(prometheus.CounterOpts{
 | |
| 			Name:        "slurp_storage_retrieve_errors_total",
 | |
| 			Help:        "Total number of retrieve errors",
 | |
| 			ConstLabels: labels,
 | |
| 		}),
 | |
| 		EncryptionErrors: promauto.NewCounter(prometheus.CounterOpts{
 | |
| 			Name:        "slurp_storage_encryption_errors_total",
 | |
| 			Help:        "Total number of encryption errors",
 | |
| 			ConstLabels: labels,
 | |
| 		}),
 | |
| 
 | |
| 		// Latency histograms
 | |
| 		StoreLatency: promauto.NewHistogram(prometheus.HistogramOpts{
 | |
| 			Name:        "slurp_storage_store_latency_seconds",
 | |
| 			Help:        "Store operation latency in seconds",
 | |
| 			ConstLabels: labels,
 | |
| 			Buckets:     prometheus.DefBuckets,
 | |
| 		}),
 | |
| 		RetrieveLatency: promauto.NewHistogram(prometheus.HistogramOpts{
 | |
| 			Name:        "slurp_storage_retrieve_latency_seconds",
 | |
| 			Help:        "Retrieve operation latency in seconds",
 | |
| 			ConstLabels: labels,
 | |
| 			Buckets:     prometheus.DefBuckets,
 | |
| 		}),
 | |
| 
 | |
| 		// Cache metrics
 | |
| 		CacheHits: promauto.NewCounter(prometheus.CounterOpts{
 | |
| 			Name:        "slurp_storage_cache_hits_total",
 | |
| 			Help:        "Total number of cache hits",
 | |
| 			ConstLabels: labels,
 | |
| 		}),
 | |
| 		CacheMisses: promauto.NewCounter(prometheus.CounterOpts{
 | |
| 			Name:        "slurp_storage_cache_misses_total",
 | |
| 			Help:        "Total number of cache misses",
 | |
| 			ConstLabels: labels,
 | |
| 		}),
 | |
| 
 | |
| 		// Storage size gauges
 | |
| 		LocalStorageSize: promauto.NewGauge(prometheus.GaugeOpts{
 | |
| 			Name:        "slurp_storage_local_size_bytes",
 | |
| 			Help:        "Local storage size in bytes",
 | |
| 			ConstLabels: labels,
 | |
| 		}),
 | |
| 		DistributedStorageSize: promauto.NewGauge(prometheus.GaugeOpts{
 | |
| 			Name:        "slurp_storage_distributed_size_bytes",
 | |
| 			Help:        "Distributed storage size in bytes",
 | |
| 			ConstLabels: labels,
 | |
| 		}),
 | |
| 
 | |
| 		// Health metrics
 | |
| 		StorageHealth: promauto.NewGauge(prometheus.GaugeOpts{
 | |
| 			Name:        "slurp_storage_health_status",
 | |
| 			Help:        "Storage health status (1=healthy, 0=unhealthy)",
 | |
| 			ConstLabels: labels,
 | |
| 		}),
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // Recording methods for metrics
 | |
| 
 | |
| func (ms *MonitoringSystem) RecordStoreOperation(duration time.Duration, success bool) {
 | |
| 	ms.metrics.StoreOperations.Inc()
 | |
| 	ms.metrics.StoreLatency.Observe(duration.Seconds())
 | |
| 	if !success {
 | |
| 		ms.metrics.StoreErrors.Inc()
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (ms *MonitoringSystem) RecordRetrieveOperation(duration time.Duration, success bool, cacheHit bool) {
 | |
| 	ms.metrics.RetrieveOperations.Inc()
 | |
| 	ms.metrics.RetrieveLatency.Observe(duration.Seconds())
 | |
| 	if !success {
 | |
| 		ms.metrics.RetrieveErrors.Inc()
 | |
| 	}
 | |
| 	if cacheHit {
 | |
| 		ms.metrics.CacheHits.Inc()
 | |
| 	} else {
 | |
| 		ms.metrics.CacheMisses.Inc()
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (ms *MonitoringSystem) RecordEncryptionOperation(duration time.Duration, success bool) {
 | |
| 	ms.metrics.EncryptionLatency.Observe(duration.Seconds())
 | |
| 	if !success {
 | |
| 		ms.metrics.EncryptionErrors.Inc()
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (ms *MonitoringSystem) UpdateStorageSize(local, distributed, compressed, index int64) {
 | |
| 	ms.metrics.LocalStorageSize.Set(float64(local))
 | |
| 	ms.metrics.DistributedStorageSize.Set(float64(distributed))
 | |
| 	ms.metrics.CompressedStorageSize.Set(float64(compressed))
 | |
| 	ms.metrics.IndexStorageSize.Set(float64(index))
 | |
| }
 | |
| 
 | |
| func (ms *MonitoringSystem) UpdateHealthStatus(healthy bool) {
 | |
| 	if healthy {
 | |
| 		ms.metrics.StorageHealth.Set(1)
 | |
| 	} else {
 | |
| 		ms.metrics.StorageHealth.Set(0)
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // Main monitoring loops
 | |
| 
 | |
| func (ms *MonitoringSystem) monitoringLoop() {
 | |
| 	ticker := time.NewTicker(30 * time.Second)
 | |
| 	defer ticker.Stop()
 | |
| 
 | |
| 	for {
 | |
| 		select {
 | |
| 		case <-ticker.C:
 | |
| 			ms.collectSystemMetrics()
 | |
| 		case event := <-ms.notifications:
 | |
| 			ms.processMonitoringEvent(event)
 | |
| 		case <-ms.stopCh:
 | |
| 			return
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (ms *MonitoringSystem) healthCheckLoop() {
 | |
| 	ticker := time.NewTicker(1 * time.Minute)
 | |
| 	defer ticker.Stop()
 | |
| 
 | |
| 	for {
 | |
| 		select {
 | |
| 		case <-ticker.C:
 | |
| 			ms.performHealthChecks()
 | |
| 		case <-ms.stopCh:
 | |
| 			return
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (ms *MonitoringSystem) alertEvaluationLoop() {
 | |
| 	ticker := time.NewTicker(15 * time.Second)
 | |
| 	defer ticker.Stop()
 | |
| 
 | |
| 	for {
 | |
| 		select {
 | |
| 		case <-ticker.C:
 | |
| 			ms.evaluateAlertRules()
 | |
| 		case <-ms.stopCh:
 | |
| 			return
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (ms *MonitoringSystem) performanceAnalysisLoop() {
 | |
| 	ticker := time.NewTicker(5 * time.Minute)
 | |
| 	defer ticker.Stop()
 | |
| 
 | |
| 	for {
 | |
| 		select {
 | |
| 		case <-ticker.C:
 | |
| 			ms.analyzePerformance()
 | |
| 		case <-ms.stopCh:
 | |
| 			return
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // Implementation of monitoring functions (simplified)
 | |
| 
 | |
| func (ms *MonitoringSystem) collectSystemMetrics() {
 | |
| 	// Collect system-level metrics
 | |
| 	// This would integrate with system monitoring tools
 | |
| }
 | |
| 
 | |
| func (ms *MonitoringSystem) processMonitoringEvent(event *MonitoringEvent) {
 | |
| 	// Process monitoring events
 | |
| 	ms.logger.LogEvent(event)
 | |
| }
 | |
| 
 | |
| func (ms *MonitoringSystem) performHealthChecks() {
 | |
| 	// Execute all registered health checks
 | |
| 	ms.healthChecker.mu.RLock()
 | |
| 	checks := ms.healthChecker.checks
 | |
| 	ms.healthChecker.mu.RUnlock()
 | |
| 
 | |
| 	for _, check := range checks {
 | |
| 		if check.Enabled {
 | |
| 			go ms.executeHealthCheck(check)
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (ms *MonitoringSystem) executeHealthCheck(check HealthCheck) {
 | |
| 	ctx, cancel := context.WithTimeout(context.Background(), check.Timeout)
 | |
| 	defer cancel()
 | |
| 
 | |
| 	result := check.Checker(ctx)
 | |
| 
 | |
| 	ms.healthChecker.mu.Lock()
 | |
| 	ms.healthChecker.status.Components[check.Name] = result
 | |
| 	ms.healthChecker.mu.Unlock()
 | |
| }
 | |
| 
 | |
| func (ms *MonitoringSystem) evaluateAlertRules() {
 | |
| 	// Evaluate alert rules against current metrics
 | |
| 	// This would query Prometheus metrics and trigger alerts
 | |
| }
 | |
| 
 | |
| func (ms *MonitoringSystem) analyzePerformance() {
 | |
| 	// Analyze performance patterns and generate recommendations
 | |
| 	ms.performanceProfiler.analyzeBottlenecks()
 | |
| 	ms.performanceProfiler.generateRecommendations()
 | |
| }
 | |
| 
 | |
| // Helper functions and implementations
 | |
| 
 | |
| func newAlertManager() *AlertManager {
 | |
| 	return &AlertManager{
 | |
| 		rules:        make([]*AlertRule, 0),
 | |
| 		activealerts: make(map[string]*Alert),
 | |
| 		notifiers:    make([]AlertNotifier, 0),
 | |
| 		history:      make([]*Alert, 0),
 | |
| 		maxHistory:   1000,
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func newHealthChecker() *HealthChecker {
 | |
| 	return &HealthChecker{
 | |
| 		checks: make(map[string]HealthCheck),
 | |
| 		status: &SystemHealth{
 | |
| 			OverallStatus: HealthHealthy,
 | |
| 			Components:    make(map[string]HealthResult),
 | |
| 			StartTime:     time.Now(),
 | |
| 		},
 | |
| 		checkInterval: 1 * time.Minute,
 | |
| 		timeout:       30 * time.Second,
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func newPerformanceProfiler() *PerformanceProfiler {
 | |
| 	return &PerformanceProfiler{
 | |
| 		operationProfiles: make(map[string]*OperationProfile),
 | |
| 		resourceUsage:     &ResourceUsage{},
 | |
| 		bottlenecks:       make([]*Bottleneck, 0),
 | |
| 		recommendations:   make([]*PerformanceRecommendation, 0),
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func newStructuredLogger() *StructuredLogger {
 | |
| 	return &StructuredLogger{
 | |
| 		level:     LogInfo,
 | |
| 		buffer:    make([]*LogEntry, 0),
 | |
| 		maxBuffer: 10000,
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (sl *StructuredLogger) LogEvent(event *MonitoringEvent) {
 | |
| 	entry := &LogEntry{
 | |
| 		Level:     LogInfo,
 | |
| 		Message:   event.Message,
 | |
| 		Component: event.Component,
 | |
| 		NodeID:    event.NodeID,
 | |
| 		Timestamp: event.Timestamp,
 | |
| 		Fields:    event.Metadata,
 | |
| 	}
 | |
| 
 | |
| 	sl.mu.Lock()
 | |
| 	sl.buffer = append(sl.buffer, entry)
 | |
| 	if len(sl.buffer) > sl.maxBuffer {
 | |
| 		sl.buffer = sl.buffer[1:] // Remove oldest entry
 | |
| 	}
 | |
| 	sl.mu.Unlock()
 | |
| }
 | |
| 
 | |
| func (pp *PerformanceProfiler) analyzeBottlenecks() {
 | |
| 	// Analyze performance data to identify bottlenecks
 | |
| 	// This would examine latency patterns, error rates, etc.
 | |
| }
 | |
| 
 | |
| func (pp *PerformanceProfiler) generateRecommendations() {
 | |
| 	// Generate performance improvement recommendations
 | |
| 	// This would analyze patterns and suggest optimizations
 | |
| }
 | |
| 
 | |
| // GetMonitoringStats returns comprehensive monitoring statistics
 | |
| func (ms *MonitoringSystem) GetMonitoringStats() (*MonitoringStats, error) {
 | |
| 	ms.mu.RLock()
 | |
| 	defer ms.mu.RUnlock()
 | |
| 
 | |
| 	stats := &MonitoringStats{
 | |
| 		NodeID:       ms.nodeID,
 | |
| 		Timestamp:    time.Now(),
 | |
| 		HealthStatus: ms.healthChecker.status.OverallStatus,
 | |
| 		ActiveAlerts: len(ms.alerts.activealerts),
 | |
| 		Bottlenecks:  len(ms.performanceProfiler.bottlenecks),
 | |
| 	}
 | |
| 
 | |
| 	return stats, nil
 | |
| }
 | |
| 
 | |
| // MonitoringStats contains monitoring system statistics
 | |
| type MonitoringStats struct {
 | |
| 	NodeID       string       `json:"node_id"`
 | |
| 	Timestamp    time.Time    `json:"timestamp"`
 | |
| 	HealthStatus HealthStatus `json:"health_status"`
 | |
| 	ActiveAlerts int          `json:"active_alerts"`
 | |
| 	Bottlenecks  int          `json:"bottlenecks"`
 | |
| }
 | |
| 
 | |
| // Close shuts down the monitoring system
 | |
| func (ms *MonitoringSystem) Close() error {
 | |
| 	close(ms.stopCh)
 | |
| 	return nil
 | |
| }
 | 
