749 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			749 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| package metrics
 | |
| 
 | |
| import (
 | |
| 	"context"
 | |
| 	"log"
 | |
| 	"net/http"
 | |
| 	"sync"
 | |
| 	"time"
 | |
| 
 | |
| 	"github.com/prometheus/client_golang/prometheus"
 | |
| 	"github.com/prometheus/client_golang/prometheus/promauto"
 | |
| 	"github.com/prometheus/client_golang/prometheus/promhttp"
 | |
| )
 | |
| 
 | |
| // CHORUSMetrics provides comprehensive Prometheus metrics for the CHORUS system
 | |
| type CHORUSMetrics struct {
 | |
| 	registry   *prometheus.Registry
 | |
| 	httpServer *http.Server
 | |
| 
 | |
| 	// System metrics
 | |
| 	systemInfo *prometheus.GaugeVec
 | |
| 	uptime     prometheus.Gauge
 | |
| 	buildInfo  *prometheus.GaugeVec
 | |
| 
 | |
| 	// P2P metrics
 | |
| 	p2pConnectedPeers     prometheus.Gauge
 | |
| 	p2pMessagesSent       *prometheus.CounterVec
 | |
| 	p2pMessagesReceived   *prometheus.CounterVec
 | |
| 	p2pMessageLatency     *prometheus.HistogramVec
 | |
| 	p2pConnectionDuration *prometheus.HistogramVec
 | |
| 	p2pPeerScore          *prometheus.GaugeVec
 | |
| 
 | |
| 	// DHT metrics
 | |
| 	dhtPutOperations     *prometheus.CounterVec
 | |
| 	dhtGetOperations     *prometheus.CounterVec
 | |
| 	dhtOperationLatency  *prometheus.HistogramVec
 | |
| 	dhtProviderRecords   prometheus.Gauge
 | |
| 	dhtReplicationFactor *prometheus.GaugeVec
 | |
| 	dhtContentKeys       prometheus.Gauge
 | |
| 	dhtCacheHits         *prometheus.CounterVec
 | |
| 	dhtCacheMisses       *prometheus.CounterVec
 | |
| 
 | |
| 	// PubSub metrics
 | |
| 	pubsubTopics         prometheus.Gauge
 | |
| 	pubsubSubscribers    *prometheus.GaugeVec
 | |
| 	pubsubMessages       *prometheus.CounterVec
 | |
| 	pubsubMessageLatency *prometheus.HistogramVec
 | |
| 	pubsubMessageSize    *prometheus.HistogramVec
 | |
| 
 | |
| 	// Election metrics
 | |
| 	electionTerm       prometheus.Gauge
 | |
| 	electionState      *prometheus.GaugeVec
 | |
| 	heartbeatsSent     prometheus.Counter
 | |
| 	heartbeatsReceived prometheus.Counter
 | |
| 	leadershipChanges  prometheus.Counter
 | |
| 	leaderUptime       prometheus.Gauge
 | |
| 	electionLatency    prometheus.Histogram
 | |
| 
 | |
| 	// Health metrics
 | |
| 	healthChecksPassed   *prometheus.CounterVec
 | |
| 	healthChecksFailed   *prometheus.CounterVec
 | |
| 	healthCheckDuration  *prometheus.HistogramVec
 | |
| 	systemHealthScore    prometheus.Gauge
 | |
| 	componentHealthScore *prometheus.GaugeVec
 | |
| 
 | |
| 	// Task metrics
 | |
| 	tasksActive       prometheus.Gauge
 | |
| 	tasksQueued       prometheus.Gauge
 | |
| 	tasksCompleted    *prometheus.CounterVec
 | |
| 	taskDuration      *prometheus.HistogramVec
 | |
| 	taskQueueWaitTime prometheus.Histogram
 | |
| 
 | |
| 	// SLURP metrics (context generation)
 | |
| 	slurpGenerated        *prometheus.CounterVec
 | |
| 	slurpGenerationTime   prometheus.Histogram
 | |
| 	slurpQueueLength      prometheus.Gauge
 | |
| 	slurpActiveJobs       prometheus.Gauge
 | |
| 	slurpLeadershipEvents prometheus.Counter
 | |
| 
 | |
| 	// SHHH sentinel metrics
 | |
| 	shhhFindings *prometheus.CounterVec
 | |
| 
 | |
| 	// UCXI metrics (protocol resolution)
 | |
| 	ucxiRequests          *prometheus.CounterVec
 | |
| 	ucxiResolutionLatency prometheus.Histogram
 | |
| 	ucxiCacheHits         prometheus.Counter
 | |
| 	ucxiCacheMisses       prometheus.Counter
 | |
| 	ucxiContentSize       prometheus.Histogram
 | |
| 
 | |
| 	// Resource metrics
 | |
| 	cpuUsage        prometheus.Gauge
 | |
| 	memoryUsage     prometheus.Gauge
 | |
| 	diskUsage       *prometheus.GaugeVec
 | |
| 	networkBytesIn  prometheus.Counter
 | |
| 	networkBytesOut prometheus.Counter
 | |
| 	goroutines      prometheus.Gauge
 | |
| 
 | |
| 	// Error metrics
 | |
| 	errors *prometheus.CounterVec
 | |
| 	panics prometheus.Counter
 | |
| 
 | |
| 	startTime time.Time
 | |
| 	mu        sync.RWMutex
 | |
| }
 | |
| 
 | |
| // MetricsConfig configures the metrics system
 | |
| type MetricsConfig struct {
 | |
| 	// HTTP server config
 | |
| 	ListenAddr  string
 | |
| 	MetricsPath string
 | |
| 
 | |
| 	// Histogram buckets
 | |
| 	LatencyBuckets []float64
 | |
| 	SizeBuckets    []float64
 | |
| 
 | |
| 	// Labels
 | |
| 	NodeID      string
 | |
| 	Version     string
 | |
| 	Environment string
 | |
| 	Cluster     string
 | |
| 
 | |
| 	// Collection intervals
 | |
| 	SystemMetricsInterval   time.Duration
 | |
| 	ResourceMetricsInterval time.Duration
 | |
| }
 | |
| 
 | |
| // DefaultMetricsConfig returns default metrics configuration
 | |
| func DefaultMetricsConfig() *MetricsConfig {
 | |
| 	return &MetricsConfig{
 | |
| 		ListenAddr:  ":9090",
 | |
| 		MetricsPath: "/metrics",
 | |
| 		LatencyBuckets: []float64{
 | |
| 			0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
 | |
| 		},
 | |
| 		SizeBuckets: []float64{
 | |
| 			64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216,
 | |
| 		},
 | |
| 		SystemMetricsInterval:   30 * time.Second,
 | |
| 		ResourceMetricsInterval: 15 * time.Second,
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // NewCHORUSMetrics creates a new metrics collector
 | |
| func NewCHORUSMetrics(config *MetricsConfig) *CHORUSMetrics {
 | |
| 	if config == nil {
 | |
| 		config = DefaultMetricsConfig()
 | |
| 	}
 | |
| 
 | |
| 	registry := prometheus.NewRegistry()
 | |
| 
 | |
| 	metrics := &CHORUSMetrics{
 | |
| 		registry:  registry,
 | |
| 		startTime: time.Now(),
 | |
| 	}
 | |
| 
 | |
| 	// Initialize all metrics
 | |
| 	metrics.initializeMetrics(config)
 | |
| 
 | |
| 	// Register with custom registry
 | |
| 	metrics.registerMetrics()
 | |
| 
 | |
| 	return metrics
 | |
| }
 | |
| 
 | |
| // initializeMetrics initializes all Prometheus metrics
 | |
| func (m *CHORUSMetrics) initializeMetrics(config *MetricsConfig) {
 | |
| 	// System metrics
 | |
| 	m.systemInfo = promauto.NewGaugeVec(
 | |
| 		prometheus.GaugeOpts{
 | |
| 			Name: "chorus_system_info",
 | |
| 			Help: "System information",
 | |
| 		},
 | |
| 		[]string{"node_id", "version", "go_version", "cluster", "environment"},
 | |
| 	)
 | |
| 
 | |
| 	m.uptime = promauto.NewGauge(
 | |
| 		prometheus.GaugeOpts{
 | |
| 			Name: "chorus_uptime_seconds",
 | |
| 			Help: "System uptime in seconds",
 | |
| 		},
 | |
| 	)
 | |
| 
 | |
| 	// P2P metrics
 | |
| 	m.p2pConnectedPeers = promauto.NewGauge(
 | |
| 		prometheus.GaugeOpts{
 | |
| 			Name: "chorus_p2p_connected_peers",
 | |
| 			Help: "Number of connected P2P peers",
 | |
| 		},
 | |
| 	)
 | |
| 
 | |
| 	m.p2pMessagesSent = promauto.NewCounterVec(
 | |
| 		prometheus.CounterOpts{
 | |
| 			Name: "chorus_p2p_messages_sent_total",
 | |
| 			Help: "Total number of P2P messages sent",
 | |
| 		},
 | |
| 		[]string{"message_type", "peer_id"},
 | |
| 	)
 | |
| 
 | |
| 	m.p2pMessagesReceived = promauto.NewCounterVec(
 | |
| 		prometheus.CounterOpts{
 | |
| 			Name: "chorus_p2p_messages_received_total",
 | |
| 			Help: "Total number of P2P messages received",
 | |
| 		},
 | |
| 		[]string{"message_type", "peer_id"},
 | |
| 	)
 | |
| 
 | |
| 	m.p2pMessageLatency = promauto.NewHistogramVec(
 | |
| 		prometheus.HistogramOpts{
 | |
| 			Name:    "chorus_p2p_message_latency_seconds",
 | |
| 			Help:    "P2P message round-trip latency",
 | |
| 			Buckets: config.LatencyBuckets,
 | |
| 		},
 | |
| 		[]string{"message_type"},
 | |
| 	)
 | |
| 
 | |
| 	// DHT metrics
 | |
| 	m.dhtPutOperations = promauto.NewCounterVec(
 | |
| 		prometheus.CounterOpts{
 | |
| 			Name: "chorus_dht_put_operations_total",
 | |
| 			Help: "Total number of DHT put operations",
 | |
| 		},
 | |
| 		[]string{"status"},
 | |
| 	)
 | |
| 
 | |
| 	m.dhtGetOperations = promauto.NewCounterVec(
 | |
| 		prometheus.CounterOpts{
 | |
| 			Name: "chorus_dht_get_operations_total",
 | |
| 			Help: "Total number of DHT get operations",
 | |
| 		},
 | |
| 		[]string{"status"},
 | |
| 	)
 | |
| 
 | |
| 	m.dhtOperationLatency = promauto.NewHistogramVec(
 | |
| 		prometheus.HistogramOpts{
 | |
| 			Name:    "chorus_dht_operation_latency_seconds",
 | |
| 			Help:    "DHT operation latency",
 | |
| 			Buckets: config.LatencyBuckets,
 | |
| 		},
 | |
| 		[]string{"operation", "status"},
 | |
| 	)
 | |
| 
 | |
| 	m.dhtProviderRecords = promauto.NewGauge(
 | |
| 		prometheus.GaugeOpts{
 | |
| 			Name: "chorus_dht_provider_records",
 | |
| 			Help: "Number of DHT provider records",
 | |
| 		},
 | |
| 	)
 | |
| 
 | |
| 	m.dhtContentKeys = promauto.NewGauge(
 | |
| 		prometheus.GaugeOpts{
 | |
| 			Name: "chorus_dht_content_keys",
 | |
| 			Help: "Number of DHT content keys",
 | |
| 		},
 | |
| 	)
 | |
| 
 | |
| 	m.dhtReplicationFactor = promauto.NewGaugeVec(
 | |
| 		prometheus.GaugeOpts{
 | |
| 			Name: "chorus_dht_replication_factor",
 | |
| 			Help: "DHT replication factor by key",
 | |
| 		},
 | |
| 		[]string{"key_hash"},
 | |
| 	)
 | |
| 
 | |
| 	// PubSub metrics
 | |
| 	m.pubsubTopics = promauto.NewGauge(
 | |
| 		prometheus.GaugeOpts{
 | |
| 			Name: "chorus_pubsub_topics",
 | |
| 			Help: "Number of active PubSub topics",
 | |
| 		},
 | |
| 	)
 | |
| 
 | |
| 	m.pubsubMessages = promauto.NewCounterVec(
 | |
| 		prometheus.CounterOpts{
 | |
| 			Name: "chorus_pubsub_messages_total",
 | |
| 			Help: "Total number of PubSub messages",
 | |
| 		},
 | |
| 		[]string{"topic", "direction", "message_type"},
 | |
| 	)
 | |
| 
 | |
| 	m.pubsubMessageLatency = promauto.NewHistogramVec(
 | |
| 		prometheus.HistogramOpts{
 | |
| 			Name:    "chorus_pubsub_message_latency_seconds",
 | |
| 			Help:    "PubSub message latency",
 | |
| 			Buckets: config.LatencyBuckets,
 | |
| 		},
 | |
| 		[]string{"topic"},
 | |
| 	)
 | |
| 
 | |
| 	// Election metrics
 | |
| 	m.electionTerm = promauto.NewGauge(
 | |
| 		prometheus.GaugeOpts{
 | |
| 			Name: "chorus_election_term",
 | |
| 			Help: "Current election term",
 | |
| 		},
 | |
| 	)
 | |
| 
 | |
| 	m.electionState = promauto.NewGaugeVec(
 | |
| 		prometheus.GaugeOpts{
 | |
| 			Name: "chorus_election_state",
 | |
| 			Help: "Current election state (1 for active state)",
 | |
| 		},
 | |
| 		[]string{"state"},
 | |
| 	)
 | |
| 
 | |
| 	m.heartbeatsSent = promauto.NewCounter(
 | |
| 		prometheus.CounterOpts{
 | |
| 			Name: "chorus_heartbeats_sent_total",
 | |
| 			Help: "Total number of heartbeats sent",
 | |
| 		},
 | |
| 	)
 | |
| 
 | |
| 	m.heartbeatsReceived = promauto.NewCounter(
 | |
| 		prometheus.CounterOpts{
 | |
| 			Name: "chorus_heartbeats_received_total",
 | |
| 			Help: "Total number of heartbeats received",
 | |
| 		},
 | |
| 	)
 | |
| 
 | |
| 	m.leadershipChanges = promauto.NewCounter(
 | |
| 		prometheus.CounterOpts{
 | |
| 			Name: "chorus_leadership_changes_total",
 | |
| 			Help: "Total number of leadership changes",
 | |
| 		},
 | |
| 	)
 | |
| 
 | |
| 	// Health metrics
 | |
| 	m.healthChecksPassed = promauto.NewCounterVec(
 | |
| 		prometheus.CounterOpts{
 | |
| 			Name: "chorus_health_checks_passed_total",
 | |
| 			Help: "Total number of health checks passed",
 | |
| 		},
 | |
| 		[]string{"check_name"},
 | |
| 	)
 | |
| 
 | |
| 	m.healthChecksFailed = promauto.NewCounterVec(
 | |
| 		prometheus.CounterOpts{
 | |
| 			Name: "chorus_health_checks_failed_total",
 | |
| 			Help: "Total number of health checks failed",
 | |
| 		},
 | |
| 		[]string{"check_name", "reason"},
 | |
| 	)
 | |
| 
 | |
| 	m.systemHealthScore = promauto.NewGauge(
 | |
| 		prometheus.GaugeOpts{
 | |
| 			Name: "chorus_system_health_score",
 | |
| 			Help: "Overall system health score (0-1)",
 | |
| 		},
 | |
| 	)
 | |
| 
 | |
| 	m.componentHealthScore = promauto.NewGaugeVec(
 | |
| 		prometheus.GaugeOpts{
 | |
| 			Name: "chorus_component_health_score",
 | |
| 			Help: "Component health score (0-1)",
 | |
| 		},
 | |
| 		[]string{"component"},
 | |
| 	)
 | |
| 
 | |
| 	// Task metrics
 | |
| 	m.tasksActive = promauto.NewGauge(
 | |
| 		prometheus.GaugeOpts{
 | |
| 			Name: "chorus_tasks_active",
 | |
| 			Help: "Number of active tasks",
 | |
| 		},
 | |
| 	)
 | |
| 
 | |
| 	m.tasksQueued = promauto.NewGauge(
 | |
| 		prometheus.GaugeOpts{
 | |
| 			Name: "chorus_tasks_queued",
 | |
| 			Help: "Number of queued tasks",
 | |
| 		},
 | |
| 	)
 | |
| 
 | |
| 	m.tasksCompleted = promauto.NewCounterVec(
 | |
| 		prometheus.CounterOpts{
 | |
| 			Name: "chorus_tasks_completed_total",
 | |
| 			Help: "Total number of completed tasks",
 | |
| 		},
 | |
| 		[]string{"status", "task_type"},
 | |
| 	)
 | |
| 
 | |
| 	m.taskDuration = promauto.NewHistogramVec(
 | |
| 		prometheus.HistogramOpts{
 | |
| 			Name:    "chorus_task_duration_seconds",
 | |
| 			Help:    "Task execution duration",
 | |
| 			Buckets: config.LatencyBuckets,
 | |
| 		},
 | |
| 		[]string{"task_type", "status"},
 | |
| 	)
 | |
| 
 | |
| 	// SLURP metrics
 | |
| 	m.slurpGenerated = promauto.NewCounterVec(
 | |
| 		prometheus.CounterOpts{
 | |
| 			Name: "chorus_slurp_contexts_generated_total",
 | |
| 			Help: "Total number of contexts generated by SLURP",
 | |
| 		},
 | |
| 		[]string{"role", "status"},
 | |
| 	)
 | |
| 
 | |
| 	m.slurpGenerationTime = promauto.NewHistogram(
 | |
| 		prometheus.HistogramOpts{
 | |
| 			Name:    "chorus_slurp_generation_time_seconds",
 | |
| 			Help:    "SLURP context generation time",
 | |
| 			Buckets: []float64{0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0},
 | |
| 		},
 | |
| 	)
 | |
| 
 | |
| 	m.slurpQueueLength = promauto.NewGauge(
 | |
| 		prometheus.GaugeOpts{
 | |
| 			Name: "chorus_slurp_queue_length",
 | |
| 			Help: "Length of SLURP generation queue",
 | |
| 		},
 | |
| 	)
 | |
| 
 | |
| 	// SHHH metrics
 | |
| 	m.shhhFindings = promauto.NewCounterVec(
 | |
| 		prometheus.CounterOpts{
 | |
| 			Name: "chorus_shhh_findings_total",
 | |
| 			Help: "Total number of SHHH redaction findings",
 | |
| 		},
 | |
| 		[]string{"rule", "severity"},
 | |
| 	)
 | |
| 
 | |
| 	// UCXI metrics
 | |
| 	m.ucxiRequests = promauto.NewCounterVec(
 | |
| 		prometheus.CounterOpts{
 | |
| 			Name: "chorus_ucxi_requests_total",
 | |
| 			Help: "Total number of UCXI requests",
 | |
| 		},
 | |
| 		[]string{"method", "status"},
 | |
| 	)
 | |
| 
 | |
| 	m.ucxiResolutionLatency = promauto.NewHistogram(
 | |
| 		prometheus.HistogramOpts{
 | |
| 			Name:    "chorus_ucxi_resolution_latency_seconds",
 | |
| 			Help:    "UCXI address resolution latency",
 | |
| 			Buckets: config.LatencyBuckets,
 | |
| 		},
 | |
| 	)
 | |
| 
 | |
| 	// Resource metrics
 | |
| 	m.cpuUsage = promauto.NewGauge(
 | |
| 		prometheus.GaugeOpts{
 | |
| 			Name: "chorus_cpu_usage_ratio",
 | |
| 			Help: "CPU usage ratio (0-1)",
 | |
| 		},
 | |
| 	)
 | |
| 
 | |
| 	m.memoryUsage = promauto.NewGauge(
 | |
| 		prometheus.GaugeOpts{
 | |
| 			Name: "chorus_memory_usage_bytes",
 | |
| 			Help: "Memory usage in bytes",
 | |
| 		},
 | |
| 	)
 | |
| 
 | |
| 	m.diskUsage = promauto.NewGaugeVec(
 | |
| 		prometheus.GaugeOpts{
 | |
| 			Name: "chorus_disk_usage_ratio",
 | |
| 			Help: "Disk usage ratio (0-1)",
 | |
| 		},
 | |
| 		[]string{"mount_point"},
 | |
| 	)
 | |
| 
 | |
| 	m.goroutines = promauto.NewGauge(
 | |
| 		prometheus.GaugeOpts{
 | |
| 			Name: "chorus_goroutines",
 | |
| 			Help: "Number of goroutines",
 | |
| 		},
 | |
| 	)
 | |
| 
 | |
| 	// Error metrics
 | |
| 	m.errors = promauto.NewCounterVec(
 | |
| 		prometheus.CounterOpts{
 | |
| 			Name: "chorus_errors_total",
 | |
| 			Help: "Total number of errors",
 | |
| 		},
 | |
| 		[]string{"component", "error_type"},
 | |
| 	)
 | |
| 
 | |
| 	m.panics = promauto.NewCounter(
 | |
| 		prometheus.CounterOpts{
 | |
| 			Name: "chorus_panics_total",
 | |
| 			Help: "Total number of panics",
 | |
| 		},
 | |
| 	)
 | |
| }
 | |
| 
 | |
| // registerMetrics registers all metrics with the registry
 | |
| func (m *CHORUSMetrics) registerMetrics() {
 | |
| 	// All metrics are auto-registered with the default registry
 | |
| 	// For custom registry, we would need to register manually
 | |
| }
 | |
| 
 | |
| // StartServer starts the Prometheus metrics HTTP server
 | |
| func (m *CHORUSMetrics) StartServer(config *MetricsConfig) error {
 | |
| 	mux := http.NewServeMux()
 | |
| 
 | |
| 	// Use custom registry
 | |
| 	handler := promhttp.HandlerFor(m.registry, promhttp.HandlerOpts{
 | |
| 		EnableOpenMetrics: true,
 | |
| 	})
 | |
| 	mux.Handle(config.MetricsPath, handler)
 | |
| 
 | |
| 	// Health endpoint
 | |
| 	mux.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
 | |
| 		w.WriteHeader(http.StatusOK)
 | |
| 		w.Write([]byte("OK"))
 | |
| 	})
 | |
| 
 | |
| 	m.httpServer = &http.Server{
 | |
| 		Addr:    config.ListenAddr,
 | |
| 		Handler: mux,
 | |
| 	}
 | |
| 
 | |
| 	go func() {
 | |
| 		log.Printf("Starting metrics server on %s%s", config.ListenAddr, config.MetricsPath)
 | |
| 		if err := m.httpServer.ListenAndServe(); err != nil && err != http.ErrServerClosed {
 | |
| 			log.Printf("Metrics server error: %v", err)
 | |
| 		}
 | |
| 	}()
 | |
| 
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // StopServer stops the metrics HTTP server
 | |
| func (m *CHORUSMetrics) StopServer() error {
 | |
| 	if m.httpServer != nil {
 | |
| 		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 | |
| 		defer cancel()
 | |
| 		return m.httpServer.Shutdown(ctx)
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // P2P Metrics Methods
 | |
| 
 | |
| func (m *CHORUSMetrics) SetConnectedPeers(count int) {
 | |
| 	m.p2pConnectedPeers.Set(float64(count))
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) IncrementMessagesSent(messageType, peerID string) {
 | |
| 	m.p2pMessagesSent.WithLabelValues(messageType, peerID).Inc()
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) IncrementMessagesReceived(messageType, peerID string) {
 | |
| 	m.p2pMessagesReceived.WithLabelValues(messageType, peerID).Inc()
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) ObserveMessageLatency(messageType string, latency time.Duration) {
 | |
| 	m.p2pMessageLatency.WithLabelValues(messageType).Observe(latency.Seconds())
 | |
| }
 | |
| 
 | |
| // DHT Metrics Methods
 | |
| 
 | |
| func (m *CHORUSMetrics) IncrementDHTPutOperations(status string) {
 | |
| 	m.dhtPutOperations.WithLabelValues(status).Inc()
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) IncrementDHTGetOperations(status string) {
 | |
| 	m.dhtGetOperations.WithLabelValues(status).Inc()
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) ObserveDHTOperationLatency(operation, status string, latency time.Duration) {
 | |
| 	m.dhtOperationLatency.WithLabelValues(operation, status).Observe(latency.Seconds())
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) SetDHTProviderRecords(count int) {
 | |
| 	m.dhtProviderRecords.Set(float64(count))
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) SetDHTContentKeys(count int) {
 | |
| 	m.dhtContentKeys.Set(float64(count))
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) SetDHTReplicationFactor(keyHash string, factor float64) {
 | |
| 	m.dhtReplicationFactor.WithLabelValues(keyHash).Set(factor)
 | |
| }
 | |
| 
 | |
| // PubSub Metrics Methods
 | |
| 
 | |
| func (m *CHORUSMetrics) SetPubSubTopics(count int) {
 | |
| 	m.pubsubTopics.Set(float64(count))
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) IncrementPubSubMessages(topic, direction, messageType string) {
 | |
| 	m.pubsubMessages.WithLabelValues(topic, direction, messageType).Inc()
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) ObservePubSubMessageLatency(topic string, latency time.Duration) {
 | |
| 	m.pubsubMessageLatency.WithLabelValues(topic).Observe(latency.Seconds())
 | |
| }
 | |
| 
 | |
| // Election Metrics Methods
 | |
| 
 | |
| func (m *CHORUSMetrics) SetElectionTerm(term int) {
 | |
| 	m.electionTerm.Set(float64(term))
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) SetElectionState(state string) {
 | |
| 	// Reset all state gauges
 | |
| 	states := []string{"idle", "discovering", "electing", "reconstructing", "complete"}
 | |
| 	for _, s := range states {
 | |
| 		m.electionState.WithLabelValues(s).Set(0)
 | |
| 	}
 | |
| 	// Set current state
 | |
| 	m.electionState.WithLabelValues(state).Set(1)
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) IncrementHeartbeatsSent() {
 | |
| 	m.heartbeatsSent.Inc()
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) IncrementHeartbeatsReceived() {
 | |
| 	m.heartbeatsReceived.Inc()
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) IncrementLeadershipChanges() {
 | |
| 	m.leadershipChanges.Inc()
 | |
| }
 | |
| 
 | |
| // Health Metrics Methods
 | |
| 
 | |
| func (m *CHORUSMetrics) IncrementHealthCheckPassed(checkName string) {
 | |
| 	m.healthChecksPassed.WithLabelValues(checkName).Inc()
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) IncrementHealthCheckFailed(checkName, reason string) {
 | |
| 	m.healthChecksFailed.WithLabelValues(checkName, reason).Inc()
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) SetSystemHealthScore(score float64) {
 | |
| 	m.systemHealthScore.Set(score)
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) SetComponentHealthScore(component string, score float64) {
 | |
| 	m.componentHealthScore.WithLabelValues(component).Set(score)
 | |
| }
 | |
| 
 | |
| // Task Metrics Methods
 | |
| 
 | |
| func (m *CHORUSMetrics) SetActiveTasks(count int) {
 | |
| 	m.tasksActive.Set(float64(count))
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) SetQueuedTasks(count int) {
 | |
| 	m.tasksQueued.Set(float64(count))
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) IncrementTasksCompleted(status, taskType string) {
 | |
| 	m.tasksCompleted.WithLabelValues(status, taskType).Inc()
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) ObserveTaskDuration(taskType, status string, duration time.Duration) {
 | |
| 	m.taskDuration.WithLabelValues(taskType, status).Observe(duration.Seconds())
 | |
| }
 | |
| 
 | |
| // SLURP Metrics Methods
 | |
| 
 | |
| func (m *CHORUSMetrics) IncrementSLURPGenerated(role, status string) {
 | |
| 	m.slurpGenerated.WithLabelValues(role, status).Inc()
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) ObserveSLURPGenerationTime(duration time.Duration) {
 | |
| 	m.slurpGenerationTime.Observe(duration.Seconds())
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) SetSLURPQueueLength(length int) {
 | |
| 	m.slurpQueueLength.Set(float64(length))
 | |
| }
 | |
| 
 | |
| // SHHH Metrics Methods
 | |
| 
 | |
| func (m *CHORUSMetrics) IncrementSHHHFindings(rule, severity string, count int) {
 | |
| 	if m == nil || m.shhhFindings == nil || count <= 0 {
 | |
| 		return
 | |
| 	}
 | |
| 	m.shhhFindings.WithLabelValues(rule, severity).Add(float64(count))
 | |
| }
 | |
| 
 | |
| // UCXI Metrics Methods
 | |
| 
 | |
| func (m *CHORUSMetrics) IncrementUCXIRequests(method, status string) {
 | |
| 	m.ucxiRequests.WithLabelValues(method, status).Inc()
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) ObserveUCXIResolutionLatency(latency time.Duration) {
 | |
| 	m.ucxiResolutionLatency.Observe(latency.Seconds())
 | |
| }
 | |
| 
 | |
| // Resource Metrics Methods
 | |
| 
 | |
| func (m *CHORUSMetrics) SetCPUUsage(usage float64) {
 | |
| 	m.cpuUsage.Set(usage)
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) SetMemoryUsage(usage float64) {
 | |
| 	m.memoryUsage.Set(usage)
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) SetDiskUsage(mountPoint string, usage float64) {
 | |
| 	m.diskUsage.WithLabelValues(mountPoint).Set(usage)
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) SetGoroutines(count int) {
 | |
| 	m.goroutines.Set(float64(count))
 | |
| }
 | |
| 
 | |
| // Error Metrics Methods
 | |
| 
 | |
| func (m *CHORUSMetrics) IncrementErrors(component, errorType string) {
 | |
| 	m.errors.WithLabelValues(component, errorType).Inc()
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) IncrementPanics() {
 | |
| 	m.panics.Inc()
 | |
| }
 | |
| 
 | |
| // System Metrics Methods
 | |
| 
 | |
| func (m *CHORUSMetrics) UpdateSystemInfo(nodeID, version, goVersion, cluster, environment string) {
 | |
| 	m.systemInfo.WithLabelValues(nodeID, version, goVersion, cluster, environment).Set(1)
 | |
| }
 | |
| 
 | |
| func (m *CHORUSMetrics) UpdateUptime() {
 | |
| 	m.uptime.Set(time.Since(m.startTime).Seconds())
 | |
| }
 | |
| 
 | |
| // CollectMetrics starts background metric collection
 | |
| func (m *CHORUSMetrics) CollectMetrics(config *MetricsConfig) {
 | |
| 	systemTicker := time.NewTicker(config.SystemMetricsInterval)
 | |
| 	resourceTicker := time.NewTicker(config.ResourceMetricsInterval)
 | |
| 
 | |
| 	go func() {
 | |
| 		defer systemTicker.Stop()
 | |
| 		defer resourceTicker.Stop()
 | |
| 
 | |
| 		for {
 | |
| 			select {
 | |
| 			case <-systemTicker.C:
 | |
| 				m.UpdateUptime()
 | |
| 				// Collect other system metrics
 | |
| 
 | |
| 			case <-resourceTicker.C:
 | |
| 				// Collect resource metrics (would integrate with actual system monitoring)
 | |
| 				// m.collectResourceMetrics()
 | |
| 			}
 | |
| 		}
 | |
| 	}()
 | |
| }
 | 
