Files
CHORUS/pkg/metrics/prometheus_metrics.go
2025-09-20 23:21:35 +10:00

749 lines
19 KiB
Go

package metrics
import (
"context"
"log"
"net/http"
"sync"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
// CHORUSMetrics provides comprehensive Prometheus metrics for the CHORUS system
type CHORUSMetrics struct {
registry *prometheus.Registry
httpServer *http.Server
// System metrics
systemInfo *prometheus.GaugeVec
uptime prometheus.Gauge
buildInfo *prometheus.GaugeVec
// P2P metrics
p2pConnectedPeers prometheus.Gauge
p2pMessagesSent *prometheus.CounterVec
p2pMessagesReceived *prometheus.CounterVec
p2pMessageLatency *prometheus.HistogramVec
p2pConnectionDuration *prometheus.HistogramVec
p2pPeerScore *prometheus.GaugeVec
// DHT metrics
dhtPutOperations *prometheus.CounterVec
dhtGetOperations *prometheus.CounterVec
dhtOperationLatency *prometheus.HistogramVec
dhtProviderRecords prometheus.Gauge
dhtReplicationFactor *prometheus.GaugeVec
dhtContentKeys prometheus.Gauge
dhtCacheHits *prometheus.CounterVec
dhtCacheMisses *prometheus.CounterVec
// PubSub metrics
pubsubTopics prometheus.Gauge
pubsubSubscribers *prometheus.GaugeVec
pubsubMessages *prometheus.CounterVec
pubsubMessageLatency *prometheus.HistogramVec
pubsubMessageSize *prometheus.HistogramVec
// Election metrics
electionTerm prometheus.Gauge
electionState *prometheus.GaugeVec
heartbeatsSent prometheus.Counter
heartbeatsReceived prometheus.Counter
leadershipChanges prometheus.Counter
leaderUptime prometheus.Gauge
electionLatency prometheus.Histogram
// Health metrics
healthChecksPassed *prometheus.CounterVec
healthChecksFailed *prometheus.CounterVec
healthCheckDuration *prometheus.HistogramVec
systemHealthScore prometheus.Gauge
componentHealthScore *prometheus.GaugeVec
// Task metrics
tasksActive prometheus.Gauge
tasksQueued prometheus.Gauge
tasksCompleted *prometheus.CounterVec
taskDuration *prometheus.HistogramVec
taskQueueWaitTime prometheus.Histogram
// SLURP metrics (context generation)
slurpGenerated *prometheus.CounterVec
slurpGenerationTime prometheus.Histogram
slurpQueueLength prometheus.Gauge
slurpActiveJobs prometheus.Gauge
slurpLeadershipEvents prometheus.Counter
// SHHH sentinel metrics
shhhFindings *prometheus.CounterVec
// UCXI metrics (protocol resolution)
ucxiRequests *prometheus.CounterVec
ucxiResolutionLatency prometheus.Histogram
ucxiCacheHits prometheus.Counter
ucxiCacheMisses prometheus.Counter
ucxiContentSize prometheus.Histogram
// Resource metrics
cpuUsage prometheus.Gauge
memoryUsage prometheus.Gauge
diskUsage *prometheus.GaugeVec
networkBytesIn prometheus.Counter
networkBytesOut prometheus.Counter
goroutines prometheus.Gauge
// Error metrics
errors *prometheus.CounterVec
panics prometheus.Counter
startTime time.Time
mu sync.RWMutex
}
// MetricsConfig configures the metrics system
type MetricsConfig struct {
// HTTP server config
ListenAddr string
MetricsPath string
// Histogram buckets
LatencyBuckets []float64
SizeBuckets []float64
// Labels
NodeID string
Version string
Environment string
Cluster string
// Collection intervals
SystemMetricsInterval time.Duration
ResourceMetricsInterval time.Duration
}
// DefaultMetricsConfig returns default metrics configuration
func DefaultMetricsConfig() *MetricsConfig {
return &MetricsConfig{
ListenAddr: ":9090",
MetricsPath: "/metrics",
LatencyBuckets: []float64{
0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
},
SizeBuckets: []float64{
64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216,
},
SystemMetricsInterval: 30 * time.Second,
ResourceMetricsInterval: 15 * time.Second,
}
}
// NewCHORUSMetrics creates a new metrics collector
func NewCHORUSMetrics(config *MetricsConfig) *CHORUSMetrics {
if config == nil {
config = DefaultMetricsConfig()
}
registry := prometheus.NewRegistry()
metrics := &CHORUSMetrics{
registry: registry,
startTime: time.Now(),
}
// Initialize all metrics
metrics.initializeMetrics(config)
// Register with custom registry
metrics.registerMetrics()
return metrics
}
// initializeMetrics initializes all Prometheus metrics
func (m *CHORUSMetrics) initializeMetrics(config *MetricsConfig) {
// System metrics
m.systemInfo = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "chorus_system_info",
Help: "System information",
},
[]string{"node_id", "version", "go_version", "cluster", "environment"},
)
m.uptime = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "chorus_uptime_seconds",
Help: "System uptime in seconds",
},
)
// P2P metrics
m.p2pConnectedPeers = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "chorus_p2p_connected_peers",
Help: "Number of connected P2P peers",
},
)
m.p2pMessagesSent = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "chorus_p2p_messages_sent_total",
Help: "Total number of P2P messages sent",
},
[]string{"message_type", "peer_id"},
)
m.p2pMessagesReceived = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "chorus_p2p_messages_received_total",
Help: "Total number of P2P messages received",
},
[]string{"message_type", "peer_id"},
)
m.p2pMessageLatency = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "chorus_p2p_message_latency_seconds",
Help: "P2P message round-trip latency",
Buckets: config.LatencyBuckets,
},
[]string{"message_type"},
)
// DHT metrics
m.dhtPutOperations = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "chorus_dht_put_operations_total",
Help: "Total number of DHT put operations",
},
[]string{"status"},
)
m.dhtGetOperations = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "chorus_dht_get_operations_total",
Help: "Total number of DHT get operations",
},
[]string{"status"},
)
m.dhtOperationLatency = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "chorus_dht_operation_latency_seconds",
Help: "DHT operation latency",
Buckets: config.LatencyBuckets,
},
[]string{"operation", "status"},
)
m.dhtProviderRecords = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "chorus_dht_provider_records",
Help: "Number of DHT provider records",
},
)
m.dhtContentKeys = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "chorus_dht_content_keys",
Help: "Number of DHT content keys",
},
)
m.dhtReplicationFactor = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "chorus_dht_replication_factor",
Help: "DHT replication factor by key",
},
[]string{"key_hash"},
)
// PubSub metrics
m.pubsubTopics = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "chorus_pubsub_topics",
Help: "Number of active PubSub topics",
},
)
m.pubsubMessages = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "chorus_pubsub_messages_total",
Help: "Total number of PubSub messages",
},
[]string{"topic", "direction", "message_type"},
)
m.pubsubMessageLatency = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "chorus_pubsub_message_latency_seconds",
Help: "PubSub message latency",
Buckets: config.LatencyBuckets,
},
[]string{"topic"},
)
// Election metrics
m.electionTerm = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "chorus_election_term",
Help: "Current election term",
},
)
m.electionState = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "chorus_election_state",
Help: "Current election state (1 for active state)",
},
[]string{"state"},
)
m.heartbeatsSent = promauto.NewCounter(
prometheus.CounterOpts{
Name: "chorus_heartbeats_sent_total",
Help: "Total number of heartbeats sent",
},
)
m.heartbeatsReceived = promauto.NewCounter(
prometheus.CounterOpts{
Name: "chorus_heartbeats_received_total",
Help: "Total number of heartbeats received",
},
)
m.leadershipChanges = promauto.NewCounter(
prometheus.CounterOpts{
Name: "chorus_leadership_changes_total",
Help: "Total number of leadership changes",
},
)
// Health metrics
m.healthChecksPassed = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "chorus_health_checks_passed_total",
Help: "Total number of health checks passed",
},
[]string{"check_name"},
)
m.healthChecksFailed = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "chorus_health_checks_failed_total",
Help: "Total number of health checks failed",
},
[]string{"check_name", "reason"},
)
m.systemHealthScore = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "chorus_system_health_score",
Help: "Overall system health score (0-1)",
},
)
m.componentHealthScore = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "chorus_component_health_score",
Help: "Component health score (0-1)",
},
[]string{"component"},
)
// Task metrics
m.tasksActive = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "chorus_tasks_active",
Help: "Number of active tasks",
},
)
m.tasksQueued = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "chorus_tasks_queued",
Help: "Number of queued tasks",
},
)
m.tasksCompleted = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "chorus_tasks_completed_total",
Help: "Total number of completed tasks",
},
[]string{"status", "task_type"},
)
m.taskDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "chorus_task_duration_seconds",
Help: "Task execution duration",
Buckets: config.LatencyBuckets,
},
[]string{"task_type", "status"},
)
// SLURP metrics
m.slurpGenerated = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "chorus_slurp_contexts_generated_total",
Help: "Total number of contexts generated by SLURP",
},
[]string{"role", "status"},
)
m.slurpGenerationTime = promauto.NewHistogram(
prometheus.HistogramOpts{
Name: "chorus_slurp_generation_time_seconds",
Help: "SLURP context generation time",
Buckets: []float64{0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0},
},
)
m.slurpQueueLength = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "chorus_slurp_queue_length",
Help: "Length of SLURP generation queue",
},
)
// SHHH metrics
m.shhhFindings = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "chorus_shhh_findings_total",
Help: "Total number of SHHH redaction findings",
},
[]string{"rule", "severity"},
)
// UCXI metrics
m.ucxiRequests = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "chorus_ucxi_requests_total",
Help: "Total number of UCXI requests",
},
[]string{"method", "status"},
)
m.ucxiResolutionLatency = promauto.NewHistogram(
prometheus.HistogramOpts{
Name: "chorus_ucxi_resolution_latency_seconds",
Help: "UCXI address resolution latency",
Buckets: config.LatencyBuckets,
},
)
// Resource metrics
m.cpuUsage = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "chorus_cpu_usage_ratio",
Help: "CPU usage ratio (0-1)",
},
)
m.memoryUsage = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "chorus_memory_usage_bytes",
Help: "Memory usage in bytes",
},
)
m.diskUsage = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "chorus_disk_usage_ratio",
Help: "Disk usage ratio (0-1)",
},
[]string{"mount_point"},
)
m.goroutines = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "chorus_goroutines",
Help: "Number of goroutines",
},
)
// Error metrics
m.errors = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "chorus_errors_total",
Help: "Total number of errors",
},
[]string{"component", "error_type"},
)
m.panics = promauto.NewCounter(
prometheus.CounterOpts{
Name: "chorus_panics_total",
Help: "Total number of panics",
},
)
}
// registerMetrics registers all metrics with the registry
func (m *CHORUSMetrics) registerMetrics() {
// All metrics are auto-registered with the default registry
// For custom registry, we would need to register manually
}
// StartServer starts the Prometheus metrics HTTP server
func (m *CHORUSMetrics) StartServer(config *MetricsConfig) error {
mux := http.NewServeMux()
// Use custom registry
handler := promhttp.HandlerFor(m.registry, promhttp.HandlerOpts{
EnableOpenMetrics: true,
})
mux.Handle(config.MetricsPath, handler)
// Health endpoint
mux.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
w.Write([]byte("OK"))
})
m.httpServer = &http.Server{
Addr: config.ListenAddr,
Handler: mux,
}
go func() {
log.Printf("Starting metrics server on %s%s", config.ListenAddr, config.MetricsPath)
if err := m.httpServer.ListenAndServe(); err != nil && err != http.ErrServerClosed {
log.Printf("Metrics server error: %v", err)
}
}()
return nil
}
// StopServer stops the metrics HTTP server
func (m *CHORUSMetrics) StopServer() error {
if m.httpServer != nil {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
return m.httpServer.Shutdown(ctx)
}
return nil
}
// P2P Metrics Methods
func (m *CHORUSMetrics) SetConnectedPeers(count int) {
m.p2pConnectedPeers.Set(float64(count))
}
func (m *CHORUSMetrics) IncrementMessagesSent(messageType, peerID string) {
m.p2pMessagesSent.WithLabelValues(messageType, peerID).Inc()
}
func (m *CHORUSMetrics) IncrementMessagesReceived(messageType, peerID string) {
m.p2pMessagesReceived.WithLabelValues(messageType, peerID).Inc()
}
func (m *CHORUSMetrics) ObserveMessageLatency(messageType string, latency time.Duration) {
m.p2pMessageLatency.WithLabelValues(messageType).Observe(latency.Seconds())
}
// DHT Metrics Methods
func (m *CHORUSMetrics) IncrementDHTPutOperations(status string) {
m.dhtPutOperations.WithLabelValues(status).Inc()
}
func (m *CHORUSMetrics) IncrementDHTGetOperations(status string) {
m.dhtGetOperations.WithLabelValues(status).Inc()
}
func (m *CHORUSMetrics) ObserveDHTOperationLatency(operation, status string, latency time.Duration) {
m.dhtOperationLatency.WithLabelValues(operation, status).Observe(latency.Seconds())
}
func (m *CHORUSMetrics) SetDHTProviderRecords(count int) {
m.dhtProviderRecords.Set(float64(count))
}
func (m *CHORUSMetrics) SetDHTContentKeys(count int) {
m.dhtContentKeys.Set(float64(count))
}
func (m *CHORUSMetrics) SetDHTReplicationFactor(keyHash string, factor float64) {
m.dhtReplicationFactor.WithLabelValues(keyHash).Set(factor)
}
// PubSub Metrics Methods
func (m *CHORUSMetrics) SetPubSubTopics(count int) {
m.pubsubTopics.Set(float64(count))
}
func (m *CHORUSMetrics) IncrementPubSubMessages(topic, direction, messageType string) {
m.pubsubMessages.WithLabelValues(topic, direction, messageType).Inc()
}
func (m *CHORUSMetrics) ObservePubSubMessageLatency(topic string, latency time.Duration) {
m.pubsubMessageLatency.WithLabelValues(topic).Observe(latency.Seconds())
}
// Election Metrics Methods
func (m *CHORUSMetrics) SetElectionTerm(term int) {
m.electionTerm.Set(float64(term))
}
func (m *CHORUSMetrics) SetElectionState(state string) {
// Reset all state gauges
states := []string{"idle", "discovering", "electing", "reconstructing", "complete"}
for _, s := range states {
m.electionState.WithLabelValues(s).Set(0)
}
// Set current state
m.electionState.WithLabelValues(state).Set(1)
}
func (m *CHORUSMetrics) IncrementHeartbeatsSent() {
m.heartbeatsSent.Inc()
}
func (m *CHORUSMetrics) IncrementHeartbeatsReceived() {
m.heartbeatsReceived.Inc()
}
func (m *CHORUSMetrics) IncrementLeadershipChanges() {
m.leadershipChanges.Inc()
}
// Health Metrics Methods
func (m *CHORUSMetrics) IncrementHealthCheckPassed(checkName string) {
m.healthChecksPassed.WithLabelValues(checkName).Inc()
}
func (m *CHORUSMetrics) IncrementHealthCheckFailed(checkName, reason string) {
m.healthChecksFailed.WithLabelValues(checkName, reason).Inc()
}
func (m *CHORUSMetrics) SetSystemHealthScore(score float64) {
m.systemHealthScore.Set(score)
}
func (m *CHORUSMetrics) SetComponentHealthScore(component string, score float64) {
m.componentHealthScore.WithLabelValues(component).Set(score)
}
// Task Metrics Methods
func (m *CHORUSMetrics) SetActiveTasks(count int) {
m.tasksActive.Set(float64(count))
}
func (m *CHORUSMetrics) SetQueuedTasks(count int) {
m.tasksQueued.Set(float64(count))
}
func (m *CHORUSMetrics) IncrementTasksCompleted(status, taskType string) {
m.tasksCompleted.WithLabelValues(status, taskType).Inc()
}
func (m *CHORUSMetrics) ObserveTaskDuration(taskType, status string, duration time.Duration) {
m.taskDuration.WithLabelValues(taskType, status).Observe(duration.Seconds())
}
// SLURP Metrics Methods
func (m *CHORUSMetrics) IncrementSLURPGenerated(role, status string) {
m.slurpGenerated.WithLabelValues(role, status).Inc()
}
func (m *CHORUSMetrics) ObserveSLURPGenerationTime(duration time.Duration) {
m.slurpGenerationTime.Observe(duration.Seconds())
}
func (m *CHORUSMetrics) SetSLURPQueueLength(length int) {
m.slurpQueueLength.Set(float64(length))
}
// SHHH Metrics Methods
func (m *CHORUSMetrics) IncrementSHHHFindings(rule, severity string, count int) {
if m == nil || m.shhhFindings == nil || count <= 0 {
return
}
m.shhhFindings.WithLabelValues(rule, severity).Add(float64(count))
}
// UCXI Metrics Methods
func (m *CHORUSMetrics) IncrementUCXIRequests(method, status string) {
m.ucxiRequests.WithLabelValues(method, status).Inc()
}
func (m *CHORUSMetrics) ObserveUCXIResolutionLatency(latency time.Duration) {
m.ucxiResolutionLatency.Observe(latency.Seconds())
}
// Resource Metrics Methods
func (m *CHORUSMetrics) SetCPUUsage(usage float64) {
m.cpuUsage.Set(usage)
}
func (m *CHORUSMetrics) SetMemoryUsage(usage float64) {
m.memoryUsage.Set(usage)
}
func (m *CHORUSMetrics) SetDiskUsage(mountPoint string, usage float64) {
m.diskUsage.WithLabelValues(mountPoint).Set(usage)
}
func (m *CHORUSMetrics) SetGoroutines(count int) {
m.goroutines.Set(float64(count))
}
// Error Metrics Methods
func (m *CHORUSMetrics) IncrementErrors(component, errorType string) {
m.errors.WithLabelValues(component, errorType).Inc()
}
func (m *CHORUSMetrics) IncrementPanics() {
m.panics.Inc()
}
// System Metrics Methods
func (m *CHORUSMetrics) UpdateSystemInfo(nodeID, version, goVersion, cluster, environment string) {
m.systemInfo.WithLabelValues(nodeID, version, goVersion, cluster, environment).Set(1)
}
func (m *CHORUSMetrics) UpdateUptime() {
m.uptime.Set(time.Since(m.startTime).Seconds())
}
// CollectMetrics starts background metric collection
func (m *CHORUSMetrics) CollectMetrics(config *MetricsConfig) {
systemTicker := time.NewTicker(config.SystemMetricsInterval)
resourceTicker := time.NewTicker(config.ResourceMetricsInterval)
go func() {
defer systemTicker.Stop()
defer resourceTicker.Stop()
for {
select {
case <-systemTicker.C:
m.UpdateUptime()
// Collect other system metrics
case <-resourceTicker.C:
// Collect resource metrics (would integrate with actual system monitoring)
// m.collectResourceMetrics()
}
}
}()
}