package metrics import ( "context" "fmt" "log" "net/http" "sync" "time" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/prometheus/client_golang/prometheus/promauto" ) // BZZZMetrics provides comprehensive Prometheus metrics for the BZZZ system type BZZZMetrics struct { registry *prometheus.Registry httpServer *http.Server // System metrics systemInfo *prometheus.GaugeVec uptime prometheus.Gauge buildInfo *prometheus.GaugeVec // P2P metrics p2pConnectedPeers prometheus.Gauge p2pMessagesSent *prometheus.CounterVec p2pMessagesReceived *prometheus.CounterVec p2pMessageLatency *prometheus.HistogramVec p2pConnectionDuration *prometheus.HistogramVec p2pPeerScore *prometheus.GaugeVec // DHT metrics dhtPutOperations *prometheus.CounterVec dhtGetOperations *prometheus.CounterVec dhtOperationLatency *prometheus.HistogramVec dhtProviderRecords prometheus.Gauge dhtReplicationFactor *prometheus.GaugeVec dhtContentKeys prometheus.Gauge dhtCacheHits *prometheus.CounterVec dhtCacheMisses *prometheus.CounterVec // PubSub metrics pubsubTopics prometheus.Gauge pubsubSubscribers *prometheus.GaugeVec pubsubMessages *prometheus.CounterVec pubsubMessageLatency *prometheus.HistogramVec pubsubMessageSize *prometheus.HistogramVec // Election metrics electionTerm prometheus.Gauge electionState *prometheus.GaugeVec heartbeatsSent prometheus.Counter heartbeatsReceived prometheus.Counter leadershipChanges prometheus.Counter leaderUptime prometheus.Gauge electionLatency prometheus.Histogram // Health metrics healthChecksPassed *prometheus.CounterVec healthChecksFailed *prometheus.CounterVec healthCheckDuration *prometheus.HistogramVec systemHealthScore prometheus.Gauge componentHealthScore *prometheus.GaugeVec // Task metrics tasksActive prometheus.Gauge tasksQueued prometheus.Gauge tasksCompleted *prometheus.CounterVec taskDuration *prometheus.HistogramVec taskQueueWaitTime prometheus.Histogram // SLURP metrics (context generation) slurpGenerated *prometheus.CounterVec slurpGenerationTime prometheus.Histogram slurpQueueLength prometheus.Gauge slurpActiveJobs prometheus.Gauge slurpLeadershipEvents prometheus.Counter // UCXI metrics (protocol resolution) ucxiRequests *prometheus.CounterVec ucxiResolutionLatency prometheus.Histogram ucxiCacheHits prometheus.Counter ucxiCacheMisses prometheus.Counter ucxiContentSize prometheus.Histogram // Resource metrics cpuUsage prometheus.Gauge memoryUsage prometheus.Gauge diskUsage *prometheus.GaugeVec networkBytesIn prometheus.Counter networkBytesOut prometheus.Counter goroutines prometheus.Gauge // Error metrics errors *prometheus.CounterVec panics prometheus.Counter startTime time.Time mu sync.RWMutex } // MetricsConfig configures the metrics system type MetricsConfig struct { // HTTP server config ListenAddr string MetricsPath string // Histogram buckets LatencyBuckets []float64 SizeBuckets []float64 // Labels NodeID string Version string Environment string Cluster string // Collection intervals SystemMetricsInterval time.Duration ResourceMetricsInterval time.Duration } // DefaultMetricsConfig returns default metrics configuration func DefaultMetricsConfig() *MetricsConfig { return &MetricsConfig{ ListenAddr: ":9090", MetricsPath: "/metrics", LatencyBuckets: []float64{ 0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, }, SizeBuckets: []float64{ 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, }, SystemMetricsInterval: 30 * time.Second, ResourceMetricsInterval: 15 * time.Second, } } // NewBZZZMetrics creates a new metrics collector func NewBZZZMetrics(config *MetricsConfig) *BZZZMetrics { if config == nil { config = DefaultMetricsConfig() } registry := prometheus.NewRegistry() metrics := &BZZZMetrics{ registry: registry, startTime: time.Now(), } // Initialize all metrics metrics.initializeMetrics(config) // Register with custom registry metrics.registerMetrics() return metrics } // initializeMetrics initializes all Prometheus metrics func (m *BZZZMetrics) initializeMetrics(config *MetricsConfig) { // System metrics m.systemInfo = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: "bzzz_system_info", Help: "System information", }, []string{"node_id", "version", "go_version", "cluster", "environment"}, ) m.uptime = promauto.NewGauge( prometheus.GaugeOpts{ Name: "bzzz_uptime_seconds", Help: "System uptime in seconds", }, ) // P2P metrics m.p2pConnectedPeers = promauto.NewGauge( prometheus.GaugeOpts{ Name: "bzzz_p2p_connected_peers", Help: "Number of connected P2P peers", }, ) m.p2pMessagesSent = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "bzzz_p2p_messages_sent_total", Help: "Total number of P2P messages sent", }, []string{"message_type", "peer_id"}, ) m.p2pMessagesReceived = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "bzzz_p2p_messages_received_total", Help: "Total number of P2P messages received", }, []string{"message_type", "peer_id"}, ) m.p2pMessageLatency = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: "bzzz_p2p_message_latency_seconds", Help: "P2P message round-trip latency", Buckets: config.LatencyBuckets, }, []string{"message_type"}, ) // DHT metrics m.dhtPutOperations = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "bzzz_dht_put_operations_total", Help: "Total number of DHT put operations", }, []string{"status"}, ) m.dhtGetOperations = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "bzzz_dht_get_operations_total", Help: "Total number of DHT get operations", }, []string{"status"}, ) m.dhtOperationLatency = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: "bzzz_dht_operation_latency_seconds", Help: "DHT operation latency", Buckets: config.LatencyBuckets, }, []string{"operation", "status"}, ) m.dhtProviderRecords = promauto.NewGauge( prometheus.GaugeOpts{ Name: "bzzz_dht_provider_records", Help: "Number of DHT provider records", }, ) m.dhtContentKeys = promauto.NewGauge( prometheus.GaugeOpts{ Name: "bzzz_dht_content_keys", Help: "Number of DHT content keys", }, ) m.dhtReplicationFactor = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: "bzzz_dht_replication_factor", Help: "DHT replication factor by key", }, []string{"key_hash"}, ) // PubSub metrics m.pubsubTopics = promauto.NewGauge( prometheus.GaugeOpts{ Name: "bzzz_pubsub_topics", Help: "Number of active PubSub topics", }, ) m.pubsubMessages = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "bzzz_pubsub_messages_total", Help: "Total number of PubSub messages", }, []string{"topic", "direction", "message_type"}, ) m.pubsubMessageLatency = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: "bzzz_pubsub_message_latency_seconds", Help: "PubSub message latency", Buckets: config.LatencyBuckets, }, []string{"topic"}, ) // Election metrics m.electionTerm = promauto.NewGauge( prometheus.GaugeOpts{ Name: "bzzz_election_term", Help: "Current election term", }, ) m.electionState = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: "bzzz_election_state", Help: "Current election state (1 for active state)", }, []string{"state"}, ) m.heartbeatsSent = promauto.NewCounter( prometheus.CounterOpts{ Name: "bzzz_heartbeats_sent_total", Help: "Total number of heartbeats sent", }, ) m.heartbeatsReceived = promauto.NewCounter( prometheus.CounterOpts{ Name: "bzzz_heartbeats_received_total", Help: "Total number of heartbeats received", }, ) m.leadershipChanges = promauto.NewCounter( prometheus.CounterOpts{ Name: "bzzz_leadership_changes_total", Help: "Total number of leadership changes", }, ) // Health metrics m.healthChecksPassed = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "bzzz_health_checks_passed_total", Help: "Total number of health checks passed", }, []string{"check_name"}, ) m.healthChecksFailed = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "bzzz_health_checks_failed_total", Help: "Total number of health checks failed", }, []string{"check_name", "reason"}, ) m.systemHealthScore = promauto.NewGauge( prometheus.GaugeOpts{ Name: "bzzz_system_health_score", Help: "Overall system health score (0-1)", }, ) m.componentHealthScore = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: "bzzz_component_health_score", Help: "Component health score (0-1)", }, []string{"component"}, ) // Task metrics m.tasksActive = promauto.NewGauge( prometheus.GaugeOpts{ Name: "bzzz_tasks_active", Help: "Number of active tasks", }, ) m.tasksQueued = promauto.NewGauge( prometheus.GaugeOpts{ Name: "bzzz_tasks_queued", Help: "Number of queued tasks", }, ) m.tasksCompleted = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "bzzz_tasks_completed_total", Help: "Total number of completed tasks", }, []string{"status", "task_type"}, ) m.taskDuration = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: "bzzz_task_duration_seconds", Help: "Task execution duration", Buckets: config.LatencyBuckets, }, []string{"task_type", "status"}, ) // SLURP metrics m.slurpGenerated = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "bzzz_slurp_contexts_generated_total", Help: "Total number of contexts generated by SLURP", }, []string{"role", "status"}, ) m.slurpGenerationTime = promauto.NewHistogram( prometheus.HistogramOpts{ Name: "bzzz_slurp_generation_time_seconds", Help: "SLURP context generation time", Buckets: []float64{0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0}, }, ) m.slurpQueueLength = promauto.NewGauge( prometheus.GaugeOpts{ Name: "bzzz_slurp_queue_length", Help: "Length of SLURP generation queue", }, ) // UCXI metrics m.ucxiRequests = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "bzzz_ucxi_requests_total", Help: "Total number of UCXI requests", }, []string{"method", "status"}, ) m.ucxiResolutionLatency = promauto.NewHistogram( prometheus.HistogramOpts{ Name: "bzzz_ucxi_resolution_latency_seconds", Help: "UCXI address resolution latency", Buckets: config.LatencyBuckets, }, ) // Resource metrics m.cpuUsage = promauto.NewGauge( prometheus.GaugeOpts{ Name: "bzzz_cpu_usage_ratio", Help: "CPU usage ratio (0-1)", }, ) m.memoryUsage = promauto.NewGauge( prometheus.GaugeOpts{ Name: "bzzz_memory_usage_bytes", Help: "Memory usage in bytes", }, ) m.diskUsage = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: "bzzz_disk_usage_ratio", Help: "Disk usage ratio (0-1)", }, []string{"mount_point"}, ) m.goroutines = promauto.NewGauge( prometheus.GaugeOpts{ Name: "bzzz_goroutines", Help: "Number of goroutines", }, ) // Error metrics m.errors = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "bzzz_errors_total", Help: "Total number of errors", }, []string{"component", "error_type"}, ) m.panics = promauto.NewCounter( prometheus.CounterOpts{ Name: "bzzz_panics_total", Help: "Total number of panics", }, ) } // registerMetrics registers all metrics with the registry func (m *BZZZMetrics) registerMetrics() { // All metrics are auto-registered with the default registry // For custom registry, we would need to register manually } // StartServer starts the Prometheus metrics HTTP server func (m *BZZZMetrics) StartServer(config *MetricsConfig) error { mux := http.NewServeMux() // Use custom registry handler := promhttp.HandlerFor(m.registry, promhttp.HandlerOpts{ EnableOpenMetrics: true, }) mux.Handle(config.MetricsPath, handler) // Health endpoint mux.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) w.Write([]byte("OK")) }) m.httpServer = &http.Server{ Addr: config.ListenAddr, Handler: mux, } go func() { log.Printf("Starting metrics server on %s%s", config.ListenAddr, config.MetricsPath) if err := m.httpServer.ListenAndServe(); err != nil && err != http.ErrServerClosed { log.Printf("Metrics server error: %v", err) } }() return nil } // StopServer stops the metrics HTTP server func (m *BZZZMetrics) StopServer() error { if m.httpServer != nil { ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() return m.httpServer.Shutdown(ctx) } return nil } // P2P Metrics Methods func (m *BZZZMetrics) SetConnectedPeers(count int) { m.p2pConnectedPeers.Set(float64(count)) } func (m *BZZZMetrics) IncrementMessagesSent(messageType, peerID string) { m.p2pMessagesSent.WithLabelValues(messageType, peerID).Inc() } func (m *BZZZMetrics) IncrementMessagesReceived(messageType, peerID string) { m.p2pMessagesReceived.WithLabelValues(messageType, peerID).Inc() } func (m *BZZZMetrics) ObserveMessageLatency(messageType string, latency time.Duration) { m.p2pMessageLatency.WithLabelValues(messageType).Observe(latency.Seconds()) } // DHT Metrics Methods func (m *BZZZMetrics) IncrementDHTPutOperations(status string) { m.dhtPutOperations.WithLabelValues(status).Inc() } func (m *BZZZMetrics) IncrementDHTGetOperations(status string) { m.dhtGetOperations.WithLabelValues(status).Inc() } func (m *BZZZMetrics) ObserveDHTOperationLatency(operation, status string, latency time.Duration) { m.dhtOperationLatency.WithLabelValues(operation, status).Observe(latency.Seconds()) } func (m *BZZZMetrics) SetDHTProviderRecords(count int) { m.dhtProviderRecords.Set(float64(count)) } func (m *BZZZMetrics) SetDHTContentKeys(count int) { m.dhtContentKeys.Set(float64(count)) } func (m *BZZZMetrics) SetDHTReplicationFactor(keyHash string, factor float64) { m.dhtReplicationFactor.WithLabelValues(keyHash).Set(factor) } // PubSub Metrics Methods func (m *BZZZMetrics) SetPubSubTopics(count int) { m.pubsubTopics.Set(float64(count)) } func (m *BZZZMetrics) IncrementPubSubMessages(topic, direction, messageType string) { m.pubsubMessages.WithLabelValues(topic, direction, messageType).Inc() } func (m *BZZZMetrics) ObservePubSubMessageLatency(topic string, latency time.Duration) { m.pubsubMessageLatency.WithLabelValues(topic).Observe(latency.Seconds()) } // Election Metrics Methods func (m *BZZZMetrics) SetElectionTerm(term int) { m.electionTerm.Set(float64(term)) } func (m *BZZZMetrics) SetElectionState(state string) { // Reset all state gauges states := []string{"idle", "discovering", "electing", "reconstructing", "complete"} for _, s := range states { m.electionState.WithLabelValues(s).Set(0) } // Set current state m.electionState.WithLabelValues(state).Set(1) } func (m *BZZZMetrics) IncrementHeartbeatsSent() { m.heartbeatsSent.Inc() } func (m *BZZZMetrics) IncrementHeartbeatsReceived() { m.heartbeatsReceived.Inc() } func (m *BZZZMetrics) IncrementLeadershipChanges() { m.leadershipChanges.Inc() } // Health Metrics Methods func (m *BZZZMetrics) IncrementHealthCheckPassed(checkName string) { m.healthChecksPassed.WithLabelValues(checkName).Inc() } func (m *BZZZMetrics) IncrementHealthCheckFailed(checkName, reason string) { m.healthChecksFailed.WithLabelValues(checkName, reason).Inc() } func (m *BZZZMetrics) SetSystemHealthScore(score float64) { m.systemHealthScore.Set(score) } func (m *BZZZMetrics) SetComponentHealthScore(component string, score float64) { m.componentHealthScore.WithLabelValues(component).Set(score) } // Task Metrics Methods func (m *BZZZMetrics) SetActiveTasks(count int) { m.tasksActive.Set(float64(count)) } func (m *BZZZMetrics) SetQueuedTasks(count int) { m.tasksQueued.Set(float64(count)) } func (m *BZZZMetrics) IncrementTasksCompleted(status, taskType string) { m.tasksCompleted.WithLabelValues(status, taskType).Inc() } func (m *BZZZMetrics) ObserveTaskDuration(taskType, status string, duration time.Duration) { m.taskDuration.WithLabelValues(taskType, status).Observe(duration.Seconds()) } // SLURP Metrics Methods func (m *BZZZMetrics) IncrementSLURPGenerated(role, status string) { m.slurpGenerated.WithLabelValues(role, status).Inc() } func (m *BZZZMetrics) ObserveSLURPGenerationTime(duration time.Duration) { m.slurpGenerationTime.Observe(duration.Seconds()) } func (m *BZZZMetrics) SetSLURPQueueLength(length int) { m.slurpQueueLength.Set(float64(length)) } // UCXI Metrics Methods func (m *BZZZMetrics) IncrementUCXIRequests(method, status string) { m.ucxiRequests.WithLabelValues(method, status).Inc() } func (m *BZZZMetrics) ObserveUCXIResolutionLatency(latency time.Duration) { m.ucxiResolutionLatency.Observe(latency.Seconds()) } // Resource Metrics Methods func (m *BZZZMetrics) SetCPUUsage(usage float64) { m.cpuUsage.Set(usage) } func (m *BZZZMetrics) SetMemoryUsage(usage float64) { m.memoryUsage.Set(usage) } func (m *BZZZMetrics) SetDiskUsage(mountPoint string, usage float64) { m.diskUsage.WithLabelValues(mountPoint).Set(usage) } func (m *BZZZMetrics) SetGoroutines(count int) { m.goroutines.Set(float64(count)) } // Error Metrics Methods func (m *BZZZMetrics) IncrementErrors(component, errorType string) { m.errors.WithLabelValues(component, errorType).Inc() } func (m *BZZZMetrics) IncrementPanics() { m.panics.Inc() } // System Metrics Methods func (m *BZZZMetrics) UpdateSystemInfo(nodeID, version, goVersion, cluster, environment string) { m.systemInfo.WithLabelValues(nodeID, version, goVersion, cluster, environment).Set(1) } func (m *BZZZMetrics) UpdateUptime() { m.uptime.Set(time.Since(m.startTime).Seconds()) } // CollectMetrics starts background metric collection func (m *BZZZMetrics) CollectMetrics(config *MetricsConfig) { systemTicker := time.NewTicker(config.SystemMetricsInterval) resourceTicker := time.NewTicker(config.ResourceMetricsInterval) go func() { defer systemTicker.Stop() defer resourceTicker.Stop() for { select { case <-systemTicker.C: m.UpdateUptime() // Collect other system metrics case <-resourceTicker.C: // Collect resource metrics (would integrate with actual system monitoring) // m.collectResourceMetrics() } } }() }