package sdk import ( "expvar" "fmt" "sync" "time" ) // Metrics provides comprehensive observability for the SDK type Metrics struct { // Connection metrics ConnectionStatus *expvar.Int ReconnectCount *expvar.Int ConnectionDuration *expvar.Int // Beat metrics BeatsReceived *expvar.Int DownbeatsReceived *expvar.Int BeatJitterMS *expvar.Map BeatCallbackLatency *expvar.Map BeatMisses *expvar.Int LocalDegradationTime *expvar.Int // Status emission metrics StatusClaimsEmitted *expvar.Int StatusClaimErrors *expvar.Int // Budget metrics BudgetsCreated *expvar.Int BudgetsCompleted *expvar.Int BudgetsTimedOut *expvar.Int // Error metrics TotalErrors *expvar.Int LastError *expvar.String // Internal counters beatJitterSamples []float64 jitterMutex sync.Mutex callbackLatencies []float64 latencyMutex sync.Mutex } // NewMetrics creates a new metrics instance with expvar integration func NewMetrics(prefix string) *Metrics { m := &Metrics{ ConnectionStatus: expvar.NewInt(prefix + ".connection.status"), ReconnectCount: expvar.NewInt(prefix + ".connection.reconnects"), ConnectionDuration: expvar.NewInt(prefix + ".connection.duration_ms"), BeatsReceived: expvar.NewInt(prefix + ".beats.received"), DownbeatsReceived: expvar.NewInt(prefix + ".beats.downbeats"), BeatJitterMS: expvar.NewMap(prefix + ".beats.jitter_ms"), BeatCallbackLatency: expvar.NewMap(prefix + ".beats.callback_latency_ms"), BeatMisses: expvar.NewInt(prefix + ".beats.misses"), LocalDegradationTime: expvar.NewInt(prefix + ".beats.degradation_ms"), StatusClaimsEmitted: expvar.NewInt(prefix + ".status.claims_emitted"), StatusClaimErrors: expvar.NewInt(prefix + ".status.claim_errors"), BudgetsCreated: expvar.NewInt(prefix + ".budgets.created"), BudgetsCompleted: expvar.NewInt(prefix + ".budgets.completed"), BudgetsTimedOut: expvar.NewInt(prefix + ".budgets.timed_out"), TotalErrors: expvar.NewInt(prefix + ".errors.total"), LastError: expvar.NewString(prefix + ".errors.last"), beatJitterSamples: make([]float64, 0, 100), callbackLatencies: make([]float64, 0, 100), } // Initialize connection status to disconnected m.ConnectionStatus.Set(0) return m } // RecordConnection records connection establishment func (m *Metrics) RecordConnection() { m.ConnectionStatus.Set(1) m.ReconnectCount.Add(1) } // RecordDisconnection records connection loss func (m *Metrics) RecordDisconnection() { m.ConnectionStatus.Set(0) } // RecordBeat records a beat reception with jitter measurement func (m *Metrics) RecordBeat(expectedTime, actualTime time.Time, isDownbeat bool) { m.BeatsReceived.Add(1) if isDownbeat { m.DownbeatsReceived.Add(1) } // Calculate and record jitter jitter := actualTime.Sub(expectedTime) jitterMS := float64(jitter.Nanoseconds()) / 1e6 m.jitterMutex.Lock() m.beatJitterSamples = append(m.beatJitterSamples, jitterMS) if len(m.beatJitterSamples) > 100 { m.beatJitterSamples = m.beatJitterSamples[1:] } // Update jitter statistics if len(m.beatJitterSamples) > 0 { avg, p95, p99 := m.calculatePercentiles(m.beatJitterSamples) m.BeatJitterMS.Set("avg", &expvar.Float{}) m.BeatJitterMS.Get("avg").(*expvar.Float).Set(avg) m.BeatJitterMS.Set("p95", &expvar.Float{}) m.BeatJitterMS.Get("p95").(*expvar.Float).Set(p95) m.BeatJitterMS.Set("p99", &expvar.Float{}) m.BeatJitterMS.Get("p99").(*expvar.Float).Set(p99) } m.jitterMutex.Unlock() } // RecordBeatMiss records a missed beat func (m *Metrics) RecordBeatMiss() { m.BeatMisses.Add(1) } // RecordCallbackLatency records callback execution latency func (m *Metrics) RecordCallbackLatency(duration time.Duration, callbackType string) { latencyMS := float64(duration.Nanoseconds()) / 1e6 m.latencyMutex.Lock() m.callbackLatencies = append(m.callbackLatencies, latencyMS) if len(m.callbackLatencies) > 100 { m.callbackLatencies = m.callbackLatencies[1:] } // Update latency statistics if len(m.callbackLatencies) > 0 { avg, p95, p99 := m.calculatePercentiles(m.callbackLatencies) key := callbackType + "_avg" m.BeatCallbackLatency.Set(key, &expvar.Float{}) m.BeatCallbackLatency.Get(key).(*expvar.Float).Set(avg) key = callbackType + "_p95" m.BeatCallbackLatency.Set(key, &expvar.Float{}) m.BeatCallbackLatency.Get(key).(*expvar.Float).Set(p95) key = callbackType + "_p99" m.BeatCallbackLatency.Set(key, &expvar.Float{}) m.BeatCallbackLatency.Get(key).(*expvar.Float).Set(p99) } m.latencyMutex.Unlock() } // RecordLocalDegradation records time spent in local degradation mode func (m *Metrics) RecordLocalDegradation(duration time.Duration) { durationMS := duration.Nanoseconds() / 1e6 m.LocalDegradationTime.Add(durationMS) } // RecordStatusClaim records a status claim emission func (m *Metrics) RecordStatusClaim(success bool) { if success { m.StatusClaimsEmitted.Add(1) } else { m.StatusClaimErrors.Add(1) } } // RecordBudget records budget creation and completion func (m *Metrics) RecordBudgetCreated() { m.BudgetsCreated.Add(1) } func (m *Metrics) RecordBudgetCompleted(timedOut bool) { if timedOut { m.BudgetsTimedOut.Add(1) } else { m.BudgetsCompleted.Add(1) } } // RecordError records an error func (m *Metrics) RecordError(err string) { m.TotalErrors.Add(1) m.LastError.Set(err) } // calculatePercentiles calculates avg, p95, p99 for a slice of samples func (m *Metrics) calculatePercentiles(samples []float64) (avg, p95, p99 float64) { if len(samples) == 0 { return 0, 0, 0 } // Calculate average sum := 0.0 for _, s := range samples { sum += s } avg = sum / float64(len(samples)) // Sort for percentiles (simple bubble sort for small slices) sorted := make([]float64, len(samples)) copy(sorted, samples) for i := 0; i < len(sorted); i++ { for j := 0; j < len(sorted)-i-1; j++ { if sorted[j] > sorted[j+1] { sorted[j], sorted[j+1] = sorted[j+1], sorted[j] } } } // Calculate percentiles p95Index := int(float64(len(sorted)) * 0.95) if p95Index >= len(sorted) { p95Index = len(sorted) - 1 } p95 = sorted[p95Index] p99Index := int(float64(len(sorted)) * 0.99) if p99Index >= len(sorted) { p99Index = len(sorted) - 1 } p99 = sorted[p99Index] return avg, p95, p99 } // Enhanced client with metrics integration func (c *client) initMetrics() { prefix := fmt.Sprintf("backbeat.sdk.%s", c.config.AgentID) c.metrics = NewMetrics(prefix) } // Add metrics field to client struct (this would go in client.go) type clientWithMetrics struct { *client metrics *Metrics } // Prometheus integration helper type PrometheusMetrics struct { // This would integrate with prometheus/client_golang // For now, we'll just use expvar which can be scraped } // GetMetricsSnapshot returns a snapshot of all current metrics func (m *Metrics) GetMetricsSnapshot() map[string]interface{} { snapshot := make(map[string]interface{}) snapshot["connection_status"] = m.ConnectionStatus.Value() snapshot["reconnect_count"] = m.ReconnectCount.Value() snapshot["beats_received"] = m.BeatsReceived.Value() snapshot["downbeats_received"] = m.DownbeatsReceived.Value() snapshot["beat_misses"] = m.BeatMisses.Value() snapshot["status_claims_emitted"] = m.StatusClaimsEmitted.Value() snapshot["status_claim_errors"] = m.StatusClaimErrors.Value() snapshot["budgets_created"] = m.BudgetsCreated.Value() snapshot["budgets_completed"] = m.BudgetsCompleted.Value() snapshot["budgets_timed_out"] = m.BudgetsTimedOut.Value() snapshot["total_errors"] = m.TotalErrors.Value() snapshot["last_error"] = m.LastError.Value() return snapshot } // Health check with metrics func (c *client) GetHealthWithMetrics() map[string]interface{} { health := map[string]interface{}{ "status": c.Health(), } if c.metrics != nil { health["metrics"] = c.metrics.GetMetricsSnapshot() } return health }