Implement initial scan logic and council formation for WHOOSH project kickoffs
- Replace incremental sync with full scan for new repositories - Add initial_scan status to bypass Since parameter filtering - Implement council formation detection for Design Brief issues - Add version display to WHOOSH UI header for debugging - Fix Docker token authentication with trailing newline removal - Add comprehensive council orchestration with Docker Swarm integration - Include BACKBEAT prototype integration for distributed timing - Support council-specific agent roles and deployment strategies - Transition repositories to active status after content discovery Key architectural improvements: - Full scan approach for new project detection vs incremental sync - Council formation triggered by chorus-entrypoint labeled Design Briefs - Proper token handling and authentication for Gitea API calls - Support for both initial discovery and ongoing task monitoring This enables autonomous project kickoff workflows where Design Brief issues automatically trigger formation of specialized agent councils for new projects. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
357
BACKBEAT-prototype/internal/backbeat/admin.go
Normal file
357
BACKBEAT-prototype/internal/backbeat/admin.go
Normal file
@@ -0,0 +1,357 @@
|
||||
package backbeat
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/gorilla/mux"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
"github.com/rs/zerolog"
|
||||
)
|
||||
|
||||
// AdminServer provides HTTP endpoints for BACKBEAT pulse administration
|
||||
// Includes tempo control, drift monitoring, and leader status as specified
|
||||
type AdminServer struct {
|
||||
router *mux.Router
|
||||
pulseState *PulseState
|
||||
metrics *Metrics
|
||||
elector *LeaderElector
|
||||
hlc *HLC
|
||||
logger zerolog.Logger
|
||||
degradation *DegradationManager
|
||||
}
|
||||
|
||||
// AdminConfig configures the admin server
|
||||
type AdminConfig struct {
|
||||
PulseState *PulseState
|
||||
Metrics *Metrics
|
||||
Elector *LeaderElector
|
||||
HLC *HLC
|
||||
Logger zerolog.Logger
|
||||
Degradation *DegradationManager
|
||||
}
|
||||
|
||||
// TempoResponse represents the response for tempo endpoints
|
||||
type TempoResponse struct {
|
||||
CurrentBPM int `json:"current_bpm"`
|
||||
PendingBPM int `json:"pending_bpm"`
|
||||
CanChange bool `json:"can_change"`
|
||||
NextChange string `json:"next_change,omitempty"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
}
|
||||
|
||||
// DriftResponse represents the response for drift monitoring
|
||||
type DriftResponse struct {
|
||||
TimerDriftPercent float64 `json:"timer_drift_percent"`
|
||||
HLCDriftSeconds float64 `json:"hlc_drift_seconds"`
|
||||
LastSyncTime string `json:"last_sync_time"`
|
||||
DegradationMode bool `json:"degradation_mode"`
|
||||
WithinLimits bool `json:"within_limits"`
|
||||
}
|
||||
|
||||
// LeaderResponse represents the response for leader status
|
||||
type LeaderResponse struct {
|
||||
NodeID string `json:"node_id"`
|
||||
IsLeader bool `json:"is_leader"`
|
||||
Leader string `json:"leader"`
|
||||
ClusterSize int `json:"cluster_size"`
|
||||
Stats map[string]interface{} `json:"stats"`
|
||||
}
|
||||
|
||||
// HealthResponse represents the health check response
|
||||
type HealthResponse struct {
|
||||
Status string `json:"status"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
Version string `json:"version"`
|
||||
NodeID string `json:"node_id"`
|
||||
IsLeader bool `json:"is_leader"`
|
||||
BeatIndex int64 `json:"beat_index"`
|
||||
TempoBPM int `json:"tempo_bpm"`
|
||||
Degradation bool `json:"degradation_mode"`
|
||||
}
|
||||
|
||||
// NewAdminServer creates a new admin API server
|
||||
func NewAdminServer(config AdminConfig) *AdminServer {
|
||||
server := &AdminServer{
|
||||
router: mux.NewRouter(),
|
||||
pulseState: config.PulseState,
|
||||
metrics: config.Metrics,
|
||||
elector: config.Elector,
|
||||
hlc: config.HLC,
|
||||
logger: config.Logger.With().Str("component", "admin-api").Logger(),
|
||||
degradation: config.Degradation,
|
||||
}
|
||||
|
||||
server.setupRoutes()
|
||||
return server
|
||||
}
|
||||
|
||||
// setupRoutes configures all admin API routes
|
||||
func (s *AdminServer) setupRoutes() {
|
||||
// Tempo control endpoints
|
||||
s.router.HandleFunc("/tempo", s.getTempo).Methods("GET")
|
||||
s.router.HandleFunc("/tempo", s.setTempo).Methods("POST")
|
||||
|
||||
// Drift monitoring endpoint
|
||||
s.router.HandleFunc("/drift", s.getDrift).Methods("GET")
|
||||
|
||||
// Leader status endpoint
|
||||
s.router.HandleFunc("/leader", s.getLeader).Methods("GET")
|
||||
|
||||
// Health check endpoints
|
||||
s.router.HandleFunc("/health", s.getHealth).Methods("GET")
|
||||
s.router.HandleFunc("/ready", s.getReady).Methods("GET")
|
||||
s.router.HandleFunc("/live", s.getLive).Methods("GET")
|
||||
|
||||
// Metrics endpoint
|
||||
s.router.Handle("/metrics", promhttp.Handler())
|
||||
|
||||
// Debug endpoints
|
||||
s.router.HandleFunc("/status", s.getStatus).Methods("GET")
|
||||
s.router.HandleFunc("/debug/state", s.getDebugState).Methods("GET")
|
||||
}
|
||||
|
||||
// getTempo handles GET /tempo requests
|
||||
func (s *AdminServer) getTempo(w http.ResponseWriter, r *http.Request) {
|
||||
s.logger.Debug().Msg("GET /tempo request")
|
||||
|
||||
response := TempoResponse{
|
||||
CurrentBPM: s.pulseState.TempoBPM,
|
||||
PendingBPM: s.pulseState.PendingBPM,
|
||||
CanChange: s.elector.IsLeader(),
|
||||
}
|
||||
|
||||
// Check if tempo change is pending
|
||||
if s.pulseState.PendingBPM != s.pulseState.TempoBPM {
|
||||
// Calculate next downbeat time
|
||||
beatsToDownbeat := int64(s.pulseState.BarLength) - ((s.pulseState.BeatIndex - 1) % int64(s.pulseState.BarLength))
|
||||
beatDuration := time.Duration(60000/s.pulseState.TempoBPM) * time.Millisecond
|
||||
nextDownbeat := time.Now().Add(time.Duration(beatsToDownbeat) * beatDuration)
|
||||
response.NextChange = nextDownbeat.Format(time.RFC3339)
|
||||
}
|
||||
|
||||
if !response.CanChange {
|
||||
response.Reason = "not leader"
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(response)
|
||||
}
|
||||
|
||||
// setTempo handles POST /tempo requests with BACKBEAT-REQ-004 validation
|
||||
func (s *AdminServer) setTempo(w http.ResponseWriter, r *http.Request) {
|
||||
s.logger.Debug().Msg("POST /tempo request")
|
||||
|
||||
// Only leader can change tempo
|
||||
if !s.elector.IsLeader() {
|
||||
s.respondError(w, http.StatusForbidden, "only leader can change tempo")
|
||||
s.metrics.RecordTempoChangeError()
|
||||
return
|
||||
}
|
||||
|
||||
var req TempoChangeRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
s.respondError(w, http.StatusBadRequest, "invalid JSON: "+err.Error())
|
||||
s.metrics.RecordTempoChangeError()
|
||||
return
|
||||
}
|
||||
|
||||
// Validate tempo change per BACKBEAT-REQ-004
|
||||
if err := ValidateTempoChange(s.pulseState.TempoBPM, req.TempoBPM); err != nil {
|
||||
s.respondError(w, http.StatusBadRequest, err.Error())
|
||||
s.metrics.RecordTempoChangeError()
|
||||
return
|
||||
}
|
||||
|
||||
// Set pending tempo - will be applied on next downbeat
|
||||
s.pulseState.PendingBPM = req.TempoBPM
|
||||
|
||||
s.logger.Info().
|
||||
Int("current_bpm", s.pulseState.TempoBPM).
|
||||
Int("pending_bpm", req.TempoBPM).
|
||||
Str("justification", req.Justification).
|
||||
Msg("tempo change scheduled")
|
||||
|
||||
response := TempoResponse{
|
||||
CurrentBPM: s.pulseState.TempoBPM,
|
||||
PendingBPM: req.TempoBPM,
|
||||
CanChange: true,
|
||||
Reason: "scheduled for next downbeat",
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(response)
|
||||
}
|
||||
|
||||
// getDrift handles GET /drift requests for BACKBEAT-PER-003 monitoring
|
||||
func (s *AdminServer) getDrift(w http.ResponseWriter, r *http.Request) {
|
||||
s.logger.Debug().Msg("GET /drift request")
|
||||
|
||||
hlcDrift := s.hlc.GetDrift()
|
||||
timerDrift := s.degradation.GetTimerDrift()
|
||||
|
||||
response := DriftResponse{
|
||||
TimerDriftPercent: timerDrift * 100, // Convert to percentage
|
||||
HLCDriftSeconds: hlcDrift.Seconds(),
|
||||
DegradationMode: s.degradation.IsInDegradationMode(),
|
||||
WithinLimits: timerDrift <= 0.01, // BACKBEAT-PER-003: ≤ 1%
|
||||
}
|
||||
|
||||
// Add last sync time if available
|
||||
if hlcDrift > 0 {
|
||||
response.LastSyncTime = time.Now().Add(-hlcDrift).Format(time.RFC3339)
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(response)
|
||||
}
|
||||
|
||||
// getLeader handles GET /leader requests
|
||||
func (s *AdminServer) getLeader(w http.ResponseWriter, r *http.Request) {
|
||||
s.logger.Debug().Msg("GET /leader request")
|
||||
|
||||
stats := s.elector.GetStats()
|
||||
clusterSize := 1 // Default to 1 if no stats available
|
||||
if size, ok := stats["num_peers"]; ok {
|
||||
if sizeStr, ok := size.(string); ok {
|
||||
if parsed, err := strconv.Atoi(sizeStr); err == nil {
|
||||
clusterSize = parsed + 1 // Add 1 for this node
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
response := LeaderResponse{
|
||||
NodeID: s.pulseState.NodeID,
|
||||
IsLeader: s.elector.IsLeader(),
|
||||
Leader: s.elector.GetLeader(),
|
||||
ClusterSize: clusterSize,
|
||||
Stats: stats,
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(response)
|
||||
}
|
||||
|
||||
// getHealth handles GET /health requests
|
||||
func (s *AdminServer) getHealth(w http.ResponseWriter, r *http.Request) {
|
||||
response := HealthResponse{
|
||||
Status: "ok",
|
||||
Timestamp: time.Now(),
|
||||
Version: "2.0.0",
|
||||
NodeID: s.pulseState.NodeID,
|
||||
IsLeader: s.elector.IsLeader(),
|
||||
BeatIndex: s.pulseState.BeatIndex,
|
||||
TempoBPM: s.pulseState.TempoBPM,
|
||||
Degradation: s.degradation.IsInDegradationMode(),
|
||||
}
|
||||
|
||||
// Check if degradation mode indicates unhealthy state
|
||||
if s.degradation.IsInDegradationMode() {
|
||||
drift := s.degradation.GetTimerDrift()
|
||||
if drift > 0.05 { // 5% drift indicates serious issues
|
||||
response.Status = "degraded"
|
||||
}
|
||||
}
|
||||
|
||||
statusCode := http.StatusOK
|
||||
if response.Status != "ok" {
|
||||
statusCode = http.StatusServiceUnavailable
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(statusCode)
|
||||
json.NewEncoder(w).Encode(response)
|
||||
}
|
||||
|
||||
// getReady handles GET /ready requests for k8s readiness probes
|
||||
func (s *AdminServer) getReady(w http.ResponseWriter, r *http.Request) {
|
||||
// Ready if we have a leader (this node or another)
|
||||
if leader := s.elector.GetLeader(); leader != "" {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
w.Write([]byte("ready"))
|
||||
} else {
|
||||
w.WriteHeader(http.StatusServiceUnavailable)
|
||||
w.Write([]byte("no leader"))
|
||||
}
|
||||
}
|
||||
|
||||
// getLive handles GET /live requests for k8s liveness probes
|
||||
func (s *AdminServer) getLive(w http.ResponseWriter, r *http.Request) {
|
||||
// Always live unless we're in severe degradation
|
||||
drift := s.degradation.GetTimerDrift()
|
||||
if drift > 0.10 { // 10% drift indicates critical issues
|
||||
w.WriteHeader(http.StatusServiceUnavailable)
|
||||
w.Write([]byte("severe drift"))
|
||||
return
|
||||
}
|
||||
|
||||
w.WriteHeader(http.StatusOK)
|
||||
w.Write([]byte("alive"))
|
||||
}
|
||||
|
||||
// getStatus handles GET /status requests for comprehensive status
|
||||
func (s *AdminServer) getStatus(w http.ResponseWriter, r *http.Request) {
|
||||
status := map[string]interface{}{
|
||||
"timestamp": time.Now(),
|
||||
"node_id": s.pulseState.NodeID,
|
||||
"cluster_id": s.pulseState.ClusterID,
|
||||
"is_leader": s.elector.IsLeader(),
|
||||
"leader": s.elector.GetLeader(),
|
||||
"beat_index": s.pulseState.BeatIndex,
|
||||
"tempo_bpm": s.pulseState.TempoBPM,
|
||||
"pending_bpm": s.pulseState.PendingBPM,
|
||||
"bar_length": s.pulseState.BarLength,
|
||||
"phases": s.pulseState.Phases,
|
||||
"degradation": s.degradation.IsInDegradationMode(),
|
||||
"uptime": time.Since(s.pulseState.StartTime),
|
||||
"raft_stats": s.elector.GetStats(),
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(status)
|
||||
}
|
||||
|
||||
// getDebugState handles GET /debug/state requests
|
||||
func (s *AdminServer) getDebugState(w http.ResponseWriter, r *http.Request) {
|
||||
debugState := map[string]interface{}{
|
||||
"pulse_state": s.pulseState,
|
||||
"hlc_drift": s.hlc.GetDrift(),
|
||||
"timer_drift": s.degradation.GetTimerDrift(),
|
||||
"leader_stats": s.elector.GetStats(),
|
||||
"degradation": s.degradation.GetState(),
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(debugState)
|
||||
}
|
||||
|
||||
// respondError sends a JSON error response
|
||||
func (s *AdminServer) respondError(w http.ResponseWriter, statusCode int, message string) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(statusCode)
|
||||
|
||||
errorResp := map[string]string{
|
||||
"error": message,
|
||||
"timestamp": time.Now().Format(time.RFC3339),
|
||||
}
|
||||
|
||||
json.NewEncoder(w).Encode(errorResp)
|
||||
}
|
||||
|
||||
// ServeHTTP implements http.Handler interface
|
||||
func (s *AdminServer) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
||||
// Add common headers
|
||||
w.Header().Set("X-BACKBEAT-Node-ID", s.pulseState.NodeID)
|
||||
w.Header().Set("X-BACKBEAT-Version", "2.0.0")
|
||||
|
||||
// Log request
|
||||
s.logger.Debug().
|
||||
Str("method", r.Method).
|
||||
Str("path", r.URL.Path).
|
||||
Str("remote_addr", r.RemoteAddr).
|
||||
Msg("admin API request")
|
||||
|
||||
s.router.ServeHTTP(w, r)
|
||||
}
|
||||
330
BACKBEAT-prototype/internal/backbeat/degradation.go
Normal file
330
BACKBEAT-prototype/internal/backbeat/degradation.go
Normal file
@@ -0,0 +1,330 @@
|
||||
package backbeat
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/rs/zerolog"
|
||||
)
|
||||
|
||||
// DegradationManager implements BACKBEAT-REQ-003 (Degrade Local)
|
||||
// Manages local tempo derivation when leader is lost and reconciliation
|
||||
type DegradationManager struct {
|
||||
mu sync.RWMutex
|
||||
logger zerolog.Logger
|
||||
|
||||
// State tracking
|
||||
inDegradationMode bool
|
||||
leaderLostAt time.Time
|
||||
lastLeaderSync time.Time
|
||||
localTempo int
|
||||
originalTempo int
|
||||
|
||||
// Timing state for BACKBEAT-PER-003 compliance
|
||||
referenceTime time.Time
|
||||
referenceBeat int64
|
||||
expectedBeatTime time.Time
|
||||
actualBeatTime time.Time
|
||||
driftAccumulation time.Duration
|
||||
|
||||
// Configuration
|
||||
maxDriftPercent float64 // BACKBEAT-PER-003: 1% max drift
|
||||
syncTimeout time.Duration
|
||||
degradationWindow time.Duration
|
||||
|
||||
// Metrics
|
||||
metrics *Metrics
|
||||
}
|
||||
|
||||
// DegradationConfig configures the degradation manager
|
||||
type DegradationConfig struct {
|
||||
Logger zerolog.Logger
|
||||
Metrics *Metrics
|
||||
MaxDriftPercent float64 // Default: 0.01 (1%)
|
||||
SyncTimeout time.Duration // Default: 30s
|
||||
DegradationWindow time.Duration // Default: 5m
|
||||
}
|
||||
|
||||
// NewDegradationManager creates a new degradation manager
|
||||
func NewDegradationManager(config DegradationConfig) *DegradationManager {
|
||||
// Set defaults
|
||||
if config.MaxDriftPercent == 0 {
|
||||
config.MaxDriftPercent = 0.01 // 1% as per BACKBEAT-PER-003
|
||||
}
|
||||
if config.SyncTimeout == 0 {
|
||||
config.SyncTimeout = 30 * time.Second
|
||||
}
|
||||
if config.DegradationWindow == 0 {
|
||||
config.DegradationWindow = 5 * time.Minute
|
||||
}
|
||||
|
||||
return &DegradationManager{
|
||||
logger: config.Logger.With().Str("component", "degradation").Logger(),
|
||||
metrics: config.Metrics,
|
||||
maxDriftPercent: config.MaxDriftPercent,
|
||||
syncTimeout: config.SyncTimeout,
|
||||
degradationWindow: config.DegradationWindow,
|
||||
referenceTime: time.Now(),
|
||||
lastLeaderSync: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
// OnLeaderLost is called when leadership is lost, initiating degradation mode
|
||||
func (d *DegradationManager) OnLeaderLost(currentTempo int, beatIndex int64) {
|
||||
d.mu.Lock()
|
||||
defer d.mu.Unlock()
|
||||
|
||||
now := time.Now()
|
||||
d.inDegradationMode = true
|
||||
d.leaderLostAt = now
|
||||
d.localTempo = currentTempo
|
||||
d.originalTempo = currentTempo
|
||||
d.referenceTime = now
|
||||
d.referenceBeat = beatIndex
|
||||
d.driftAccumulation = 0
|
||||
|
||||
d.logger.Warn().
|
||||
Int("tempo_bpm", currentTempo).
|
||||
Int64("beat_index", beatIndex).
|
||||
Msg("entered degradation mode - deriving local tempo")
|
||||
|
||||
if d.metrics != nil {
|
||||
d.metrics.UpdateDegradationMode(true)
|
||||
}
|
||||
}
|
||||
|
||||
// OnLeaderRecovered is called when leadership is restored
|
||||
func (d *DegradationManager) OnLeaderRecovered(leaderTempo int, leaderBeatIndex int64, hlc string) error {
|
||||
d.mu.Lock()
|
||||
defer d.mu.Unlock()
|
||||
|
||||
if !d.inDegradationMode {
|
||||
return nil // Already recovered
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
degradationDuration := now.Sub(d.leaderLostAt)
|
||||
|
||||
d.logger.Info().
|
||||
Dur("degradation_duration", degradationDuration).
|
||||
Int("local_tempo", d.localTempo).
|
||||
Int("leader_tempo", leaderTempo).
|
||||
Int64("local_beat", d.referenceBeat).
|
||||
Int64("leader_beat", leaderBeatIndex).
|
||||
Str("leader_hlc", hlc).
|
||||
Msg("reconciling with leader after degradation")
|
||||
|
||||
// Calculate drift during degradation period
|
||||
drift := d.calculateDrift(now)
|
||||
|
||||
// Reset degradation state
|
||||
d.inDegradationMode = false
|
||||
d.lastLeaderSync = now
|
||||
d.referenceTime = now
|
||||
d.referenceBeat = leaderBeatIndex
|
||||
d.driftAccumulation = 0
|
||||
|
||||
d.logger.Info().
|
||||
Float64("drift_percent", drift*100).
|
||||
Msg("recovered from degradation mode")
|
||||
|
||||
if d.metrics != nil {
|
||||
d.metrics.UpdateDegradationMode(false)
|
||||
d.metrics.UpdateDriftMetrics(drift, 0) // Reset HLC drift
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// UpdateBeatTiming updates timing information for drift calculation
|
||||
func (d *DegradationManager) UpdateBeatTiming(expectedTime, actualTime time.Time, beatIndex int64) {
|
||||
d.mu.Lock()
|
||||
defer d.mu.Unlock()
|
||||
|
||||
d.expectedBeatTime = expectedTime
|
||||
d.actualBeatTime = actualTime
|
||||
|
||||
// Accumulate drift if in degradation mode
|
||||
if d.inDegradationMode {
|
||||
beatDrift := actualTime.Sub(expectedTime)
|
||||
d.driftAccumulation += beatDrift.Abs()
|
||||
|
||||
// Update metrics
|
||||
if d.metrics != nil {
|
||||
drift := d.calculateDrift(actualTime)
|
||||
d.metrics.UpdateDriftMetrics(drift, 0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// GetTimerDrift returns the current timer drift ratio for BACKBEAT-PER-003
|
||||
func (d *DegradationManager) GetTimerDrift() float64 {
|
||||
d.mu.RLock()
|
||||
defer d.mu.RUnlock()
|
||||
|
||||
if !d.inDegradationMode {
|
||||
return 0.0 // No drift when synchronized with leader
|
||||
}
|
||||
|
||||
return d.calculateDrift(time.Now())
|
||||
}
|
||||
|
||||
// calculateDrift calculates the current drift ratio (internal method, must be called with lock)
|
||||
func (d *DegradationManager) calculateDrift(now time.Time) float64 {
|
||||
if d.referenceTime.IsZero() {
|
||||
return 0.0
|
||||
}
|
||||
|
||||
elapsed := now.Sub(d.referenceTime)
|
||||
if elapsed <= 0 {
|
||||
return 0.0
|
||||
}
|
||||
|
||||
// Calculate expected vs actual timing
|
||||
expectedDuration := elapsed
|
||||
actualDuration := elapsed + d.driftAccumulation
|
||||
|
||||
if expectedDuration <= 0 {
|
||||
return 0.0
|
||||
}
|
||||
|
||||
drift := float64(actualDuration-expectedDuration) / float64(expectedDuration)
|
||||
return math.Abs(drift)
|
||||
}
|
||||
|
||||
// IsInDegradationMode returns true if currently in degradation mode
|
||||
func (d *DegradationManager) IsInDegradationMode() bool {
|
||||
d.mu.RLock()
|
||||
defer d.mu.RUnlock()
|
||||
return d.inDegradationMode
|
||||
}
|
||||
|
||||
// GetDegradationDuration returns how long we've been in degradation mode
|
||||
func (d *DegradationManager) GetDegradationDuration() time.Duration {
|
||||
d.mu.RLock()
|
||||
defer d.mu.RUnlock()
|
||||
|
||||
if !d.inDegradationMode {
|
||||
return 0
|
||||
}
|
||||
|
||||
return time.Since(d.leaderLostAt)
|
||||
}
|
||||
|
||||
// IsWithinDriftLimits checks if current drift is within BACKBEAT-PER-003 limits
|
||||
func (d *DegradationManager) IsWithinDriftLimits() bool {
|
||||
drift := d.GetTimerDrift()
|
||||
return drift <= d.maxDriftPercent
|
||||
}
|
||||
|
||||
// GetLocalTempo returns the current local tempo when in degradation mode
|
||||
func (d *DegradationManager) GetLocalTempo() int {
|
||||
d.mu.RLock()
|
||||
defer d.mu.RUnlock()
|
||||
|
||||
if !d.inDegradationMode {
|
||||
return 0 // Not applicable when not in degradation
|
||||
}
|
||||
|
||||
return d.localTempo
|
||||
}
|
||||
|
||||
// AdjustLocalTempo allows fine-tuning local tempo to minimize drift
|
||||
func (d *DegradationManager) AdjustLocalTempo(newTempo int) error {
|
||||
d.mu.Lock()
|
||||
defer d.mu.Unlock()
|
||||
|
||||
if !d.inDegradationMode {
|
||||
return fmt.Errorf("cannot adjust local tempo when not in degradation mode")
|
||||
}
|
||||
|
||||
// Validate tempo adjustment (max 5% change from original)
|
||||
maxChange := float64(d.originalTempo) * 0.05
|
||||
change := math.Abs(float64(newTempo - d.originalTempo))
|
||||
|
||||
if change > maxChange {
|
||||
return fmt.Errorf("tempo adjustment too large: %.1f BPM (max %.1f BPM)",
|
||||
change, maxChange)
|
||||
}
|
||||
|
||||
oldTempo := d.localTempo
|
||||
d.localTempo = newTempo
|
||||
|
||||
d.logger.Info().
|
||||
Int("old_tempo", oldTempo).
|
||||
Int("new_tempo", newTempo).
|
||||
Float64("drift_percent", d.calculateDrift(time.Now())*100).
|
||||
Msg("adjusted local tempo to minimize drift")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetState returns the current degradation manager state for debugging
|
||||
func (d *DegradationManager) GetState() map[string]interface{} {
|
||||
d.mu.RLock()
|
||||
defer d.mu.RUnlock()
|
||||
|
||||
state := map[string]interface{}{
|
||||
"in_degradation_mode": d.inDegradationMode,
|
||||
"local_tempo": d.localTempo,
|
||||
"original_tempo": d.originalTempo,
|
||||
"drift_percent": d.calculateDrift(time.Now()) * 100,
|
||||
"within_limits": d.IsWithinDriftLimits(),
|
||||
"max_drift_percent": d.maxDriftPercent * 100,
|
||||
"reference_time": d.referenceTime,
|
||||
"reference_beat": d.referenceBeat,
|
||||
"drift_accumulation_ms": d.driftAccumulation.Milliseconds(),
|
||||
}
|
||||
|
||||
if d.inDegradationMode {
|
||||
state["degradation_duration"] = time.Since(d.leaderLostAt)
|
||||
state["leader_lost_at"] = d.leaderLostAt
|
||||
}
|
||||
|
||||
return state
|
||||
}
|
||||
|
||||
// MonitorDrift runs a background goroutine to monitor drift and alert on violations
|
||||
func (d *DegradationManager) MonitorDrift(ctx context.Context) {
|
||||
ticker := time.NewTicker(10 * time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
d.checkDriftLimits()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// checkDriftLimits monitors drift and logs warnings when limits are exceeded
|
||||
func (d *DegradationManager) checkDriftLimits() {
|
||||
d.mu.RLock()
|
||||
inDegradation := d.inDegradationMode
|
||||
drift := d.calculateDrift(time.Now())
|
||||
d.mu.RUnlock()
|
||||
|
||||
if !inDegradation {
|
||||
return // No drift monitoring when synchronized
|
||||
}
|
||||
|
||||
driftPercent := drift * 100
|
||||
|
||||
if drift > d.maxDriftPercent {
|
||||
d.logger.Warn().
|
||||
Float64("drift_percent", driftPercent).
|
||||
Float64("limit_percent", d.maxDriftPercent*100).
|
||||
Msg("BACKBEAT-PER-003 violation: timer drift exceeds 1% limit")
|
||||
} else if drift > d.maxDriftPercent*0.8 {
|
||||
// Warning at 80% of limit
|
||||
d.logger.Warn().
|
||||
Float64("drift_percent", driftPercent).
|
||||
Float64("limit_percent", d.maxDriftPercent*100).
|
||||
Msg("approaching drift limit")
|
||||
}
|
||||
}
|
||||
165
BACKBEAT-prototype/internal/backbeat/hlc.go
Normal file
165
BACKBEAT-prototype/internal/backbeat/hlc.go
Normal file
@@ -0,0 +1,165 @@
|
||||
package backbeat
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// HLC implements Hybrid Logical Clock for BACKBEAT-REQ-003 (degrade local)
|
||||
// Provides ordering guarantees for distributed events and supports reconciliation
|
||||
type HLC struct {
|
||||
mu sync.RWMutex
|
||||
pt time.Time // physical time
|
||||
lc int64 // logical counter
|
||||
nodeID string // node identifier for uniqueness
|
||||
lastSync time.Time // last successful sync with leader
|
||||
}
|
||||
|
||||
// NewHLC creates a new Hybrid Logical Clock instance
|
||||
func NewHLC(nodeID string) *HLC {
|
||||
return &HLC{
|
||||
pt: time.Now().UTC(),
|
||||
lc: 0,
|
||||
nodeID: nodeID,
|
||||
lastSync: time.Now().UTC(),
|
||||
}
|
||||
}
|
||||
|
||||
// Next generates the next HLC timestamp
|
||||
// Format: unix_ms_hex:logical_counter_hex:node_id_suffix
|
||||
// Example: "7ffd:0001:abcd"
|
||||
func (h *HLC) Next() string {
|
||||
h.mu.Lock()
|
||||
defer h.mu.Unlock()
|
||||
|
||||
now := time.Now().UTC()
|
||||
|
||||
// BACKBEAT-REQ-003: Support for local time derivation
|
||||
if now.After(h.pt) || now.Equal(h.pt) {
|
||||
h.pt = now
|
||||
if now.After(h.pt) {
|
||||
h.lc = 0
|
||||
} else {
|
||||
h.lc++
|
||||
}
|
||||
} else {
|
||||
h.lc++
|
||||
}
|
||||
|
||||
// Format as compact hex representation
|
||||
ptMs := h.pt.UnixMilli()
|
||||
nodeHash := h.nodeID
|
||||
if len(nodeHash) > 4 {
|
||||
nodeHash = nodeHash[:4]
|
||||
}
|
||||
|
||||
return fmt.Sprintf("%04x:%04x:%s", ptMs&0xFFFF, h.lc&0xFFFF, nodeHash)
|
||||
}
|
||||
|
||||
// Update synchronizes with an external HLC timestamp
|
||||
// Used for BACKBEAT-REQ-003 reconciliation with leader
|
||||
func (h *HLC) Update(remoteHLC string) error {
|
||||
h.mu.Lock()
|
||||
defer h.mu.Unlock()
|
||||
|
||||
parts := strings.Split(remoteHLC, ":")
|
||||
if len(parts) != 3 {
|
||||
return fmt.Errorf("invalid HLC format: %s", remoteHLC)
|
||||
}
|
||||
|
||||
remotePt, err := strconv.ParseInt(parts[0], 16, 64)
|
||||
if err != nil {
|
||||
return fmt.Errorf("invalid physical time in HLC: %v", err)
|
||||
}
|
||||
|
||||
remoteLc, err := strconv.ParseInt(parts[1], 16, 64)
|
||||
if err != nil {
|
||||
return fmt.Errorf("invalid logical counter in HLC: %v", err)
|
||||
}
|
||||
|
||||
now := time.Now().UTC()
|
||||
remoteTime := time.UnixMilli(remotePt)
|
||||
|
||||
// Update physical time to max(local_time, remote_time, current_time)
|
||||
maxTime := now
|
||||
if remoteTime.After(maxTime) {
|
||||
maxTime = remoteTime
|
||||
}
|
||||
if h.pt.After(maxTime) {
|
||||
maxTime = h.pt
|
||||
}
|
||||
|
||||
// Update logical counter based on HLC algorithm
|
||||
if maxTime.Equal(h.pt) && maxTime.Equal(remoteTime) {
|
||||
h.lc = max(h.lc, remoteLc) + 1
|
||||
} else if maxTime.Equal(h.pt) {
|
||||
h.lc++
|
||||
} else if maxTime.Equal(remoteTime) {
|
||||
h.lc = remoteLc + 1
|
||||
} else {
|
||||
h.lc = 0
|
||||
}
|
||||
|
||||
h.pt = maxTime
|
||||
h.lastSync = now
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetDrift returns the time since last successful sync with leader
|
||||
// Used for BACKBEAT-PER-003 (SDK timer drift ≤ 1% over 1 hour)
|
||||
func (h *HLC) GetDrift() time.Duration {
|
||||
h.mu.RLock()
|
||||
defer h.mu.RUnlock()
|
||||
return time.Since(h.lastSync)
|
||||
}
|
||||
|
||||
// Compare compares two HLC timestamps
|
||||
// Returns -1 if a < b, 0 if a == b, 1 if a > b
|
||||
func (h *HLC) Compare(a, b string) int {
|
||||
partsA := strings.Split(a, ":")
|
||||
partsB := strings.Split(b, ":")
|
||||
|
||||
if len(partsA) != 3 || len(partsB) != 3 {
|
||||
return 0 // Invalid format, consider equal
|
||||
}
|
||||
|
||||
ptA, _ := strconv.ParseInt(partsA[0], 16, 64)
|
||||
ptB, _ := strconv.ParseInt(partsB[0], 16, 64)
|
||||
|
||||
if ptA != ptB {
|
||||
if ptA < ptB {
|
||||
return -1
|
||||
}
|
||||
return 1
|
||||
}
|
||||
|
||||
lcA, _ := strconv.ParseInt(partsA[1], 16, 64)
|
||||
lcB, _ := strconv.ParseInt(partsB[1], 16, 64)
|
||||
|
||||
if lcA != lcB {
|
||||
if lcA < lcB {
|
||||
return -1
|
||||
}
|
||||
return 1
|
||||
}
|
||||
|
||||
// If physical time and logical counter are equal, compare node IDs
|
||||
if partsA[2] != partsB[2] {
|
||||
if partsA[2] < partsB[2] {
|
||||
return -1
|
||||
}
|
||||
return 1
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
func max(a, b int64) int64 {
|
||||
if a > b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
336
BACKBEAT-prototype/internal/backbeat/leader.go
Normal file
336
BACKBEAT-prototype/internal/backbeat/leader.go
Normal file
@@ -0,0 +1,336 @@
|
||||
package backbeat
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/raft"
|
||||
raftboltdb "github.com/hashicorp/raft-boltdb/v2"
|
||||
"github.com/rs/zerolog"
|
||||
)
|
||||
|
||||
// LeaderElector implements BACKBEAT-REQ-001 (Pulse Leader)
|
||||
// Provides pluggable leader election using Raft consensus
|
||||
type LeaderElector struct {
|
||||
mu sync.RWMutex
|
||||
raft *raft.Raft
|
||||
nodeID string
|
||||
bindAddr string
|
||||
dataDir string
|
||||
isLeader bool
|
||||
leaderCh chan bool
|
||||
shutdownCh chan struct{}
|
||||
logger zerolog.Logger
|
||||
onBecomeLeader func()
|
||||
onLoseLeader func()
|
||||
}
|
||||
|
||||
// FSM implements the Raft finite state machine for BACKBEAT state
|
||||
type BackbeatFSM struct {
|
||||
mu sync.RWMutex
|
||||
state map[string]interface{}
|
||||
}
|
||||
|
||||
// LeaderElectorConfig configures the leader election
|
||||
type LeaderElectorConfig struct {
|
||||
NodeID string
|
||||
BindAddr string
|
||||
DataDir string
|
||||
Logger zerolog.Logger
|
||||
OnBecomeLeader func()
|
||||
OnLoseLeader func()
|
||||
Bootstrap bool
|
||||
Peers []string
|
||||
}
|
||||
|
||||
// NewLeaderElector creates a new leader elector for BACKBEAT-REQ-001
|
||||
func NewLeaderElector(config LeaderElectorConfig) (*LeaderElector, error) {
|
||||
if config.NodeID == "" {
|
||||
return nil, fmt.Errorf("node ID is required")
|
||||
}
|
||||
|
||||
if config.BindAddr == "" {
|
||||
config.BindAddr = "127.0.0.1:0" // Let system assign port
|
||||
}
|
||||
|
||||
if config.DataDir == "" {
|
||||
config.DataDir = filepath.Join(os.TempDir(), "backbeat-raft-"+config.NodeID)
|
||||
}
|
||||
|
||||
// Create data directory
|
||||
if err := os.MkdirAll(config.DataDir, 0755); err != nil {
|
||||
return nil, fmt.Errorf("failed to create data directory: %v", err)
|
||||
}
|
||||
|
||||
le := &LeaderElector{
|
||||
nodeID: config.NodeID,
|
||||
bindAddr: config.BindAddr,
|
||||
dataDir: config.DataDir,
|
||||
logger: config.Logger.With().Str("component", "leader-elector").Logger(),
|
||||
leaderCh: make(chan bool, 1),
|
||||
shutdownCh: make(chan struct{}),
|
||||
onBecomeLeader: config.OnBecomeLeader,
|
||||
onLoseLeader: config.OnLoseLeader,
|
||||
}
|
||||
|
||||
if err := le.setupRaft(config.Bootstrap, config.Peers); err != nil {
|
||||
return nil, fmt.Errorf("failed to setup Raft: %v", err)
|
||||
}
|
||||
|
||||
go le.monitorLeadership()
|
||||
|
||||
return le, nil
|
||||
}
|
||||
|
||||
// setupRaft initializes the Raft consensus system
|
||||
func (le *LeaderElector) setupRaft(bootstrap bool, peers []string) error {
|
||||
// Create Raft configuration
|
||||
config := raft.DefaultConfig()
|
||||
config.LocalID = raft.ServerID(le.nodeID)
|
||||
config.HeartbeatTimeout = 1 * time.Second
|
||||
config.ElectionTimeout = 1 * time.Second
|
||||
config.CommitTimeout = 500 * time.Millisecond
|
||||
config.LeaderLeaseTimeout = 500 * time.Millisecond
|
||||
|
||||
// Setup logging will be handled by Raft's default logger
|
||||
|
||||
// Create transport
|
||||
addr, err := net.ResolveTCPAddr("tcp", le.bindAddr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to resolve bind address: %v", err)
|
||||
}
|
||||
|
||||
transport, err := raft.NewTCPTransport(le.bindAddr, addr, 3, 10*time.Second, os.Stderr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create transport: %v", err)
|
||||
}
|
||||
|
||||
// Update bind address with actual port if it was auto-assigned
|
||||
le.bindAddr = string(transport.LocalAddr())
|
||||
|
||||
// Create the snapshot store
|
||||
snapshots, err := raft.NewFileSnapshotStore(le.dataDir, 2, os.Stderr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create snapshot store: %v", err)
|
||||
}
|
||||
|
||||
// Create the log store and stable store
|
||||
logStore, err := raftboltdb.NewBoltStore(filepath.Join(le.dataDir, "raft-log.bolt"))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create log store: %v", err)
|
||||
}
|
||||
|
||||
stableStore, err := raftboltdb.NewBoltStore(filepath.Join(le.dataDir, "raft-stable.bolt"))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create stable store: %v", err)
|
||||
}
|
||||
|
||||
// Create FSM
|
||||
fsm := &BackbeatFSM{
|
||||
state: make(map[string]interface{}),
|
||||
}
|
||||
|
||||
// Create Raft instance
|
||||
r, err := raft.NewRaft(config, fsm, logStore, stableStore, snapshots, transport)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create Raft instance: %v", err)
|
||||
}
|
||||
|
||||
le.raft = r
|
||||
|
||||
// Bootstrap cluster if needed
|
||||
if bootstrap {
|
||||
servers := []raft.Server{
|
||||
{
|
||||
ID: config.LocalID,
|
||||
Address: transport.LocalAddr(),
|
||||
},
|
||||
}
|
||||
|
||||
// Add peer servers
|
||||
for _, peer := range peers {
|
||||
servers = append(servers, raft.Server{
|
||||
ID: raft.ServerID(peer),
|
||||
Address: raft.ServerAddress(peer),
|
||||
})
|
||||
}
|
||||
|
||||
configuration := raft.Configuration{Servers: servers}
|
||||
r.BootstrapCluster(configuration)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// monitorLeadership watches for leadership changes
|
||||
func (le *LeaderElector) monitorLeadership() {
|
||||
for {
|
||||
select {
|
||||
case isLeader := <-le.raft.LeaderCh():
|
||||
le.mu.Lock()
|
||||
wasLeader := le.isLeader
|
||||
le.isLeader = isLeader
|
||||
le.mu.Unlock()
|
||||
|
||||
if isLeader && !wasLeader {
|
||||
le.logger.Info().Msg("became leader")
|
||||
if le.onBecomeLeader != nil {
|
||||
le.onBecomeLeader()
|
||||
}
|
||||
} else if !isLeader && wasLeader {
|
||||
le.logger.Info().Msg("lost leadership")
|
||||
if le.onLoseLeader != nil {
|
||||
le.onLoseLeader()
|
||||
}
|
||||
}
|
||||
|
||||
// Notify any waiting goroutines
|
||||
select {
|
||||
case le.leaderCh <- isLeader:
|
||||
default:
|
||||
}
|
||||
|
||||
case <-le.shutdownCh:
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// IsLeader returns true if this node is the current leader
|
||||
func (le *LeaderElector) IsLeader() bool {
|
||||
le.mu.RLock()
|
||||
defer le.mu.RUnlock()
|
||||
return le.isLeader
|
||||
}
|
||||
|
||||
// GetLeader returns the current leader address
|
||||
func (le *LeaderElector) GetLeader() string {
|
||||
if le.raft == nil {
|
||||
return ""
|
||||
}
|
||||
_, leaderAddr := le.raft.LeaderWithID()
|
||||
return string(leaderAddr)
|
||||
}
|
||||
|
||||
// WaitForLeader blocks until leadership is established (this node or another)
|
||||
func (le *LeaderElector) WaitForLeader(ctx context.Context) error {
|
||||
ticker := time.NewTicker(100 * time.Millisecond)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-ticker.C:
|
||||
if leader := le.GetLeader(); leader != "" {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown gracefully shuts down the leader elector
|
||||
func (le *LeaderElector) Shutdown() error {
|
||||
close(le.shutdownCh)
|
||||
|
||||
if le.raft != nil {
|
||||
shutdownFuture := le.raft.Shutdown()
|
||||
if err := shutdownFuture.Error(); err != nil {
|
||||
le.logger.Error().Err(err).Msg("failed to shutdown Raft")
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetStats returns Raft statistics for monitoring
|
||||
func (le *LeaderElector) GetStats() map[string]interface{} {
|
||||
if le.raft == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
stats := le.raft.Stats()
|
||||
result := make(map[string]interface{})
|
||||
for k, v := range stats {
|
||||
result[k] = v
|
||||
}
|
||||
|
||||
result["is_leader"] = le.IsLeader()
|
||||
result["leader"] = le.GetLeader()
|
||||
result["node_id"] = le.nodeID
|
||||
result["bind_addr"] = le.bindAddr
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// BackbeatFSM implementation
|
||||
func (fsm *BackbeatFSM) Apply(log *raft.Log) interface{} {
|
||||
fsm.mu.Lock()
|
||||
defer fsm.mu.Unlock()
|
||||
|
||||
// Parse the command
|
||||
var cmd map[string]interface{}
|
||||
if err := json.Unmarshal(log.Data, &cmd); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Apply command to state
|
||||
for k, v := range cmd {
|
||||
fsm.state[k] = v
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (fsm *BackbeatFSM) Snapshot() (raft.FSMSnapshot, error) {
|
||||
fsm.mu.RLock()
|
||||
defer fsm.mu.RUnlock()
|
||||
|
||||
// Create a copy of the state
|
||||
state := make(map[string]interface{})
|
||||
for k, v := range fsm.state {
|
||||
state[k] = v
|
||||
}
|
||||
|
||||
return &BackbeatSnapshot{state: state}, nil
|
||||
}
|
||||
|
||||
func (fsm *BackbeatFSM) Restore(rc io.ReadCloser) error {
|
||||
defer rc.Close()
|
||||
|
||||
var state map[string]interface{}
|
||||
decoder := json.NewDecoder(rc)
|
||||
if err := decoder.Decode(&state); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fsm.mu.Lock()
|
||||
defer fsm.mu.Unlock()
|
||||
fsm.state = state
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// BackbeatSnapshot implements raft.FSMSnapshot
|
||||
type BackbeatSnapshot struct {
|
||||
state map[string]interface{}
|
||||
}
|
||||
|
||||
func (s *BackbeatSnapshot) Persist(sink raft.SnapshotSink) error {
|
||||
encoder := json.NewEncoder(sink)
|
||||
if err := encoder.Encode(s.state); err != nil {
|
||||
sink.Cancel()
|
||||
return err
|
||||
}
|
||||
return sink.Close()
|
||||
}
|
||||
|
||||
func (s *BackbeatSnapshot) Release() {}
|
||||
376
BACKBEAT-prototype/internal/backbeat/metrics.go
Normal file
376
BACKBEAT-prototype/internal/backbeat/metrics.go
Normal file
@@ -0,0 +1,376 @@
|
||||
package backbeat
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||
)
|
||||
|
||||
// Metrics provides comprehensive observability for BACKBEAT pulse service
|
||||
// Supports BACKBEAT-PER-001, BACKBEAT-PER-002, BACKBEAT-PER-003 monitoring
|
||||
type Metrics struct {
|
||||
// BACKBEAT-PER-001: End-to-end delivery p95 ≤ 100ms at 2Hz
|
||||
BeatPublishDuration prometheus.Histogram
|
||||
BeatDeliveryLatency prometheus.Histogram
|
||||
|
||||
// BACKBEAT-PER-002: Pulse jitter p95 ≤ 20ms
|
||||
PulseJitter prometheus.Histogram
|
||||
BeatTiming prometheus.Histogram
|
||||
|
||||
// BACKBEAT-PER-003: SDK timer drift ≤ 1% over 1 hour
|
||||
TimerDrift prometheus.Gauge
|
||||
HLCDrift prometheus.Gauge
|
||||
|
||||
// Leadership and cluster health
|
||||
IsLeader prometheus.Gauge
|
||||
LeadershipChanges prometheus.Counter
|
||||
ClusterSize prometheus.Gauge
|
||||
|
||||
// Tempo and beat metrics
|
||||
CurrentTempo prometheus.Gauge
|
||||
BeatCounter prometheus.Counter
|
||||
DownbeatCounter prometheus.Counter
|
||||
PhaseTransitions prometheus.CounterVec
|
||||
|
||||
// Error and degradation metrics
|
||||
TempoChangeErrors prometheus.Counter
|
||||
LeadershipLoss prometheus.Counter
|
||||
DegradationMode prometheus.Gauge
|
||||
NATSConnectionLoss prometheus.Counter
|
||||
|
||||
// Performance metrics
|
||||
BeatFrameSize prometheus.Histogram
|
||||
NATSPublishErrors prometheus.Counter
|
||||
|
||||
// BACKBEAT-OBS-002: Reverb aggregation metrics
|
||||
ReverbAgentsReporting prometheus.Gauge
|
||||
ReverbOnTimeReviews prometheus.Gauge
|
||||
ReverbTempoDriftMS prometheus.Gauge
|
||||
ReverbWindowsCompleted prometheus.Counter
|
||||
ReverbClaimsProcessed prometheus.Counter
|
||||
ReverbWindowProcessingTime prometheus.Histogram
|
||||
ReverbBarReportSize prometheus.Histogram
|
||||
ReverbWindowsActive prometheus.Gauge
|
||||
ReverbClaimsPerWindow prometheus.Histogram
|
||||
}
|
||||
|
||||
// NewMetrics creates and registers all BACKBEAT metrics
|
||||
func NewMetrics() *Metrics {
|
||||
return &Metrics{
|
||||
// BACKBEAT-PER-001: End-to-end delivery monitoring
|
||||
BeatPublishDuration: promauto.NewHistogram(prometheus.HistogramOpts{
|
||||
Name: "backbeat_beat_publish_duration_seconds",
|
||||
Help: "Time spent publishing beat frames to NATS",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "pulse",
|
||||
Buckets: prometheus.ExponentialBuckets(0.001, 2, 10), // 1ms to 1s
|
||||
}),
|
||||
|
||||
BeatDeliveryLatency: promauto.NewHistogram(prometheus.HistogramOpts{
|
||||
Name: "backbeat_beat_delivery_latency_seconds",
|
||||
Help: "End-to-end beat delivery latency (BACKBEAT-PER-001: p95 ≤ 100ms)",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "pulse",
|
||||
Buckets: prometheus.ExponentialBuckets(0.001, 2, 10),
|
||||
}),
|
||||
|
||||
// BACKBEAT-PER-002: Pulse jitter monitoring
|
||||
PulseJitter: promauto.NewHistogram(prometheus.HistogramOpts{
|
||||
Name: "backbeat_pulse_jitter_seconds",
|
||||
Help: "Beat timing jitter (BACKBEAT-PER-002: p95 ≤ 20ms)",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "pulse",
|
||||
Buckets: []float64{0.001, 0.005, 0.010, 0.015, 0.020, 0.025, 0.050, 0.100},
|
||||
}),
|
||||
|
||||
BeatTiming: promauto.NewHistogram(prometheus.HistogramOpts{
|
||||
Name: "backbeat_beat_timing_accuracy_seconds",
|
||||
Help: "Accuracy of beat timing relative to expected schedule",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "pulse",
|
||||
Buckets: prometheus.ExponentialBuckets(0.0001, 2, 12),
|
||||
}),
|
||||
|
||||
// BACKBEAT-PER-003: Timer drift monitoring
|
||||
TimerDrift: promauto.NewGauge(prometheus.GaugeOpts{
|
||||
Name: "backbeat_timer_drift_ratio",
|
||||
Help: "Timer drift ratio (BACKBEAT-PER-003: ≤ 1% over 1 hour)",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "pulse",
|
||||
}),
|
||||
|
||||
HLCDrift: promauto.NewGauge(prometheus.GaugeOpts{
|
||||
Name: "backbeat_hlc_drift_seconds",
|
||||
Help: "HLC drift from last leader sync",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "pulse",
|
||||
}),
|
||||
|
||||
// Leadership metrics
|
||||
IsLeader: promauto.NewGauge(prometheus.GaugeOpts{
|
||||
Name: "backbeat_is_leader",
|
||||
Help: "1 if this node is the current leader, 0 otherwise",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "cluster",
|
||||
}),
|
||||
|
||||
LeadershipChanges: promauto.NewCounter(prometheus.CounterOpts{
|
||||
Name: "backbeat_leadership_changes_total",
|
||||
Help: "Total number of leadership changes",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "cluster",
|
||||
}),
|
||||
|
||||
ClusterSize: promauto.NewGauge(prometheus.GaugeOpts{
|
||||
Name: "backbeat_cluster_size",
|
||||
Help: "Number of nodes in the cluster",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "cluster",
|
||||
}),
|
||||
|
||||
// Tempo and beat metrics
|
||||
CurrentTempo: promauto.NewGauge(prometheus.GaugeOpts{
|
||||
Name: "backbeat_current_tempo_bpm",
|
||||
Help: "Current tempo in beats per minute",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "pulse",
|
||||
}),
|
||||
|
||||
BeatCounter: promauto.NewCounter(prometheus.CounterOpts{
|
||||
Name: "backbeat_beats_total",
|
||||
Help: "Total number of beats published",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "pulse",
|
||||
}),
|
||||
|
||||
DownbeatCounter: promauto.NewCounter(prometheus.CounterOpts{
|
||||
Name: "backbeat_downbeats_total",
|
||||
Help: "Total number of downbeats published",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "pulse",
|
||||
}),
|
||||
|
||||
PhaseTransitions: *promauto.NewCounterVec(prometheus.CounterOpts{
|
||||
Name: "backbeat_phase_transitions_total",
|
||||
Help: "Total number of phase transitions by phase name",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "pulse",
|
||||
}, []string{"phase", "from_phase"}),
|
||||
|
||||
// Error metrics
|
||||
TempoChangeErrors: promauto.NewCounter(prometheus.CounterOpts{
|
||||
Name: "backbeat_tempo_change_errors_total",
|
||||
Help: "Total number of rejected tempo change requests",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "control",
|
||||
}),
|
||||
|
||||
LeadershipLoss: promauto.NewCounter(prometheus.CounterOpts{
|
||||
Name: "backbeat_leadership_loss_total",
|
||||
Help: "Total number of times this node lost leadership",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "cluster",
|
||||
}),
|
||||
|
||||
DegradationMode: promauto.NewGauge(prometheus.GaugeOpts{
|
||||
Name: "backbeat_degradation_mode",
|
||||
Help: "1 if running in degradation mode (BACKBEAT-REQ-003), 0 otherwise",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "pulse",
|
||||
}),
|
||||
|
||||
NATSConnectionLoss: promauto.NewCounter(prometheus.CounterOpts{
|
||||
Name: "backbeat_nats_connection_loss_total",
|
||||
Help: "Total number of NATS connection losses",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "transport",
|
||||
}),
|
||||
|
||||
// Performance metrics
|
||||
BeatFrameSize: promauto.NewHistogram(prometheus.HistogramOpts{
|
||||
Name: "backbeat_beat_frame_size_bytes",
|
||||
Help: "Size of serialized beat frames",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "pulse",
|
||||
Buckets: prometheus.ExponentialBuckets(100, 2, 10),
|
||||
}),
|
||||
|
||||
NATSPublishErrors: promauto.NewCounter(prometheus.CounterOpts{
|
||||
Name: "backbeat_nats_publish_errors_total",
|
||||
Help: "Total number of NATS publish errors",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "transport",
|
||||
}),
|
||||
|
||||
// BACKBEAT-OBS-002: Reverb aggregation metrics
|
||||
ReverbAgentsReporting: promauto.NewGauge(prometheus.GaugeOpts{
|
||||
Name: "backbeat_reverb_agents_reporting",
|
||||
Help: "Number of agents reporting in current window (BACKBEAT-OBS-002)",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "reverb",
|
||||
}),
|
||||
|
||||
ReverbOnTimeReviews: promauto.NewGauge(prometheus.GaugeOpts{
|
||||
Name: "backbeat_reverb_on_time_reviews",
|
||||
Help: "Number of on-time reviews completed (BACKBEAT-OBS-002)",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "reverb",
|
||||
}),
|
||||
|
||||
ReverbTempoDriftMS: promauto.NewGauge(prometheus.GaugeOpts{
|
||||
Name: "backbeat_reverb_tempo_drift_ms",
|
||||
Help: "Current tempo drift in milliseconds (BACKBEAT-OBS-002)",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "reverb",
|
||||
}),
|
||||
|
||||
ReverbWindowsCompleted: promauto.NewCounter(prometheus.CounterOpts{
|
||||
Name: "backbeat_reverb_windows_completed_total",
|
||||
Help: "Total number of windows completed (BACKBEAT-OBS-002)",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "reverb",
|
||||
}),
|
||||
|
||||
ReverbClaimsProcessed: promauto.NewCounter(prometheus.CounterOpts{
|
||||
Name: "backbeat_reverb_claims_processed_total",
|
||||
Help: "Total number of status claims processed (BACKBEAT-OBS-002)",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "reverb",
|
||||
}),
|
||||
|
||||
ReverbWindowProcessingTime: promauto.NewHistogram(prometheus.HistogramOpts{
|
||||
Name: "backbeat_reverb_window_processing_seconds",
|
||||
Help: "Time to process and emit a window report (BACKBEAT-PER-002: ≤ 1 beat)",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "reverb",
|
||||
Buckets: prometheus.ExponentialBuckets(0.001, 2, 12), // 1ms to 4s
|
||||
}),
|
||||
|
||||
ReverbBarReportSize: promauto.NewHistogram(prometheus.HistogramOpts{
|
||||
Name: "backbeat_reverb_bar_report_size_bytes",
|
||||
Help: "Size of serialized bar reports",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "reverb",
|
||||
Buckets: prometheus.ExponentialBuckets(100, 2, 10),
|
||||
}),
|
||||
|
||||
ReverbWindowsActive: promauto.NewGauge(prometheus.GaugeOpts{
|
||||
Name: "backbeat_reverb_windows_active",
|
||||
Help: "Number of active windows being aggregated",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "reverb",
|
||||
}),
|
||||
|
||||
ReverbClaimsPerWindow: promauto.NewHistogram(prometheus.HistogramOpts{
|
||||
Name: "backbeat_reverb_claims_per_window",
|
||||
Help: "Number of claims processed per window",
|
||||
Namespace: "backbeat",
|
||||
Subsystem: "reverb",
|
||||
Buckets: prometheus.ExponentialBuckets(1, 2, 15), // 1 to 32k claims
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
// RecordBeatPublish records metrics for a published beat
|
||||
func (m *Metrics) RecordBeatPublish(duration time.Duration, frameSize int, isDownbeat bool, phase string) {
|
||||
m.BeatPublishDuration.Observe(duration.Seconds())
|
||||
m.BeatFrameSize.Observe(float64(frameSize))
|
||||
m.BeatCounter.Inc()
|
||||
|
||||
if isDownbeat {
|
||||
m.DownbeatCounter.Inc()
|
||||
}
|
||||
}
|
||||
|
||||
// RecordPulseJitter records beat timing jitter
|
||||
func (m *Metrics) RecordPulseJitter(jitter time.Duration) {
|
||||
m.PulseJitter.Observe(jitter.Seconds())
|
||||
}
|
||||
|
||||
// RecordBeatTiming records beat timing accuracy
|
||||
func (m *Metrics) RecordBeatTiming(expectedTime, actualTime time.Time) {
|
||||
diff := actualTime.Sub(expectedTime).Abs()
|
||||
m.BeatTiming.Observe(diff.Seconds())
|
||||
}
|
||||
|
||||
// UpdateTempoMetrics updates tempo-related metrics
|
||||
func (m *Metrics) UpdateTempoMetrics(currentBPM int) {
|
||||
m.CurrentTempo.Set(float64(currentBPM))
|
||||
}
|
||||
|
||||
// UpdateLeadershipMetrics updates leadership-related metrics
|
||||
func (m *Metrics) UpdateLeadershipMetrics(isLeader bool, clusterSize int) {
|
||||
if isLeader {
|
||||
m.IsLeader.Set(1)
|
||||
} else {
|
||||
m.IsLeader.Set(0)
|
||||
}
|
||||
m.ClusterSize.Set(float64(clusterSize))
|
||||
}
|
||||
|
||||
// RecordLeadershipChange records a leadership change event
|
||||
func (m *Metrics) RecordLeadershipChange(becameLeader bool) {
|
||||
m.LeadershipChanges.Inc()
|
||||
if !becameLeader {
|
||||
m.LeadershipLoss.Inc()
|
||||
}
|
||||
}
|
||||
|
||||
// UpdateDriftMetrics updates drift-related metrics for BACKBEAT-PER-003
|
||||
func (m *Metrics) UpdateDriftMetrics(timerDriftRatio float64, hlcDriftSeconds float64) {
|
||||
m.TimerDrift.Set(timerDriftRatio)
|
||||
m.HLCDrift.Set(hlcDriftSeconds)
|
||||
}
|
||||
|
||||
// UpdateDegradationMode updates degradation mode status
|
||||
func (m *Metrics) UpdateDegradationMode(inDegradationMode bool) {
|
||||
if inDegradationMode {
|
||||
m.DegradationMode.Set(1)
|
||||
} else {
|
||||
m.DegradationMode.Set(0)
|
||||
}
|
||||
}
|
||||
|
||||
// RecordTempoChangeError records a tempo change error
|
||||
func (m *Metrics) RecordTempoChangeError() {
|
||||
m.TempoChangeErrors.Inc()
|
||||
}
|
||||
|
||||
// RecordNATSError records NATS-related errors
|
||||
func (m *Metrics) RecordNATSError(errorType string) {
|
||||
switch errorType {
|
||||
case "connection_loss":
|
||||
m.NATSConnectionLoss.Inc()
|
||||
case "publish_error":
|
||||
m.NATSPublishErrors.Inc()
|
||||
}
|
||||
}
|
||||
|
||||
// RecordPhaseTransition records a phase transition
|
||||
func (m *Metrics) RecordPhaseTransition(fromPhase, toPhase string) {
|
||||
m.PhaseTransitions.WithLabelValues(toPhase, fromPhase).Inc()
|
||||
}
|
||||
|
||||
// RecordReverbWindow records metrics for a completed reverb window
|
||||
func (m *Metrics) RecordReverbWindow(processingTime time.Duration, claimsCount int, agentsReporting int, onTimeReviews int, tempoDriftMS int, reportSize int) {
|
||||
m.ReverbWindowsCompleted.Inc()
|
||||
m.ReverbWindowProcessingTime.Observe(processingTime.Seconds())
|
||||
m.ReverbClaimsPerWindow.Observe(float64(claimsCount))
|
||||
m.ReverbBarReportSize.Observe(float64(reportSize))
|
||||
|
||||
// Update current window metrics
|
||||
m.ReverbAgentsReporting.Set(float64(agentsReporting))
|
||||
m.ReverbOnTimeReviews.Set(float64(onTimeReviews))
|
||||
m.ReverbTempoDriftMS.Set(float64(tempoDriftMS))
|
||||
}
|
||||
|
||||
// RecordReverbClaim records a processed status claim
|
||||
func (m *Metrics) RecordReverbClaim() {
|
||||
m.ReverbClaimsProcessed.Inc()
|
||||
}
|
||||
|
||||
// UpdateReverbActiveWindows updates the number of active windows being tracked
|
||||
func (m *Metrics) UpdateReverbActiveWindows(count int) {
|
||||
m.ReverbWindowsActive.Set(float64(count))
|
||||
}
|
||||
15
BACKBEAT-prototype/internal/backbeat/score.go
Normal file
15
BACKBEAT-prototype/internal/backbeat/score.go
Normal file
@@ -0,0 +1,15 @@
|
||||
package backbeat
|
||||
|
||||
import "errors"
|
||||
|
||||
// PhaseFor returns the phase name for a given beat index (1-indexed).
|
||||
func PhaseFor(phases map[string]int, beatIndex int) (string, error) {
|
||||
acc := 0
|
||||
for name, n := range phases {
|
||||
acc += n
|
||||
if beatIndex <= acc {
|
||||
return name, nil
|
||||
}
|
||||
}
|
||||
return "", errors.New("beat index out of range")
|
||||
}
|
||||
260
BACKBEAT-prototype/internal/backbeat/types.go
Normal file
260
BACKBEAT-prototype/internal/backbeat/types.go
Normal file
@@ -0,0 +1,260 @@
|
||||
package backbeat
|
||||
|
||||
import (
|
||||
"crypto/sha256"
|
||||
"fmt"
|
||||
"time"
|
||||
)
|
||||
|
||||
// BeatFrame represents the INT-A specification for BACKBEAT-REQ-002
|
||||
// BACKBEAT-REQ-002: BeatFrame must emit INT-A with hlc, beat_index, downbeat, phase, deadline_at, tempo_bpm
|
||||
type BeatFrame struct {
|
||||
Type string `json:"type"` // INT-A: always "backbeat.beatframe.v1"
|
||||
ClusterID string `json:"cluster_id"` // INT-A: cluster identifier
|
||||
BeatIndex int64 `json:"beat_index"` // INT-A: global beat counter (not cyclic)
|
||||
Downbeat bool `json:"downbeat"` // INT-A: true when beat_index % bar_length == 1
|
||||
Phase string `json:"phase"` // INT-A: current phase name
|
||||
HLC string `json:"hlc"` // INT-A: hybrid logical clock timestamp
|
||||
DeadlineAt time.Time `json:"deadline_at"` // INT-A: RFC3339 timestamp for beat deadline
|
||||
TempoBPM int `json:"tempo_bpm"` // INT-A: current tempo in beats per minute
|
||||
WindowID string `json:"window_id"` // BACKBEAT-REQ-005: deterministic window identifier
|
||||
}
|
||||
|
||||
// StatusClaim represents the INT-B specification for BACKBEAT-REQ-020
|
||||
// BACKBEAT-REQ-020: StatusClaim must include type, agent_id, task_id, beat_index, state, beats_left, progress, notes, hlc
|
||||
type StatusClaim struct {
|
||||
Type string `json:"type"` // INT-B: always "backbeat.statusclaim.v1"
|
||||
AgentID string `json:"agent_id"` // INT-B: agent identifier (e.g., "agent:xyz")
|
||||
TaskID string `json:"task_id"` // INT-B: task identifier (e.g., "task:123")
|
||||
BeatIndex int64 `json:"beat_index"` // INT-B: current beat index
|
||||
State string `json:"state"` // INT-B: executing|planning|waiting|review|done|failed
|
||||
WaitFor []string `json:"wait_for,omitempty"` // refs (e.g., hmmm://thread/...)
|
||||
BeatsLeft int `json:"beats_left"` // INT-B: estimated beats remaining
|
||||
Progress float64 `json:"progress"` // INT-B: progress ratio (0.0-1.0)
|
||||
Notes string `json:"notes"` // INT-B: status description
|
||||
HLC string `json:"hlc"` // INT-B: hybrid logical clock timestamp
|
||||
}
|
||||
|
||||
// BarReport represents the INT-C specification for BACKBEAT-REQ-021
|
||||
// BACKBEAT-REQ-021: BarReport must emit INT-C with window_id, from_beat, to_beat, and KPIs at each downbeat
|
||||
type BarReport struct {
|
||||
Type string `json:"type"` // INT-C: always "backbeat.barreport.v1"
|
||||
WindowID string `json:"window_id"` // INT-C: deterministic window identifier
|
||||
FromBeat int64 `json:"from_beat"` // INT-C: starting beat index of the window
|
||||
ToBeat int64 `json:"to_beat"` // INT-C: ending beat index of the window
|
||||
AgentsReporting int `json:"agents_reporting"` // INT-C: number of unique agents that reported
|
||||
OnTimeReviews int `json:"on_time_reviews"` // INT-C: tasks completed by deadline
|
||||
HelpPromisesFulfilled int `json:"help_promises_fulfilled"` // INT-C: help requests fulfilled
|
||||
SecretRotationsOK bool `json:"secret_rotations_ok"` // INT-C: security rotation status
|
||||
TempoDriftMS int `json:"tempo_drift_ms"` // INT-C: tempo drift in milliseconds
|
||||
Issues []string `json:"issues"` // INT-C: list of detected issues
|
||||
|
||||
// Internal fields for aggregation (not part of INT-C)
|
||||
ClusterID string `json:"cluster_id,omitempty"` // For internal routing
|
||||
StateCounts map[string]int `json:"state_counts,omitempty"` // For debugging
|
||||
}
|
||||
|
||||
// PulseState represents the internal state of the pulse service
|
||||
type PulseState struct {
|
||||
ClusterID string
|
||||
NodeID string
|
||||
IsLeader bool
|
||||
BeatIndex int64
|
||||
TempoBPM int
|
||||
PendingBPM int
|
||||
BarLength int
|
||||
Phases []string
|
||||
CurrentPhase int
|
||||
LastDownbeat time.Time
|
||||
StartTime time.Time
|
||||
FrozenBeats int
|
||||
}
|
||||
|
||||
// TempoChangeRequest represents a tempo change request with validation
|
||||
type TempoChangeRequest struct {
|
||||
TempoBPM int `json:"tempo_bpm"`
|
||||
Justification string `json:"justification,omitempty"`
|
||||
}
|
||||
|
||||
// GenerateWindowID creates a deterministic window ID per BACKBEAT-REQ-005
|
||||
// BACKBEAT-REQ-005: window_id = hex(sha256(cluster_id + ":" + downbeat_beat_index))[0:32]
|
||||
func GenerateWindowID(clusterID string, downbeatBeatIndex int64) string {
|
||||
input := fmt.Sprintf("%s:%d", clusterID, downbeatBeatIndex)
|
||||
hash := sha256.Sum256([]byte(input))
|
||||
return fmt.Sprintf("%x", hash)[:32]
|
||||
}
|
||||
|
||||
// IsDownbeat determines if a given beat index represents a downbeat
|
||||
func IsDownbeat(beatIndex int64, barLength int) bool {
|
||||
return (beatIndex-1)%int64(barLength) == 0
|
||||
}
|
||||
|
||||
// GetDownbeatIndex calculates the downbeat index for a given beat
|
||||
func GetDownbeatIndex(beatIndex int64, barLength int) int64 {
|
||||
return ((beatIndex-1)/int64(barLength))*int64(barLength) + 1
|
||||
}
|
||||
|
||||
// ValidateTempoChange checks if a tempo change is within acceptable limits
|
||||
// BACKBEAT-REQ-004: Changes only on next downbeat; ≤±10% delta cap
|
||||
func ValidateTempoChange(currentBPM, newBPM int) error {
|
||||
if newBPM <= 0 {
|
||||
return fmt.Errorf("invalid tempo: must be positive, got %d", newBPM)
|
||||
}
|
||||
|
||||
// Calculate percentage change
|
||||
delta := float64(newBPM-currentBPM) / float64(currentBPM)
|
||||
maxDelta := 0.10 // 10% as per BACKBEAT-REQ-004
|
||||
|
||||
if delta > maxDelta || delta < -maxDelta {
|
||||
return fmt.Errorf("tempo change exceeds ±10%% limit: current=%d new=%d delta=%.1f%%",
|
||||
currentBPM, newBPM, delta*100)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ValidateStatusClaim validates a StatusClaim according to INT-B specification
|
||||
func ValidateStatusClaim(sc *StatusClaim) error {
|
||||
if sc.Type != "backbeat.statusclaim.v1" {
|
||||
return fmt.Errorf("invalid type: expected 'backbeat.statusclaim.v1', got '%s'", sc.Type)
|
||||
}
|
||||
|
||||
if sc.AgentID == "" {
|
||||
return fmt.Errorf("agent_id is required")
|
||||
}
|
||||
|
||||
if sc.TaskID == "" {
|
||||
return fmt.Errorf("task_id is required")
|
||||
}
|
||||
|
||||
if sc.BeatIndex <= 0 {
|
||||
return fmt.Errorf("beat_index must be positive, got %d", sc.BeatIndex)
|
||||
}
|
||||
|
||||
validStates := map[string]bool{
|
||||
"executing": true,
|
||||
"planning": true,
|
||||
"waiting": true,
|
||||
"review": true,
|
||||
"done": true,
|
||||
"failed": true,
|
||||
}
|
||||
|
||||
if !validStates[sc.State] {
|
||||
return fmt.Errorf("invalid state: must be one of [executing, planning, waiting, review, done, failed], got '%s'", sc.State)
|
||||
}
|
||||
|
||||
if sc.Progress < 0.0 || sc.Progress > 1.0 {
|
||||
return fmt.Errorf("progress must be between 0.0 and 1.0, got %f", sc.Progress)
|
||||
}
|
||||
|
||||
if sc.HLC == "" {
|
||||
return fmt.Errorf("hlc is required")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// WindowAggregation represents aggregated data for a window
|
||||
type WindowAggregation struct {
|
||||
WindowID string
|
||||
FromBeat int64
|
||||
ToBeat int64
|
||||
Claims []*StatusClaim
|
||||
AgentStates map[string]string // agent_id -> latest state
|
||||
UniqueAgents map[string]bool // set of agent_ids that reported
|
||||
StateCounts map[string]int // state -> count
|
||||
CompletedTasks int // tasks with state "done"
|
||||
FailedTasks int // tasks with state "failed"
|
||||
LastUpdated time.Time
|
||||
}
|
||||
|
||||
// NewWindowAggregation creates a new window aggregation
|
||||
func NewWindowAggregation(windowID string, fromBeat, toBeat int64) *WindowAggregation {
|
||||
return &WindowAggregation{
|
||||
WindowID: windowID,
|
||||
FromBeat: fromBeat,
|
||||
ToBeat: toBeat,
|
||||
Claims: make([]*StatusClaim, 0),
|
||||
AgentStates: make(map[string]string),
|
||||
UniqueAgents: make(map[string]bool),
|
||||
StateCounts: make(map[string]int),
|
||||
LastUpdated: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
// AddClaim adds a status claim to the window aggregation
|
||||
func (wa *WindowAggregation) AddClaim(claim *StatusClaim) {
|
||||
wa.Claims = append(wa.Claims, claim)
|
||||
wa.UniqueAgents[claim.AgentID] = true
|
||||
|
||||
// Update agent's latest state
|
||||
wa.AgentStates[claim.AgentID] = claim.State
|
||||
|
||||
// Update state counts
|
||||
wa.StateCounts[claim.State]++
|
||||
|
||||
// Track completed and failed tasks
|
||||
if claim.State == "done" {
|
||||
wa.CompletedTasks++
|
||||
} else if claim.State == "failed" {
|
||||
wa.FailedTasks++
|
||||
}
|
||||
|
||||
wa.LastUpdated = time.Now()
|
||||
}
|
||||
|
||||
// GenerateBarReport generates a BarReport from the aggregated data
|
||||
func (wa *WindowAggregation) GenerateBarReport(clusterID string) *BarReport {
|
||||
// Calculate KPIs based on aggregated data
|
||||
agentsReporting := len(wa.UniqueAgents)
|
||||
onTimeReviews := wa.StateCounts["done"] // Tasks completed successfully
|
||||
|
||||
// Help promises fulfilled - placeholder calculation
|
||||
// In a real implementation, this would track help request/response pairs
|
||||
helpPromisesFulfilled := wa.StateCounts["done"] / 10 // Rough estimate
|
||||
|
||||
// Secret rotations OK - placeholder
|
||||
// In a real implementation, this would check security rotation status
|
||||
secretRotationsOK := true
|
||||
|
||||
// Tempo drift - placeholder calculation
|
||||
// In a real implementation, this would measure actual tempo drift
|
||||
tempoDriftMS := 0
|
||||
|
||||
// Detect issues based on aggregated data
|
||||
issues := make([]string, 0)
|
||||
if wa.FailedTasks > 0 {
|
||||
issues = append(issues, fmt.Sprintf("%d failed tasks detected", wa.FailedTasks))
|
||||
}
|
||||
|
||||
if agentsReporting == 0 {
|
||||
issues = append(issues, "no agents reporting in window")
|
||||
}
|
||||
|
||||
return &BarReport{
|
||||
Type: "backbeat.barreport.v1",
|
||||
WindowID: wa.WindowID,
|
||||
FromBeat: wa.FromBeat,
|
||||
ToBeat: wa.ToBeat,
|
||||
AgentsReporting: agentsReporting,
|
||||
OnTimeReviews: onTimeReviews,
|
||||
HelpPromisesFulfilled: helpPromisesFulfilled,
|
||||
SecretRotationsOK: secretRotationsOK,
|
||||
TempoDriftMS: tempoDriftMS,
|
||||
Issues: issues,
|
||||
ClusterID: clusterID,
|
||||
StateCounts: wa.StateCounts,
|
||||
}
|
||||
}
|
||||
|
||||
// Score represents a YAML-based task score for agent simulation
|
||||
type Score struct {
|
||||
Phases map[string]int `yaml:"phases"`
|
||||
WaitBudget WaitBudget `yaml:"wait_budget"`
|
||||
}
|
||||
|
||||
// WaitBudget represents waiting time budgets for different scenarios
|
||||
type WaitBudget struct {
|
||||
Help int `yaml:"help"`
|
||||
}
|
||||
Reference in New Issue
Block a user