Implement initial scan logic and council formation for WHOOSH project kickoffs

- Replace incremental sync with full scan for new repositories
- Add initial_scan status to bypass Since parameter filtering
- Implement council formation detection for Design Brief issues
- Add version display to WHOOSH UI header for debugging
- Fix Docker token authentication with trailing newline removal
- Add comprehensive council orchestration with Docker Swarm integration
- Include BACKBEAT prototype integration for distributed timing
- Support council-specific agent roles and deployment strategies
- Transition repositories to active status after content discovery

Key architectural improvements:
- Full scan approach for new project detection vs incremental sync
- Council formation triggered by chorus-entrypoint labeled Design Briefs
- Proper token handling and authentication for Gitea API calls
- Support for both initial discovery and ongoing task monitoring

This enables autonomous project kickoff workflows where Design Brief issues
automatically trigger formation of specialized agent councils for new projects.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Claude Code
2025-09-12 09:49:36 +10:00
parent b5c0deb6bc
commit 56ea52b743
74 changed files with 17778 additions and 236 deletions

View File

@@ -0,0 +1,357 @@
package backbeat
import (
"encoding/json"
"net/http"
"strconv"
"time"
"github.com/gorilla/mux"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/rs/zerolog"
)
// AdminServer provides HTTP endpoints for BACKBEAT pulse administration
// Includes tempo control, drift monitoring, and leader status as specified
type AdminServer struct {
router *mux.Router
pulseState *PulseState
metrics *Metrics
elector *LeaderElector
hlc *HLC
logger zerolog.Logger
degradation *DegradationManager
}
// AdminConfig configures the admin server
type AdminConfig struct {
PulseState *PulseState
Metrics *Metrics
Elector *LeaderElector
HLC *HLC
Logger zerolog.Logger
Degradation *DegradationManager
}
// TempoResponse represents the response for tempo endpoints
type TempoResponse struct {
CurrentBPM int `json:"current_bpm"`
PendingBPM int `json:"pending_bpm"`
CanChange bool `json:"can_change"`
NextChange string `json:"next_change,omitempty"`
Reason string `json:"reason,omitempty"`
}
// DriftResponse represents the response for drift monitoring
type DriftResponse struct {
TimerDriftPercent float64 `json:"timer_drift_percent"`
HLCDriftSeconds float64 `json:"hlc_drift_seconds"`
LastSyncTime string `json:"last_sync_time"`
DegradationMode bool `json:"degradation_mode"`
WithinLimits bool `json:"within_limits"`
}
// LeaderResponse represents the response for leader status
type LeaderResponse struct {
NodeID string `json:"node_id"`
IsLeader bool `json:"is_leader"`
Leader string `json:"leader"`
ClusterSize int `json:"cluster_size"`
Stats map[string]interface{} `json:"stats"`
}
// HealthResponse represents the health check response
type HealthResponse struct {
Status string `json:"status"`
Timestamp time.Time `json:"timestamp"`
Version string `json:"version"`
NodeID string `json:"node_id"`
IsLeader bool `json:"is_leader"`
BeatIndex int64 `json:"beat_index"`
TempoBPM int `json:"tempo_bpm"`
Degradation bool `json:"degradation_mode"`
}
// NewAdminServer creates a new admin API server
func NewAdminServer(config AdminConfig) *AdminServer {
server := &AdminServer{
router: mux.NewRouter(),
pulseState: config.PulseState,
metrics: config.Metrics,
elector: config.Elector,
hlc: config.HLC,
logger: config.Logger.With().Str("component", "admin-api").Logger(),
degradation: config.Degradation,
}
server.setupRoutes()
return server
}
// setupRoutes configures all admin API routes
func (s *AdminServer) setupRoutes() {
// Tempo control endpoints
s.router.HandleFunc("/tempo", s.getTempo).Methods("GET")
s.router.HandleFunc("/tempo", s.setTempo).Methods("POST")
// Drift monitoring endpoint
s.router.HandleFunc("/drift", s.getDrift).Methods("GET")
// Leader status endpoint
s.router.HandleFunc("/leader", s.getLeader).Methods("GET")
// Health check endpoints
s.router.HandleFunc("/health", s.getHealth).Methods("GET")
s.router.HandleFunc("/ready", s.getReady).Methods("GET")
s.router.HandleFunc("/live", s.getLive).Methods("GET")
// Metrics endpoint
s.router.Handle("/metrics", promhttp.Handler())
// Debug endpoints
s.router.HandleFunc("/status", s.getStatus).Methods("GET")
s.router.HandleFunc("/debug/state", s.getDebugState).Methods("GET")
}
// getTempo handles GET /tempo requests
func (s *AdminServer) getTempo(w http.ResponseWriter, r *http.Request) {
s.logger.Debug().Msg("GET /tempo request")
response := TempoResponse{
CurrentBPM: s.pulseState.TempoBPM,
PendingBPM: s.pulseState.PendingBPM,
CanChange: s.elector.IsLeader(),
}
// Check if tempo change is pending
if s.pulseState.PendingBPM != s.pulseState.TempoBPM {
// Calculate next downbeat time
beatsToDownbeat := int64(s.pulseState.BarLength) - ((s.pulseState.BeatIndex - 1) % int64(s.pulseState.BarLength))
beatDuration := time.Duration(60000/s.pulseState.TempoBPM) * time.Millisecond
nextDownbeat := time.Now().Add(time.Duration(beatsToDownbeat) * beatDuration)
response.NextChange = nextDownbeat.Format(time.RFC3339)
}
if !response.CanChange {
response.Reason = "not leader"
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(response)
}
// setTempo handles POST /tempo requests with BACKBEAT-REQ-004 validation
func (s *AdminServer) setTempo(w http.ResponseWriter, r *http.Request) {
s.logger.Debug().Msg("POST /tempo request")
// Only leader can change tempo
if !s.elector.IsLeader() {
s.respondError(w, http.StatusForbidden, "only leader can change tempo")
s.metrics.RecordTempoChangeError()
return
}
var req TempoChangeRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
s.respondError(w, http.StatusBadRequest, "invalid JSON: "+err.Error())
s.metrics.RecordTempoChangeError()
return
}
// Validate tempo change per BACKBEAT-REQ-004
if err := ValidateTempoChange(s.pulseState.TempoBPM, req.TempoBPM); err != nil {
s.respondError(w, http.StatusBadRequest, err.Error())
s.metrics.RecordTempoChangeError()
return
}
// Set pending tempo - will be applied on next downbeat
s.pulseState.PendingBPM = req.TempoBPM
s.logger.Info().
Int("current_bpm", s.pulseState.TempoBPM).
Int("pending_bpm", req.TempoBPM).
Str("justification", req.Justification).
Msg("tempo change scheduled")
response := TempoResponse{
CurrentBPM: s.pulseState.TempoBPM,
PendingBPM: req.TempoBPM,
CanChange: true,
Reason: "scheduled for next downbeat",
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(response)
}
// getDrift handles GET /drift requests for BACKBEAT-PER-003 monitoring
func (s *AdminServer) getDrift(w http.ResponseWriter, r *http.Request) {
s.logger.Debug().Msg("GET /drift request")
hlcDrift := s.hlc.GetDrift()
timerDrift := s.degradation.GetTimerDrift()
response := DriftResponse{
TimerDriftPercent: timerDrift * 100, // Convert to percentage
HLCDriftSeconds: hlcDrift.Seconds(),
DegradationMode: s.degradation.IsInDegradationMode(),
WithinLimits: timerDrift <= 0.01, // BACKBEAT-PER-003: ≤ 1%
}
// Add last sync time if available
if hlcDrift > 0 {
response.LastSyncTime = time.Now().Add(-hlcDrift).Format(time.RFC3339)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(response)
}
// getLeader handles GET /leader requests
func (s *AdminServer) getLeader(w http.ResponseWriter, r *http.Request) {
s.logger.Debug().Msg("GET /leader request")
stats := s.elector.GetStats()
clusterSize := 1 // Default to 1 if no stats available
if size, ok := stats["num_peers"]; ok {
if sizeStr, ok := size.(string); ok {
if parsed, err := strconv.Atoi(sizeStr); err == nil {
clusterSize = parsed + 1 // Add 1 for this node
}
}
}
response := LeaderResponse{
NodeID: s.pulseState.NodeID,
IsLeader: s.elector.IsLeader(),
Leader: s.elector.GetLeader(),
ClusterSize: clusterSize,
Stats: stats,
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(response)
}
// getHealth handles GET /health requests
func (s *AdminServer) getHealth(w http.ResponseWriter, r *http.Request) {
response := HealthResponse{
Status: "ok",
Timestamp: time.Now(),
Version: "2.0.0",
NodeID: s.pulseState.NodeID,
IsLeader: s.elector.IsLeader(),
BeatIndex: s.pulseState.BeatIndex,
TempoBPM: s.pulseState.TempoBPM,
Degradation: s.degradation.IsInDegradationMode(),
}
// Check if degradation mode indicates unhealthy state
if s.degradation.IsInDegradationMode() {
drift := s.degradation.GetTimerDrift()
if drift > 0.05 { // 5% drift indicates serious issues
response.Status = "degraded"
}
}
statusCode := http.StatusOK
if response.Status != "ok" {
statusCode = http.StatusServiceUnavailable
}
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(statusCode)
json.NewEncoder(w).Encode(response)
}
// getReady handles GET /ready requests for k8s readiness probes
func (s *AdminServer) getReady(w http.ResponseWriter, r *http.Request) {
// Ready if we have a leader (this node or another)
if leader := s.elector.GetLeader(); leader != "" {
w.WriteHeader(http.StatusOK)
w.Write([]byte("ready"))
} else {
w.WriteHeader(http.StatusServiceUnavailable)
w.Write([]byte("no leader"))
}
}
// getLive handles GET /live requests for k8s liveness probes
func (s *AdminServer) getLive(w http.ResponseWriter, r *http.Request) {
// Always live unless we're in severe degradation
drift := s.degradation.GetTimerDrift()
if drift > 0.10 { // 10% drift indicates critical issues
w.WriteHeader(http.StatusServiceUnavailable)
w.Write([]byte("severe drift"))
return
}
w.WriteHeader(http.StatusOK)
w.Write([]byte("alive"))
}
// getStatus handles GET /status requests for comprehensive status
func (s *AdminServer) getStatus(w http.ResponseWriter, r *http.Request) {
status := map[string]interface{}{
"timestamp": time.Now(),
"node_id": s.pulseState.NodeID,
"cluster_id": s.pulseState.ClusterID,
"is_leader": s.elector.IsLeader(),
"leader": s.elector.GetLeader(),
"beat_index": s.pulseState.BeatIndex,
"tempo_bpm": s.pulseState.TempoBPM,
"pending_bpm": s.pulseState.PendingBPM,
"bar_length": s.pulseState.BarLength,
"phases": s.pulseState.Phases,
"degradation": s.degradation.IsInDegradationMode(),
"uptime": time.Since(s.pulseState.StartTime),
"raft_stats": s.elector.GetStats(),
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(status)
}
// getDebugState handles GET /debug/state requests
func (s *AdminServer) getDebugState(w http.ResponseWriter, r *http.Request) {
debugState := map[string]interface{}{
"pulse_state": s.pulseState,
"hlc_drift": s.hlc.GetDrift(),
"timer_drift": s.degradation.GetTimerDrift(),
"leader_stats": s.elector.GetStats(),
"degradation": s.degradation.GetState(),
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(debugState)
}
// respondError sends a JSON error response
func (s *AdminServer) respondError(w http.ResponseWriter, statusCode int, message string) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(statusCode)
errorResp := map[string]string{
"error": message,
"timestamp": time.Now().Format(time.RFC3339),
}
json.NewEncoder(w).Encode(errorResp)
}
// ServeHTTP implements http.Handler interface
func (s *AdminServer) ServeHTTP(w http.ResponseWriter, r *http.Request) {
// Add common headers
w.Header().Set("X-BACKBEAT-Node-ID", s.pulseState.NodeID)
w.Header().Set("X-BACKBEAT-Version", "2.0.0")
// Log request
s.logger.Debug().
Str("method", r.Method).
Str("path", r.URL.Path).
Str("remote_addr", r.RemoteAddr).
Msg("admin API request")
s.router.ServeHTTP(w, r)
}

View File

@@ -0,0 +1,330 @@
package backbeat
import (
"context"
"fmt"
"math"
"sync"
"time"
"github.com/rs/zerolog"
)
// DegradationManager implements BACKBEAT-REQ-003 (Degrade Local)
// Manages local tempo derivation when leader is lost and reconciliation
type DegradationManager struct {
mu sync.RWMutex
logger zerolog.Logger
// State tracking
inDegradationMode bool
leaderLostAt time.Time
lastLeaderSync time.Time
localTempo int
originalTempo int
// Timing state for BACKBEAT-PER-003 compliance
referenceTime time.Time
referenceBeat int64
expectedBeatTime time.Time
actualBeatTime time.Time
driftAccumulation time.Duration
// Configuration
maxDriftPercent float64 // BACKBEAT-PER-003: 1% max drift
syncTimeout time.Duration
degradationWindow time.Duration
// Metrics
metrics *Metrics
}
// DegradationConfig configures the degradation manager
type DegradationConfig struct {
Logger zerolog.Logger
Metrics *Metrics
MaxDriftPercent float64 // Default: 0.01 (1%)
SyncTimeout time.Duration // Default: 30s
DegradationWindow time.Duration // Default: 5m
}
// NewDegradationManager creates a new degradation manager
func NewDegradationManager(config DegradationConfig) *DegradationManager {
// Set defaults
if config.MaxDriftPercent == 0 {
config.MaxDriftPercent = 0.01 // 1% as per BACKBEAT-PER-003
}
if config.SyncTimeout == 0 {
config.SyncTimeout = 30 * time.Second
}
if config.DegradationWindow == 0 {
config.DegradationWindow = 5 * time.Minute
}
return &DegradationManager{
logger: config.Logger.With().Str("component", "degradation").Logger(),
metrics: config.Metrics,
maxDriftPercent: config.MaxDriftPercent,
syncTimeout: config.SyncTimeout,
degradationWindow: config.DegradationWindow,
referenceTime: time.Now(),
lastLeaderSync: time.Now(),
}
}
// OnLeaderLost is called when leadership is lost, initiating degradation mode
func (d *DegradationManager) OnLeaderLost(currentTempo int, beatIndex int64) {
d.mu.Lock()
defer d.mu.Unlock()
now := time.Now()
d.inDegradationMode = true
d.leaderLostAt = now
d.localTempo = currentTempo
d.originalTempo = currentTempo
d.referenceTime = now
d.referenceBeat = beatIndex
d.driftAccumulation = 0
d.logger.Warn().
Int("tempo_bpm", currentTempo).
Int64("beat_index", beatIndex).
Msg("entered degradation mode - deriving local tempo")
if d.metrics != nil {
d.metrics.UpdateDegradationMode(true)
}
}
// OnLeaderRecovered is called when leadership is restored
func (d *DegradationManager) OnLeaderRecovered(leaderTempo int, leaderBeatIndex int64, hlc string) error {
d.mu.Lock()
defer d.mu.Unlock()
if !d.inDegradationMode {
return nil // Already recovered
}
now := time.Now()
degradationDuration := now.Sub(d.leaderLostAt)
d.logger.Info().
Dur("degradation_duration", degradationDuration).
Int("local_tempo", d.localTempo).
Int("leader_tempo", leaderTempo).
Int64("local_beat", d.referenceBeat).
Int64("leader_beat", leaderBeatIndex).
Str("leader_hlc", hlc).
Msg("reconciling with leader after degradation")
// Calculate drift during degradation period
drift := d.calculateDrift(now)
// Reset degradation state
d.inDegradationMode = false
d.lastLeaderSync = now
d.referenceTime = now
d.referenceBeat = leaderBeatIndex
d.driftAccumulation = 0
d.logger.Info().
Float64("drift_percent", drift*100).
Msg("recovered from degradation mode")
if d.metrics != nil {
d.metrics.UpdateDegradationMode(false)
d.metrics.UpdateDriftMetrics(drift, 0) // Reset HLC drift
}
return nil
}
// UpdateBeatTiming updates timing information for drift calculation
func (d *DegradationManager) UpdateBeatTiming(expectedTime, actualTime time.Time, beatIndex int64) {
d.mu.Lock()
defer d.mu.Unlock()
d.expectedBeatTime = expectedTime
d.actualBeatTime = actualTime
// Accumulate drift if in degradation mode
if d.inDegradationMode {
beatDrift := actualTime.Sub(expectedTime)
d.driftAccumulation += beatDrift.Abs()
// Update metrics
if d.metrics != nil {
drift := d.calculateDrift(actualTime)
d.metrics.UpdateDriftMetrics(drift, 0)
}
}
}
// GetTimerDrift returns the current timer drift ratio for BACKBEAT-PER-003
func (d *DegradationManager) GetTimerDrift() float64 {
d.mu.RLock()
defer d.mu.RUnlock()
if !d.inDegradationMode {
return 0.0 // No drift when synchronized with leader
}
return d.calculateDrift(time.Now())
}
// calculateDrift calculates the current drift ratio (internal method, must be called with lock)
func (d *DegradationManager) calculateDrift(now time.Time) float64 {
if d.referenceTime.IsZero() {
return 0.0
}
elapsed := now.Sub(d.referenceTime)
if elapsed <= 0 {
return 0.0
}
// Calculate expected vs actual timing
expectedDuration := elapsed
actualDuration := elapsed + d.driftAccumulation
if expectedDuration <= 0 {
return 0.0
}
drift := float64(actualDuration-expectedDuration) / float64(expectedDuration)
return math.Abs(drift)
}
// IsInDegradationMode returns true if currently in degradation mode
func (d *DegradationManager) IsInDegradationMode() bool {
d.mu.RLock()
defer d.mu.RUnlock()
return d.inDegradationMode
}
// GetDegradationDuration returns how long we've been in degradation mode
func (d *DegradationManager) GetDegradationDuration() time.Duration {
d.mu.RLock()
defer d.mu.RUnlock()
if !d.inDegradationMode {
return 0
}
return time.Since(d.leaderLostAt)
}
// IsWithinDriftLimits checks if current drift is within BACKBEAT-PER-003 limits
func (d *DegradationManager) IsWithinDriftLimits() bool {
drift := d.GetTimerDrift()
return drift <= d.maxDriftPercent
}
// GetLocalTempo returns the current local tempo when in degradation mode
func (d *DegradationManager) GetLocalTempo() int {
d.mu.RLock()
defer d.mu.RUnlock()
if !d.inDegradationMode {
return 0 // Not applicable when not in degradation
}
return d.localTempo
}
// AdjustLocalTempo allows fine-tuning local tempo to minimize drift
func (d *DegradationManager) AdjustLocalTempo(newTempo int) error {
d.mu.Lock()
defer d.mu.Unlock()
if !d.inDegradationMode {
return fmt.Errorf("cannot adjust local tempo when not in degradation mode")
}
// Validate tempo adjustment (max 5% change from original)
maxChange := float64(d.originalTempo) * 0.05
change := math.Abs(float64(newTempo - d.originalTempo))
if change > maxChange {
return fmt.Errorf("tempo adjustment too large: %.1f BPM (max %.1f BPM)",
change, maxChange)
}
oldTempo := d.localTempo
d.localTempo = newTempo
d.logger.Info().
Int("old_tempo", oldTempo).
Int("new_tempo", newTempo).
Float64("drift_percent", d.calculateDrift(time.Now())*100).
Msg("adjusted local tempo to minimize drift")
return nil
}
// GetState returns the current degradation manager state for debugging
func (d *DegradationManager) GetState() map[string]interface{} {
d.mu.RLock()
defer d.mu.RUnlock()
state := map[string]interface{}{
"in_degradation_mode": d.inDegradationMode,
"local_tempo": d.localTempo,
"original_tempo": d.originalTempo,
"drift_percent": d.calculateDrift(time.Now()) * 100,
"within_limits": d.IsWithinDriftLimits(),
"max_drift_percent": d.maxDriftPercent * 100,
"reference_time": d.referenceTime,
"reference_beat": d.referenceBeat,
"drift_accumulation_ms": d.driftAccumulation.Milliseconds(),
}
if d.inDegradationMode {
state["degradation_duration"] = time.Since(d.leaderLostAt)
state["leader_lost_at"] = d.leaderLostAt
}
return state
}
// MonitorDrift runs a background goroutine to monitor drift and alert on violations
func (d *DegradationManager) MonitorDrift(ctx context.Context) {
ticker := time.NewTicker(10 * time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
d.checkDriftLimits()
}
}
}
// checkDriftLimits monitors drift and logs warnings when limits are exceeded
func (d *DegradationManager) checkDriftLimits() {
d.mu.RLock()
inDegradation := d.inDegradationMode
drift := d.calculateDrift(time.Now())
d.mu.RUnlock()
if !inDegradation {
return // No drift monitoring when synchronized
}
driftPercent := drift * 100
if drift > d.maxDriftPercent {
d.logger.Warn().
Float64("drift_percent", driftPercent).
Float64("limit_percent", d.maxDriftPercent*100).
Msg("BACKBEAT-PER-003 violation: timer drift exceeds 1% limit")
} else if drift > d.maxDriftPercent*0.8 {
// Warning at 80% of limit
d.logger.Warn().
Float64("drift_percent", driftPercent).
Float64("limit_percent", d.maxDriftPercent*100).
Msg("approaching drift limit")
}
}

View File

@@ -0,0 +1,165 @@
package backbeat
import (
"fmt"
"strconv"
"strings"
"sync"
"time"
)
// HLC implements Hybrid Logical Clock for BACKBEAT-REQ-003 (degrade local)
// Provides ordering guarantees for distributed events and supports reconciliation
type HLC struct {
mu sync.RWMutex
pt time.Time // physical time
lc int64 // logical counter
nodeID string // node identifier for uniqueness
lastSync time.Time // last successful sync with leader
}
// NewHLC creates a new Hybrid Logical Clock instance
func NewHLC(nodeID string) *HLC {
return &HLC{
pt: time.Now().UTC(),
lc: 0,
nodeID: nodeID,
lastSync: time.Now().UTC(),
}
}
// Next generates the next HLC timestamp
// Format: unix_ms_hex:logical_counter_hex:node_id_suffix
// Example: "7ffd:0001:abcd"
func (h *HLC) Next() string {
h.mu.Lock()
defer h.mu.Unlock()
now := time.Now().UTC()
// BACKBEAT-REQ-003: Support for local time derivation
if now.After(h.pt) || now.Equal(h.pt) {
h.pt = now
if now.After(h.pt) {
h.lc = 0
} else {
h.lc++
}
} else {
h.lc++
}
// Format as compact hex representation
ptMs := h.pt.UnixMilli()
nodeHash := h.nodeID
if len(nodeHash) > 4 {
nodeHash = nodeHash[:4]
}
return fmt.Sprintf("%04x:%04x:%s", ptMs&0xFFFF, h.lc&0xFFFF, nodeHash)
}
// Update synchronizes with an external HLC timestamp
// Used for BACKBEAT-REQ-003 reconciliation with leader
func (h *HLC) Update(remoteHLC string) error {
h.mu.Lock()
defer h.mu.Unlock()
parts := strings.Split(remoteHLC, ":")
if len(parts) != 3 {
return fmt.Errorf("invalid HLC format: %s", remoteHLC)
}
remotePt, err := strconv.ParseInt(parts[0], 16, 64)
if err != nil {
return fmt.Errorf("invalid physical time in HLC: %v", err)
}
remoteLc, err := strconv.ParseInt(parts[1], 16, 64)
if err != nil {
return fmt.Errorf("invalid logical counter in HLC: %v", err)
}
now := time.Now().UTC()
remoteTime := time.UnixMilli(remotePt)
// Update physical time to max(local_time, remote_time, current_time)
maxTime := now
if remoteTime.After(maxTime) {
maxTime = remoteTime
}
if h.pt.After(maxTime) {
maxTime = h.pt
}
// Update logical counter based on HLC algorithm
if maxTime.Equal(h.pt) && maxTime.Equal(remoteTime) {
h.lc = max(h.lc, remoteLc) + 1
} else if maxTime.Equal(h.pt) {
h.lc++
} else if maxTime.Equal(remoteTime) {
h.lc = remoteLc + 1
} else {
h.lc = 0
}
h.pt = maxTime
h.lastSync = now
return nil
}
// GetDrift returns the time since last successful sync with leader
// Used for BACKBEAT-PER-003 (SDK timer drift ≤ 1% over 1 hour)
func (h *HLC) GetDrift() time.Duration {
h.mu.RLock()
defer h.mu.RUnlock()
return time.Since(h.lastSync)
}
// Compare compares two HLC timestamps
// Returns -1 if a < b, 0 if a == b, 1 if a > b
func (h *HLC) Compare(a, b string) int {
partsA := strings.Split(a, ":")
partsB := strings.Split(b, ":")
if len(partsA) != 3 || len(partsB) != 3 {
return 0 // Invalid format, consider equal
}
ptA, _ := strconv.ParseInt(partsA[0], 16, 64)
ptB, _ := strconv.ParseInt(partsB[0], 16, 64)
if ptA != ptB {
if ptA < ptB {
return -1
}
return 1
}
lcA, _ := strconv.ParseInt(partsA[1], 16, 64)
lcB, _ := strconv.ParseInt(partsB[1], 16, 64)
if lcA != lcB {
if lcA < lcB {
return -1
}
return 1
}
// If physical time and logical counter are equal, compare node IDs
if partsA[2] != partsB[2] {
if partsA[2] < partsB[2] {
return -1
}
return 1
}
return 0
}
func max(a, b int64) int64 {
if a > b {
return a
}
return b
}

View File

@@ -0,0 +1,336 @@
package backbeat
import (
"context"
"encoding/json"
"fmt"
"io"
"net"
"os"
"path/filepath"
"sync"
"time"
"github.com/hashicorp/raft"
raftboltdb "github.com/hashicorp/raft-boltdb/v2"
"github.com/rs/zerolog"
)
// LeaderElector implements BACKBEAT-REQ-001 (Pulse Leader)
// Provides pluggable leader election using Raft consensus
type LeaderElector struct {
mu sync.RWMutex
raft *raft.Raft
nodeID string
bindAddr string
dataDir string
isLeader bool
leaderCh chan bool
shutdownCh chan struct{}
logger zerolog.Logger
onBecomeLeader func()
onLoseLeader func()
}
// FSM implements the Raft finite state machine for BACKBEAT state
type BackbeatFSM struct {
mu sync.RWMutex
state map[string]interface{}
}
// LeaderElectorConfig configures the leader election
type LeaderElectorConfig struct {
NodeID string
BindAddr string
DataDir string
Logger zerolog.Logger
OnBecomeLeader func()
OnLoseLeader func()
Bootstrap bool
Peers []string
}
// NewLeaderElector creates a new leader elector for BACKBEAT-REQ-001
func NewLeaderElector(config LeaderElectorConfig) (*LeaderElector, error) {
if config.NodeID == "" {
return nil, fmt.Errorf("node ID is required")
}
if config.BindAddr == "" {
config.BindAddr = "127.0.0.1:0" // Let system assign port
}
if config.DataDir == "" {
config.DataDir = filepath.Join(os.TempDir(), "backbeat-raft-"+config.NodeID)
}
// Create data directory
if err := os.MkdirAll(config.DataDir, 0755); err != nil {
return nil, fmt.Errorf("failed to create data directory: %v", err)
}
le := &LeaderElector{
nodeID: config.NodeID,
bindAddr: config.BindAddr,
dataDir: config.DataDir,
logger: config.Logger.With().Str("component", "leader-elector").Logger(),
leaderCh: make(chan bool, 1),
shutdownCh: make(chan struct{}),
onBecomeLeader: config.OnBecomeLeader,
onLoseLeader: config.OnLoseLeader,
}
if err := le.setupRaft(config.Bootstrap, config.Peers); err != nil {
return nil, fmt.Errorf("failed to setup Raft: %v", err)
}
go le.monitorLeadership()
return le, nil
}
// setupRaft initializes the Raft consensus system
func (le *LeaderElector) setupRaft(bootstrap bool, peers []string) error {
// Create Raft configuration
config := raft.DefaultConfig()
config.LocalID = raft.ServerID(le.nodeID)
config.HeartbeatTimeout = 1 * time.Second
config.ElectionTimeout = 1 * time.Second
config.CommitTimeout = 500 * time.Millisecond
config.LeaderLeaseTimeout = 500 * time.Millisecond
// Setup logging will be handled by Raft's default logger
// Create transport
addr, err := net.ResolveTCPAddr("tcp", le.bindAddr)
if err != nil {
return fmt.Errorf("failed to resolve bind address: %v", err)
}
transport, err := raft.NewTCPTransport(le.bindAddr, addr, 3, 10*time.Second, os.Stderr)
if err != nil {
return fmt.Errorf("failed to create transport: %v", err)
}
// Update bind address with actual port if it was auto-assigned
le.bindAddr = string(transport.LocalAddr())
// Create the snapshot store
snapshots, err := raft.NewFileSnapshotStore(le.dataDir, 2, os.Stderr)
if err != nil {
return fmt.Errorf("failed to create snapshot store: %v", err)
}
// Create the log store and stable store
logStore, err := raftboltdb.NewBoltStore(filepath.Join(le.dataDir, "raft-log.bolt"))
if err != nil {
return fmt.Errorf("failed to create log store: %v", err)
}
stableStore, err := raftboltdb.NewBoltStore(filepath.Join(le.dataDir, "raft-stable.bolt"))
if err != nil {
return fmt.Errorf("failed to create stable store: %v", err)
}
// Create FSM
fsm := &BackbeatFSM{
state: make(map[string]interface{}),
}
// Create Raft instance
r, err := raft.NewRaft(config, fsm, logStore, stableStore, snapshots, transport)
if err != nil {
return fmt.Errorf("failed to create Raft instance: %v", err)
}
le.raft = r
// Bootstrap cluster if needed
if bootstrap {
servers := []raft.Server{
{
ID: config.LocalID,
Address: transport.LocalAddr(),
},
}
// Add peer servers
for _, peer := range peers {
servers = append(servers, raft.Server{
ID: raft.ServerID(peer),
Address: raft.ServerAddress(peer),
})
}
configuration := raft.Configuration{Servers: servers}
r.BootstrapCluster(configuration)
}
return nil
}
// monitorLeadership watches for leadership changes
func (le *LeaderElector) monitorLeadership() {
for {
select {
case isLeader := <-le.raft.LeaderCh():
le.mu.Lock()
wasLeader := le.isLeader
le.isLeader = isLeader
le.mu.Unlock()
if isLeader && !wasLeader {
le.logger.Info().Msg("became leader")
if le.onBecomeLeader != nil {
le.onBecomeLeader()
}
} else if !isLeader && wasLeader {
le.logger.Info().Msg("lost leadership")
if le.onLoseLeader != nil {
le.onLoseLeader()
}
}
// Notify any waiting goroutines
select {
case le.leaderCh <- isLeader:
default:
}
case <-le.shutdownCh:
return
}
}
}
// IsLeader returns true if this node is the current leader
func (le *LeaderElector) IsLeader() bool {
le.mu.RLock()
defer le.mu.RUnlock()
return le.isLeader
}
// GetLeader returns the current leader address
func (le *LeaderElector) GetLeader() string {
if le.raft == nil {
return ""
}
_, leaderAddr := le.raft.LeaderWithID()
return string(leaderAddr)
}
// WaitForLeader blocks until leadership is established (this node or another)
func (le *LeaderElector) WaitForLeader(ctx context.Context) error {
ticker := time.NewTicker(100 * time.Millisecond)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return ctx.Err()
case <-ticker.C:
if leader := le.GetLeader(); leader != "" {
return nil
}
}
}
}
// Shutdown gracefully shuts down the leader elector
func (le *LeaderElector) Shutdown() error {
close(le.shutdownCh)
if le.raft != nil {
shutdownFuture := le.raft.Shutdown()
if err := shutdownFuture.Error(); err != nil {
le.logger.Error().Err(err).Msg("failed to shutdown Raft")
return err
}
}
return nil
}
// GetStats returns Raft statistics for monitoring
func (le *LeaderElector) GetStats() map[string]interface{} {
if le.raft == nil {
return nil
}
stats := le.raft.Stats()
result := make(map[string]interface{})
for k, v := range stats {
result[k] = v
}
result["is_leader"] = le.IsLeader()
result["leader"] = le.GetLeader()
result["node_id"] = le.nodeID
result["bind_addr"] = le.bindAddr
return result
}
// BackbeatFSM implementation
func (fsm *BackbeatFSM) Apply(log *raft.Log) interface{} {
fsm.mu.Lock()
defer fsm.mu.Unlock()
// Parse the command
var cmd map[string]interface{}
if err := json.Unmarshal(log.Data, &cmd); err != nil {
return err
}
// Apply command to state
for k, v := range cmd {
fsm.state[k] = v
}
return nil
}
func (fsm *BackbeatFSM) Snapshot() (raft.FSMSnapshot, error) {
fsm.mu.RLock()
defer fsm.mu.RUnlock()
// Create a copy of the state
state := make(map[string]interface{})
for k, v := range fsm.state {
state[k] = v
}
return &BackbeatSnapshot{state: state}, nil
}
func (fsm *BackbeatFSM) Restore(rc io.ReadCloser) error {
defer rc.Close()
var state map[string]interface{}
decoder := json.NewDecoder(rc)
if err := decoder.Decode(&state); err != nil {
return err
}
fsm.mu.Lock()
defer fsm.mu.Unlock()
fsm.state = state
return nil
}
// BackbeatSnapshot implements raft.FSMSnapshot
type BackbeatSnapshot struct {
state map[string]interface{}
}
func (s *BackbeatSnapshot) Persist(sink raft.SnapshotSink) error {
encoder := json.NewEncoder(sink)
if err := encoder.Encode(s.state); err != nil {
sink.Cancel()
return err
}
return sink.Close()
}
func (s *BackbeatSnapshot) Release() {}

View File

@@ -0,0 +1,376 @@
package backbeat
import (
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
// Metrics provides comprehensive observability for BACKBEAT pulse service
// Supports BACKBEAT-PER-001, BACKBEAT-PER-002, BACKBEAT-PER-003 monitoring
type Metrics struct {
// BACKBEAT-PER-001: End-to-end delivery p95 ≤ 100ms at 2Hz
BeatPublishDuration prometheus.Histogram
BeatDeliveryLatency prometheus.Histogram
// BACKBEAT-PER-002: Pulse jitter p95 ≤ 20ms
PulseJitter prometheus.Histogram
BeatTiming prometheus.Histogram
// BACKBEAT-PER-003: SDK timer drift ≤ 1% over 1 hour
TimerDrift prometheus.Gauge
HLCDrift prometheus.Gauge
// Leadership and cluster health
IsLeader prometheus.Gauge
LeadershipChanges prometheus.Counter
ClusterSize prometheus.Gauge
// Tempo and beat metrics
CurrentTempo prometheus.Gauge
BeatCounter prometheus.Counter
DownbeatCounter prometheus.Counter
PhaseTransitions prometheus.CounterVec
// Error and degradation metrics
TempoChangeErrors prometheus.Counter
LeadershipLoss prometheus.Counter
DegradationMode prometheus.Gauge
NATSConnectionLoss prometheus.Counter
// Performance metrics
BeatFrameSize prometheus.Histogram
NATSPublishErrors prometheus.Counter
// BACKBEAT-OBS-002: Reverb aggregation metrics
ReverbAgentsReporting prometheus.Gauge
ReverbOnTimeReviews prometheus.Gauge
ReverbTempoDriftMS prometheus.Gauge
ReverbWindowsCompleted prometheus.Counter
ReverbClaimsProcessed prometheus.Counter
ReverbWindowProcessingTime prometheus.Histogram
ReverbBarReportSize prometheus.Histogram
ReverbWindowsActive prometheus.Gauge
ReverbClaimsPerWindow prometheus.Histogram
}
// NewMetrics creates and registers all BACKBEAT metrics
func NewMetrics() *Metrics {
return &Metrics{
// BACKBEAT-PER-001: End-to-end delivery monitoring
BeatPublishDuration: promauto.NewHistogram(prometheus.HistogramOpts{
Name: "backbeat_beat_publish_duration_seconds",
Help: "Time spent publishing beat frames to NATS",
Namespace: "backbeat",
Subsystem: "pulse",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 10), // 1ms to 1s
}),
BeatDeliveryLatency: promauto.NewHistogram(prometheus.HistogramOpts{
Name: "backbeat_beat_delivery_latency_seconds",
Help: "End-to-end beat delivery latency (BACKBEAT-PER-001: p95 ≤ 100ms)",
Namespace: "backbeat",
Subsystem: "pulse",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 10),
}),
// BACKBEAT-PER-002: Pulse jitter monitoring
PulseJitter: promauto.NewHistogram(prometheus.HistogramOpts{
Name: "backbeat_pulse_jitter_seconds",
Help: "Beat timing jitter (BACKBEAT-PER-002: p95 ≤ 20ms)",
Namespace: "backbeat",
Subsystem: "pulse",
Buckets: []float64{0.001, 0.005, 0.010, 0.015, 0.020, 0.025, 0.050, 0.100},
}),
BeatTiming: promauto.NewHistogram(prometheus.HistogramOpts{
Name: "backbeat_beat_timing_accuracy_seconds",
Help: "Accuracy of beat timing relative to expected schedule",
Namespace: "backbeat",
Subsystem: "pulse",
Buckets: prometheus.ExponentialBuckets(0.0001, 2, 12),
}),
// BACKBEAT-PER-003: Timer drift monitoring
TimerDrift: promauto.NewGauge(prometheus.GaugeOpts{
Name: "backbeat_timer_drift_ratio",
Help: "Timer drift ratio (BACKBEAT-PER-003: ≤ 1% over 1 hour)",
Namespace: "backbeat",
Subsystem: "pulse",
}),
HLCDrift: promauto.NewGauge(prometheus.GaugeOpts{
Name: "backbeat_hlc_drift_seconds",
Help: "HLC drift from last leader sync",
Namespace: "backbeat",
Subsystem: "pulse",
}),
// Leadership metrics
IsLeader: promauto.NewGauge(prometheus.GaugeOpts{
Name: "backbeat_is_leader",
Help: "1 if this node is the current leader, 0 otherwise",
Namespace: "backbeat",
Subsystem: "cluster",
}),
LeadershipChanges: promauto.NewCounter(prometheus.CounterOpts{
Name: "backbeat_leadership_changes_total",
Help: "Total number of leadership changes",
Namespace: "backbeat",
Subsystem: "cluster",
}),
ClusterSize: promauto.NewGauge(prometheus.GaugeOpts{
Name: "backbeat_cluster_size",
Help: "Number of nodes in the cluster",
Namespace: "backbeat",
Subsystem: "cluster",
}),
// Tempo and beat metrics
CurrentTempo: promauto.NewGauge(prometheus.GaugeOpts{
Name: "backbeat_current_tempo_bpm",
Help: "Current tempo in beats per minute",
Namespace: "backbeat",
Subsystem: "pulse",
}),
BeatCounter: promauto.NewCounter(prometheus.CounterOpts{
Name: "backbeat_beats_total",
Help: "Total number of beats published",
Namespace: "backbeat",
Subsystem: "pulse",
}),
DownbeatCounter: promauto.NewCounter(prometheus.CounterOpts{
Name: "backbeat_downbeats_total",
Help: "Total number of downbeats published",
Namespace: "backbeat",
Subsystem: "pulse",
}),
PhaseTransitions: *promauto.NewCounterVec(prometheus.CounterOpts{
Name: "backbeat_phase_transitions_total",
Help: "Total number of phase transitions by phase name",
Namespace: "backbeat",
Subsystem: "pulse",
}, []string{"phase", "from_phase"}),
// Error metrics
TempoChangeErrors: promauto.NewCounter(prometheus.CounterOpts{
Name: "backbeat_tempo_change_errors_total",
Help: "Total number of rejected tempo change requests",
Namespace: "backbeat",
Subsystem: "control",
}),
LeadershipLoss: promauto.NewCounter(prometheus.CounterOpts{
Name: "backbeat_leadership_loss_total",
Help: "Total number of times this node lost leadership",
Namespace: "backbeat",
Subsystem: "cluster",
}),
DegradationMode: promauto.NewGauge(prometheus.GaugeOpts{
Name: "backbeat_degradation_mode",
Help: "1 if running in degradation mode (BACKBEAT-REQ-003), 0 otherwise",
Namespace: "backbeat",
Subsystem: "pulse",
}),
NATSConnectionLoss: promauto.NewCounter(prometheus.CounterOpts{
Name: "backbeat_nats_connection_loss_total",
Help: "Total number of NATS connection losses",
Namespace: "backbeat",
Subsystem: "transport",
}),
// Performance metrics
BeatFrameSize: promauto.NewHistogram(prometheus.HistogramOpts{
Name: "backbeat_beat_frame_size_bytes",
Help: "Size of serialized beat frames",
Namespace: "backbeat",
Subsystem: "pulse",
Buckets: prometheus.ExponentialBuckets(100, 2, 10),
}),
NATSPublishErrors: promauto.NewCounter(prometheus.CounterOpts{
Name: "backbeat_nats_publish_errors_total",
Help: "Total number of NATS publish errors",
Namespace: "backbeat",
Subsystem: "transport",
}),
// BACKBEAT-OBS-002: Reverb aggregation metrics
ReverbAgentsReporting: promauto.NewGauge(prometheus.GaugeOpts{
Name: "backbeat_reverb_agents_reporting",
Help: "Number of agents reporting in current window (BACKBEAT-OBS-002)",
Namespace: "backbeat",
Subsystem: "reverb",
}),
ReverbOnTimeReviews: promauto.NewGauge(prometheus.GaugeOpts{
Name: "backbeat_reverb_on_time_reviews",
Help: "Number of on-time reviews completed (BACKBEAT-OBS-002)",
Namespace: "backbeat",
Subsystem: "reverb",
}),
ReverbTempoDriftMS: promauto.NewGauge(prometheus.GaugeOpts{
Name: "backbeat_reverb_tempo_drift_ms",
Help: "Current tempo drift in milliseconds (BACKBEAT-OBS-002)",
Namespace: "backbeat",
Subsystem: "reverb",
}),
ReverbWindowsCompleted: promauto.NewCounter(prometheus.CounterOpts{
Name: "backbeat_reverb_windows_completed_total",
Help: "Total number of windows completed (BACKBEAT-OBS-002)",
Namespace: "backbeat",
Subsystem: "reverb",
}),
ReverbClaimsProcessed: promauto.NewCounter(prometheus.CounterOpts{
Name: "backbeat_reverb_claims_processed_total",
Help: "Total number of status claims processed (BACKBEAT-OBS-002)",
Namespace: "backbeat",
Subsystem: "reverb",
}),
ReverbWindowProcessingTime: promauto.NewHistogram(prometheus.HistogramOpts{
Name: "backbeat_reverb_window_processing_seconds",
Help: "Time to process and emit a window report (BACKBEAT-PER-002: ≤ 1 beat)",
Namespace: "backbeat",
Subsystem: "reverb",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 12), // 1ms to 4s
}),
ReverbBarReportSize: promauto.NewHistogram(prometheus.HistogramOpts{
Name: "backbeat_reverb_bar_report_size_bytes",
Help: "Size of serialized bar reports",
Namespace: "backbeat",
Subsystem: "reverb",
Buckets: prometheus.ExponentialBuckets(100, 2, 10),
}),
ReverbWindowsActive: promauto.NewGauge(prometheus.GaugeOpts{
Name: "backbeat_reverb_windows_active",
Help: "Number of active windows being aggregated",
Namespace: "backbeat",
Subsystem: "reverb",
}),
ReverbClaimsPerWindow: promauto.NewHistogram(prometheus.HistogramOpts{
Name: "backbeat_reverb_claims_per_window",
Help: "Number of claims processed per window",
Namespace: "backbeat",
Subsystem: "reverb",
Buckets: prometheus.ExponentialBuckets(1, 2, 15), // 1 to 32k claims
}),
}
}
// RecordBeatPublish records metrics for a published beat
func (m *Metrics) RecordBeatPublish(duration time.Duration, frameSize int, isDownbeat bool, phase string) {
m.BeatPublishDuration.Observe(duration.Seconds())
m.BeatFrameSize.Observe(float64(frameSize))
m.BeatCounter.Inc()
if isDownbeat {
m.DownbeatCounter.Inc()
}
}
// RecordPulseJitter records beat timing jitter
func (m *Metrics) RecordPulseJitter(jitter time.Duration) {
m.PulseJitter.Observe(jitter.Seconds())
}
// RecordBeatTiming records beat timing accuracy
func (m *Metrics) RecordBeatTiming(expectedTime, actualTime time.Time) {
diff := actualTime.Sub(expectedTime).Abs()
m.BeatTiming.Observe(diff.Seconds())
}
// UpdateTempoMetrics updates tempo-related metrics
func (m *Metrics) UpdateTempoMetrics(currentBPM int) {
m.CurrentTempo.Set(float64(currentBPM))
}
// UpdateLeadershipMetrics updates leadership-related metrics
func (m *Metrics) UpdateLeadershipMetrics(isLeader bool, clusterSize int) {
if isLeader {
m.IsLeader.Set(1)
} else {
m.IsLeader.Set(0)
}
m.ClusterSize.Set(float64(clusterSize))
}
// RecordLeadershipChange records a leadership change event
func (m *Metrics) RecordLeadershipChange(becameLeader bool) {
m.LeadershipChanges.Inc()
if !becameLeader {
m.LeadershipLoss.Inc()
}
}
// UpdateDriftMetrics updates drift-related metrics for BACKBEAT-PER-003
func (m *Metrics) UpdateDriftMetrics(timerDriftRatio float64, hlcDriftSeconds float64) {
m.TimerDrift.Set(timerDriftRatio)
m.HLCDrift.Set(hlcDriftSeconds)
}
// UpdateDegradationMode updates degradation mode status
func (m *Metrics) UpdateDegradationMode(inDegradationMode bool) {
if inDegradationMode {
m.DegradationMode.Set(1)
} else {
m.DegradationMode.Set(0)
}
}
// RecordTempoChangeError records a tempo change error
func (m *Metrics) RecordTempoChangeError() {
m.TempoChangeErrors.Inc()
}
// RecordNATSError records NATS-related errors
func (m *Metrics) RecordNATSError(errorType string) {
switch errorType {
case "connection_loss":
m.NATSConnectionLoss.Inc()
case "publish_error":
m.NATSPublishErrors.Inc()
}
}
// RecordPhaseTransition records a phase transition
func (m *Metrics) RecordPhaseTransition(fromPhase, toPhase string) {
m.PhaseTransitions.WithLabelValues(toPhase, fromPhase).Inc()
}
// RecordReverbWindow records metrics for a completed reverb window
func (m *Metrics) RecordReverbWindow(processingTime time.Duration, claimsCount int, agentsReporting int, onTimeReviews int, tempoDriftMS int, reportSize int) {
m.ReverbWindowsCompleted.Inc()
m.ReverbWindowProcessingTime.Observe(processingTime.Seconds())
m.ReverbClaimsPerWindow.Observe(float64(claimsCount))
m.ReverbBarReportSize.Observe(float64(reportSize))
// Update current window metrics
m.ReverbAgentsReporting.Set(float64(agentsReporting))
m.ReverbOnTimeReviews.Set(float64(onTimeReviews))
m.ReverbTempoDriftMS.Set(float64(tempoDriftMS))
}
// RecordReverbClaim records a processed status claim
func (m *Metrics) RecordReverbClaim() {
m.ReverbClaimsProcessed.Inc()
}
// UpdateReverbActiveWindows updates the number of active windows being tracked
func (m *Metrics) UpdateReverbActiveWindows(count int) {
m.ReverbWindowsActive.Set(float64(count))
}

View File

@@ -0,0 +1,15 @@
package backbeat
import "errors"
// PhaseFor returns the phase name for a given beat index (1-indexed).
func PhaseFor(phases map[string]int, beatIndex int) (string, error) {
acc := 0
for name, n := range phases {
acc += n
if beatIndex <= acc {
return name, nil
}
}
return "", errors.New("beat index out of range")
}

View File

@@ -0,0 +1,260 @@
package backbeat
import (
"crypto/sha256"
"fmt"
"time"
)
// BeatFrame represents the INT-A specification for BACKBEAT-REQ-002
// BACKBEAT-REQ-002: BeatFrame must emit INT-A with hlc, beat_index, downbeat, phase, deadline_at, tempo_bpm
type BeatFrame struct {
Type string `json:"type"` // INT-A: always "backbeat.beatframe.v1"
ClusterID string `json:"cluster_id"` // INT-A: cluster identifier
BeatIndex int64 `json:"beat_index"` // INT-A: global beat counter (not cyclic)
Downbeat bool `json:"downbeat"` // INT-A: true when beat_index % bar_length == 1
Phase string `json:"phase"` // INT-A: current phase name
HLC string `json:"hlc"` // INT-A: hybrid logical clock timestamp
DeadlineAt time.Time `json:"deadline_at"` // INT-A: RFC3339 timestamp for beat deadline
TempoBPM int `json:"tempo_bpm"` // INT-A: current tempo in beats per minute
WindowID string `json:"window_id"` // BACKBEAT-REQ-005: deterministic window identifier
}
// StatusClaim represents the INT-B specification for BACKBEAT-REQ-020
// BACKBEAT-REQ-020: StatusClaim must include type, agent_id, task_id, beat_index, state, beats_left, progress, notes, hlc
type StatusClaim struct {
Type string `json:"type"` // INT-B: always "backbeat.statusclaim.v1"
AgentID string `json:"agent_id"` // INT-B: agent identifier (e.g., "agent:xyz")
TaskID string `json:"task_id"` // INT-B: task identifier (e.g., "task:123")
BeatIndex int64 `json:"beat_index"` // INT-B: current beat index
State string `json:"state"` // INT-B: executing|planning|waiting|review|done|failed
WaitFor []string `json:"wait_for,omitempty"` // refs (e.g., hmmm://thread/...)
BeatsLeft int `json:"beats_left"` // INT-B: estimated beats remaining
Progress float64 `json:"progress"` // INT-B: progress ratio (0.0-1.0)
Notes string `json:"notes"` // INT-B: status description
HLC string `json:"hlc"` // INT-B: hybrid logical clock timestamp
}
// BarReport represents the INT-C specification for BACKBEAT-REQ-021
// BACKBEAT-REQ-021: BarReport must emit INT-C with window_id, from_beat, to_beat, and KPIs at each downbeat
type BarReport struct {
Type string `json:"type"` // INT-C: always "backbeat.barreport.v1"
WindowID string `json:"window_id"` // INT-C: deterministic window identifier
FromBeat int64 `json:"from_beat"` // INT-C: starting beat index of the window
ToBeat int64 `json:"to_beat"` // INT-C: ending beat index of the window
AgentsReporting int `json:"agents_reporting"` // INT-C: number of unique agents that reported
OnTimeReviews int `json:"on_time_reviews"` // INT-C: tasks completed by deadline
HelpPromisesFulfilled int `json:"help_promises_fulfilled"` // INT-C: help requests fulfilled
SecretRotationsOK bool `json:"secret_rotations_ok"` // INT-C: security rotation status
TempoDriftMS int `json:"tempo_drift_ms"` // INT-C: tempo drift in milliseconds
Issues []string `json:"issues"` // INT-C: list of detected issues
// Internal fields for aggregation (not part of INT-C)
ClusterID string `json:"cluster_id,omitempty"` // For internal routing
StateCounts map[string]int `json:"state_counts,omitempty"` // For debugging
}
// PulseState represents the internal state of the pulse service
type PulseState struct {
ClusterID string
NodeID string
IsLeader bool
BeatIndex int64
TempoBPM int
PendingBPM int
BarLength int
Phases []string
CurrentPhase int
LastDownbeat time.Time
StartTime time.Time
FrozenBeats int
}
// TempoChangeRequest represents a tempo change request with validation
type TempoChangeRequest struct {
TempoBPM int `json:"tempo_bpm"`
Justification string `json:"justification,omitempty"`
}
// GenerateWindowID creates a deterministic window ID per BACKBEAT-REQ-005
// BACKBEAT-REQ-005: window_id = hex(sha256(cluster_id + ":" + downbeat_beat_index))[0:32]
func GenerateWindowID(clusterID string, downbeatBeatIndex int64) string {
input := fmt.Sprintf("%s:%d", clusterID, downbeatBeatIndex)
hash := sha256.Sum256([]byte(input))
return fmt.Sprintf("%x", hash)[:32]
}
// IsDownbeat determines if a given beat index represents a downbeat
func IsDownbeat(beatIndex int64, barLength int) bool {
return (beatIndex-1)%int64(barLength) == 0
}
// GetDownbeatIndex calculates the downbeat index for a given beat
func GetDownbeatIndex(beatIndex int64, barLength int) int64 {
return ((beatIndex-1)/int64(barLength))*int64(barLength) + 1
}
// ValidateTempoChange checks if a tempo change is within acceptable limits
// BACKBEAT-REQ-004: Changes only on next downbeat; ≤±10% delta cap
func ValidateTempoChange(currentBPM, newBPM int) error {
if newBPM <= 0 {
return fmt.Errorf("invalid tempo: must be positive, got %d", newBPM)
}
// Calculate percentage change
delta := float64(newBPM-currentBPM) / float64(currentBPM)
maxDelta := 0.10 // 10% as per BACKBEAT-REQ-004
if delta > maxDelta || delta < -maxDelta {
return fmt.Errorf("tempo change exceeds ±10%% limit: current=%d new=%d delta=%.1f%%",
currentBPM, newBPM, delta*100)
}
return nil
}
// ValidateStatusClaim validates a StatusClaim according to INT-B specification
func ValidateStatusClaim(sc *StatusClaim) error {
if sc.Type != "backbeat.statusclaim.v1" {
return fmt.Errorf("invalid type: expected 'backbeat.statusclaim.v1', got '%s'", sc.Type)
}
if sc.AgentID == "" {
return fmt.Errorf("agent_id is required")
}
if sc.TaskID == "" {
return fmt.Errorf("task_id is required")
}
if sc.BeatIndex <= 0 {
return fmt.Errorf("beat_index must be positive, got %d", sc.BeatIndex)
}
validStates := map[string]bool{
"executing": true,
"planning": true,
"waiting": true,
"review": true,
"done": true,
"failed": true,
}
if !validStates[sc.State] {
return fmt.Errorf("invalid state: must be one of [executing, planning, waiting, review, done, failed], got '%s'", sc.State)
}
if sc.Progress < 0.0 || sc.Progress > 1.0 {
return fmt.Errorf("progress must be between 0.0 and 1.0, got %f", sc.Progress)
}
if sc.HLC == "" {
return fmt.Errorf("hlc is required")
}
return nil
}
// WindowAggregation represents aggregated data for a window
type WindowAggregation struct {
WindowID string
FromBeat int64
ToBeat int64
Claims []*StatusClaim
AgentStates map[string]string // agent_id -> latest state
UniqueAgents map[string]bool // set of agent_ids that reported
StateCounts map[string]int // state -> count
CompletedTasks int // tasks with state "done"
FailedTasks int // tasks with state "failed"
LastUpdated time.Time
}
// NewWindowAggregation creates a new window aggregation
func NewWindowAggregation(windowID string, fromBeat, toBeat int64) *WindowAggregation {
return &WindowAggregation{
WindowID: windowID,
FromBeat: fromBeat,
ToBeat: toBeat,
Claims: make([]*StatusClaim, 0),
AgentStates: make(map[string]string),
UniqueAgents: make(map[string]bool),
StateCounts: make(map[string]int),
LastUpdated: time.Now(),
}
}
// AddClaim adds a status claim to the window aggregation
func (wa *WindowAggregation) AddClaim(claim *StatusClaim) {
wa.Claims = append(wa.Claims, claim)
wa.UniqueAgents[claim.AgentID] = true
// Update agent's latest state
wa.AgentStates[claim.AgentID] = claim.State
// Update state counts
wa.StateCounts[claim.State]++
// Track completed and failed tasks
if claim.State == "done" {
wa.CompletedTasks++
} else if claim.State == "failed" {
wa.FailedTasks++
}
wa.LastUpdated = time.Now()
}
// GenerateBarReport generates a BarReport from the aggregated data
func (wa *WindowAggregation) GenerateBarReport(clusterID string) *BarReport {
// Calculate KPIs based on aggregated data
agentsReporting := len(wa.UniqueAgents)
onTimeReviews := wa.StateCounts["done"] // Tasks completed successfully
// Help promises fulfilled - placeholder calculation
// In a real implementation, this would track help request/response pairs
helpPromisesFulfilled := wa.StateCounts["done"] / 10 // Rough estimate
// Secret rotations OK - placeholder
// In a real implementation, this would check security rotation status
secretRotationsOK := true
// Tempo drift - placeholder calculation
// In a real implementation, this would measure actual tempo drift
tempoDriftMS := 0
// Detect issues based on aggregated data
issues := make([]string, 0)
if wa.FailedTasks > 0 {
issues = append(issues, fmt.Sprintf("%d failed tasks detected", wa.FailedTasks))
}
if agentsReporting == 0 {
issues = append(issues, "no agents reporting in window")
}
return &BarReport{
Type: "backbeat.barreport.v1",
WindowID: wa.WindowID,
FromBeat: wa.FromBeat,
ToBeat: wa.ToBeat,
AgentsReporting: agentsReporting,
OnTimeReviews: onTimeReviews,
HelpPromisesFulfilled: helpPromisesFulfilled,
SecretRotationsOK: secretRotationsOK,
TempoDriftMS: tempoDriftMS,
Issues: issues,
ClusterID: clusterID,
StateCounts: wa.StateCounts,
}
}
// Score represents a YAML-based task score for agent simulation
type Score struct {
Phases map[string]int `yaml:"phases"`
WaitBudget WaitBudget `yaml:"wait_budget"`
}
// WaitBudget represents waiting time budgets for different scenarios
type WaitBudget struct {
Help int `yaml:"help"`
}