480 lines
13 KiB
Go
480 lines
13 KiB
Go
// Package sdk provides the BACKBEAT Go SDK for enabling CHORUS services
|
|
// to become BACKBEAT-aware with beat synchronization and status emission.
|
|
package sdk
|
|
|
|
import (
|
|
"context"
|
|
"crypto/ed25519"
|
|
"encoding/json"
|
|
"fmt"
|
|
"log/slog"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/google/uuid"
|
|
"github.com/nats-io/nats.go"
|
|
)
|
|
|
|
// Client interface defines the core BACKBEAT SDK functionality
|
|
// Implements BACKBEAT-REQ-040, 041, 042, 043, 044
|
|
type Client interface {
|
|
// Beat subscription (BACKBEAT-REQ-040)
|
|
OnBeat(callback func(BeatFrame)) error
|
|
OnDownbeat(callback func(BeatFrame)) error
|
|
|
|
// Status emission (BACKBEAT-REQ-041)
|
|
EmitStatusClaim(claim StatusClaim) error
|
|
|
|
// Beat budgets (BACKBEAT-REQ-042)
|
|
WithBeatBudget(n int, fn func() error) error
|
|
|
|
// Utilities
|
|
GetCurrentBeat() int64
|
|
GetCurrentWindow() string
|
|
IsInWindow(windowID string) bool
|
|
GetCurrentTempo() int
|
|
GetTempoDrift() time.Duration
|
|
|
|
// Lifecycle management
|
|
Start(ctx context.Context) error
|
|
Stop() error
|
|
Health() HealthStatus
|
|
}
|
|
|
|
// Config represents the SDK configuration
|
|
type Config struct {
|
|
ClusterID string // BACKBEAT cluster identifier
|
|
AgentID string // Unique agent identifier
|
|
NATSUrl string // NATS connection URL
|
|
SigningKey ed25519.PrivateKey // Ed25519 private key for signing (BACKBEAT-REQ-044)
|
|
Logger *slog.Logger // Structured logger
|
|
JitterTolerance time.Duration // Maximum jitter tolerance (default: 50ms)
|
|
ReconnectDelay time.Duration // NATS reconnection delay (default: 1s)
|
|
MaxReconnects int // Maximum reconnection attempts (default: -1 for infinite)
|
|
}
|
|
|
|
// DefaultConfig returns a Config with sensible defaults
|
|
func DefaultConfig() *Config {
|
|
return &Config{
|
|
JitterTolerance: 50 * time.Millisecond,
|
|
ReconnectDelay: 1 * time.Second,
|
|
MaxReconnects: -1, // Infinite reconnects
|
|
Logger: slog.Default(),
|
|
}
|
|
}
|
|
|
|
// BeatFrame represents a beat frame with timing information
|
|
type BeatFrame struct {
|
|
Type string `json:"type"`
|
|
ClusterID string `json:"cluster_id"`
|
|
BeatIndex int64 `json:"beat_index"`
|
|
Downbeat bool `json:"downbeat"`
|
|
Phase string `json:"phase"`
|
|
HLC string `json:"hlc"`
|
|
DeadlineAt time.Time `json:"deadline_at"`
|
|
TempoBPM int `json:"tempo_bpm"`
|
|
WindowID string `json:"window_id"`
|
|
}
|
|
|
|
// StatusClaim represents a status claim emission
|
|
type StatusClaim struct {
|
|
// Auto-populated by SDK
|
|
Type string `json:"type"` // Always "backbeat.statusclaim.v1"
|
|
AgentID string `json:"agent_id"` // Auto-populated from config
|
|
TaskID string `json:"task_id"` // Auto-generated if not provided
|
|
BeatIndex int64 `json:"beat_index"` // Auto-populated from current beat
|
|
HLC string `json:"hlc"` // Auto-populated from current HLC
|
|
|
|
// User-provided
|
|
State string `json:"state"` // executing|planning|waiting|review|done|failed
|
|
WaitFor []string `json:"wait_for,omitempty"` // refs (e.g., hmmm://thread/...)
|
|
BeatsLeft int `json:"beats_left"` // estimated beats remaining
|
|
Progress float64 `json:"progress"` // progress ratio (0.0-1.0)
|
|
Notes string `json:"notes"` // status description
|
|
}
|
|
|
|
// HealthStatus represents the current health of the SDK client
|
|
type HealthStatus struct {
|
|
Connected bool `json:"connected"`
|
|
LastBeat int64 `json:"last_beat"`
|
|
LastBeatTime time.Time `json:"last_beat_time"`
|
|
TimeDrift time.Duration `json:"time_drift"`
|
|
ReconnectCount int `json:"reconnect_count"`
|
|
LocalDegradation bool `json:"local_degradation"`
|
|
CurrentTempo int `json:"current_tempo"`
|
|
TempoDrift time.Duration `json:"tempo_drift"`
|
|
MeasuredBPM float64 `json:"measured_bpm"`
|
|
Errors []string `json:"errors,omitempty"`
|
|
}
|
|
|
|
// LegacyBeatInfo represents legacy {bar,beat} information
|
|
// For BACKBEAT-REQ-043 compatibility
|
|
type LegacyBeatInfo struct {
|
|
Bar int `json:"bar"`
|
|
Beat int `json:"beat"`
|
|
}
|
|
|
|
// tempoSample represents a tempo measurement for drift calculation
|
|
type tempoSample struct {
|
|
BeatIndex int64
|
|
Tempo int
|
|
MeasuredTime time.Time
|
|
ActualBPM float64 // Measured BPM based on inter-beat timing
|
|
}
|
|
|
|
// client implements the Client interface
|
|
type client struct {
|
|
config *Config
|
|
nc *nats.Conn
|
|
ctx context.Context
|
|
cancel context.CancelFunc
|
|
wg sync.WaitGroup
|
|
|
|
// Beat tracking
|
|
currentBeat int64
|
|
currentWindow string
|
|
currentHLC string
|
|
lastBeatTime time.Time
|
|
currentTempo int // Current tempo in BPM
|
|
lastTempo int // Last known tempo for drift calculation
|
|
tempoHistory []tempoSample // History for drift calculation
|
|
beatMutex sync.RWMutex
|
|
|
|
// Callbacks
|
|
beatCallbacks []func(BeatFrame)
|
|
downbeatCallbacks []func(BeatFrame)
|
|
callbackMutex sync.RWMutex
|
|
|
|
// Health and metrics
|
|
reconnectCount int
|
|
localDegradation bool
|
|
errors []string
|
|
errorMutex sync.RWMutex
|
|
metrics *Metrics
|
|
|
|
// Beat budget tracking
|
|
budgetContexts map[string]context.CancelFunc
|
|
budgetMutex sync.Mutex
|
|
|
|
// Legacy compatibility
|
|
legacyWarned bool
|
|
legacyMutex sync.Mutex
|
|
}
|
|
|
|
// NewClient creates a new BACKBEAT SDK client
|
|
func NewClient(config *Config) Client {
|
|
if config.Logger == nil {
|
|
config.Logger = slog.Default()
|
|
}
|
|
|
|
c := &client{
|
|
config: config,
|
|
beatCallbacks: make([]func(BeatFrame), 0),
|
|
downbeatCallbacks: make([]func(BeatFrame), 0),
|
|
budgetContexts: make(map[string]context.CancelFunc),
|
|
errors: make([]string, 0),
|
|
tempoHistory: make([]tempoSample, 0, 100),
|
|
currentTempo: 60, // Default to 60 BPM
|
|
}
|
|
|
|
// Initialize metrics
|
|
prefix := fmt.Sprintf("backbeat.sdk.%s", config.AgentID)
|
|
c.metrics = NewMetrics(prefix)
|
|
|
|
return c
|
|
}
|
|
|
|
// Start initializes the client and begins beat synchronization
|
|
func (c *client) Start(ctx context.Context) error {
|
|
c.ctx, c.cancel = context.WithCancel(ctx)
|
|
|
|
if err := c.connect(); err != nil {
|
|
return fmt.Errorf("failed to connect to NATS: %w", err)
|
|
}
|
|
|
|
c.wg.Add(1)
|
|
go c.beatSubscriptionLoop()
|
|
|
|
c.config.Logger.Info("BACKBEAT SDK client started",
|
|
slog.String("cluster_id", c.config.ClusterID),
|
|
slog.String("agent_id", c.config.AgentID))
|
|
|
|
return nil
|
|
}
|
|
|
|
// Stop gracefully stops the client
|
|
func (c *client) Stop() error {
|
|
if c.cancel != nil {
|
|
c.cancel()
|
|
}
|
|
|
|
// Cancel all active beat budgets
|
|
c.budgetMutex.Lock()
|
|
for id, cancel := range c.budgetContexts {
|
|
cancel()
|
|
delete(c.budgetContexts, id)
|
|
}
|
|
c.budgetMutex.Unlock()
|
|
|
|
if c.nc != nil {
|
|
c.nc.Close()
|
|
}
|
|
|
|
c.wg.Wait()
|
|
|
|
c.config.Logger.Info("BACKBEAT SDK client stopped")
|
|
return nil
|
|
}
|
|
|
|
// OnBeat registers a callback for beat events (BACKBEAT-REQ-040)
|
|
func (c *client) OnBeat(callback func(BeatFrame)) error {
|
|
if callback == nil {
|
|
return fmt.Errorf("callback cannot be nil")
|
|
}
|
|
|
|
c.callbackMutex.Lock()
|
|
defer c.callbackMutex.Unlock()
|
|
|
|
c.beatCallbacks = append(c.beatCallbacks, callback)
|
|
return nil
|
|
}
|
|
|
|
// OnDownbeat registers a callback for downbeat events (BACKBEAT-REQ-040)
|
|
func (c *client) OnDownbeat(callback func(BeatFrame)) error {
|
|
if callback == nil {
|
|
return fmt.Errorf("callback cannot be nil")
|
|
}
|
|
|
|
c.callbackMutex.Lock()
|
|
defer c.callbackMutex.Unlock()
|
|
|
|
c.downbeatCallbacks = append(c.downbeatCallbacks, callback)
|
|
return nil
|
|
}
|
|
|
|
// EmitStatusClaim emits a status claim (BACKBEAT-REQ-041)
|
|
func (c *client) EmitStatusClaim(claim StatusClaim) error {
|
|
// Auto-populate required fields
|
|
claim.Type = "backbeat.statusclaim.v1"
|
|
claim.AgentID = c.config.AgentID
|
|
claim.BeatIndex = c.GetCurrentBeat()
|
|
claim.HLC = c.getCurrentHLC()
|
|
|
|
// Auto-generate task ID if not provided
|
|
if claim.TaskID == "" {
|
|
claim.TaskID = fmt.Sprintf("task:%s", uuid.New().String()[:8])
|
|
}
|
|
|
|
// Validate the claim
|
|
if err := c.validateStatusClaim(&claim); err != nil {
|
|
return fmt.Errorf("invalid status claim: %w", err)
|
|
}
|
|
|
|
// Sign the claim if signing key is available (BACKBEAT-REQ-044)
|
|
if c.config.SigningKey != nil {
|
|
if err := c.signStatusClaim(&claim); err != nil {
|
|
return fmt.Errorf("failed to sign status claim: %w", err)
|
|
}
|
|
}
|
|
|
|
// Publish to NATS
|
|
data, err := json.Marshal(claim)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to marshal status claim: %w", err)
|
|
}
|
|
|
|
subject := fmt.Sprintf("backbeat.status.%s", c.config.ClusterID)
|
|
headers := c.createHeaders()
|
|
|
|
msg := &nats.Msg{
|
|
Subject: subject,
|
|
Data: data,
|
|
Header: headers,
|
|
}
|
|
|
|
if err := c.nc.PublishMsg(msg); err != nil {
|
|
c.addError(fmt.Sprintf("failed to publish status claim: %v", err))
|
|
c.metrics.RecordStatusClaim(false)
|
|
return fmt.Errorf("failed to publish status claim: %w", err)
|
|
}
|
|
|
|
c.metrics.RecordStatusClaim(true)
|
|
c.config.Logger.Debug("Status claim emitted",
|
|
slog.String("agent_id", claim.AgentID),
|
|
slog.String("task_id", claim.TaskID),
|
|
slog.String("state", claim.State),
|
|
slog.Int64("beat_index", claim.BeatIndex))
|
|
|
|
return nil
|
|
}
|
|
|
|
// WithBeatBudget executes a function with a beat-based timeout (BACKBEAT-REQ-042)
|
|
func (c *client) WithBeatBudget(n int, fn func() error) error {
|
|
if n <= 0 {
|
|
return fmt.Errorf("beat budget must be positive, got %d", n)
|
|
}
|
|
|
|
// Calculate timeout based on current tempo
|
|
currentBeat := c.GetCurrentBeat()
|
|
beatDuration := c.getBeatDuration()
|
|
timeout := time.Duration(n) * beatDuration
|
|
|
|
// Use background context if client context is not set (for testing)
|
|
baseCtx := c.ctx
|
|
if baseCtx == nil {
|
|
baseCtx = context.Background()
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(baseCtx, timeout)
|
|
defer cancel()
|
|
|
|
// Track the budget context for cancellation
|
|
budgetID := uuid.New().String()
|
|
c.budgetMutex.Lock()
|
|
c.budgetContexts[budgetID] = cancel
|
|
c.budgetMutex.Unlock()
|
|
|
|
// Record budget creation
|
|
c.metrics.RecordBudgetCreated()
|
|
|
|
defer func() {
|
|
c.budgetMutex.Lock()
|
|
delete(c.budgetContexts, budgetID)
|
|
c.budgetMutex.Unlock()
|
|
}()
|
|
|
|
// Execute function with timeout
|
|
done := make(chan error, 1)
|
|
go func() {
|
|
done <- fn()
|
|
}()
|
|
|
|
select {
|
|
case err := <-done:
|
|
c.metrics.RecordBudgetCompleted(false) // Not timed out
|
|
if err != nil {
|
|
c.config.Logger.Debug("Beat budget function completed with error",
|
|
slog.Int("budget", n),
|
|
slog.Int64("start_beat", currentBeat),
|
|
slog.String("error", err.Error()))
|
|
} else {
|
|
c.config.Logger.Debug("Beat budget function completed successfully",
|
|
slog.Int("budget", n),
|
|
slog.Int64("start_beat", currentBeat))
|
|
}
|
|
return err
|
|
case <-ctx.Done():
|
|
c.metrics.RecordBudgetCompleted(true) // Timed out
|
|
c.config.Logger.Warn("Beat budget exceeded",
|
|
slog.Int("budget", n),
|
|
slog.Int64("start_beat", currentBeat),
|
|
slog.Duration("timeout", timeout))
|
|
return fmt.Errorf("beat budget of %d beats exceeded", n)
|
|
}
|
|
}
|
|
|
|
// GetCurrentBeat returns the current beat index
|
|
func (c *client) GetCurrentBeat() int64 {
|
|
c.beatMutex.RLock()
|
|
defer c.beatMutex.RUnlock()
|
|
return c.currentBeat
|
|
}
|
|
|
|
// GetCurrentWindow returns the current window ID
|
|
func (c *client) GetCurrentWindow() string {
|
|
c.beatMutex.RLock()
|
|
defer c.beatMutex.RUnlock()
|
|
return c.currentWindow
|
|
}
|
|
|
|
// IsInWindow checks if we're currently in the specified window
|
|
func (c *client) IsInWindow(windowID string) bool {
|
|
return c.GetCurrentWindow() == windowID
|
|
}
|
|
|
|
// GetCurrentTempo returns the current tempo in BPM
|
|
func (c *client) GetCurrentTempo() int {
|
|
c.beatMutex.RLock()
|
|
defer c.beatMutex.RUnlock()
|
|
return c.currentTempo
|
|
}
|
|
|
|
// GetTempoDrift calculates the drift between expected and actual tempo
|
|
func (c *client) GetTempoDrift() time.Duration {
|
|
c.beatMutex.RLock()
|
|
defer c.beatMutex.RUnlock()
|
|
|
|
if len(c.tempoHistory) < 2 {
|
|
return 0
|
|
}
|
|
|
|
// Calculate average measured BPM from recent samples
|
|
historyLen := len(c.tempoHistory)
|
|
recentCount := 10
|
|
if historyLen < recentCount {
|
|
recentCount = historyLen
|
|
}
|
|
|
|
recent := c.tempoHistory[historyLen-recentCount:]
|
|
if len(recent) < 2 {
|
|
recent = c.tempoHistory
|
|
}
|
|
|
|
totalBPM := 0.0
|
|
for _, sample := range recent {
|
|
totalBPM += sample.ActualBPM
|
|
}
|
|
avgMeasuredBPM := totalBPM / float64(len(recent))
|
|
|
|
// Calculate drift
|
|
expectedBeatDuration := 60.0 / float64(c.currentTempo)
|
|
actualBeatDuration := 60.0 / avgMeasuredBPM
|
|
|
|
drift := actualBeatDuration - expectedBeatDuration
|
|
return time.Duration(drift * float64(time.Second))
|
|
}
|
|
|
|
// Health returns the current health status
|
|
func (c *client) Health() HealthStatus {
|
|
c.errorMutex.RLock()
|
|
errors := make([]string, len(c.errors))
|
|
copy(errors, c.errors)
|
|
c.errorMutex.RUnlock()
|
|
|
|
c.beatMutex.RLock()
|
|
timeDrift := time.Since(c.lastBeatTime)
|
|
currentTempo := c.currentTempo
|
|
|
|
// Calculate measured BPM from recent tempo history
|
|
measuredBPM := 60.0 // Default
|
|
if len(c.tempoHistory) > 0 {
|
|
historyLen := len(c.tempoHistory)
|
|
recentCount := 5
|
|
if historyLen < recentCount {
|
|
recentCount = historyLen
|
|
}
|
|
|
|
recent := c.tempoHistory[historyLen-recentCount:]
|
|
totalBPM := 0.0
|
|
for _, sample := range recent {
|
|
totalBPM += sample.ActualBPM
|
|
}
|
|
measuredBPM = totalBPM / float64(len(recent))
|
|
}
|
|
c.beatMutex.RUnlock()
|
|
|
|
tempoDrift := c.GetTempoDrift()
|
|
|
|
return HealthStatus{
|
|
Connected: c.nc != nil && c.nc.IsConnected(),
|
|
LastBeat: c.GetCurrentBeat(),
|
|
LastBeatTime: c.lastBeatTime,
|
|
TimeDrift: timeDrift,
|
|
ReconnectCount: c.reconnectCount,
|
|
LocalDegradation: c.localDegradation,
|
|
CurrentTempo: currentTempo,
|
|
TempoDrift: tempoDrift,
|
|
MeasuredBPM: measuredBPM,
|
|
Errors: errors,
|
|
}
|
|
} |