Integrate BACKBEAT SDK and resolve KACHING license validation

Major integrations and fixes:
- Added BACKBEAT SDK integration for P2P operation timing
- Implemented beat-aware status tracking for distributed operations
- Added Docker secrets support for secure license management
- Resolved KACHING license validation via HTTPS/TLS
- Updated docker-compose configuration for clean stack deployment
- Disabled rollback policies to prevent deployment failures
- Added license credential storage (CHORUS-DEV-MULTI-001)

Technical improvements:
- BACKBEAT P2P operation tracking with phase management
- Enhanced configuration system with file-based secrets
- Improved error handling for license validation
- Clean separation of KACHING and CHORUS deployment stacks

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
anthonyrawlins
2025-09-06 07:56:26 +10:00
parent 543ab216f9
commit 9bdcbe0447
4730 changed files with 1480093 additions and 1916 deletions

View File

@@ -0,0 +1,426 @@
package sdk
import (
"crypto/ed25519"
"crypto/sha256"
"encoding/json"
"fmt"
"time"
"github.com/nats-io/nats.go"
)
// connect establishes connection to NATS with retry logic
func (c *client) connect() error {
opts := []nats.Option{
nats.ReconnectWait(c.config.ReconnectDelay),
nats.MaxReconnects(c.config.MaxReconnects),
nats.ReconnectHandler(func(nc *nats.Conn) {
c.reconnectCount++
c.metrics.RecordConnection()
c.config.Logger.Info("NATS reconnected",
"reconnect_count", c.reconnectCount,
"url", nc.ConnectedUrl())
}),
nats.DisconnectErrHandler(func(nc *nats.Conn, err error) {
if err != nil {
c.metrics.RecordDisconnection()
c.addError(fmt.Sprintf("NATS disconnected: %v", err))
c.config.Logger.Warn("NATS disconnected", "error", err)
}
}),
nats.ClosedHandler(func(nc *nats.Conn) {
c.metrics.RecordDisconnection()
c.config.Logger.Info("NATS connection closed")
}),
}
nc, err := nats.Connect(c.config.NATSUrl, opts...)
if err != nil {
c.metrics.RecordError(fmt.Sprintf("NATS connection failed: %v", err))
return fmt.Errorf("failed to connect to NATS: %w", err)
}
c.nc = nc
c.metrics.RecordConnection()
c.config.Logger.Info("Connected to NATS", "url", nc.ConnectedUrl())
return nil
}
// beatSubscriptionLoop handles beat frame subscription with jitter tolerance
func (c *client) beatSubscriptionLoop() {
defer c.wg.Done()
subject := fmt.Sprintf("backbeat.beat.%s", c.config.ClusterID)
// Subscribe to beat frames
sub, err := c.nc.Subscribe(subject, c.handleBeatFrame)
if err != nil {
c.addError(fmt.Sprintf("failed to subscribe to beats: %v", err))
c.config.Logger.Error("Failed to subscribe to beats", "error", err)
return
}
defer sub.Unsubscribe()
c.config.Logger.Info("Beat subscription active", "subject", subject)
// Start local degradation timer for fallback timing
localTicker := time.NewTicker(1 * time.Second) // Default 60 BPM fallback
defer localTicker.Stop()
for {
select {
case <-c.ctx.Done():
return
case <-localTicker.C:
// Local degradation mode - generate synthetic beats if no recent beats
c.beatMutex.RLock()
timeSinceLastBeat := time.Since(c.lastBeatTime)
c.beatMutex.RUnlock()
// If more than 2 beat intervals have passed, enter degradation mode
if timeSinceLastBeat > 2*time.Second {
if !c.localDegradation {
c.localDegradation = true
c.config.Logger.Warn("Entering local degradation mode",
"time_since_last_beat", timeSinceLastBeat)
}
c.handleLocalDegradationBeat()
c.metrics.RecordLocalDegradation(timeSinceLastBeat)
} else if c.localDegradation {
// Exit degradation mode
c.localDegradation = false
c.config.Logger.Info("Exiting local degradation mode")
}
}
}
}
// handleBeatFrame processes incoming beat frames with jitter tolerance
func (c *client) handleBeatFrame(msg *nats.Msg) {
var beatFrame BeatFrame
if err := json.Unmarshal(msg.Data, &beatFrame); err != nil {
c.addError(fmt.Sprintf("failed to unmarshal beat frame: %v", err))
return
}
// Validate beat frame
if beatFrame.Type != "backbeat.beatframe.v1" {
c.addError(fmt.Sprintf("invalid beat frame type: %s", beatFrame.Type))
return
}
// Check for jitter tolerance
now := time.Now()
expectedTime := beatFrame.DeadlineAt.Add(-c.getBeatDuration()) // Beat should arrive one duration before deadline
jitter := now.Sub(expectedTime)
if jitter.Abs() > c.config.JitterTolerance {
c.config.Logger.Debug("Beat jitter detected",
"jitter", jitter,
"tolerance", c.config.JitterTolerance,
"beat_index", beatFrame.BeatIndex)
}
// Update internal state
c.beatMutex.Lock()
c.currentBeat = beatFrame.BeatIndex
c.currentWindow = beatFrame.WindowID
c.currentHLC = beatFrame.HLC
// Track tempo changes and calculate actual BPM
if c.currentTempo != beatFrame.TempoBPM {
c.lastTempo = c.currentTempo
c.currentTempo = beatFrame.TempoBPM
}
// Calculate actual BPM from inter-beat timing
actualBPM := 60.0 // Default
if !c.lastBeatTime.IsZero() {
interBeatDuration := now.Sub(c.lastBeatTime)
if interBeatDuration > 0 {
actualBPM = 60.0 / interBeatDuration.Seconds()
}
}
// Record tempo sample for drift analysis
sample := tempoSample{
BeatIndex: beatFrame.BeatIndex,
Tempo: beatFrame.TempoBPM,
MeasuredTime: now,
ActualBPM: actualBPM,
}
c.tempoHistory = append(c.tempoHistory, sample)
// Keep only last 100 samples
if len(c.tempoHistory) > 100 {
c.tempoHistory = c.tempoHistory[1:]
}
c.lastBeatTime = now
c.beatMutex.Unlock()
// Record beat metrics
c.metrics.RecordBeat(beatFrame.DeadlineAt.Add(-c.getBeatDuration()), now, beatFrame.Downbeat)
// If we were in local degradation mode, exit it
if c.localDegradation {
c.localDegradation = false
c.config.Logger.Info("Exiting local degradation mode - beat received")
}
// Execute beat callbacks with error handling
c.callbackMutex.RLock()
beatCallbacks := make([]func(BeatFrame), len(c.beatCallbacks))
copy(beatCallbacks, c.beatCallbacks)
var downbeatCallbacks []func(BeatFrame)
if beatFrame.Downbeat {
downbeatCallbacks = make([]func(BeatFrame), len(c.downbeatCallbacks))
copy(downbeatCallbacks, c.downbeatCallbacks)
}
c.callbackMutex.RUnlock()
// Execute callbacks in separate goroutines to prevent blocking
for _, callback := range beatCallbacks {
go c.safeExecuteCallback(callback, beatFrame, "beat")
}
if beatFrame.Downbeat {
for _, callback := range downbeatCallbacks {
go c.safeExecuteCallback(callback, beatFrame, "downbeat")
}
}
c.config.Logger.Debug("Beat processed",
"beat_index", beatFrame.BeatIndex,
"downbeat", beatFrame.Downbeat,
"phase", beatFrame.Phase,
"window_id", beatFrame.WindowID)
}
// handleLocalDegradationBeat generates synthetic beats during network issues
func (c *client) handleLocalDegradationBeat() {
c.beatMutex.Lock()
c.currentBeat++
// Generate synthetic beat frame
now := time.Now()
beatFrame := BeatFrame{
Type: "backbeat.beatframe.v1",
ClusterID: c.config.ClusterID,
BeatIndex: c.currentBeat,
Downbeat: (c.currentBeat-1)%4 == 0, // Assume 4/4 time signature
Phase: "degraded",
HLC: fmt.Sprintf("%d-0", now.UnixNano()),
DeadlineAt: now.Add(time.Second), // 1 second deadline in degradation
TempoBPM: 2, // Default 2 BPM (30-second beats) - reasonable for distributed systems
WindowID: c.generateDegradedWindowID(c.currentBeat),
}
c.currentWindow = beatFrame.WindowID
c.currentHLC = beatFrame.HLC
c.lastBeatTime = now
c.beatMutex.Unlock()
// Execute callbacks same as normal beats
c.callbackMutex.RLock()
beatCallbacks := make([]func(BeatFrame), len(c.beatCallbacks))
copy(beatCallbacks, c.beatCallbacks)
var downbeatCallbacks []func(BeatFrame)
if beatFrame.Downbeat {
downbeatCallbacks = make([]func(BeatFrame), len(c.downbeatCallbacks))
copy(downbeatCallbacks, c.downbeatCallbacks)
}
c.callbackMutex.RUnlock()
for _, callback := range beatCallbacks {
go c.safeExecuteCallback(callback, beatFrame, "degraded-beat")
}
if beatFrame.Downbeat {
for _, callback := range downbeatCallbacks {
go c.safeExecuteCallback(callback, beatFrame, "degraded-downbeat")
}
}
}
// safeExecuteCallback executes a callback with panic recovery
func (c *client) safeExecuteCallback(callback func(BeatFrame), beat BeatFrame, callbackType string) {
defer func() {
if r := recover(); r != nil {
errMsg := fmt.Sprintf("panic in %s callback: %v", callbackType, r)
c.addError(errMsg)
c.metrics.RecordError(errMsg)
c.config.Logger.Error("Callback panic recovered",
"type", callbackType,
"panic", r,
"beat_index", beat.BeatIndex)
}
}()
start := time.Now()
callback(beat)
duration := time.Since(start)
// Record callback latency metrics
c.metrics.RecordCallbackLatency(duration, callbackType)
// Warn about slow callbacks
if duration > 5*time.Millisecond {
c.config.Logger.Warn("Slow callback detected",
"type", callbackType,
"duration", duration,
"beat_index", beat.BeatIndex)
}
}
// validateStatusClaim validates a status claim
func (c *client) validateStatusClaim(claim *StatusClaim) error {
if claim.State == "" {
return fmt.Errorf("state is required")
}
validStates := map[string]bool{
"executing": true,
"planning": true,
"waiting": true,
"review": true,
"done": true,
"failed": true,
}
if !validStates[claim.State] {
return fmt.Errorf("invalid state: must be one of [executing, planning, waiting, review, done, failed], got '%s'", claim.State)
}
if claim.Progress < 0.0 || claim.Progress > 1.0 {
return fmt.Errorf("progress must be between 0.0 and 1.0, got %f", claim.Progress)
}
if claim.BeatsLeft < 0 {
return fmt.Errorf("beats_left must be non-negative, got %d", claim.BeatsLeft)
}
return nil
}
// signStatusClaim signs a status claim using Ed25519 (BACKBEAT-REQ-044)
func (c *client) signStatusClaim(claim *StatusClaim) error {
if c.config.SigningKey == nil {
return fmt.Errorf("signing key not configured")
}
// Create canonical representation for signing
canonical, err := json.Marshal(claim)
if err != nil {
return fmt.Errorf("failed to marshal claim for signing: %w", err)
}
// Sign the canonical representation
signature := ed25519.Sign(c.config.SigningKey, canonical)
// Add signature to notes (temporary until proper signature field added)
claim.Notes += fmt.Sprintf(" [sig:%x]", signature)
return nil
}
// createHeaders creates NATS headers with required security information
func (c *client) createHeaders() nats.Header {
headers := make(nats.Header)
// Add window ID header (BACKBEAT-REQ-044)
headers.Add("x-window-id", c.GetCurrentWindow())
// Add HLC header (BACKBEAT-REQ-044)
headers.Add("x-hlc", c.getCurrentHLC())
// Add agent ID for routing
headers.Add("x-agent-id", c.config.AgentID)
return headers
}
// getCurrentHLC returns the current HLC timestamp
func (c *client) getCurrentHLC() string {
c.beatMutex.RLock()
defer c.beatMutex.RUnlock()
if c.currentHLC != "" {
return c.currentHLC
}
// Generate fallback HLC
return fmt.Sprintf("%d-0", time.Now().UnixNano())
}
// getBeatDuration calculates the duration of a beat based on current tempo
func (c *client) getBeatDuration() time.Duration {
c.beatMutex.RLock()
tempo := c.currentTempo
c.beatMutex.RUnlock()
if tempo <= 0 {
tempo = 60 // Default to 60 BPM if no tempo information available
}
// Calculate beat duration: 60 seconds / BPM = seconds per beat
return time.Duration(60.0/float64(tempo)*1000) * time.Millisecond
}
// generateDegradedWindowID generates a window ID for degraded mode
func (c *client) generateDegradedWindowID(beatIndex int64) string {
// Use similar algorithm to regular window ID but mark as degraded
input := fmt.Sprintf("%s:degraded:%d", c.config.ClusterID, beatIndex/4) // Assume 4-beat bars
hash := sha256.Sum256([]byte(input))
return fmt.Sprintf("deg-%x", hash)[:32]
}
// addError adds an error to the error list with deduplication
func (c *client) addError(err string) {
c.errorMutex.Lock()
defer c.errorMutex.Unlock()
// Keep only the last 10 errors to prevent memory leaks
if len(c.errors) >= 10 {
c.errors = c.errors[1:]
}
timestampedErr := fmt.Sprintf("[%s] %s", time.Now().Format("15:04:05"), err)
c.errors = append(c.errors, timestampedErr)
// Record error in metrics
c.metrics.RecordError(timestampedErr)
}
// Legacy compatibility functions for BACKBEAT-REQ-043
// ConvertLegacyBeat converts legacy {bar,beat} to beat_index with warning
func (c *client) ConvertLegacyBeat(bar, beat int) int64 {
c.legacyMutex.Lock()
if !c.legacyWarned {
c.config.Logger.Warn("Legacy {bar,beat} format detected - please migrate to beat_index",
"bar", bar, "beat", beat)
c.legacyWarned = true
}
c.legacyMutex.Unlock()
// Convert assuming 4 beats per bar (standard)
return int64((bar-1)*4 + beat)
}
// GetLegacyBeatInfo converts current beat_index to legacy {bar,beat} format
func (c *client) GetLegacyBeatInfo() LegacyBeatInfo {
beatIndex := c.GetCurrentBeat()
if beatIndex <= 0 {
return LegacyBeatInfo{Bar: 1, Beat: 1}
}
// Convert assuming 4 beats per bar
bar := int((beatIndex-1)/4) + 1
beat := int((beatIndex-1)%4) + 1
return LegacyBeatInfo{Bar: bar, Beat: beat}
}