package sdk import ( "crypto/ed25519" "crypto/sha256" "encoding/json" "fmt" "time" "github.com/nats-io/nats.go" ) // connect establishes connection to NATS with retry logic func (c *client) connect() error { opts := []nats.Option{ nats.ReconnectWait(c.config.ReconnectDelay), nats.MaxReconnects(c.config.MaxReconnects), nats.ReconnectHandler(func(nc *nats.Conn) { c.reconnectCount++ c.metrics.RecordConnection() c.config.Logger.Info("NATS reconnected", "reconnect_count", c.reconnectCount, "url", nc.ConnectedUrl()) }), nats.DisconnectErrHandler(func(nc *nats.Conn, err error) { if err != nil { c.metrics.RecordDisconnection() c.addError(fmt.Sprintf("NATS disconnected: %v", err)) c.config.Logger.Warn("NATS disconnected", "error", err) } }), nats.ClosedHandler(func(nc *nats.Conn) { c.metrics.RecordDisconnection() c.config.Logger.Info("NATS connection closed") }), } nc, err := nats.Connect(c.config.NATSUrl, opts...) if err != nil { c.metrics.RecordError(fmt.Sprintf("NATS connection failed: %v", err)) return fmt.Errorf("failed to connect to NATS: %w", err) } c.nc = nc c.metrics.RecordConnection() c.config.Logger.Info("Connected to NATS", "url", nc.ConnectedUrl()) return nil } // beatSubscriptionLoop handles beat frame subscription with jitter tolerance func (c *client) beatSubscriptionLoop() { defer c.wg.Done() subject := fmt.Sprintf("backbeat.beat.%s", c.config.ClusterID) // Subscribe to beat frames sub, err := c.nc.Subscribe(subject, c.handleBeatFrame) if err != nil { c.addError(fmt.Sprintf("failed to subscribe to beats: %v", err)) c.config.Logger.Error("Failed to subscribe to beats", "error", err) return } defer sub.Unsubscribe() c.config.Logger.Info("Beat subscription active", "subject", subject) // Start local degradation timer for fallback timing localTicker := time.NewTicker(1 * time.Second) // Default 60 BPM fallback defer localTicker.Stop() for { select { case <-c.ctx.Done(): return case <-localTicker.C: // Local degradation mode - generate synthetic beats if no recent beats c.beatMutex.RLock() timeSinceLastBeat := time.Since(c.lastBeatTime) c.beatMutex.RUnlock() // If more than 2 beat intervals have passed, enter degradation mode if timeSinceLastBeat > 2*time.Second { if !c.localDegradation { c.localDegradation = true c.config.Logger.Warn("Entering local degradation mode", "time_since_last_beat", timeSinceLastBeat) } c.handleLocalDegradationBeat() c.metrics.RecordLocalDegradation(timeSinceLastBeat) } else if c.localDegradation { // Exit degradation mode c.localDegradation = false c.config.Logger.Info("Exiting local degradation mode") } } } } // handleBeatFrame processes incoming beat frames with jitter tolerance func (c *client) handleBeatFrame(msg *nats.Msg) { var beatFrame BeatFrame if err := json.Unmarshal(msg.Data, &beatFrame); err != nil { c.addError(fmt.Sprintf("failed to unmarshal beat frame: %v", err)) return } // Validate beat frame if beatFrame.Type != "backbeat.beatframe.v1" { c.addError(fmt.Sprintf("invalid beat frame type: %s", beatFrame.Type)) return } // Check for jitter tolerance now := time.Now() expectedTime := beatFrame.DeadlineAt.Add(-c.getBeatDuration()) // Beat should arrive one duration before deadline jitter := now.Sub(expectedTime) if jitter.Abs() > c.config.JitterTolerance { c.config.Logger.Debug("Beat jitter detected", "jitter", jitter, "tolerance", c.config.JitterTolerance, "beat_index", beatFrame.BeatIndex) } // Update internal state c.beatMutex.Lock() c.currentBeat = beatFrame.BeatIndex c.currentWindow = beatFrame.WindowID c.currentHLC = beatFrame.HLC // Track tempo changes and calculate actual BPM if c.currentTempo != beatFrame.TempoBPM { c.lastTempo = c.currentTempo c.currentTempo = beatFrame.TempoBPM } // Calculate actual BPM from inter-beat timing actualBPM := 60.0 // Default if !c.lastBeatTime.IsZero() { interBeatDuration := now.Sub(c.lastBeatTime) if interBeatDuration > 0 { actualBPM = 60.0 / interBeatDuration.Seconds() } } // Record tempo sample for drift analysis sample := tempoSample{ BeatIndex: beatFrame.BeatIndex, Tempo: beatFrame.TempoBPM, MeasuredTime: now, ActualBPM: actualBPM, } c.tempoHistory = append(c.tempoHistory, sample) // Keep only last 100 samples if len(c.tempoHistory) > 100 { c.tempoHistory = c.tempoHistory[1:] } c.lastBeatTime = now c.beatMutex.Unlock() // Record beat metrics c.metrics.RecordBeat(beatFrame.DeadlineAt.Add(-c.getBeatDuration()), now, beatFrame.Downbeat) // If we were in local degradation mode, exit it if c.localDegradation { c.localDegradation = false c.config.Logger.Info("Exiting local degradation mode - beat received") } // Execute beat callbacks with error handling c.callbackMutex.RLock() beatCallbacks := make([]func(BeatFrame), len(c.beatCallbacks)) copy(beatCallbacks, c.beatCallbacks) var downbeatCallbacks []func(BeatFrame) if beatFrame.Downbeat { downbeatCallbacks = make([]func(BeatFrame), len(c.downbeatCallbacks)) copy(downbeatCallbacks, c.downbeatCallbacks) } c.callbackMutex.RUnlock() // Execute callbacks in separate goroutines to prevent blocking for _, callback := range beatCallbacks { go c.safeExecuteCallback(callback, beatFrame, "beat") } if beatFrame.Downbeat { for _, callback := range downbeatCallbacks { go c.safeExecuteCallback(callback, beatFrame, "downbeat") } } c.config.Logger.Debug("Beat processed", "beat_index", beatFrame.BeatIndex, "downbeat", beatFrame.Downbeat, "phase", beatFrame.Phase, "window_id", beatFrame.WindowID) } // handleLocalDegradationBeat generates synthetic beats during network issues func (c *client) handleLocalDegradationBeat() { c.beatMutex.Lock() c.currentBeat++ // Generate synthetic beat frame now := time.Now() beatFrame := BeatFrame{ Type: "backbeat.beatframe.v1", ClusterID: c.config.ClusterID, BeatIndex: c.currentBeat, Downbeat: (c.currentBeat-1)%4 == 0, // Assume 4/4 time signature Phase: "degraded", HLC: fmt.Sprintf("%d-0", now.UnixNano()), DeadlineAt: now.Add(time.Second), // 1 second deadline in degradation TempoBPM: 2, // Default 2 BPM (30-second beats) - reasonable for distributed systems WindowID: c.generateDegradedWindowID(c.currentBeat), } c.currentWindow = beatFrame.WindowID c.currentHLC = beatFrame.HLC c.lastBeatTime = now c.beatMutex.Unlock() // Execute callbacks same as normal beats c.callbackMutex.RLock() beatCallbacks := make([]func(BeatFrame), len(c.beatCallbacks)) copy(beatCallbacks, c.beatCallbacks) var downbeatCallbacks []func(BeatFrame) if beatFrame.Downbeat { downbeatCallbacks = make([]func(BeatFrame), len(c.downbeatCallbacks)) copy(downbeatCallbacks, c.downbeatCallbacks) } c.callbackMutex.RUnlock() for _, callback := range beatCallbacks { go c.safeExecuteCallback(callback, beatFrame, "degraded-beat") } if beatFrame.Downbeat { for _, callback := range downbeatCallbacks { go c.safeExecuteCallback(callback, beatFrame, "degraded-downbeat") } } } // safeExecuteCallback executes a callback with panic recovery func (c *client) safeExecuteCallback(callback func(BeatFrame), beat BeatFrame, callbackType string) { defer func() { if r := recover(); r != nil { errMsg := fmt.Sprintf("panic in %s callback: %v", callbackType, r) c.addError(errMsg) c.metrics.RecordError(errMsg) c.config.Logger.Error("Callback panic recovered", "type", callbackType, "panic", r, "beat_index", beat.BeatIndex) } }() start := time.Now() callback(beat) duration := time.Since(start) // Record callback latency metrics c.metrics.RecordCallbackLatency(duration, callbackType) // Warn about slow callbacks if duration > 5*time.Millisecond { c.config.Logger.Warn("Slow callback detected", "type", callbackType, "duration", duration, "beat_index", beat.BeatIndex) } } // validateStatusClaim validates a status claim func (c *client) validateStatusClaim(claim *StatusClaim) error { if claim.State == "" { return fmt.Errorf("state is required") } validStates := map[string]bool{ "executing": true, "planning": true, "waiting": true, "review": true, "done": true, "failed": true, } if !validStates[claim.State] { return fmt.Errorf("invalid state: must be one of [executing, planning, waiting, review, done, failed], got '%s'", claim.State) } if claim.Progress < 0.0 || claim.Progress > 1.0 { return fmt.Errorf("progress must be between 0.0 and 1.0, got %f", claim.Progress) } if claim.BeatsLeft < 0 { return fmt.Errorf("beats_left must be non-negative, got %d", claim.BeatsLeft) } return nil } // signStatusClaim signs a status claim using Ed25519 (BACKBEAT-REQ-044) func (c *client) signStatusClaim(claim *StatusClaim) error { if c.config.SigningKey == nil { return fmt.Errorf("signing key not configured") } // Create canonical representation for signing canonical, err := json.Marshal(claim) if err != nil { return fmt.Errorf("failed to marshal claim for signing: %w", err) } // Sign the canonical representation signature := ed25519.Sign(c.config.SigningKey, canonical) // Add signature to notes (temporary until proper signature field added) claim.Notes += fmt.Sprintf(" [sig:%x]", signature) return nil } // createHeaders creates NATS headers with required security information func (c *client) createHeaders() nats.Header { headers := make(nats.Header) // Add window ID header (BACKBEAT-REQ-044) headers.Add("x-window-id", c.GetCurrentWindow()) // Add HLC header (BACKBEAT-REQ-044) headers.Add("x-hlc", c.getCurrentHLC()) // Add agent ID for routing headers.Add("x-agent-id", c.config.AgentID) return headers } // getCurrentHLC returns the current HLC timestamp func (c *client) getCurrentHLC() string { c.beatMutex.RLock() defer c.beatMutex.RUnlock() if c.currentHLC != "" { return c.currentHLC } // Generate fallback HLC return fmt.Sprintf("%d-0", time.Now().UnixNano()) } // getBeatDuration calculates the duration of a beat based on current tempo func (c *client) getBeatDuration() time.Duration { c.beatMutex.RLock() tempo := c.currentTempo c.beatMutex.RUnlock() if tempo <= 0 { tempo = 60 // Default to 60 BPM if no tempo information available } // Calculate beat duration: 60 seconds / BPM = seconds per beat return time.Duration(60.0/float64(tempo)*1000) * time.Millisecond } // generateDegradedWindowID generates a window ID for degraded mode func (c *client) generateDegradedWindowID(beatIndex int64) string { // Use similar algorithm to regular window ID but mark as degraded input := fmt.Sprintf("%s:degraded:%d", c.config.ClusterID, beatIndex/4) // Assume 4-beat bars hash := sha256.Sum256([]byte(input)) return fmt.Sprintf("deg-%x", hash)[:32] } // addError adds an error to the error list with deduplication func (c *client) addError(err string) { c.errorMutex.Lock() defer c.errorMutex.Unlock() // Keep only the last 10 errors to prevent memory leaks if len(c.errors) >= 10 { c.errors = c.errors[1:] } timestampedErr := fmt.Sprintf("[%s] %s", time.Now().Format("15:04:05"), err) c.errors = append(c.errors, timestampedErr) // Record error in metrics c.metrics.RecordError(timestampedErr) } // Legacy compatibility functions for BACKBEAT-REQ-043 // ConvertLegacyBeat converts legacy {bar,beat} to beat_index with warning func (c *client) ConvertLegacyBeat(bar, beat int) int64 { c.legacyMutex.Lock() if !c.legacyWarned { c.config.Logger.Warn("Legacy {bar,beat} format detected - please migrate to beat_index", "bar", bar, "beat", beat) c.legacyWarned = true } c.legacyMutex.Unlock() // Convert assuming 4 beats per bar (standard) return int64((bar-1)*4 + beat) } // GetLegacyBeatInfo converts current beat_index to legacy {bar,beat} format func (c *client) GetLegacyBeatInfo() LegacyBeatInfo { beatIndex := c.GetCurrentBeat() if beatIndex <= 0 { return LegacyBeatInfo{Bar: 1, Beat: 1} } // Convert assuming 4 beats per bar bar := int((beatIndex-1)/4) + 1 beat := int((beatIndex-1)%4) + 1 return LegacyBeatInfo{Bar: bar, Beat: beat} }