backbeat: add module sources

2025-10-17 08:56:25 +11:00
parent 627d15b3f7
commit 4b4eb16efb
48 changed files with 11636 additions and 0 deletions
--- a/pkg/sdk/examples/examples_test.go
+++ b/pkg/sdk/examples/examples_test.go
@@ -0,0 +1,520 @@
+package examples
+
+import (
+	"context"
+	"crypto/ed25519"
+	"crypto/rand"
+	"fmt"
+	"testing"
+	"time"
+
+	"github.com/chorus-services/backbeat/pkg/sdk"
+)
+
+var testCounter int
+
+// generateUniqueAgentID generates unique agent IDs for tests to avoid expvar conflicts
+func generateUniqueAgentID(prefix string) string {
+	testCounter++
+	return fmt.Sprintf("%s-%d", prefix, testCounter)
+}
+
+// Test helper interface for both *testing.T and *testing.B
+type testHelper interface {
+	Fatalf(format string, args ...interface{})
+}
+
+// Test helper to create a test client configuration
+func createTestConfig(t testHelper, agentIDPrefix string) *sdk.Config {
+	_, signingKey, err := ed25519.GenerateKey(rand.Reader)
+	if err != nil {
+		t.Fatalf("Failed to generate signing key: %v", err)
+	}
+	
+	config := sdk.DefaultConfig()
+	config.ClusterID = "test-cluster"
+	config.AgentID = generateUniqueAgentID(agentIDPrefix)
+	config.NATSUrl = "nats://localhost:4222" // Assumes NATS is running for tests
+	config.SigningKey = signingKey
+	
+	return config
+}
+
+// TestSimpleAgentPattern tests the simple agent usage pattern
+func TestSimpleAgentPattern(t *testing.T) {
+	config := createTestConfig(t, "test-simple-agent")
+	client := sdk.NewClient(config)
+	
+	// Context for timeout control (used in full integration tests)
+	_ = context.Background()
+	
+	// Track callback invocations  
+	var beatCount, downbeatCount int
+	
+	// Register callbacks
+	err := client.OnBeat(func(beat sdk.BeatFrame) {
+		beatCount++
+		t.Logf("Beat received: %d (downbeat: %v)", beat.BeatIndex, beat.Downbeat)
+	})
+	if err != nil {
+		t.Fatalf("Failed to register beat callback: %v", err)
+	}
+	
+	err = client.OnDownbeat(func(beat sdk.BeatFrame) {
+		downbeatCount++
+		t.Logf("Downbeat received: %d", beat.BeatIndex)
+	})
+	if err != nil {
+		t.Fatalf("Failed to register downbeat callback: %v", err)
+	}
+	
+	// Use variables to prevent unused warnings
+	_ = beatCount
+	_ = downbeatCount
+	
+	// This test only checks if the client can be configured and started
+	// without errors. Full integration tests would require running services.
+	
+	// Test health status before starting
+	health := client.Health()
+	if health.Connected {
+		t.Error("Client should not be connected before Start()")
+	}
+	
+	// Test that we can create status claims
+	err = client.EmitStatusClaim(sdk.StatusClaim{
+		State:     "planning",
+		BeatsLeft: 10,
+		Progress:  0.0,
+		Notes:     "Test status claim",
+	})
+	// This should fail because client isn't started
+	if err == nil {
+		t.Error("EmitStatusClaim should fail when client not started")
+	}
+}
+
+// TestBeatBudgetPattern tests the beat budget usage pattern
+func TestBeatBudgetPattern(t *testing.T) {
+	config := createTestConfig(t, "test-budget-agent")
+	client := sdk.NewClient(config)
+	
+	// Test beat budget without starting client (should work for timeout logic)
+	err := client.WithBeatBudget(2, func() error {
+		time.Sleep(100 * time.Millisecond) // Quick task
+		return nil
+	})
+	
+	// This may fail due to no beat timing available, but shouldn't panic
+	if err != nil {
+		t.Logf("Beat budget failed as expected (no timing): %v", err)
+	}
+	
+	// Test invalid budget
+	err = client.WithBeatBudget(0, func() error {
+		return nil
+	})
+	if err == nil {
+		t.Error("WithBeatBudget should fail with zero budget")
+	}
+	
+	err = client.WithBeatBudget(-1, func() error {
+		return nil
+	})
+	if err == nil {
+		t.Error("WithBeatBudget should fail with negative budget")
+	}
+}
+
+// TestClientConfiguration tests various client configuration scenarios
+func TestClientConfiguration(t *testing.T) {
+	// Test with minimal config
+	config := &sdk.Config{
+		ClusterID: "test",
+		AgentID:   "test-agent",
+		NATSUrl:   "nats://localhost:4222",
+	}
+	
+	client := sdk.NewClient(config)
+	if client == nil {
+		t.Fatal("NewClient should not return nil")
+	}
+	
+	// Test health before start
+	health := client.Health()
+	if health.Connected {
+		t.Error("New client should not be connected")
+	}
+	
+	// Test utilities with no beat data
+	beat := client.GetCurrentBeat()
+	if beat != 0 {
+		t.Errorf("GetCurrentBeat should return 0 initially, got %d", beat)
+	}
+	
+	window := client.GetCurrentWindow()
+	if window != "" {
+		t.Errorf("GetCurrentWindow should return empty string initially, got %s", window)
+	}
+	
+	// Test IsInWindow
+	if client.IsInWindow("any-window") {
+		t.Error("IsInWindow should return false with no current window")
+	}
+}
+
+// TestStatusClaimValidation tests status claim validation
+func TestStatusClaimValidation(t *testing.T) {
+	config := createTestConfig(t, "test-validation")
+	client := sdk.NewClient(config)
+	
+	// Test various invalid status claims
+	testCases := []struct {
+		name    string
+		claim   sdk.StatusClaim
+		wantErr bool
+	}{
+		{
+			name: "valid claim",
+			claim: sdk.StatusClaim{
+				State:     "executing",
+				BeatsLeft: 5,
+				Progress:  0.5,
+				Notes:     "Test note",
+			},
+			wantErr: false, // Will still error due to no connection, but validation should pass
+		},
+		{
+			name: "invalid state",
+			claim: sdk.StatusClaim{
+				State:     "invalid",
+				BeatsLeft: 5,
+				Progress:  0.5,
+				Notes:     "Test note",
+			},
+			wantErr: true,
+		},
+		{
+			name: "negative progress",
+			claim: sdk.StatusClaim{
+				State:     "executing",
+				BeatsLeft: 5,
+				Progress:  -0.1,
+				Notes:     "Test note",
+			},
+			wantErr: true,
+		},
+		{
+			name: "progress too high",
+			claim: sdk.StatusClaim{
+				State:     "executing",
+				BeatsLeft: 5,
+				Progress:  1.1,
+				Notes:     "Test note",
+			},
+			wantErr: true,
+		},
+		{
+			name: "negative beats left",
+			claim: sdk.StatusClaim{
+				State:     "executing",
+				BeatsLeft: -1,
+				Progress:  0.5,
+				Notes:     "Test note",
+			},
+			wantErr: true,
+		},
+	}
+	
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			err := client.EmitStatusClaim(tc.claim)
+			
+			if tc.wantErr && err == nil {
+				t.Error("Expected error but got none")
+			}
+			
+			// Note: All will error due to no connection, but we're testing validation
+			if err != nil {
+				t.Logf("Error (expected): %v", err)
+			}
+		})
+	}
+}
+
+// BenchmarkStatusClaimEmission benchmarks status claim creation and validation
+func BenchmarkStatusClaimEmission(b *testing.B) {
+	config := createTestConfig(b, "benchmark-agent")
+	client := sdk.NewClient(config)
+	
+	claim := sdk.StatusClaim{
+		State:     "executing",
+		BeatsLeft: 10,
+		Progress:  0.75,
+		Notes:     "Benchmark test claim",
+	}
+	
+	b.ResetTimer()
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			// This will fail due to no connection, but measures validation overhead
+			client.EmitStatusClaim(claim)
+		}
+	})
+}
+
+// BenchmarkBeatCallbacks benchmarks callback execution
+func BenchmarkBeatCallbacks(b *testing.B) {
+	config := createTestConfig(b, "callback-benchmark")
+	client := sdk.NewClient(config)
+	
+	// Register a simple callback
+	client.OnBeat(func(beat sdk.BeatFrame) {
+		// Minimal processing
+		_ = beat.BeatIndex
+	})
+	
+	// Create a mock beat frame
+	beatFrame := sdk.BeatFrame{
+		Type:       "backbeat.beatframe.v1",
+		ClusterID:  "test",
+		BeatIndex:  1,
+		Downbeat:   false,
+		Phase:      "test",
+		HLC:        "123-0",
+		WindowID:   "test-window",
+		TempoBPM:   2,  // 30-second beats - much more reasonable for testing
+	}
+	
+	b.ResetTimer()
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			// Simulate callback execution
+			// Note: This doesn't actually invoke callbacks since client isn't started
+			_ = beatFrame
+		}
+	})
+}
+
+// TestDetermineState tests the state determination logic from simple_agent.go
+func TestDetermineState(t *testing.T) {
+	tests := []struct {
+		total    int64
+		completed int64
+		expected string
+	}{
+		{0, 0, "waiting"},
+		{5, 5, "done"},
+		{5, 3, "executing"},
+		{5, 0, "planning"},
+		{10, 8, "executing"},
+		{1, 1, "done"},
+	}
+
+	for _, test := range tests {
+		result := determineState(test.total, test.completed)
+		if result != test.expected {
+			t.Errorf("determineState(%d, %d) = %s; expected %s", 
+				test.total, test.completed, result, test.expected)
+		}
+	}
+}
+
+// TestCalculateBeatsLeft tests the beats remaining calculation from simple_agent.go
+func TestCalculateBeatsLeft(t *testing.T) {
+	tests := []struct {
+		total    int64
+		completed int64
+		expected int
+	}{
+		{0, 0, 0},
+		{5, 5, 0},
+		{5, 3, 10}, // (5-3) * 5 = 10
+		{10, 0, 50}, // 10 * 5 = 50
+		{1, 0, 5},   // 1 * 5 = 5
+	}
+
+	for _, test := range tests {
+		result := calculateBeatsLeft(test.total, test.completed)
+		if result != test.expected {
+			t.Errorf("calculateBeatsLeft(%d, %d) = %d; expected %d", 
+				test.total, test.completed, result, test.expected)
+		}
+	}
+}
+
+// TestTaskStructure tests Task struct from task_processor.go
+func TestTaskStructure(t *testing.T) {
+	task := &Task{
+		ID:          "test-task-123",
+		Description: "Test processing task",
+		BeatBudget:  8,
+		WorkTime:    3 * time.Second,
+		Created:     time.Now(),
+	}
+
+	if task.ID == "" {
+		t.Error("Expected task ID to be set")
+	}
+
+	if task.Description == "" {
+		t.Error("Expected task description to be set")
+	}
+
+	if task.BeatBudget <= 0 {
+		t.Error("Expected positive beat budget")
+	}
+
+	if task.WorkTime <= 0 {
+		t.Error("Expected positive work time")
+	}
+
+	if task.Created.IsZero() {
+		t.Error("Expected creation time to be set")
+	}
+}
+
+// TestServiceHealthStructure tests ServiceHealth struct from service_monitor.go
+func TestServiceHealthStructure(t *testing.T) {
+	health := &ServiceHealth{
+		ServiceName:   "test-service",
+		Status:        "healthy",
+		LastCheck:     time.Now(),
+		ResponseTime:  150 * time.Millisecond,
+		ErrorCount:    0,
+		Uptime:        5 * time.Minute,
+	}
+
+	if health.ServiceName == "" {
+		t.Error("Expected service name to be set")
+	}
+
+	validStatuses := []string{"healthy", "degraded", "unhealthy", "unknown"}
+	validStatus := false
+	for _, status := range validStatuses {
+		if health.Status == status {
+			validStatus = true
+			break
+		}
+	}
+	if !validStatus {
+		t.Errorf("Expected valid status, got: %s", health.Status)
+	}
+
+	if health.ResponseTime < 0 {
+		t.Error("Expected non-negative response time")
+	}
+
+	if health.ErrorCount < 0 {
+		t.Error("Expected non-negative error count")
+	}
+}
+
+// TestSystemMetricsStructure tests SystemMetrics struct from service_monitor.go
+func TestSystemMetricsStructure(t *testing.T) {
+	metrics := &SystemMetrics{
+		CPUPercent:     25.5,
+		MemoryPercent:  67.8,
+		GoroutineCount: 42,
+		HeapSizeMB:     128.5,
+	}
+
+	if metrics.CPUPercent < 0 || metrics.CPUPercent > 100 {
+		t.Error("Expected CPU percentage between 0 and 100")
+	}
+
+	if metrics.MemoryPercent < 0 || metrics.MemoryPercent > 100 {
+		t.Error("Expected memory percentage between 0 and 100")
+	}
+
+	if metrics.GoroutineCount < 0 {
+		t.Error("Expected non-negative goroutine count")
+	}
+
+	if metrics.HeapSizeMB < 0 {
+		t.Error("Expected non-negative heap size")
+	}
+}
+
+// TestHealthScoreCalculation tests calculateHealthScore from service_monitor.go
+func TestHealthScoreCalculation(t *testing.T) {
+	tests := []struct {
+		summary  map[string]int
+		expected float64
+	}{
+		{map[string]int{"healthy": 0, "degraded": 0, "unhealthy": 0, "unknown": 0}, 0.0},
+		{map[string]int{"healthy": 4, "degraded": 0, "unhealthy": 0, "unknown": 0}, 1.0},
+		{map[string]int{"healthy": 0, "degraded": 0, "unhealthy": 4, "unknown": 0}, 0.0},
+		{map[string]int{"healthy": 2, "degraded": 2, "unhealthy": 0, "unknown": 0}, 0.75},
+		{map[string]int{"healthy": 1, "degraded": 1, "unhealthy": 1, "unknown": 1}, 0.4375},
+	}
+
+	for i, test := range tests {
+		result := calculateHealthScore(test.summary)
+		if result != test.expected {
+			t.Errorf("Test %d: calculateHealthScore(%v) = %.4f; expected %.4f", 
+				i, test.summary, result, test.expected)
+		}
+	}
+}
+
+// TestDetermineOverallState tests determineOverallState from service_monitor.go
+func TestDetermineOverallState(t *testing.T) {
+	tests := []struct {
+		summary  map[string]int
+		expected string
+	}{
+		{map[string]int{"healthy": 3, "degraded": 0, "unhealthy": 0, "unknown": 0}, "done"},
+		{map[string]int{"healthy": 2, "degraded": 1, "unhealthy": 0, "unknown": 0}, "executing"},
+		{map[string]int{"healthy": 1, "degraded": 1, "unhealthy": 1, "unknown": 0}, "failed"},
+		{map[string]int{"healthy": 0, "degraded": 0, "unhealthy": 0, "unknown": 3}, "waiting"},
+		{map[string]int{"healthy": 0, "degraded": 0, "unhealthy": 1, "unknown": 0}, "failed"},
+	}
+
+	for i, test := range tests {
+		result := determineOverallState(test.summary)
+		if result != test.expected {
+			t.Errorf("Test %d: determineOverallState(%v) = %s; expected %s", 
+				i, test.summary, result, test.expected)
+		}
+	}
+}
+
+// TestFormatHealthSummary tests formatHealthSummary from service_monitor.go
+func TestFormatHealthSummary(t *testing.T) {
+	summary := map[string]int{
+		"healthy":   3,
+		"degraded":  2,
+		"unhealthy": 1,
+		"unknown":   0,
+	}
+
+	result := formatHealthSummary(summary)
+	expected := "H:3 D:2 U:1 ?:0"
+
+	if result != expected {
+		t.Errorf("formatHealthSummary() = %s; expected %s", result, expected)
+	}
+}
+
+// TestCollectSystemMetrics tests collectSystemMetrics from service_monitor.go
+func TestCollectSystemMetrics(t *testing.T) {
+	metrics := collectSystemMetrics()
+
+	if metrics.GoroutineCount <= 0 {
+		t.Error("Expected positive goroutine count")
+	}
+
+	if metrics.HeapSizeMB < 0 {
+		t.Error("Expected non-negative heap size")
+	}
+
+	// Note: CPU and Memory percentages are simplified in the example implementation
+	if metrics.CPUPercent < 0 {
+		t.Error("Expected non-negative CPU percentage")
+	}
+
+	if metrics.MemoryPercent < 0 {
+		t.Error("Expected non-negative memory percentage")
+	}
+}
--- a/pkg/sdk/examples/service_monitor.go
+++ b/pkg/sdk/examples/service_monitor.go
@@ -0,0 +1,326 @@
+package examples
+
+import (
+	"context"
+	"crypto/ed25519"
+	"crypto/rand"
+	"encoding/json"
+	"fmt"
+	"log/slog"
+	"net/http"
+	"os"
+	"os/signal"
+	"runtime"
+	"sync"
+	"syscall"
+	"time"
+
+	"github.com/chorus-services/backbeat/pkg/sdk"
+)
+
+// ServiceHealth represents the health status of a monitored service
+type ServiceHealth struct {
+	ServiceName   string    `json:"service_name"`
+	Status        string    `json:"status"` // healthy, degraded, unhealthy
+	LastCheck     time.Time `json:"last_check"`
+	ResponseTime  time.Duration `json:"response_time"`
+	ErrorCount    int       `json:"error_count"`
+	Uptime        time.Duration `json:"uptime"`
+}
+
+// SystemMetrics represents system-level metrics
+type SystemMetrics struct {
+	CPUPercent    float64 `json:"cpu_percent"`
+	MemoryPercent float64 `json:"memory_percent"`
+	GoroutineCount int    `json:"goroutine_count"`
+	HeapSizeMB    float64 `json:"heap_size_mb"`
+}
+
+// ServiceMonitor demonstrates health monitoring with beat-aligned reporting
+// This example shows how to integrate BACKBEAT with service monitoring
+func ServiceMonitor() {
+	// Generate a signing key for this example
+	_, signingKey, err := ed25519.GenerateKey(rand.Reader)
+	if err != nil {
+		slog.Error("Failed to generate signing key", "error", err)
+		return
+	}
+	
+	// Create SDK configuration
+	config := sdk.DefaultConfig()
+	config.ClusterID = "chorus-dev"
+	config.AgentID = "service-monitor"
+	config.NATSUrl = "nats://localhost:4222"
+	config.SigningKey = signingKey
+	config.Logger = slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{
+		Level: slog.LevelInfo,
+	}))
+	
+	// Create BACKBEAT client
+	client := sdk.NewClient(config)
+	
+	// Services to monitor (example endpoints)
+	monitoredServices := map[string]string{
+		"pulse-service":  "http://localhost:8080/health",
+		"reverb-service": "http://localhost:8081/health",
+		"nats-server":    "http://localhost:8222/varz", // NATS monitoring endpoint
+	}
+	
+	// Health tracking
+	var (
+		healthStatus = make(map[string]*ServiceHealth)
+		healthMutex  sync.RWMutex
+		startTime    = time.Now()
+	)
+	
+	// Initialize health status
+	for serviceName := range monitoredServices {
+		healthStatus[serviceName] = &ServiceHealth{
+			ServiceName: serviceName,
+			Status:      "unknown",
+			LastCheck:   time.Time{},
+		}
+	}
+	
+	// Register beat callback for frequent health checks
+	client.OnBeat(func(beat sdk.BeatFrame) {
+		// Perform health checks every 4 beats (reduce frequency)
+		if beat.BeatIndex%4 == 0 {
+			performHealthChecks(monitoredServices, healthStatus, &healthMutex)
+		}
+		
+		// Emit status claim with current health summary
+		if beat.BeatIndex%2 == 0 {
+			healthSummary := generateHealthSummary(healthStatus, &healthMutex)
+			systemMetrics := collectSystemMetrics()
+			
+			state := determineOverallState(healthSummary)
+			notes := fmt.Sprintf("Services: %s | CPU: %.1f%% | Mem: %.1f%% | Goroutines: %d",
+				formatHealthSummary(healthSummary),
+				systemMetrics.CPUPercent,
+				systemMetrics.MemoryPercent,
+				systemMetrics.GoroutineCount)
+			
+			err := client.EmitStatusClaim(sdk.StatusClaim{
+				State:     state,
+				BeatsLeft: 0, // Monitoring is continuous
+				Progress:  calculateHealthScore(healthSummary),
+				Notes:     notes,
+			})
+			if err != nil {
+				slog.Error("Failed to emit status claim", "error", err)
+			}
+		}
+	})
+	
+	// Register downbeat callback for detailed reporting
+	client.OnDownbeat(func(beat sdk.BeatFrame) {
+		healthMutex.RLock()
+		healthData, _ := json.MarshalIndent(healthStatus, "", "  ")
+		healthMutex.RUnlock()
+		
+		systemMetrics := collectSystemMetrics()
+		uptime := time.Since(startTime)
+		
+		slog.Info("Service health report",
+			"beat_index", beat.BeatIndex,
+			"window_id", beat.WindowID,
+			"uptime", uptime.String(),
+			"cpu_percent", systemMetrics.CPUPercent,
+			"memory_percent", systemMetrics.MemoryPercent,
+			"heap_mb", systemMetrics.HeapSizeMB,
+			"goroutines", systemMetrics.GoroutineCount,
+		)
+		
+		// Log health details
+		slog.Debug("Detailed health status", "health_data", string(healthData))
+		
+		// Emit comprehensive status for the bar
+		healthSummary := generateHealthSummary(healthStatus, &healthMutex)
+		err := client.EmitStatusClaim(sdk.StatusClaim{
+			State:     "review", // Downbeat is review time
+			BeatsLeft: 0,
+			Progress:  calculateHealthScore(healthSummary),
+			Notes:     fmt.Sprintf("Bar %d health review: %s", beat.BeatIndex/4, formatDetailedHealth(healthSummary, systemMetrics)),
+		})
+		if err != nil {
+			slog.Error("Failed to emit downbeat status", "error", err)
+		}
+	})
+	
+	// Setup graceful shutdown
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	
+	// Handle shutdown signals
+	sigChan := make(chan os.Signal, 1)
+	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
+	
+	go func() {
+		<-sigChan
+		slog.Info("Shutdown signal received")
+		cancel()
+	}()
+	
+	// Start the client
+	if err := client.Start(ctx); err != nil {
+		slog.Error("Failed to start BACKBEAT client", "error", err)
+		return
+	}
+	defer client.Stop()
+	
+	slog.Info("Service monitor started - use Ctrl+C to stop",
+		"monitored_services", len(monitoredServices))
+	
+	// Expose metrics endpoint
+	go func() {
+		http.HandleFunc("/metrics", func(w http.ResponseWriter, r *http.Request) {
+			healthMutex.RLock()
+			data := make(map[string]interface{})
+			data["health"] = healthStatus
+			data["system"] = collectSystemMetrics()
+			data["backbeat"] = client.Health()
+			healthMutex.RUnlock()
+			
+			w.Header().Set("Content-Type", "application/json")
+			json.NewEncoder(w).Encode(data)
+		})
+		
+		slog.Info("Metrics endpoint available", "url", "http://localhost:9090/metrics")
+		if err := http.ListenAndServe(":9090", nil); err != nil {
+			slog.Error("Metrics server failed", "error", err)
+		}
+	}()
+	
+	// Wait for shutdown
+	<-ctx.Done()
+	slog.Info("Service monitor shutting down")
+}
+
+// performHealthChecks checks the health of all monitored services
+func performHealthChecks(services map[string]string, healthStatus map[string]*ServiceHealth, mutex *sync.RWMutex) {
+	for serviceName, endpoint := range services {
+		go func(name, url string) {
+			start := time.Now()
+			
+			client := &http.Client{Timeout: 5 * time.Second}
+			resp, err := client.Get(url)
+			responseTime := time.Since(start)
+			
+			mutex.Lock()
+			health := healthStatus[name]
+			health.LastCheck = time.Now()
+			health.ResponseTime = responseTime
+			
+			if err != nil {
+				health.ErrorCount++
+				health.Status = "unhealthy"
+				slog.Warn("Health check failed",
+					"service", name,
+					"endpoint", url,
+					"error", err,
+					"response_time", responseTime)
+			} else {
+				if resp.StatusCode >= 200 && resp.StatusCode < 300 {
+					health.Status = "healthy"
+				} else if resp.StatusCode >= 300 && resp.StatusCode < 500 {
+					health.Status = "degraded"
+				} else {
+					health.Status = "unhealthy"
+					health.ErrorCount++
+				}
+				resp.Body.Close()
+				
+				if responseTime > 2*time.Second {
+					health.Status = "degraded" // Slow response
+				}
+				
+				slog.Debug("Health check completed",
+					"service", name,
+					"status", health.Status,
+					"response_time", responseTime,
+					"status_code", resp.StatusCode)
+			}
+			mutex.Unlock()
+		}(serviceName, endpoint)
+	}
+}
+
+// generateHealthSummary creates a summary of service health
+func generateHealthSummary(healthStatus map[string]*ServiceHealth, mutex *sync.RWMutex) map[string]int {
+	mutex.RLock()
+	defer mutex.RUnlock()
+	
+	summary := map[string]int{
+		"healthy":   0,
+		"degraded":  0,
+		"unhealthy": 0,
+		"unknown":   0,
+	}
+	
+	for _, health := range healthStatus {
+		summary[health.Status]++
+	}
+	
+	return summary
+}
+
+// determineOverallState determines the overall system state
+func determineOverallState(healthSummary map[string]int) string {
+	if healthSummary["unhealthy"] > 0 {
+		return "failed"
+	}
+	if healthSummary["degraded"] > 0 {
+		return "executing" // Degraded but still working
+	}
+	if healthSummary["healthy"] > 0 {
+		return "done"
+	}
+	return "waiting" // All unknown
+}
+
+// calculateHealthScore calculates a health score (0.0-1.0)
+func calculateHealthScore(healthSummary map[string]int) float64 {
+	total := healthSummary["healthy"] + healthSummary["degraded"] + healthSummary["unhealthy"] + healthSummary["unknown"]
+	if total == 0 {
+		return 0.0
+	}
+	
+	// Weight the scores: healthy=1.0, degraded=0.5, unhealthy=0.0, unknown=0.25
+	score := float64(healthSummary["healthy"])*1.0 + 
+		float64(healthSummary["degraded"])*0.5 + 
+		float64(healthSummary["unknown"])*0.25
+	
+	return score / float64(total)
+}
+
+// formatHealthSummary creates a compact string representation
+func formatHealthSummary(healthSummary map[string]int) string {
+	return fmt.Sprintf("H:%d D:%d U:%d ?:%d",
+		healthSummary["healthy"],
+		healthSummary["degraded"],
+		healthSummary["unhealthy"],
+		healthSummary["unknown"])
+}
+
+// formatDetailedHealth creates detailed health information
+func formatDetailedHealth(healthSummary map[string]int, systemMetrics SystemMetrics) string {
+	return fmt.Sprintf("Health: %s, CPU: %.1f%%, Mem: %.1f%%, Heap: %.1fMB",
+		formatHealthSummary(healthSummary),
+		systemMetrics.CPUPercent,
+		systemMetrics.MemoryPercent,
+		systemMetrics.HeapSizeMB)
+}
+
+// collectSystemMetrics collects basic system metrics
+func collectSystemMetrics() SystemMetrics {
+	var mem runtime.MemStats
+	runtime.ReadMemStats(&mem)
+	
+	return SystemMetrics{
+		CPUPercent:     0.0, // Would need external package like gopsutil for real CPU metrics
+		MemoryPercent:  float64(mem.Sys) / (1024 * 1024 * 1024) * 100, // Rough approximation
+		GoroutineCount: runtime.NumGoroutine(),
+		HeapSizeMB:     float64(mem.HeapSys) / (1024 * 1024),
+	}
+}
--- a/pkg/sdk/examples/simple_agent.go
+++ b/pkg/sdk/examples/simple_agent.go
@@ -0,0 +1,150 @@
+// Package examples demonstrates BACKBEAT SDK usage patterns
+package examples
+
+import (
+	"context"
+	"crypto/ed25519"
+	"crypto/rand"
+	"fmt"
+	"log/slog"
+	"os"
+	"os/signal"
+	"sync/atomic"
+	"syscall"
+	"time"
+
+	"github.com/chorus-services/backbeat/pkg/sdk"
+)
+
+// SimpleAgent demonstrates basic BACKBEAT SDK usage
+// This example shows the minimal integration pattern for CHORUS services
+func SimpleAgent() {
+	// Generate a signing key for this example
+	_, signingKey, err := ed25519.GenerateKey(rand.Reader)
+	if err != nil {
+		slog.Error("Failed to generate signing key", "error", err)
+		return
+	}
+	
+	// Create SDK configuration
+	config := sdk.DefaultConfig()
+	config.ClusterID = "chorus-dev"
+	config.AgentID = "simple-agent"
+	config.NATSUrl = "nats://localhost:4222" // Adjust for your setup
+	config.SigningKey = signingKey
+	config.Logger = slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{
+		Level: slog.LevelInfo,
+	}))
+	
+	// Create BACKBEAT client
+	client := sdk.NewClient(config)
+	
+	// Track some simple state
+	var taskCounter int64
+	var completedTasks int64
+	
+	// Register beat callback - this runs on every beat
+	client.OnBeat(func(beat sdk.BeatFrame) {
+		currentTasks := atomic.LoadInt64(&taskCounter)
+		completed := atomic.LoadInt64(&completedTasks)
+		
+		// Emit status every few beats
+		if beat.BeatIndex%3 == 0 {
+			progress := 0.0
+			if currentTasks > 0 {
+				progress = float64(completed) / float64(currentTasks)
+			}
+			
+			err := client.EmitStatusClaim(sdk.StatusClaim{
+				State:     determineState(currentTasks, completed),
+				BeatsLeft: calculateBeatsLeft(currentTasks, completed),
+				Progress:  progress,
+				Notes:     fmt.Sprintf("Processing tasks: %d/%d", completed, currentTasks),
+			})
+			if err != nil {
+				slog.Error("Failed to emit status claim", "error", err)
+			}
+		}
+	})
+	
+	// Register downbeat callback - this runs at the start of each bar
+	client.OnDownbeat(func(beat sdk.BeatFrame) {
+		slog.Info("Bar started",
+			"beat_index", beat.BeatIndex,
+			"window_id", beat.WindowID,
+			"phase", beat.Phase)
+		
+		// Start new tasks at the beginning of bars
+		atomic.AddInt64(&taskCounter, 2) // Add 2 new tasks per bar
+	})
+	
+	// Setup graceful shutdown
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	
+	// Handle shutdown signals
+	sigChan := make(chan os.Signal, 1)
+	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
+	
+	go func() {
+		<-sigChan
+		slog.Info("Shutdown signal received")
+		cancel()
+	}()
+	
+	// Start the client
+	if err := client.Start(ctx); err != nil {
+		slog.Error("Failed to start BACKBEAT client", "error", err)
+		return
+	}
+	defer client.Stop()
+	
+	slog.Info("Simple agent started - use Ctrl+C to stop")
+	
+	// Simulate some work - complete tasks periodically
+	ticker := time.NewTicker(2 * time.Second)
+	defer ticker.Stop()
+	
+	for {
+		select {
+		case <-ctx.Done():
+			slog.Info("Shutting down simple agent")
+			return
+		case <-ticker.C:
+			// Complete a task if we have any pending
+			current := atomic.LoadInt64(&taskCounter)
+			completed := atomic.LoadInt64(&completedTasks)
+			if completed < current {
+				atomic.AddInt64(&completedTasks, 1)
+				slog.Debug("Completed a task",
+					"completed", completed+1,
+					"total", current)
+			}
+		}
+	}
+}
+
+// determineState calculates the current state based on task progress
+func determineState(total, completed int64) string {
+	if total == 0 {
+		return "waiting"
+	}
+	if completed == total {
+		return "done"
+	}
+	if completed > 0 {
+		return "executing"
+	}
+	return "planning"
+}
+
+// calculateBeatsLeft estimates beats remaining based on current progress
+func calculateBeatsLeft(total, completed int64) int {
+	if total == 0 || completed >= total {
+		return 0
+	}
+	
+	remaining := total - completed
+	// Assume each task takes about 5 beats to complete
+	return int(remaining * 5)
+}
--- a/pkg/sdk/examples/task_processor.go
+++ b/pkg/sdk/examples/task_processor.go
@@ -0,0 +1,259 @@
+package examples
+
+import (
+	"context"
+	"crypto/ed25519"
+	"crypto/rand"
+	"fmt"
+	"log/slog"
+	"math"
+	mathRand "math/rand"
+	"os"
+	"os/signal"
+	"sync"
+	"syscall"
+	"time"
+
+	"github.com/chorus-services/backbeat/pkg/sdk"
+)
+
+// Task represents a work item with beat budget requirements
+type Task struct {
+	ID          string
+	Description string
+	BeatBudget  int           // Maximum beats allowed for completion
+	WorkTime    time.Duration // Simulated work duration
+	Created     time.Time
+}
+
+// TaskProcessor demonstrates beat budget usage and timeout management
+// This example shows how to use beat budgets for reliable task execution
+func TaskProcessor() {
+	// Generate a signing key for this example
+	_, signingKey, err := ed25519.GenerateKey(rand.Reader)
+	if err != nil {
+		slog.Error("Failed to generate signing key", "error", err)
+		return
+	}
+	
+	// Create SDK configuration
+	config := sdk.DefaultConfig()
+	config.ClusterID = "chorus-dev"
+	config.AgentID = "task-processor"
+	config.NATSUrl = "nats://localhost:4222"
+	config.SigningKey = signingKey
+	config.Logger = slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{
+		Level: slog.LevelDebug,
+	}))
+	
+	// Create BACKBEAT client
+	client := sdk.NewClient(config)
+	
+	// Task management
+	var (
+		taskQueue     = make(chan *Task, 100)
+		activeTasks   = make(map[string]*Task)
+		completedTasks = 0
+		failedTasks   = 0
+		taskMutex     sync.RWMutex
+	)
+	
+	// Register beat callback for status reporting
+	client.OnBeat(func(beat sdk.BeatFrame) {
+		taskMutex.RLock()
+		activeCount := len(activeTasks)
+		taskMutex.RUnlock()
+		
+		// Emit status every 2 beats
+		if beat.BeatIndex%2 == 0 {
+			state := "waiting"
+			if activeCount > 0 {
+				state = "executing"
+			}
+			
+			progress := float64(completedTasks) / float64(completedTasks+failedTasks+activeCount+len(taskQueue))
+			if math.IsNaN(progress) {
+				progress = 0.0
+			}
+			
+			err := client.EmitStatusClaim(sdk.StatusClaim{
+				State:     state,
+				BeatsLeft: activeCount * 5, // Estimate 5 beats per active task
+				Progress:  progress,
+				Notes:     fmt.Sprintf("Active: %d, Completed: %d, Failed: %d, Queue: %d", 
+					activeCount, completedTasks, failedTasks, len(taskQueue)),
+			})
+			if err != nil {
+				slog.Error("Failed to emit status claim", "error", err)
+			}
+		}
+	})
+	
+	// Register downbeat callback to create new tasks
+	client.OnDownbeat(func(beat sdk.BeatFrame) {
+		slog.Info("New bar - creating tasks",
+			"beat_index", beat.BeatIndex,
+			"window_id", beat.WindowID)
+		
+		// Create 1-3 new tasks each bar
+		numTasks := mathRand.Intn(3) + 1
+		for i := 0; i < numTasks; i++ {
+			task := &Task{
+				ID:          fmt.Sprintf("task-%d-%d", beat.BeatIndex, i),
+				Description: fmt.Sprintf("Process data batch %d", i),
+				BeatBudget:  mathRand.Intn(8) + 2, // 2-10 beat budget
+				WorkTime:    time.Duration(mathRand.Intn(3)+1) * time.Second, // 1-4 seconds of work
+				Created:     time.Now(),
+			}
+			
+			select {
+			case taskQueue <- task:
+				slog.Debug("Task created", "task_id", task.ID, "budget", task.BeatBudget)
+			default:
+				slog.Warn("Task queue full, dropping task", "task_id", task.ID)
+			}
+		}
+	})
+	
+	// Setup graceful shutdown
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	
+	// Handle shutdown signals
+	sigChan := make(chan os.Signal, 1)
+	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
+	
+	go func() {
+		<-sigChan
+		slog.Info("Shutdown signal received")
+		cancel()
+	}()
+	
+	// Start the client
+	if err := client.Start(ctx); err != nil {
+		slog.Error("Failed to start BACKBEAT client", "error", err)
+		return
+	}
+	defer client.Stop()
+	
+	slog.Info("Task processor started - use Ctrl+C to stop")
+	
+	// Start task workers
+	const numWorkers = 3
+	for i := 0; i < numWorkers; i++ {
+		go func(workerID int) {
+			for {
+				select {
+				case <-ctx.Done():
+					return
+				case task := <-taskQueue:
+					processTaskWithBudget(ctx, client, task, workerID, &taskMutex, activeTasks, &completedTasks, &failedTasks)
+				}
+			}
+		}(i)
+	}
+	
+	// Wait for shutdown
+	<-ctx.Done()
+	slog.Info("Task processor shutting down")
+}
+
+// processTaskWithBudget processes a task using BACKBEAT beat budgets
+func processTaskWithBudget(
+	ctx context.Context,
+	client sdk.Client, 
+	task *Task, 
+	workerID int,
+	taskMutex *sync.RWMutex,
+	activeTasks map[string]*Task,
+	completedTasks *int,
+	failedTasks *int,
+) {
+	// Add task to active tasks
+	taskMutex.Lock()
+	activeTasks[task.ID] = task
+	taskMutex.Unlock()
+	
+	// Remove from active tasks when done
+	defer func() {
+		taskMutex.Lock()
+		delete(activeTasks, task.ID)
+		taskMutex.Unlock()
+	}()
+	
+	slog.Info("Processing task",
+		"worker", workerID,
+		"task_id", task.ID,
+		"budget", task.BeatBudget,
+		"work_time", task.WorkTime)
+	
+	// Use beat budget to execute the task
+	err := client.WithBeatBudget(task.BeatBudget, func() error {
+		// Emit starting status
+		client.EmitStatusClaim(sdk.StatusClaim{
+			TaskID:    task.ID,
+			State:     "executing",
+			BeatsLeft: task.BeatBudget,
+			Progress:  0.0,
+			Notes:     fmt.Sprintf("Worker %d processing %s", workerID, task.Description),
+		})
+		
+		// Simulate work with progress updates
+		steps := 5
+		stepDuration := task.WorkTime / time.Duration(steps)
+		
+		for step := 0; step < steps; step++ {
+			select {
+			case <-ctx.Done():
+				return ctx.Err()
+			case <-time.After(stepDuration):
+				progress := float64(step+1) / float64(steps)
+				
+				client.EmitStatusClaim(sdk.StatusClaim{
+					TaskID:    task.ID,
+					State:     "executing",
+					BeatsLeft: int(float64(task.BeatBudget) * (1.0 - progress)),
+					Progress:  progress,
+					Notes:     fmt.Sprintf("Worker %d step %d/%d", workerID, step+1, steps),
+				})
+			}
+		}
+		
+		return nil
+	})
+	
+	// Handle completion or timeout
+	if err != nil {
+		slog.Warn("Task failed or timed out",
+			"worker", workerID,
+			"task_id", task.ID,
+			"error", err)
+		
+		*failedTasks++
+		
+		// Emit failure status
+		client.EmitStatusClaim(sdk.StatusClaim{
+			TaskID:    task.ID,
+			State:     "failed",
+			BeatsLeft: 0,
+			Progress:  0.0,
+			Notes:     fmt.Sprintf("Worker %d failed: %s", workerID, err.Error()),
+		})
+	} else {
+		slog.Info("Task completed successfully",
+			"worker", workerID,
+			"task_id", task.ID,
+			"duration", time.Since(task.Created))
+		
+		*completedTasks++
+		
+		// Emit completion status
+		client.EmitStatusClaim(sdk.StatusClaim{
+			TaskID:    task.ID,
+			State:     "done",
+			BeatsLeft: 0,
+			Progress:  1.0,
+			Notes:     fmt.Sprintf("Worker %d completed %s", workerID, task.Description),
+		})
+	}
+}