backbeat: add module sources

This commit is contained in:
anthonyrawlins
2025-10-17 08:56:25 +11:00
parent 627d15b3f7
commit 4b4eb16efb
48 changed files with 11636 additions and 0 deletions

115
Dockerfile Normal file
View File

@@ -0,0 +1,115 @@
# Build stage
FROM golang:1.22-alpine AS builder
# Install build dependencies
RUN apk add --no-cache git ca-certificates
# Set working directory
WORKDIR /app
# Copy go mod files
COPY go.mod go.sum ./
# Download dependencies
RUN go mod download
# Copy source code
COPY . .
# Build all services
RUN CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -o pulse ./cmd/pulse
RUN CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -o reverb ./cmd/reverb
RUN CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -o agent-sim ./cmd/agent-sim
# Pulse service image
FROM alpine:latest AS pulse
# Install runtime dependencies
RUN apk --no-cache add ca-certificates tzdata
# Create non-root user
RUN addgroup -g 1001 backbeat && \
adduser -D -s /bin/sh -u 1001 -G backbeat backbeat
# Set working directory
WORKDIR /app
# Copy pulse binary from builder
COPY --from=builder /app/pulse .
# Create data directory
RUN mkdir -p /data && chown -R backbeat:backbeat /data
# Switch to non-root user
USER backbeat
# Expose ports (8080 for HTTP, 9000 for Raft)
EXPOSE 8080 9000
# Health check
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD wget --no-verbose --tries=1 --spider http://localhost:8080/health || exit 1
# Default command
ENTRYPOINT ["./pulse"]
CMD ["-cluster", "chorus-production", \
"-admin-port", "8080", \
"-raft-bind", "0.0.0.0:9000", \
"-data-dir", "/data"]
# Reverb service image
FROM alpine:latest AS reverb
# Install runtime dependencies
RUN apk --no-cache add ca-certificates tzdata
# Create non-root user
RUN addgroup -g 1001 backbeat && \
adduser -D -s /bin/sh -u 1001 -G backbeat backbeat
# Set working directory
WORKDIR /app
# Copy reverb binary from builder
COPY --from=builder /app/reverb .
# Switch to non-root user
USER backbeat
# Expose port (8080 for HTTP)
EXPOSE 8080
# Health check
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD wget --no-verbose --tries=1 --spider http://localhost:8080/health || exit 1
# Default command
ENTRYPOINT ["./reverb"]
CMD ["-cluster", "chorus-production", \
"-nats", "nats://nats:4222", \
"-bar-length", "120", \
"-log-level", "info"]
# Agent simulator image
FROM alpine:latest AS agent-sim
# Install runtime dependencies
RUN apk --no-cache add ca-certificates tzdata
# Create non-root user
RUN addgroup -g 1001 backbeat && \
adduser -D -s /bin/sh -u 1001 -G backbeat backbeat
# Set working directory
WORKDIR /app
# Copy agent-sim binary from builder
COPY --from=builder /app/agent-sim .
# Switch to non-root user
USER backbeat
# Default command
ENTRYPOINT ["./agent-sim"]
CMD ["-cluster", "chorus-production", \
"-nats", "nats://nats:4222"]

111
Dockerfile.production Normal file
View File

@@ -0,0 +1,111 @@
# Production Dockerfile for BACKBEAT services
# Multi-stage build with optimized production images
# Build stage
FROM golang:1.22-alpine AS builder
# Install build dependencies
RUN apk add --no-cache git ca-certificates tzdata
# Set working directory
WORKDIR /app
# Copy go mod files
COPY go.mod go.sum ./
# Download dependencies
RUN go mod download
# Copy source code
COPY . .
# Build all services with optimizations
RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \
-a -installsuffix cgo \
-ldflags='-w -s -extldflags "-static"' \
-o pulse ./cmd/pulse
RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \
-a -installsuffix cgo \
-ldflags='-w -s -extldflags "-static"' \
-o reverb ./cmd/reverb
RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \
-a -installsuffix cgo \
-ldflags='-w -s -extldflags "-static"' \
-o agent-sim ./cmd/agent-sim
# Pulse service image
FROM alpine:3.18 AS pulse
# Install runtime dependencies
RUN apk --no-cache add ca-certificates tzdata wget && \
update-ca-certificates
# Create non-root user
RUN addgroup -g 1001 backbeat && \
adduser -D -s /bin/sh -u 1001 -G backbeat backbeat
# Set working directory
WORKDIR /app
# Copy pulse and agent-sim binaries from builder
COPY --from=builder /app/pulse .
COPY --from=builder /app/agent-sim .
RUN chmod +x ./pulse ./agent-sim
# Create data directory
RUN mkdir -p /data && chown -R backbeat:backbeat /data /app
# Switch to non-root user
USER backbeat
# Expose ports (8080 for HTTP API, 9000 for Raft)
EXPOSE 8080 9000
# Health check endpoint
HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \
CMD wget --no-verbose --tries=1 --spider http://localhost:8080/healthz || exit 1
# Default command with production settings
ENTRYPOINT ["./pulse"]
CMD ["-cluster", "chorus-production", \
"-admin-port", "8080", \
"-raft-bind", "0.0.0.0:9000", \
"-data-dir", "/data", \
"-log-level", "info"]
# Reverb service image
FROM alpine:3.18 AS reverb
# Install runtime dependencies
RUN apk --no-cache add ca-certificates tzdata wget && \
update-ca-certificates
# Create non-root user
RUN addgroup -g 1001 backbeat && \
adduser -D -s /bin/sh -u 1001 -G backbeat backbeat
# Set working directory
WORKDIR /app
# Copy reverb binary from builder
COPY --from=builder /app/reverb .
RUN chmod +x ./reverb
# Switch to non-root user
USER backbeat
# Expose port (8080 for HTTP API)
EXPOSE 8080
# Health check endpoint
HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \
CMD wget --no-verbose --tries=1 --spider http://localhost:8080/healthz || exit 1
# Default command with production settings
ENTRYPOINT ["./reverb"]
CMD ["-cluster", "chorus-production", \
"-nats", "nats://nats:4222", \
"-bar-length", "120", \
"-log-level", "info"]

167
Makefile Normal file
View File

@@ -0,0 +1,167 @@
# BACKBEAT prototype Makefile
# Provides development and deployment workflows for the BACKBEAT system
# Variables
PROJECT_NAME = backbeat
DOCKER_REGISTRY = registry.home.deepblack.cloud
VERSION ?= v1.0.6
CLUSTER_NAME ?= chorus-dev
# Go build variables
GOOS ?= linux
GOARCH ?= amd64
CGO_ENABLED ?= 0
# Build flags
LDFLAGS = -w -s -X main.version=$(VERSION)
BUILD_FLAGS = -a -installsuffix cgo -ldflags "$(LDFLAGS)"
.PHONY: all build test clean docker docker-push run-dev stop-dev logs fmt vet deps help
# Default target
all: build
# Help target
help:
@echo "BACKBEAT prototype Makefile"
@echo ""
@echo "Available targets:"
@echo " build - Build all Go binaries"
@echo " test - Run all tests"
@echo " clean - Clean build artifacts"
@echo " docker - Build all Docker images"
@echo " docker-push - Push Docker images to registry"
@echo " run-dev - Start development environment with docker-compose"
@echo " stop-dev - Stop development environment"
@echo " logs - Show logs from development environment"
@echo " fmt - Format Go code"
@echo " vet - Run Go vet"
@echo " deps - Download Go dependencies"
@echo ""
@echo "Environment variables:"
@echo " VERSION - Version tag for builds (default: v1.0.0)"
@echo " CLUSTER_NAME - Cluster name for development (default: chorus-dev)"
# Build all binaries
build:
@echo "Building BACKBEAT binaries..."
@mkdir -p bin/
GOOS=$(GOOS) GOARCH=$(GOARCH) CGO_ENABLED=$(CGO_ENABLED) go build $(BUILD_FLAGS) -o bin/pulse ./cmd/pulse
GOOS=$(GOOS) GOARCH=$(GOARCH) CGO_ENABLED=$(CGO_ENABLED) go build $(BUILD_FLAGS) -o bin/reverb ./cmd/reverb
GOOS=$(GOOS) GOARCH=$(GOARCH) CGO_ENABLED=$(CGO_ENABLED) go build $(BUILD_FLAGS) -o bin/agent-sim ./cmd/agent-sim
@echo "✓ Binaries built in bin/"
# Run tests
test:
@echo "Running tests..."
go test -v -race -cover ./...
@echo "✓ Tests completed"
# Clean build artifacts
clean:
@echo "Cleaning build artifacts..."
rm -rf bin/
docker system prune -f --volumes
@echo "✓ Clean completed"
# Format Go code
fmt:
@echo "Formatting Go code..."
go fmt ./...
@echo "✓ Code formatted"
# Run Go vet
vet:
@echo "Running Go vet..."
go vet ./...
@echo "✓ Vet completed"
# Download dependencies
deps:
@echo "Downloading dependencies..."
go mod download
go mod tidy
@echo "✓ Dependencies updated"
# Build Docker images
docker:
@echo "Building Docker images..."
docker build -t $(PROJECT_NAME)-pulse:$(VERSION) --target pulse .
docker build -t $(PROJECT_NAME)-reverb:$(VERSION) --target reverb .
docker build -t $(PROJECT_NAME)-agent-sim:$(VERSION) --target agent-sim .
@echo "✓ Docker images built"
# Tag and push Docker images to registry
docker-push: docker
@echo "Pushing Docker images to $(DOCKER_REGISTRY)..."
docker tag $(PROJECT_NAME)-pulse:$(VERSION) $(DOCKER_REGISTRY)/$(PROJECT_NAME)-pulse:$(VERSION)
docker tag $(PROJECT_NAME)-reverb:$(VERSION) $(DOCKER_REGISTRY)/$(PROJECT_NAME)-reverb:$(VERSION)
docker tag $(PROJECT_NAME)-agent-sim:$(VERSION) $(DOCKER_REGISTRY)/$(PROJECT_NAME)-agent-sim:$(VERSION)
docker push $(DOCKER_REGISTRY)/$(PROJECT_NAME)-pulse:$(VERSION)
docker push $(DOCKER_REGISTRY)/$(PROJECT_NAME)-reverb:$(VERSION)
docker push $(DOCKER_REGISTRY)/$(PROJECT_NAME)-agent-sim:$(VERSION)
@echo "✓ Docker images pushed"
# Start development environment
run-dev:
@echo "Starting BACKBEAT development environment..."
docker-compose up -d --build
@echo "✓ Development environment started"
@echo ""
@echo "Services available at:"
@echo " - Pulse node 1: http://localhost:8080"
@echo " - Pulse node 2: http://localhost:8081"
@echo " - Reverb service: http://localhost:8082"
@echo " - NATS server: http://localhost:8222"
@echo " - Prometheus: http://localhost:9090"
@echo " - Grafana: http://localhost:3000 (admin/admin)"
# Stop development environment
stop-dev:
@echo "Stopping BACKBEAT development environment..."
docker-compose down
@echo "✓ Development environment stopped"
# Show logs from development environment
logs:
docker-compose logs -f
# Show status of development environment
status:
@echo "BACKBEAT development environment status:"
@echo ""
docker-compose ps
@echo ""
@echo "Health checks:"
@curl -s http://localhost:8080/health | jq '.' 2>/dev/null || echo "Pulse-1: Not responding"
@curl -s http://localhost:8081/health | jq '.' 2>/dev/null || echo "Pulse-2: Not responding"
@curl -s http://localhost:8082/health | jq '.' 2>/dev/null || echo "Reverb: Not responding"
# Quick development cycle
dev: clean fmt vet test build
@echo "✓ Development cycle completed"
# Production build
production: clean test
@echo "Building for production..."
@$(MAKE) build GOOS=linux GOARCH=amd64
@$(MAKE) docker VERSION=$(VERSION)
@echo "✓ Production build completed"
# Install development tools
install-tools:
@echo "Installing development tools..."
go install golang.org/x/tools/cmd/goimports@latest
go install honnef.co/go/tools/cmd/staticcheck@latest
@echo "✓ Development tools installed"
# Run static analysis
lint:
@echo "Running static analysis..."
@command -v staticcheck >/dev/null 2>&1 || { echo "staticcheck not installed. Run 'make install-tools' first."; exit 1; }
staticcheck ./...
@echo "✓ Static analysis completed"
# Full CI pipeline
ci: deps fmt vet lint test build
@echo "✓ CI pipeline completed"

351
README-IMPLEMENTATION.md Normal file
View File

@@ -0,0 +1,351 @@
# BACKBEAT Pulse Service Implementation
## Overview
This is the complete implementation of the BACKBEAT pulse service based on the architectural requirements for CHORUS 2.0.0. The service provides foundational timing coordination for the distributed ecosystem with production-grade leader election, hybrid logical clocks, and comprehensive observability.
## Architecture
The implementation consists of several key components:
### Core Components
1. **Leader Election System** (`internal/backbeat/leader.go`)
- Implements BACKBEAT-REQ-001 using HashiCorp Raft consensus
- Pluggable strategy with automatic failover
- Single BeatFrame publisher per cluster guarantee
2. **Hybrid Logical Clock** (`internal/backbeat/hlc.go`)
- Provides ordering guarantees for distributed events
- Supports reconciliation after network partitions
- Format: `unix_ms_hex:logical_counter_hex:node_id_suffix`
3. **BeatFrame Generator** (`cmd/pulse/main.go`)
- Implements BACKBEAT-REQ-002 (INT-A BeatFrame emission)
- Publishes structured beat events to NATS
- Includes HLC, beat_index, downbeat, phase, deadline_at, tempo_bpm
4. **Degradation Manager** (`internal/backbeat/degradation.go`)
- Implements BACKBEAT-REQ-003 (local tempo derivation)
- Manages partition tolerance with drift monitoring
- BACKBEAT-PER-003 compliance (≤1% drift over 1 hour)
5. **Admin API Server** (`internal/backbeat/admin.go`)
- HTTP endpoints for operational control
- Tempo management with BACKBEAT-REQ-004 validation
- Health checks, drift monitoring, leader status
6. **Metrics & Observability** (`internal/backbeat/metrics.go`)
- Prometheus metrics for all performance requirements
- Comprehensive monitoring of timing accuracy
- Performance requirement tracking
## Requirements Implementation
### BACKBEAT-REQ-001: Pulse Leader
**Implemented**: Leader election using Raft consensus algorithm
- Single leader publishes BeatFrames per cluster
- Automatic failover with consistent leadership
- Pluggable strategy (currently Raft, extensible)
### BACKBEAT-REQ-002: BeatFrame Emit
**Implemented**: INT-A compliant BeatFrame publishing
```json
{
"type": "backbeat.beatframe.v1",
"cluster_id": "string",
"beat_index": 0,
"downbeat": false,
"phase": "plan",
"hlc": "7ffd:0001:abcd",
"deadline_at": "2025-09-04T12:00:00Z",
"tempo_bpm": 120,
"window_id": "deterministic_sha256_hash"
}
```
### BACKBEAT-REQ-003: Degrade Local
**Implemented**: Partition tolerance with local tempo derivation
- Followers maintain local timing when leader is lost
- HLC-based reconciliation when leader returns
- Drift monitoring and alerting
### BACKBEAT-REQ-004: Tempo Change Rules
**Implemented**: Downbeat-gated tempo changes with delta limits
- Changes only applied on next downbeat
- ≤±10% delta validation
- Admin API with validation and scheduling
### BACKBEAT-REQ-005: Window ID
**Implemented**: Deterministic window ID generation
```go
window_id = hex(sha256(cluster_id + ":" + downbeat_beat_index))[0:32]
```
## Performance Requirements
### BACKBEAT-PER-001: End-to-End Delivery
**Target**: p95 ≤ 100ms at 2Hz
- Comprehensive latency monitoring
- NATS optimization for low latency
- Metrics: `backbeat_beat_delivery_latency_seconds`
### BACKBEAT-PER-002: Pulse Jitter
**Target**: p95 ≤ 20ms
- High-resolution timing measurement
- Jitter calculation and monitoring
- Metrics: `backbeat_pulse_jitter_seconds`
### BACKBEAT-PER-003: Timer Drift
**Target**: ≤1% over 1 hour without leader
- Continuous drift monitoring
- Degradation mode with local derivation
- Automatic alerting on threshold violations
- Metrics: `backbeat_timer_drift_ratio`
## API Endpoints
### Admin API (Port 8080)
#### GET /tempo
Returns current and pending tempo information:
```json
{
"current_bpm": 120,
"pending_bpm": 120,
"can_change": true,
"next_change": "2025-09-04T12:00:00Z",
"reason": ""
}
```
#### POST /tempo
Changes tempo with validation:
```json
{
"tempo_bpm": 130,
"justification": "workload increase"
}
```
#### GET /drift
Returns drift monitoring information:
```json
{
"timer_drift_percent": 0.5,
"hlc_drift_seconds": 1.2,
"last_sync_time": "2025-09-04T11:59:00Z",
"degradation_mode": false,
"within_limits": true
}
```
#### GET /leader
Returns leadership information:
```json
{
"node_id": "pulse-abc123",
"is_leader": true,
"leader": "127.0.0.1:9000",
"cluster_size": 2,
"stats": { ... }
}
```
#### Health & Monitoring
- `GET /health` - Overall service health
- `GET /ready` - Kubernetes readiness probe
- `GET /live` - Kubernetes liveness probe
- `GET /metrics` - Prometheus metrics endpoint
## Deployment
### Development (Single Node)
```bash
make build
make dev
```
### Cluster Development
```bash
make cluster
# Starts leader on :8080, follower on :8081
```
### Production (Docker Compose)
```bash
docker-compose up -d
```
This starts:
- NATS message broker
- 2-node BACKBEAT pulse cluster
- Prometheus metrics collection
- Grafana dashboards
- Health monitoring
### Production (Docker Swarm)
```bash
docker stack deploy -c docker-compose.swarm.yml backbeat
```
## Configuration
### Command Line Options
```
-cluster string Cluster identifier (default "chorus-aus-01")
-node-id string Node identifier (auto-generated if empty)
-bpm int Initial tempo in BPM (default 12)
-bar int Beats per bar (default 8)
-phases string Comma-separated phase names (default "plan,work,review")
-min-bpm int Minimum allowed BPM (default 4)
-max-bpm int Maximum allowed BPM (default 24)
-nats string NATS server URL (default "nats://localhost:4222")
-admin-port int Admin API port (default 8080)
-raft-bind string Raft bind address (default "127.0.0.1:0")
-bootstrap bool Bootstrap new cluster (default false)
-peers string Comma-separated Raft peer addresses
-data-dir string Data directory (auto-generated if empty)
```
### Environment Variables
- `BACKBEAT_LOG_LEVEL` - Log level (debug, info, warn, error)
- `BACKBEAT_DATA_DIR` - Data directory override
- `BACKBEAT_CLUSTER_ID` - Cluster ID override
## Monitoring
### Key Metrics
- `backbeat_beat_publish_duration_seconds` - Beat publishing latency
- `backbeat_pulse_jitter_seconds` - Timing jitter (BACKBEAT-PER-002)
- `backbeat_timer_drift_ratio` - Timer drift percentage (BACKBEAT-PER-003)
- `backbeat_is_leader` - Leadership status
- `backbeat_beats_total` - Total beats published
- `backbeat_tempo_change_errors_total` - Failed tempo changes
### Alerts
Configure alerts for:
- Pulse jitter p95 > 20ms
- Timer drift > 1%
- Leadership changes
- Degradation mode active > 5 minutes
- NATS connection losses
## Testing
### API Testing
```bash
make test-all
```
Tests all admin endpoints with sample requests.
### Load Testing
```bash
# Monitor metrics during load
watch curl -s http://localhost:8080/metrics | grep backbeat_pulse_jitter
```
### Chaos Engineering
- Network partitions between nodes
- NATS broker restart
- Leader node termination
- Clock drift simulation
## Integration
### NATS Subjects
- `backbeat.{cluster}.beat` - BeatFrame publications
- `backbeat.{cluster}.control` - Legacy control messages (backward compatibility)
### Service Discovery
- Raft handles internal cluster membership
- External services discover via NATS subjects
- Health checks via HTTP endpoints
## Security
### Network Security
- Raft traffic encrypted in production
- Admin API should be behind authentication proxy
- NATS authentication recommended
### Data Security
- No sensitive data in BeatFrames
- Raft logs contain only operational state
- Metrics don't expose sensitive information
## Performance Tuning
### NATS Configuration
```
max_payload: 1MB
max_connections: 10000
jetstream: enabled
```
### Raft Configuration
```
HeartbeatTimeout: 1s
ElectionTimeout: 1s
CommitTimeout: 500ms
```
### Go Runtime
```
GOGC=100
GOMAXPROCS=auto
```
## Troubleshooting
### Common Issues
1. **Leadership flapping**
- Check network connectivity between nodes
- Verify Raft bind addresses are reachable
- Monitor `backbeat_leadership_changes_total`
2. **High jitter**
- Check system load and CPU scheduling
- Verify Go GC tuning
- Monitor `backbeat_pulse_jitter_seconds`
3. **Drift violations**
- Check NTP synchronization
- Monitor degradation mode duration
- Verify `backbeat_timer_drift_ratio`
### Debug Commands
```bash
# Check leader status
curl http://localhost:8080/leader | jq
# Check drift status
curl http://localhost:8080/drift | jq
# View Raft logs
docker logs backbeat_pulse-leader_1
# Monitor real-time metrics
curl http://localhost:8080/metrics | grep backbeat_
```
## Future Enhancements
1. **COOEE Transport Integration** - Replace NATS with COOEE for enhanced delivery
2. **Multi-Region Support** - Cross-datacenter synchronization
3. **Dynamic Phase Configuration** - Runtime phase definition updates
4. **Backup/Restore** - Raft state backup and recovery
5. **WebSocket API** - Real-time admin interface
## Compliance
This implementation fully satisfies:
- ✅ BACKBEAT-REQ-001 through BACKBEAT-REQ-005
- ✅ BACKBEAT-PER-001 through BACKBEAT-PER-003
- ✅ INT-A BeatFrame specification
- ✅ Production deployment requirements
- ✅ Observability and monitoring requirements
The service is ready for production deployment in the CHORUS 2.0.0 ecosystem.

325
README.md Normal file
View File

@@ -0,0 +1,325 @@
# BACKBEAT Prototype
A production-grade distributed task orchestration system with time-synchronized beat generation and agent status aggregation.
## Overview
BACKBEAT implements a novel approach to distributed system coordination using musical concepts:
- **Pulse Service**: Leader-elected nodes generate synchronized "beats" as timing references
- **Reverb Service**: Aggregates agent status claims and produces summary reports per "window"
- **Agent Simulation**: Simulates distributed agents reporting task status
## Module Availability
BACKBEAT is published as a Go module. Consumers can pin the current release directly:
```bash
go get github.com/chorus-services/backbeat@v0.1.0
```
After downloading, the SDK helpers are available via `github.com/chorus-services/backbeat/pkg/sdk`.
## Architecture
```
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ Pulse │────▶│ NATS │◀────│ Reverb │
│ (Leader) │ │ Broker │ │ (Aggregator)│
└─────────────┘ └─────────────┘ └─────────────┘
┌─────────────┐
│ Agents │
│ (Simulated) │
└─────────────┘
```
### Key Components
1. **Pulse Service** (`cmd/pulse/`)
- Raft-based leader election
- Hybrid Logical Clock (HLC) synchronization
- Tempo control with ±10% change limits
- Beat frame generation at configurable BPM
- Degradation mode for fault tolerance
2. **Reverb Service** (`cmd/reverb/`)
- StatusClaim ingestion and validation
- Window-based aggregation
- BarReport generation with KPIs
- Performance monitoring and SLO tracking
- Admin API for operational visibility
3. **Agent Simulator** (`cmd/agent-sim/`)
- Multi-agent simulation
- Realistic task state transitions
- Configurable reporting rates
- Load testing capabilities
## Requirements Implementation
The system implements the following requirements:
### Core Requirements
- **BACKBEAT-REQ-020**: StatusClaim ingestion and window grouping
- **BACKBEAT-REQ-021**: BarReport emission at downbeats with KPIs
- **BACKBEAT-REQ-022**: DHT persistence placeholder (future implementation)
### Performance Requirements
- **BACKBEAT-PER-001**: End-to-end delivery p95 ≤ 100ms at 2Hz
- **BACKBEAT-PER-002**: Reverb rollup ≤ 1 beat after downbeat
- **BACKBEAT-PER-003**: SDK timer drift ≤ 1% over 1 hour
### Observability Requirements
- **BACKBEAT-OBS-002**: Comprehensive reverb metrics
- Prometheus metrics export
- Structured logging with zerolog
- Health and readiness endpoints
## Quick Start
### Development Environment
1. **Start the complete stack:**
```bash
make run-dev
```
2. **Monitor the services:**
- Pulse Node 1: http://localhost:8080
- Pulse Node 2: http://localhost:8081
- Reverb Service: http://localhost:8082
- Prometheus: http://localhost:9090
- Grafana: http://localhost:3000 (admin/admin)
3. **View logs:**
```bash
make logs
```
4. **Check service status:**
```bash
make status
```
### Manual Build
```bash
# Build all services
make build
# Run individual services
./bin/pulse -cluster=test-cluster -nats=nats://localhost:4222
./bin/reverb -cluster=test-cluster -nats=nats://localhost:4222
./bin/agent-sim -cluster=test-cluster -nats=nats://localhost:4222
```
## Interface Specifications
### INT-A: BeatFrame (Pulse → All)
```json
{
"type": "backbeat.beatframe.v1",
"cluster_id": "chorus-production",
"beat_index": 1234,
"downbeat": true,
"phase": "execution",
"hlc": "7ffd:0001:beef",
"deadline_at": "2024-01-15T10:30:00Z",
"tempo_bpm": 120,
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5"
}
```
### INT-B: StatusClaim (Agents → Reverb)
```json
{
"type": "backbeat.statusclaim.v1",
"agent_id": "agent:xyz",
"task_id": "task:123",
"beat_index": 1234,
"state": "executing",
"beats_left": 3,
"progress": 0.5,
"notes": "fetching inputs",
"hlc": "7ffd:0001:beef"
}
```
### INT-C: BarReport (Reverb → Consumers)
```json
{
"type": "backbeat.barreport.v1",
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
"from_beat": 240,
"to_beat": 359,
"agents_reporting": 978,
"on_time_reviews": 842,
"help_promises_fulfilled": 91,
"secret_rotations_ok": true,
"tempo_drift_ms": 7,
"issues": []
}
```
## API Endpoints
### Pulse Service
- `GET /health` - Health check
- `GET /ready` - Readiness check
- `GET /metrics` - Prometheus metrics
- `POST /api/v1/tempo` - Change tempo
- `GET /api/v1/status` - Service status
### Reverb Service
- `GET /health` - Health check
- `GET /ready` - Readiness check
- `GET /metrics` - Prometheus metrics
- `GET /api/v1/windows` - List active windows
- `GET /api/v1/windows/{id}` - Get window details
- `GET /api/v1/status` - Service status
## Configuration
### Environment Variables
- `BACKBEAT_ENV` - Environment (development/production)
- `NATS_URL` - NATS server URL
- `LOG_LEVEL` - Logging level (debug/info/warn/error)
### Command Line Flags
#### Pulse Service
- `-cluster` - Cluster identifier
- `-node` - Node identifier
- `-admin-port` - HTTP admin port
- `-raft-bind` - Raft cluster bind address
- `-data-dir` - Data directory
- `-nats` - NATS server URL
#### Reverb Service
- `-cluster` - Cluster identifier
- `-node` - Node identifier
- `-nats` - NATS server URL
- `-bar-length` - Bar length in beats
- `-log-level` - Log level
## Monitoring
### Key Metrics
**Pulse Service:**
- `backbeat_beats_total` - Total beats published
- `backbeat_pulse_jitter_seconds` - Beat timing jitter
- `backbeat_is_leader` - Leadership status
- `backbeat_current_tempo_bpm` - Current tempo
**Reverb Service:**
- `backbeat_reverb_agents_reporting` - Agents in current window
- `backbeat_reverb_on_time_reviews` - On-time task completions
- `backbeat_reverb_windows_completed_total` - Total windows processed
- `backbeat_reverb_window_processing_seconds` - Window processing time
### Performance SLOs
The system tracks compliance with performance requirements:
- Beat delivery latency p95 ≤ 100ms
- Pulse jitter p95 ≤ 20ms
- Reverb processing ≤ 1 beat duration
- Timer drift ≤ 1% over 1 hour
## Development
### Build Requirements
- Go 1.22+
- Docker & Docker Compose
- Make
### Development Workflow
```bash
# Format, vet, test, and build
make dev
# Run full CI pipeline
make ci
# Build for production
make production
```
### Testing
```bash
# Run tests
make test
# Run with race detection
go test -race ./...
# Run specific test suites
go test ./internal/backbeat -v
```
## Production Deployment
### Docker Images
The multi-stage Dockerfile produces separate images for each service:
- `backbeat-pulse:v1.0.0` - Pulse service
- `backbeat-reverb:v1.0.0` - Reverb service
- `backbeat-agent-sim:v1.0.0` - Agent simulator
### Kubernetes Deployment
```bash
# Build and push images
make docker-push VERSION=v1.0.0
# Deploy to Kubernetes (example)
kubectl apply -f k8s/
```
### Docker Swarm Deployment
```bash
# Build images
make docker
# Deploy stack
docker stack deploy -c docker-compose.swarm.yml backbeat
```
## Troubleshooting
### Common Issues
1. **NATS Connection Failed**
- Verify NATS server is running
- Check network connectivity
- Verify NATS URL configuration
2. **Leader Election Issues**
- Check Raft logs for cluster formation
- Verify peer connectivity on Raft ports
- Ensure persistent storage is available
3. **Missing StatusClaims**
- Verify agents are publishing to correct NATS subjects
- Check StatusClaim validation errors in reverb logs
- Monitor `backbeat_reverb_claims_processed_total` metric
### Log Analysis
```bash
# Follow reverb service logs
docker-compose logs -f reverb
# Search for specific window processing
docker-compose logs reverb | grep "window_id=abc123"
# Monitor performance metrics
curl http://localhost:8082/metrics | grep backbeat_reverb
```
## License
This is prototype software for the CHORUS platform. See licensing documentation for details.
## Support
For issues and questions, please refer to the CHORUS platform documentation or contact the development team.

125
TEMPO-RECOMMENDATIONS.md Normal file
View File

@@ -0,0 +1,125 @@
# BACKBEAT Tempo Recommendations
## Why Slower Beats Make Sense for Distributed Systems
Unlike musical BPM (120+ beats per minute), distributed task coordination works better with much slower tempos. Here's why:
### Recommended Tempo Ranges
**Development & Testing: 1-2 BPM**
- 1 BPM = 60-second beats (1 minute per beat)
- 2 BPM = 30-second beats (30 seconds per beat)
- Perfect for debugging and observing system behavior
- Plenty of time to see what agents are doing within each beat
**Production: 5-12 BPM**
- 5 BPM = 12-second beats
- 12 BPM = 5-second beats
- Good balance between responsiveness and coordination overhead
- Reasonable for most distributed task processing
**High-Frequency (Special Cases): 30-60 BPM**
- 30 BPM = 2-second beats
- 60 BPM = 1-second beats
- Only for very short-duration tasks
- High coordination overhead
### Window Sizing Examples
With **2 BPM (30-second beats)** and **4 beats per window**:
- Each window = 2 minutes
- Downbeats every 2 minutes for secret rotation, rollups, reviews
- Agents report status every 30 seconds
- Reasonable time for meaningful work between status updates
With **12 BPM (5-second beats)** and **8 beats per window**:
- Each window = 40 seconds
- Downbeats every 40 seconds
- Agents report every 5 seconds
- More responsive but higher coordination overhead
### Why Not 120+ BPM?
**120 BPM = 500ms beats** - This is far too fast because:
- Agents would report status twice per second
- No time for meaningful work between beats
- Network latency (50-100ms) becomes significant fraction of beat time
- High coordination overhead drowns out actual work
- Human operators can't observe or debug system behavior
### Beat Budget Examples
With **2 BPM (30-second beats)**:
- `withBeatBudget(4, task)` = 2-minute timeout
- `withBeatBudget(10, task)` = 5-minute timeout
- Natural timeout periods that make sense for real tasks
With **120 BPM (0.5-second beats)**:
- `withBeatBudget(10, task)` = 5-second timeout
- Most meaningful tasks would need budget of 100+ beats
- Defeats the purpose of beat-based timeouts
## BACKBEAT Default Settings
**Current Defaults (Updated):**
- Pulse service: `1 BPM` (60-second beats)
- Window size: `8 beats` = 4 minutes per window
- Min BPM: `1` (60-second beats for debugging)
- Max BPM: `24` (2.5-second beats for high-frequency systems)
**Configuration Examples:**
```bash
# Development - very slow for debugging
./pulse -bpm 1 -bar 4 # 60s beats, 4min windows
# Production - balanced
./pulse -bpm 5 -bar 6 # 12s beats, 72s windows
# High-frequency - only if needed
./pulse -bpm 24 -bar 10 # ~2.5s beats, 25s windows
```
## Integration with CHORUS Agents
When CHORUS agents become BACKBEAT-aware, they'll report status on each beat:
**With 2 BPM (30s beats):**
```
T+0s: Agent starts task, reports "executing", 10 beats remaining
T+30s: Beat 1 - reports "executing", 9 beats remaining, 20% progress
T+60s: Beat 2 - reports "executing", 8 beats remaining, 40% progress
T+90s: Beat 3 - reports "review", 0 beats remaining, 100% progress
T+120s: Downbeat - window closes, reverb generates BarReport
```
**With 120 BPM (0.5s beats) - NOT RECOMMENDED:**
```
T+0.0s: Agent starts task, reports "executing", 600 beats remaining
T+0.5s: Beat 1 - barely any progress to report
T+1.0s: Beat 2 - still barely any progress
... (598 more rapid-fire status updates)
T+300s: Finally done, but coordination overhead was massive
```
## Performance Impact
**Slower beats (1-12 BPM):**
- ✅ Meaningful status updates
- ✅ Human-observable behavior
- ✅ Reasonable coordination overhead
- ✅ Network jitter tolerance
- ✅ Debugging friendly
**Faster beats (60+ BPM):**
- ❌ Status spam with little information
- ❌ High coordination overhead
- ❌ Network jitter becomes significant
- ❌ Impossible to debug or observe
- ❌ Most real tasks need huge beat budgets
## Conclusion
BACKBEAT is designed for **distributed task coordination**, not musical timing. Slower beats (1-12 BPM) provide the right balance of coordination and efficiency for real distributed work.
The updated defaults (2 BPM, 8 beats/window) give a solid foundation that works well for both development and production use cases.

100
cmd/agent-sim/main.go Normal file
View File

@@ -0,0 +1,100 @@
package main
import (
"encoding/json"
"flag"
"fmt"
"log"
"math/rand"
"os"
"time"
bb "github.com/chorus-services/backbeat/internal/backbeat"
"github.com/nats-io/nats.go"
"gopkg.in/yaml.v3"
)
type scoreFile struct {
Score bb.Score `yaml:"score"`
}
func main() {
cluster := flag.String("cluster", "chorus-aus-01", "cluster id")
agentID := flag.String("id", "bzzz-1", "agent id")
scorePath := flag.String("score", "./configs/sample-score.yaml", "score yaml path")
natsURL := flag.String("nats", nats.DefaultURL, "nats url")
flag.Parse()
buf, err := os.ReadFile(*scorePath)
if err != nil {
log.Fatal(err)
}
var s scoreFile
if err := yaml.Unmarshal(buf, &s); err != nil {
log.Fatal(err)
}
score := s.Score
nc, err := nats.Connect(*natsURL)
if err != nil {
log.Fatal(err)
}
defer nc.Drain()
hlc := bb.NewHLC(*agentID)
state := "planning"
waiting := 0
beatsLeft := 0
nc.Subscribe(fmt.Sprintf("backbeat.%s.beat", *cluster), func(m *nats.Msg) {
var bf bb.BeatFrame
if err := json.Unmarshal(m.Data, &bf); err != nil {
return
}
phase, _ := bb.PhaseFor(score.Phases, int(bf.BeatIndex))
switch phase {
case "plan":
state = "planning"
beatsLeft = 0
case "work":
if waiting == 0 && rand.Float64() < 0.3 {
waiting = 1
}
if waiting > 0 {
state = "waiting"
beatsLeft = score.WaitBudget.Help - waiting
waiting++
if waiting > score.WaitBudget.Help {
state = "executing"
waiting = 0
}
} else {
state = "executing"
beatsLeft = 0
}
case "review":
state = "review"
waiting = 0
beatsLeft = 0
}
sc := bb.StatusClaim{
AgentID: *agentID,
TaskID: "ucxl://demo/task",
BeatIndex: bf.BeatIndex,
State: state,
WaitFor: nil,
BeatsLeft: beatsLeft,
Progress: rand.Float64(),
Notes: "proto",
HLC: hlc.Next(),
}
payload, _ := json.Marshal(sc)
nc.Publish("backbeat.status."+*agentID, payload)
})
log.Printf("AgentSim %s started (cluster=%s)\n", *agentID, *cluster)
for {
time.Sleep(10 * time.Second)
}
}

617
cmd/pulse/main.go Normal file
View File

@@ -0,0 +1,617 @@
package main
import (
"context"
"encoding/json"
"flag"
"fmt"
"net/http"
"os"
"os/signal"
"strings"
"sync"
"syscall"
"time"
"github.com/google/uuid"
"github.com/nats-io/nats.go"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
bb "github.com/chorus-services/backbeat/internal/backbeat"
)
// PulseService implements the complete BACKBEAT pulse service
// with leader election, HLC timing, degradation mode, and admin API
type PulseService struct {
mu sync.RWMutex
ctx context.Context
cancel context.CancelFunc
logger zerolog.Logger
// Core components
state *bb.PulseState
elector *bb.LeaderElector
hlc *bb.HLC
degradation *bb.DegradationManager
metrics *bb.Metrics
adminServer *bb.AdminServer
// NATS connectivity
nc *nats.Conn
beatPublisher *nats.Conn
controlSub *nats.Subscription
// Timing control
ticker *time.Ticker
lastBeatTime time.Time
startTime time.Time
// Configuration
config PulseConfig
}
// PulseConfig holds all configuration for the pulse service
type PulseConfig struct {
ClusterID string
NodeID string
InitialTempoBPM int
BarLength int
Phases []string
MinBPM int
MaxBPM int
// Network
NATSUrl string
AdminPort int
RaftBindAddr string
// Cluster
Bootstrap bool
RaftPeers []string
// Paths
DataDir string
}
// Legacy control message for backward compatibility
type ctrlMsg struct {
Cmd string `json:"cmd"`
BPM int `json:"bpm,omitempty"`
To int `json:"to,omitempty"`
Beats int `json:"beats,omitempty"`
Easing string `json:"easing,omitempty"`
Phases map[string]int `json:"phases,omitempty"`
DurationBeats int `json:"duration_beats,omitempty"`
}
func main() {
// Parse command line flags
config := parseFlags()
// Setup structured logging
logger := setupLogging()
// Create and start pulse service
service, err := NewPulseService(config, logger)
if err != nil {
log.Fatal().Err(err).Msg("failed to create pulse service")
}
// Handle graceful shutdown
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
sigCh := make(chan os.Signal, 1)
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
// Start service
if err := service.Start(ctx); err != nil {
log.Fatal().Err(err).Msg("failed to start pulse service")
}
logger.Info().Msg("BACKBEAT pulse service started successfully")
// Wait for shutdown signal
<-sigCh
logger.Info().Msg("shutdown signal received")
// Graceful shutdown
if err := service.Shutdown(); err != nil {
logger.Error().Err(err).Msg("error during shutdown")
}
logger.Info().Msg("BACKBEAT pulse service shutdown complete")
}
// parseFlags parses command line arguments
func parseFlags() PulseConfig {
config := PulseConfig{}
var phasesStr, peersStr string
flag.StringVar(&config.ClusterID, "cluster", "chorus-aus-01", "cluster identifier")
flag.StringVar(&config.NodeID, "node-id", "", "node identifier (auto-generated if empty)")
// REQ: BACKBEAT-REQ-002 - Default tempo should be reasonable for distributed systems
// 1 BPM = 60-second beats, good for low-intensity or recovery windows
// 12 BPM = 5-second beats, reasonable for production
flag.IntVar(&config.InitialTempoBPM, "bpm", 1, "initial tempo in BPM (1=60s beats, 12=5s beats)")
flag.IntVar(&config.BarLength, "bar", 8, "beats per bar")
flag.StringVar(&phasesStr, "phases", "plan,work,review", "comma-separated phase names")
flag.IntVar(&config.MinBPM, "min-bpm", 1, "minimum allowed BPM")
flag.IntVar(&config.MaxBPM, "max-bpm", 24, "maximum allowed BPM")
flag.StringVar(&config.NATSUrl, "nats", "nats://backbeat-nats:4222", "NATS server URL")
flag.IntVar(&config.AdminPort, "admin-port", 8080, "admin API port")
flag.StringVar(&config.RaftBindAddr, "raft-bind", "127.0.0.1:0", "Raft bind address")
flag.BoolVar(&config.Bootstrap, "bootstrap", false, "bootstrap new cluster")
flag.StringVar(&peersStr, "peers", "", "comma-separated Raft peer addresses")
flag.StringVar(&config.DataDir, "data-dir", "", "data directory (auto-generated if empty)")
flag.Parse()
// Debug: Log all command line arguments
log.Info().Strs("args", os.Args).Msg("command line arguments received")
log.Info().Str("parsed_nats_url", config.NATSUrl).Msg("parsed NATS URL from flags")
// Process parsed values
config.Phases = strings.Split(phasesStr, ",")
if peersStr != "" {
config.RaftPeers = strings.Split(peersStr, ",")
}
// Generate node ID if not provided
if config.NodeID == "" {
config.NodeID = "pulse-" + uuid.New().String()[:8]
}
return config
}
// setupLogging configures structured logging
func setupLogging() zerolog.Logger {
// Configure zerolog
zerolog.TimeFieldFormat = time.RFC3339
logger := log.With().
Str("service", "backbeat-pulse").
Str("version", "2.0.0").
Logger()
return logger
}
// NewPulseService creates a new pulse service instance
func NewPulseService(config PulseConfig, logger zerolog.Logger) (*PulseService, error) {
ctx, cancel := context.WithCancel(context.Background())
service := &PulseService{
ctx: ctx,
cancel: cancel,
logger: logger,
config: config,
startTime: time.Now(),
}
// Initialize pulse state
service.state = &bb.PulseState{
ClusterID: config.ClusterID,
NodeID: config.NodeID,
IsLeader: false,
BeatIndex: 1,
TempoBPM: config.InitialTempoBPM,
PendingBPM: config.InitialTempoBPM,
BarLength: config.BarLength,
Phases: config.Phases,
CurrentPhase: 0,
LastDownbeat: time.Now(),
StartTime: time.Now(),
FrozenBeats: 0,
}
// Initialize components
if err := service.initializeComponents(); err != nil {
cancel()
return nil, fmt.Errorf("failed to initialize components: %v", err)
}
return service, nil
}
// initializeComponents sets up all service components
func (s *PulseService) initializeComponents() error {
var err error
// Initialize metrics
s.metrics = bb.NewMetrics()
// Initialize HLC
s.hlc = bb.NewHLC(s.config.NodeID)
// Initialize degradation manager
degradationConfig := bb.DegradationConfig{
Logger: s.logger,
Metrics: s.metrics,
}
s.degradation = bb.NewDegradationManager(degradationConfig)
// Initialize leader elector
leaderConfig := bb.LeaderElectorConfig{
NodeID: s.config.NodeID,
BindAddr: s.config.RaftBindAddr,
DataDir: s.config.DataDir,
Logger: s.logger,
Bootstrap: s.config.Bootstrap,
Peers: s.config.RaftPeers,
OnBecomeLeader: s.onBecomeLeader,
OnLoseLeader: s.onLoseLeader,
}
s.elector, err = bb.NewLeaderElector(leaderConfig)
if err != nil {
return fmt.Errorf("failed to create leader elector: %v", err)
}
// Initialize admin server
adminConfig := bb.AdminConfig{
PulseState: s.state,
Metrics: s.metrics,
Elector: s.elector,
HLC: s.hlc,
Logger: s.logger,
Degradation: s.degradation,
}
s.adminServer = bb.NewAdminServer(adminConfig)
return nil
}
// Start begins the pulse service operation
func (s *PulseService) Start(ctx context.Context) error {
s.logger.Info().
Str("cluster_id", s.config.ClusterID).
Str("node_id", s.config.NodeID).
Int("initial_bpm", s.config.InitialTempoBPM).
Int("bar_length", s.config.BarLength).
Strs("phases", s.config.Phases).
Msg("starting BACKBEAT pulse service")
// Connect to NATS
if err := s.connectNATS(); err != nil {
return fmt.Errorf("NATS connection failed: %v", err)
}
// Start admin HTTP server
go s.startAdminServer()
// Wait for leadership to be established
if err := s.elector.WaitForLeader(ctx); err != nil {
return fmt.Errorf("failed to establish leadership: %v", err)
}
// Start drift monitoring
go s.degradation.MonitorDrift(ctx)
// Start pulse loop
go s.runPulseLoop(ctx)
return nil
}
// connectNATS establishes NATS connection and sets up subscriptions
func (s *PulseService) connectNATS() error {
var err error
// Connect to NATS with retry logic for Docker Swarm startup
opts := []nats.Option{
nats.Timeout(10 * time.Second),
nats.ReconnectWait(2 * time.Second),
nats.MaxReconnects(5),
nats.DisconnectErrHandler(func(nc *nats.Conn, err error) {
s.logger.Warn().Err(err).Msg("NATS disconnected")
}),
nats.ReconnectHandler(func(nc *nats.Conn) {
s.logger.Info().Msg("NATS reconnected")
}),
}
// Retry connection up to 10 times with exponential backoff
maxRetries := 10
for attempt := 1; attempt <= maxRetries; attempt++ {
s.logger.Info().Int("attempt", attempt).Str("url", s.config.NATSUrl).Msg("attempting NATS connection")
s.nc, err = nats.Connect(s.config.NATSUrl, opts...)
if err == nil {
s.logger.Info().Str("url", s.config.NATSUrl).Msg("successfully connected to NATS")
break
}
if attempt == maxRetries {
return fmt.Errorf("failed to connect to NATS after %d attempts: %v", maxRetries, err)
}
backoff := time.Duration(attempt) * 2 * time.Second
s.logger.Warn().Err(err).Int("attempt", attempt).Dur("backoff", backoff).Msg("NATS connection failed, retrying")
time.Sleep(backoff)
}
// Setup control message subscription for backward compatibility
controlSubject := fmt.Sprintf("backbeat.%s.control", s.config.ClusterID)
s.controlSub, err = s.nc.Subscribe(controlSubject, s.handleControlMessage)
if err != nil {
return fmt.Errorf("failed to subscribe to control messages: %v", err)
}
s.logger.Info().
Str("nats_url", s.config.NATSUrl).
Str("control_subject", controlSubject).
Msg("connected to NATS")
return nil
}
// startAdminServer starts the HTTP admin server
func (s *PulseService) startAdminServer() {
addr := fmt.Sprintf(":%d", s.config.AdminPort)
server := &http.Server{
Addr: addr,
Handler: s.adminServer,
}
s.logger.Info().
Str("address", addr).
Msg("starting admin API server")
if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed {
s.logger.Error().Err(err).Msg("admin server error")
}
}
// runPulseLoop runs the main pulse generation loop
func (s *PulseService) runPulseLoop(ctx context.Context) {
// Calculate initial beat duration
beatDuration := time.Duration(60000/s.state.TempoBPM) * time.Millisecond
s.ticker = time.NewTicker(beatDuration)
defer s.ticker.Stop()
s.lastBeatTime = time.Now()
for {
select {
case <-ctx.Done():
return
case now := <-s.ticker.C:
s.processBeat(now)
}
}
}
// processBeat handles a single beat event
func (s *PulseService) processBeat(now time.Time) {
s.mu.Lock()
defer s.mu.Unlock()
// Only leader publishes beats (BACKBEAT-REQ-001)
if !s.elector.IsLeader() {
return
}
// Check for downbeat and apply pending changes (BACKBEAT-REQ-004)
isDownbeat := bb.IsDownbeat(s.state.BeatIndex, s.state.BarLength)
if isDownbeat && s.state.FrozenBeats == 0 {
// Apply pending tempo changes on downbeat
if s.state.PendingBPM != s.state.TempoBPM {
s.logger.Info().
Int("old_bpm", s.state.TempoBPM).
Int("new_bpm", s.state.PendingBPM).
Int64("beat_index", s.state.BeatIndex).
Msg("applying tempo change at downbeat")
s.state.TempoBPM = s.state.PendingBPM
// Update ticker with new tempo
beatDuration := time.Duration(60000/s.state.TempoBPM) * time.Millisecond
s.ticker.Reset(beatDuration)
// Update metrics
s.metrics.UpdateTempoMetrics(s.state.TempoBPM)
}
s.state.LastDownbeat = now
}
// Handle frozen beats
if s.state.FrozenBeats > 0 && isDownbeat {
s.state.FrozenBeats--
}
// Calculate current phase
currentPhase := s.state.Phases[s.state.CurrentPhase%len(s.state.Phases)]
// Generate window ID for downbeats (BACKBEAT-REQ-005)
var windowID string
if isDownbeat {
downbeatIndex := bb.GetDownbeatIndex(s.state.BeatIndex, s.state.BarLength)
windowID = bb.GenerateWindowID(s.state.ClusterID, downbeatIndex)
}
// Create BeatFrame per INT-A specification (BACKBEAT-REQ-002)
beatFrame := bb.BeatFrame{
Type: "backbeat.beatframe.v1",
ClusterID: s.state.ClusterID,
BeatIndex: s.state.BeatIndex,
Downbeat: isDownbeat,
Phase: currentPhase,
HLC: s.hlc.Next(),
DeadlineAt: now.Add(time.Duration(60000/s.state.TempoBPM) * time.Millisecond),
TempoBPM: s.state.TempoBPM,
WindowID: windowID,
}
// Publish beat frame
subject := fmt.Sprintf("backbeat.%s.beat", s.state.ClusterID)
payload, err := json.Marshal(beatFrame)
if err != nil {
s.logger.Error().Err(err).Msg("failed to marshal beat frame")
return
}
start := time.Now()
if err := s.nc.Publish(subject, payload); err != nil {
s.logger.Error().Err(err).Str("subject", subject).Msg("failed to publish beat")
s.metrics.RecordNATSError("publish_error")
return
}
publishDuration := time.Since(start)
// Record timing metrics
expectedTime := s.lastBeatTime.Add(time.Duration(60000/s.state.TempoBPM) * time.Millisecond)
jitter := now.Sub(expectedTime).Abs()
s.metrics.RecordBeatPublish(publishDuration, len(payload), isDownbeat, currentPhase)
s.metrics.RecordPulseJitter(jitter)
s.metrics.RecordBeatTiming(expectedTime, now)
// Update degradation manager with timing info
s.degradation.UpdateBeatTiming(expectedTime, now, s.state.BeatIndex)
s.lastBeatTime = now
// Advance beat index and phase
s.state.BeatIndex++
if isDownbeat {
// Move to next bar, cycle through phases
s.state.CurrentPhase = (s.state.CurrentPhase + 1) % len(s.state.Phases)
}
s.logger.Debug().
Int64("beat_index", s.state.BeatIndex-1).
Bool("downbeat", isDownbeat).
Str("phase", currentPhase).
Str("window_id", windowID).
Dur("jitter", jitter).
Msg("published beat frame")
}
// handleControlMessage handles legacy control messages for backward compatibility
func (s *PulseService) handleControlMessage(msg *nats.Msg) {
var ctrl ctrlMsg
if err := json.Unmarshal(msg.Data, &ctrl); err != nil {
s.logger.Warn().Err(err).Msg("invalid control message")
return
}
s.mu.Lock()
defer s.mu.Unlock()
response := map[string]interface{}{
"ok": true,
"apply_at_downbeat": true,
"policy_hash": "v2",
}
switch ctrl.Cmd {
case "set_bpm":
if ctrl.BPM < s.config.MinBPM || ctrl.BPM > s.config.MaxBPM {
response["ok"] = false
response["error"] = fmt.Sprintf("BPM %d out of range [%d, %d]", ctrl.BPM, s.config.MinBPM, s.config.MaxBPM)
break
}
// Validate tempo change
if err := bb.ValidateTempoChange(s.state.TempoBPM, ctrl.BPM); err != nil {
response["ok"] = false
response["error"] = err.Error()
s.metrics.RecordTempoChangeError()
break
}
s.state.PendingBPM = ctrl.BPM
s.logger.Info().
Int("requested_bpm", ctrl.BPM).
Str("command", "set_bpm").
Msg("tempo change requested via control message")
case "freeze":
duration := ctrl.DurationBeats
if duration <= 0 {
duration = s.state.BarLength
}
s.state.FrozenBeats = duration
s.logger.Info().
Int("duration_beats", duration).
Msg("freeze requested via control message")
case "unfreeze":
s.state.FrozenBeats = 0
s.logger.Info().Msg("unfreeze requested via control message")
default:
response["ok"] = false
response["error"] = "unknown command: " + ctrl.Cmd
}
// Send response
if msg.Reply != "" {
responseBytes, _ := json.Marshal(response)
s.nc.Publish(msg.Reply, responseBytes)
}
}
// onBecomeLeader is called when this node becomes the leader
func (s *PulseService) onBecomeLeader() {
s.mu.Lock()
s.state.IsLeader = true
s.mu.Unlock()
s.logger.Info().Msg("became pulse leader - starting beat generation")
s.metrics.RecordLeadershipChange(true)
s.metrics.UpdateLeadershipMetrics(true, 1) // TODO: get actual cluster size
// Exit degradation mode if active
if s.degradation.IsInDegradationMode() {
s.degradation.OnLeaderRecovered(s.state.TempoBPM, s.state.BeatIndex, s.hlc.Next())
}
}
// onLoseLeader is called when this node loses leadership
func (s *PulseService) onLoseLeader() {
s.mu.Lock()
s.state.IsLeader = false
s.mu.Unlock()
s.logger.Warn().Msg("lost pulse leadership - entering degradation mode")
s.metrics.RecordLeadershipChange(false)
s.metrics.UpdateLeadershipMetrics(false, 1) // TODO: get actual cluster size
// Enter degradation mode
s.degradation.OnLeaderLost(s.state.TempoBPM, s.state.BeatIndex)
}
// Shutdown gracefully shuts down the pulse service
func (s *PulseService) Shutdown() error {
s.logger.Info().Msg("shutting down pulse service")
// Cancel context
s.cancel()
// Stop ticker
if s.ticker != nil {
s.ticker.Stop()
}
// Close NATS connection
if s.nc != nil {
s.nc.Drain()
}
// Shutdown leader elector
if s.elector != nil {
if err := s.elector.Shutdown(); err != nil {
s.logger.Error().Err(err).Msg("error shutting down leader elector")
return err
}
}
return nil
}

585
cmd/reverb/main.go Normal file
View File

@@ -0,0 +1,585 @@
package main
import (
"context"
"encoding/json"
"flag"
"fmt"
"net/http"
"os"
"os/signal"
"sync"
"syscall"
"time"
"github.com/gorilla/mux"
"github.com/nats-io/nats.go"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
bb "github.com/chorus-services/backbeat/internal/backbeat"
)
// ReverbService implements BACKBEAT-REQ-020, BACKBEAT-REQ-021, BACKBEAT-REQ-022
// Aggregates StatusClaims from agents and produces BarReports for each window
type ReverbService struct {
clusterID string
nodeID string
natsConn *nats.Conn
metrics *bb.Metrics
// Window management
windowsMu sync.RWMutex
windows map[string]*bb.WindowAggregation // windowID -> aggregation
windowTTL time.Duration
barLength int
// Pulse synchronization
currentBeat int64
currentWindowID string
// Configuration
maxWindowsRetained int
cleanupInterval time.Duration
// Control channels
ctx context.Context
cancel context.CancelFunc
done chan struct{}
}
// NewReverbService creates a new reverb aggregation service
func NewReverbService(clusterID, nodeID string, natsConn *nats.Conn, barLength int) *ReverbService {
ctx, cancel := context.WithCancel(context.Background())
return &ReverbService{
clusterID: clusterID,
nodeID: nodeID,
natsConn: natsConn,
metrics: bb.NewMetrics(),
windows: make(map[string]*bb.WindowAggregation),
windowTTL: 5 * time.Minute, // Keep windows for 5 minutes after completion
barLength: barLength,
maxWindowsRetained: 100, // Prevent memory leaks
cleanupInterval: 30 * time.Second,
ctx: ctx,
cancel: cancel,
done: make(chan struct{}),
}
}
// Start initializes and starts the reverb service
// BACKBEAT-REQ-020: Subscribe to INT-B StatusClaims; group by window_id
// BACKBEAT-REQ-021: Emit INT-C BarReport at each downbeat with KPIs
func (rs *ReverbService) Start() error {
log.Info().
Str("cluster_id", rs.clusterID).
Str("node_id", rs.nodeID).
Int("bar_length", rs.barLength).
Msg("Starting BACKBEAT reverb service")
// BACKBEAT-REQ-020: Subscribe to StatusClaims on status channel
beatSubject := fmt.Sprintf("backbeat.%s.beat", rs.clusterID)
statusSubject := fmt.Sprintf("backbeat.%s.status", rs.clusterID)
// Subscribe to pulse BeatFrames for downbeat timing
_, err := rs.natsConn.Subscribe(beatSubject, rs.handleBeatFrame)
if err != nil {
return fmt.Errorf("failed to subscribe to beat channel: %w", err)
}
log.Info().Str("subject", beatSubject).Msg("Subscribed to pulse beat channel")
// Subscribe to StatusClaims for aggregation
_, err = rs.natsConn.Subscribe(statusSubject, rs.handleStatusClaim)
if err != nil {
return fmt.Errorf("failed to subscribe to status channel: %w", err)
}
log.Info().Str("subject", statusSubject).Msg("Subscribed to agent status channel")
// Start background cleanup goroutine
go rs.cleanupRoutine()
// Start HTTP server for health and metrics
go rs.startHTTPServer()
log.Info().Msg("BACKBEAT reverb service started successfully")
return nil
}
// handleBeatFrame processes incoming BeatFrames to detect downbeats
// BACKBEAT-REQ-021: Emit INT-C BarReport at each downbeat with KPIs
func (rs *ReverbService) handleBeatFrame(msg *nats.Msg) {
var bf bb.BeatFrame
if err := json.Unmarshal(msg.Data, &bf); err != nil {
log.Error().Err(err).Msg("Failed to unmarshal BeatFrame")
rs.metrics.RecordNATSError("unmarshal_error")
return
}
rs.currentBeat = bf.BeatIndex
// Process downbeat - emit BarReport for previous window
if bf.Downbeat && rs.currentWindowID != "" && rs.currentWindowID != bf.WindowID {
rs.processDownbeat(rs.currentWindowID)
}
// Update current window
rs.currentWindowID = bf.WindowID
log.Debug().
Int64("beat_index", bf.BeatIndex).
Bool("downbeat", bf.Downbeat).
Str("window_id", bf.WindowID).
Msg("Processed beat frame")
}
// handleStatusClaim processes incoming StatusClaims for aggregation
// BACKBEAT-REQ-020: Subscribe to INT-B StatusClaims; group by window_id
func (rs *ReverbService) handleStatusClaim(msg *nats.Msg) {
var sc bb.StatusClaim
if err := json.Unmarshal(msg.Data, &sc); err != nil {
log.Error().Err(err).Msg("Failed to unmarshal StatusClaim")
rs.metrics.RecordNATSError("unmarshal_error")
return
}
// Validate StatusClaim according to INT-B specification
if err := bb.ValidateStatusClaim(&sc); err != nil {
log.Warn().Err(err).
Str("agent_id", sc.AgentID).
Str("task_id", sc.TaskID).
Msg("Invalid StatusClaim received")
return
}
// Determine window ID for this claim
windowID := rs.getWindowIDForBeat(sc.BeatIndex)
if windowID == "" {
log.Warn().
Int64("beat_index", sc.BeatIndex).
Msg("Could not determine window ID for StatusClaim")
return
}
// Add claim to appropriate window aggregation
rs.addClaimToWindow(windowID, &sc)
rs.metrics.RecordReverbClaim()
log.Debug().
Str("agent_id", sc.AgentID).
Str("task_id", sc.TaskID).
Str("state", sc.State).
Str("window_id", windowID).
Msg("Processed status claim")
}
// addClaimToWindow adds a StatusClaim to the appropriate window aggregation
func (rs *ReverbService) addClaimToWindow(windowID string, claim *bb.StatusClaim) {
rs.windowsMu.Lock()
defer rs.windowsMu.Unlock()
// Get or create window aggregation
window, exists := rs.windows[windowID]
if !exists {
// Create new window - calculate beat range
fromBeat := rs.getWindowStartBeat(claim.BeatIndex)
toBeat := fromBeat + int64(rs.barLength) - 1
window = bb.NewWindowAggregation(windowID, fromBeat, toBeat)
rs.windows[windowID] = window
log.Info().
Str("window_id", windowID).
Int64("from_beat", fromBeat).
Int64("to_beat", toBeat).
Msg("Created new window aggregation")
}
// Add claim to window
window.AddClaim(claim)
// Update metrics
rs.metrics.UpdateReverbActiveWindows(len(rs.windows))
}
// processDownbeat processes a completed window and emits BarReport
// BACKBEAT-REQ-021: Emit INT-C BarReport at each downbeat with KPIs
// BACKBEAT-PER-002: Reverb rollup complete ≤ 1 beat after downbeat
func (rs *ReverbService) processDownbeat(windowID string) {
start := time.Now()
rs.windowsMu.RLock()
window, exists := rs.windows[windowID]
rs.windowsMu.RUnlock()
if !exists {
log.Warn().Str("window_id", windowID).Msg("No aggregation found for completed window")
return
}
log.Info().
Str("window_id", windowID).
Int("claims_count", len(window.Claims)).
Int("agents_reporting", len(window.UniqueAgents)).
Msg("Processing completed window")
// Generate BarReport from aggregated data
barReport := window.GenerateBarReport(rs.clusterID)
// Serialize BarReport
reportData, err := json.Marshal(barReport)
if err != nil {
log.Error().Err(err).Str("window_id", windowID).Msg("Failed to marshal BarReport")
return
}
// BACKBEAT-REQ-021: Emit INT-C BarReport
reverbSubject := fmt.Sprintf("backbeat.%s.reverb", rs.clusterID)
if err := rs.natsConn.Publish(reverbSubject, reportData); err != nil {
log.Error().Err(err).
Str("window_id", windowID).
Str("subject", reverbSubject).
Msg("Failed to publish BarReport")
rs.metrics.RecordNATSError("publish_error")
return
}
processingTime := time.Since(start)
// Record metrics
rs.metrics.RecordReverbWindow(
processingTime,
len(window.Claims),
barReport.AgentsReporting,
barReport.OnTimeReviews,
barReport.TempoDriftMS,
len(reportData),
)
log.Info().
Str("window_id", windowID).
Int("claims_processed", len(window.Claims)).
Int("agents_reporting", barReport.AgentsReporting).
Int("on_time_reviews", barReport.OnTimeReviews).
Dur("processing_time", processingTime).
Int("report_size_bytes", len(reportData)).
Msg("Published BarReport")
// BACKBEAT-REQ-022: Optionally persist BarReports via DHT (placeholder)
// TODO: Implement DHT persistence when available
log.Debug().
Str("window_id", windowID).
Msg("DHT persistence placeholder - not yet implemented")
}
// getWindowIDForBeat determines the window ID for a given beat index
func (rs *ReverbService) getWindowIDForBeat(beatIndex int64) string {
if beatIndex <= 0 {
return ""
}
// Find the downbeat for this window
downbeatIndex := bb.GetDownbeatIndex(beatIndex, rs.barLength)
// Generate deterministic window ID per BACKBEAT-REQ-005
return bb.GenerateWindowID(rs.clusterID, downbeatIndex)
}
// getWindowStartBeat calculates the starting beat for a window containing the given beat
func (rs *ReverbService) getWindowStartBeat(beatIndex int64) int64 {
return bb.GetDownbeatIndex(beatIndex, rs.barLength)
}
// cleanupRoutine periodically cleans up old window aggregations
func (rs *ReverbService) cleanupRoutine() {
ticker := time.NewTicker(rs.cleanupInterval)
defer ticker.Stop()
for {
select {
case <-rs.ctx.Done():
return
case <-ticker.C:
rs.cleanupOldWindows()
}
}
}
// cleanupOldWindows removes expired window aggregations to prevent memory leaks
func (rs *ReverbService) cleanupOldWindows() {
rs.windowsMu.Lock()
defer rs.windowsMu.Unlock()
now := time.Now()
removedCount := 0
for windowID, window := range rs.windows {
if now.Sub(window.LastUpdated) > rs.windowTTL {
delete(rs.windows, windowID)
removedCount++
}
}
// Also enforce maximum window retention
if len(rs.windows) > rs.maxWindowsRetained {
// Remove oldest windows beyond limit (simple approach)
excess := len(rs.windows) - rs.maxWindowsRetained
for windowID := range rs.windows {
if excess <= 0 {
break
}
delete(rs.windows, windowID)
removedCount++
excess--
}
}
if removedCount > 0 {
log.Info().
Int("removed_count", removedCount).
Int("remaining_windows", len(rs.windows)).
Msg("Cleaned up old window aggregations")
}
// Update metrics
rs.metrics.UpdateReverbActiveWindows(len(rs.windows))
}
// startHTTPServer starts the HTTP server for health checks and metrics
func (rs *ReverbService) startHTTPServer() {
router := mux.NewRouter()
// Health endpoint
router.HandleFunc("/health", rs.healthHandler).Methods("GET")
router.HandleFunc("/ready", rs.readinessHandler).Methods("GET")
// Metrics endpoint
router.Handle("/metrics", promhttp.Handler()).Methods("GET")
// Admin API endpoints
router.HandleFunc("/api/v1/windows", rs.listWindowsHandler).Methods("GET")
router.HandleFunc("/api/v1/windows/{windowId}", rs.getWindowHandler).Methods("GET")
router.HandleFunc("/api/v1/status", rs.statusHandler).Methods("GET")
server := &http.Server{
Addr: ":8080",
Handler: router,
ReadTimeout: 10 * time.Second,
WriteTimeout: 10 * time.Second,
}
log.Info().Str("address", ":8080").Msg("Starting HTTP server")
if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed {
log.Error().Err(err).Msg("HTTP server error")
}
}
// Health check handlers
func (rs *ReverbService) healthHandler(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(map[string]interface{}{
"status": "healthy",
"service": "backbeat-reverb",
"cluster_id": rs.clusterID,
"node_id": rs.nodeID,
"timestamp": time.Now().UTC().Format(time.RFC3339),
})
}
func (rs *ReverbService) readinessHandler(w http.ResponseWriter, r *http.Request) {
// Check NATS connection
if !rs.natsConn.IsConnected() {
w.WriteHeader(http.StatusServiceUnavailable)
json.NewEncoder(w).Encode(map[string]string{
"status": "not ready",
"reason": "NATS connection lost",
})
return
}
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(map[string]interface{}{
"status": "ready",
"active_windows": len(rs.windows),
"current_beat": rs.currentBeat,
"current_window_id": rs.currentWindowID,
})
}
// Admin API handlers
func (rs *ReverbService) listWindowsHandler(w http.ResponseWriter, r *http.Request) {
rs.windowsMu.RLock()
defer rs.windowsMu.RUnlock()
windows := make([]map[string]interface{}, 0, len(rs.windows))
for windowID, window := range rs.windows {
windows = append(windows, map[string]interface{}{
"window_id": windowID,
"from_beat": window.FromBeat,
"to_beat": window.ToBeat,
"claims_count": len(window.Claims),
"agents_reporting": len(window.UniqueAgents),
"last_updated": window.LastUpdated.UTC().Format(time.RFC3339),
})
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]interface{}{
"windows": windows,
"total_count": len(windows),
})
}
func (rs *ReverbService) getWindowHandler(w http.ResponseWriter, r *http.Request) {
vars := mux.Vars(r)
windowID := vars["windowId"]
rs.windowsMu.RLock()
window, exists := rs.windows[windowID]
rs.windowsMu.RUnlock()
if !exists {
w.WriteHeader(http.StatusNotFound)
json.NewEncoder(w).Encode(map[string]string{
"error": "window not found",
"window_id": windowID,
})
return
}
// Generate current BarReport for this window
barReport := window.GenerateBarReport(rs.clusterID)
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]interface{}{
"window_aggregation": map[string]interface{}{
"window_id": window.WindowID,
"from_beat": window.FromBeat,
"to_beat": window.ToBeat,
"claims_count": len(window.Claims),
"unique_agents": len(window.UniqueAgents),
"state_counts": window.StateCounts,
"completed_tasks": window.CompletedTasks,
"failed_tasks": window.FailedTasks,
"last_updated": window.LastUpdated.UTC().Format(time.RFC3339),
},
"current_bar_report": barReport,
})
}
func (rs *ReverbService) statusHandler(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]interface{}{
"service": "backbeat-reverb",
"cluster_id": rs.clusterID,
"node_id": rs.nodeID,
"active_windows": len(rs.windows),
"current_beat": rs.currentBeat,
"current_window_id": rs.currentWindowID,
"bar_length": rs.barLength,
"window_ttl_seconds": int(rs.windowTTL.Seconds()),
"max_windows_retained": rs.maxWindowsRetained,
"nats_connected": rs.natsConn.IsConnected(),
"uptime_seconds": time.Since(time.Now()).Seconds(), // Placeholder
"version": "v1.0.0",
"timestamp": time.Now().UTC().Format(time.RFC3339),
})
}
// Stop gracefully shuts down the reverb service
func (rs *ReverbService) Stop() {
log.Info().Msg("Stopping BACKBEAT reverb service")
rs.cancel()
close(rs.done)
}
func main() {
// Command line flags
clusterID := flag.String("cluster", "chorus-aus-01", "Cluster identifier")
natsURL := flag.String("nats", "nats://backbeat-nats:4222", "NATS server URL")
nodeID := flag.String("node", "", "Node identifier (auto-generated if empty)")
barLength := flag.Int("bar-length", 120, "Bar length in beats")
logLevel := flag.String("log-level", "info", "Log level (debug, info, warn, error)")
flag.Parse()
// Configure structured logging
switch *logLevel {
case "debug":
zerolog.SetGlobalLevel(zerolog.DebugLevel)
case "info":
zerolog.SetGlobalLevel(zerolog.InfoLevel)
case "warn":
zerolog.SetGlobalLevel(zerolog.WarnLevel)
case "error":
zerolog.SetGlobalLevel(zerolog.ErrorLevel)
default:
zerolog.SetGlobalLevel(zerolog.InfoLevel)
}
// Pretty logging in development
if os.Getenv("BACKBEAT_ENV") != "production" {
log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr})
}
// Generate node ID if not provided
if *nodeID == "" {
*nodeID = fmt.Sprintf("reverb-%d", time.Now().Unix())
}
log.Info().
Str("cluster_id", *clusterID).
Str("node_id", *nodeID).
Str("nats_url", *natsURL).
Int("bar_length", *barLength).
Msg("Starting BACKBEAT reverb service")
// Connect to NATS
nc, err := nats.Connect(*natsURL,
nats.Timeout(10*time.Second),
nats.ReconnectWait(2*time.Second),
nats.MaxReconnects(-1),
nats.DisconnectErrHandler(func(nc *nats.Conn, err error) {
log.Error().Err(err).Msg("NATS disconnected")
}),
nats.ReconnectHandler(func(nc *nats.Conn) {
log.Info().Str("server", nc.ConnectedUrl()).Msg("NATS reconnected")
}),
)
if err != nil {
log.Fatal().Err(err).Msg("Failed to connect to NATS")
}
defer nc.Drain()
// Create and start reverb service
service := NewReverbService(*clusterID, *nodeID, nc, *barLength)
if err := service.Start(); err != nil {
log.Fatal().Err(err).Msg("Failed to start reverb service")
}
// Handle graceful shutdown
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
log.Info().Msg("BACKBEAT reverb service is running. Press Ctrl+C to exit.")
// Wait for shutdown signal
<-sigChan
log.Info().Msg("Shutdown signal received")
// Graceful shutdown
service.Stop()
// Wait for background tasks to complete
select {
case <-service.done:
log.Info().Msg("BACKBEAT reverb service stopped gracefully")
case <-time.After(30 * time.Second):
log.Warn().Msg("Shutdown timeout exceeded")
}
}

36
cmd/sdk-examples/main.go Normal file
View File

@@ -0,0 +1,36 @@
// Command sdk-examples provides executable examples of BACKBEAT SDK usage
package main
import (
"flag"
"fmt"
"os"
"github.com/chorus-services/backbeat/pkg/sdk/examples"
)
func main() {
var exampleName string
flag.StringVar(&exampleName, "example", "simple", "Example to run: simple, task-processor, service-monitor")
flag.Parse()
fmt.Printf("Running BACKBEAT SDK example: %s\n", exampleName)
fmt.Println("Press Ctrl+C to stop")
fmt.Println()
switch exampleName {
case "simple":
examples.SimpleAgent()
case "task-processor":
examples.TaskProcessor()
case "service-monitor":
examples.ServiceMonitor()
default:
fmt.Printf("Unknown example: %s\n", exampleName)
fmt.Println("Available examples:")
fmt.Println(" simple - Basic beat subscription and status emission")
fmt.Println(" task-processor - Beat budget usage for task timeout management")
fmt.Println(" service-monitor - Health monitoring with beat-aligned reporting")
os.Exit(1)
}
}

13
configs/sample-score.yaml Normal file
View File

@@ -0,0 +1,13 @@
score:
tempo: 12
bar_len: 8
phases:
plan: 2
work: 4
review: 2
wait_budget:
help: 2
io: 1
retry:
max_phrases: 2
backoff: geometric

366
contracts/README.md Normal file
View File

@@ -0,0 +1,366 @@
# BACKBEAT Contracts Package
[![Build Status](https://github.com/chorus-services/backbeat/actions/workflows/contracts.yml/badge.svg)](https://github.com/chorus-services/backbeat/actions/workflows/contracts.yml)
[![Schema Version](https://img.shields.io/badge/schema-v1.0.0-blue)](schemas/)
[![License](https://img.shields.io/badge/license-MIT-green)](LICENSE)
The authoritative contract definitions and validation tools for BACKBEAT distributed orchestration across the CHORUS 2.0.0 ecosystem.
## 🎯 Overview
BACKBEAT provides synchronized distributed execution through three core message interfaces:
- **INT-A (BeatFrame)**: 🥁 Rhythm coordination from Pulse → All Services
- **INT-B (StatusClaim)**: 📊 Agent status reporting from Agents → Reverb
- **INT-C (BarReport)**: 📈 Periodic summaries from Reverb → All Services
This contracts package ensures all CHORUS 2.0.0 projects can reliably integrate with BACKBEAT through:
**JSON Schema Validation** - Semver-versioned schemas for all interfaces
**Conformance Testing** - Comprehensive test suites with valid/invalid examples
**CI Integration** - Drop-in validation for any CI pipeline
**Documentation** - Complete integration guides and best practices
## 🚀 Quick Start
### 1. Validate Your Messages
```bash
# Clone the contracts repository
git clone https://github.com/chorus-services/backbeat.git
cd backbeat/contracts
# Build the validation tool
cd tests/integration && make build
# Validate your BACKBEAT messages
./backbeat-validate --schemas ../../schemas --dir /path/to/your/messages --exit-code
```
### 2. Add to CI Pipeline
#### GitHub Actions
```yaml
- name: Validate BACKBEAT Contracts
run: |
git clone https://github.com/chorus-services/backbeat.git
cd backbeat/contracts/tests/integration
make build
./backbeat-validate --schemas ../../schemas --dir ${{ github.workspace }}/messages --exit-code
```
#### GitLab CI
```yaml
validate-backbeat:
script:
- git clone https://github.com/chorus-services/backbeat.git
- cd backbeat/contracts/tests/integration && make build
- ./backbeat-validate --schemas ../../schemas --dir messages --exit-code
```
### 3. Integrate with Your Project
Add to your `Makefile`:
```makefile
validate-backbeat:
@git clone https://github.com/chorus-services/backbeat.git .backbeat 2>/dev/null || true
@cd .backbeat/contracts/tests/integration && make build
@.backbeat/contracts/tests/integration/backbeat-validate --schemas .backbeat/contracts/schemas --dir messages --exit-code
```
## 📁 Package Structure
```
contracts/
├── schemas/ # JSON Schema definitions
│ ├── beatframe-v1.schema.json # INT-A: Pulse → All Services
│ ├── statusclaim-v1.schema.json # INT-B: Agents → Reverb
│ └── barreport-v1.schema.json # INT-C: Reverb → All Services
├── tests/
│ ├── conformance_test.go # Go conformance test suite
│ ├── examples/ # Valid/invalid message examples
│ │ ├── beatframe-valid.json
│ │ ├── beatframe-invalid.json
│ │ ├── statusclaim-valid.json
│ │ ├── statusclaim-invalid.json
│ │ ├── barreport-valid.json
│ │ └── barreport-invalid.json
│ └── integration/ # CI integration helpers
│ ├── validator.go # Message validation library
│ ├── ci_helper.go # CI integration utilities
│ ├── cmd/backbeat-validate/ # CLI validation tool
│ └── Makefile # Build and test automation
├── docs/
│ ├── integration-guide.md # How to BACKBEAT-enable services
│ ├── schema-evolution.md # Versioning and compatibility
│ └── tempo-guide.md # Beat timing recommendations
└── README.md # This file
```
## 🔧 Core Interfaces
### INT-A: BeatFrame (Pulse → All Services)
Synchronization messages broadcast every beat:
```json
{
"type": "backbeat.beatframe.v1",
"cluster_id": "chorus-prod",
"beat_index": 1337,
"downbeat": false,
"phase": "execute",
"hlc": "7ffd:0001:abcd",
"deadline_at": "2025-09-05T12:30:00Z",
"tempo_bpm": 2.0,
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5"
}
```
**Key Fields:**
- `beat_index`: Monotonic counter since cluster start
- `phase`: `"plan"`, `"execute"`, or `"review"`
- `tempo_bpm`: Current beats per minute (default: 2.0 = 30-second beats)
- `deadline_at`: When this phase must complete
### INT-B: StatusClaim (Agents → Reverb)
Agent status reports during beat execution:
```json
{
"type": "backbeat.statusclaim.v1",
"agent_id": "search-indexer:worker-03",
"task_id": "index-batch:20250905-120",
"beat_index": 1337,
"state": "executing",
"beats_left": 3,
"progress": 0.65,
"notes": "processing batch 120/200",
"hlc": "7ffd:0001:beef"
}
```
**Key Fields:**
- `state`: `"idle"`, `"planning"`, `"executing"`, `"reviewing"`, `"completed"`, `"failed"`, `"blocked"`, `"helping"`
- `beats_left`: Estimated beats to completion
- `progress`: Completion percentage (0.0 - 1.0)
### INT-C: BarReport (Reverb → All Services)
Periodic cluster health summaries:
```json
{
"type": "backbeat.barreport.v1",
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
"from_beat": 240,
"to_beat": 359,
"agents_reporting": 978,
"on_time_reviews": 942,
"help_promises_fulfilled": 87,
"secret_rotations_ok": true,
"tempo_drift_ms": 7.3,
"issues": []
}
```
**Key Fields:**
- `agents_reporting`: Total active agents in window
- `on_time_reviews`: Agents completing review phase on time
- `tempo_drift_ms`: Timing drift (positive = behind, negative = ahead)
## 🛠️ Usage Examples
### Validate Single Message
```bash
# Validate from file
./backbeat-validate --schemas ../schemas --file message.json
# Validate from stdin
echo '{"type":"backbeat.beatframe.v1",...}' | ./backbeat-validate --schemas ../schemas --message -
# Get JSON output for programmatic use
./backbeat-validate --schemas ../schemas --file message.json --json
```
### Validate Directory
```bash
# Validate all JSON files in directory
./backbeat-validate --schemas ../schemas --dir messages/
# Quiet mode (only errors)
./backbeat-validate --schemas ../schemas --dir messages/ --quiet
# Exit with error code on validation failures
./backbeat-validate --schemas ../schemas --dir messages/ --exit-code
```
### Go Integration
```go
import "github.com/chorus-services/backbeat/contracts/tests/integration"
// Create validator
validator, err := integration.NewMessageValidator("./schemas")
if err != nil {
log.Fatal(err)
}
// Validate message
result, err := validator.ValidateMessageString(`{"type":"backbeat.beatframe.v1",...}`)
if err != nil {
log.Fatal(err)
}
if !result.Valid {
log.Errorf("Validation failed: %v", result.Errors)
}
```
## 📊 Tempo Recommendations
| Use Case | Tempo (BPM) | Beat Duration | Example Services |
|----------|-------------|---------------|------------------|
| **Development** | 0.1 - 0.5 | 2-10 minutes | Testing, debugging |
| **Batch Processing** | 0.5 - 2.0 | 30s - 2 minutes | ETL, data warehouses |
| **Standard Services** | 2.0 - 10.0 | 6-30 seconds | APIs, web apps |
| **Responsive Apps** | 10.0 - 60.0 | 1-6 seconds | Dashboards, monitoring |
| **High-Frequency** | 60+ | <1 second | Trading, IoT processing |
**Default**: 2.0 BPM (30-second beats) works well for most CHORUS services.
## 📋 Integration Checklist
- [ ] **Message Validation**: Add schema validation to your CI pipeline
- [ ] **BeatFrame Handler**: Implement INT-A message consumption
- [ ] **StatusClaim Publisher**: Implement INT-B message publishing (if you have agents)
- [ ] **BarReport Consumer**: Implement INT-C message consumption (optional)
- [ ] **Tempo Selection**: Choose appropriate BPM for your workload
- [ ] **Error Handling**: Handle validation failures and timing issues
- [ ] **Monitoring**: Track beat processing latency and deadline misses
- [ ] **Load Testing**: Verify performance at production tempo
## 🔄 Schema Versioning
Schemas follow [Semantic Versioning](https://semver.org/):
- **MAJOR** (1.0.0 2.0.0): Breaking changes requiring code updates
- **MINOR** (1.0.0 1.1.0): Backward-compatible additions
- **PATCH** (1.0.0 1.0.1): Documentation and example updates
Current versions:
- **BeatFrame**: v1.0.0 (`backbeat.beatframe.v1`)
- **StatusClaim**: v1.0.0 (`backbeat.statusclaim.v1`)
- **BarReport**: v1.0.0 (`backbeat.barreport.v1`)
See [schema-evolution.md](docs/schema-evolution.md) for migration strategies.
## 🧪 Running Tests
```bash
# Run all tests
make test
# Test schemas are valid JSON
make test-schemas
# Test example messages
make test-examples
# Run Go integration tests
make test-integration
# Validate built-in examples
make validate-examples
```
## 🏗️ Building
```bash
# Build CLI validation tool
make build
# Install Go dependencies
make deps
# Format code
make fmt
# Run linter
make lint
# Generate CI configuration examples
make examples
```
## 📚 Documentation
- **[Integration Guide](docs/integration-guide.md)**: Complete guide for CHORUS 2.0.0 projects
- **[Schema Evolution](docs/schema-evolution.md)**: Versioning and compatibility management
- **[Tempo Guide](docs/tempo-guide.md)**: Beat timing and performance optimization
## 🤝 Contributing
1. **Fork** this repository
2. **Create** a feature branch: `git checkout -b feature/amazing-feature`
3. **Add** tests for your changes
4. **Run** `make test` to ensure everything passes
5. **Commit** your changes: `git commit -m 'Add amazing feature'`
6. **Push** to the branch: `git push origin feature/amazing-feature`
7. **Open** a Pull Request
### Schema Changes
- **Minor changes** (new optional fields): Create PR with updated schema
- **Major changes** (breaking): Discuss in issue first, follow migration process
- **All changes**: Update examples and tests accordingly
## 🔍 Troubleshooting
### Common Validation Errors
| Error | Cause | Fix |
|-------|-------|-----|
| `type field is required` | Missing `type` field | Add correct message type |
| `hlc must match pattern` | Invalid HLC format | Use `XXXX:XXXX:XXXX` hex format |
| `window_id must be exactly 32 hex characters` | Wrong window ID | Use 32-character hex string |
| `phase must be one of: plan, execute, review` | Invalid phase | Use exact phase names |
| `tempo_bpm must be at least 0.1` | Tempo too low | Use tempo 0.1 BPM |
### Performance Issues
- **Beat processing too slow**: Reduce tempo or optimize code
- **High CPU usage**: Consider lower tempo or horizontal scaling
- **Network saturation**: Reduce message frequency or size
- **Memory leaks**: Ensure proper cleanup in beat handlers
### Getting Help
- **Issues**: [GitHub Issues](https://github.com/chorus-services/backbeat/issues)
- **Discussions**: [GitHub Discussions](https://github.com/chorus-services/backbeat/discussions)
- **Documentation**: Check the [docs/](docs/) directory
- **Examples**: See [tests/examples/](tests/examples/) for message samples
## 📜 License
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
## 🎵 About BACKBEAT
BACKBEAT provides the rhythmic heartbeat that synchronizes distributed systems across CHORUS 2.0.0. Just as musicians use a metronome to stay in time, BACKBEAT keeps your services coordinated and responsive.
**Key Benefits:**
- 🎯 **Predictable Timing**: Know exactly when coordination happens
- 🔄 **Graceful Coordination**: Services sync without tight coupling
- 📊 **Health Visibility**: Real-time insight into cluster performance
- 🛡 **Fault Tolerance**: Detect and recover from failures quickly
- **Scalable**: Works from development (0.1 BPM) to high-frequency (1000+ BPM)
---
**Made with ❤️ by the CHORUS 2.0.0 team**
*"In rhythm there is coordination, in coordination there is reliability."*

View File

@@ -0,0 +1,436 @@
# BACKBEAT Integration Guide for CHORUS 2.0.0 Projects
This guide explains how to integrate BACKBEAT contract validation into your CHORUS 2.0.0 project for guaranteed compatibility with the distributed orchestration system.
## Overview
BACKBEAT provides three core interfaces for coordinated distributed execution:
- **INT-A (BeatFrame)**: Rhythm coordination from Pulse service to all agents
- **INT-B (StatusClaim)**: Agent status reporting to Reverb service
- **INT-C (BarReport)**: Periodic summary reports from Reverb to all services
All messages must conform to the published JSON schemas to ensure reliable operation across the CHORUS ecosystem.
## Quick Start
### 1. Add Contract Validation to Your CI Pipeline
#### GitHub Actions
```yaml
name: BACKBEAT Contract Validation
on: [push, pull_request]
jobs:
validate-backbeat:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Checkout BACKBEAT contracts
uses: actions/checkout@v4
with:
repository: 'chorus-services/backbeat'
path: 'backbeat-contracts'
- name: Set up Go
uses: actions/setup-go@v4
with:
go-version: '1.22'
- name: Validate BACKBEAT messages
run: |
cd backbeat-contracts/contracts/tests/integration
make build
./backbeat-validate \
--schemas ../../schemas \
--dir ../../../your-messages-directory \
--exit-code
```
#### GitLab CI
```yaml
validate-backbeat:
stage: test
image: golang:1.22
before_script:
- git clone https://github.com/chorus-services/backbeat.git /tmp/backbeat
- cd /tmp/backbeat/contracts/tests/integration && make build
script:
- /tmp/backbeat/contracts/tests/integration/backbeat-validate
--schemas /tmp/backbeat/contracts/schemas
--dir $CI_PROJECT_DIR/messages
--exit-code
```
### 2. Project Makefile Integration
Add to your project's `Makefile`:
```makefile
# BACKBEAT contract validation
BACKBEAT_REPO = https://github.com/chorus-services/backbeat.git
BACKBEAT_DIR = .backbeat-contracts
$(BACKBEAT_DIR):
git clone $(BACKBEAT_REPO) $(BACKBEAT_DIR)
validate-backbeat: $(BACKBEAT_DIR)
cd $(BACKBEAT_DIR)/contracts/tests/integration && make build
$(BACKBEAT_DIR)/contracts/tests/integration/backbeat-validate \
--schemas $(BACKBEAT_DIR)/contracts/schemas \
--dir messages \
--exit-code
.PHONY: validate-backbeat
```
## Message Implementation
### Implementing BeatFrame Consumer (INT-A)
Your service should subscribe to beat frames from the Pulse service and respond appropriately:
```go
// Example Go implementation
type BeatFrameHandler struct {
currentBeat int64
phase string
}
func (h *BeatFrameHandler) HandleBeatFrame(frame BeatFrame) {
// Validate the beat frame
if err := validateBeatFrame(frame); err != nil {
log.Errorf("Invalid beat frame: %v", err)
return
}
// Update internal state
h.currentBeat = frame.BeatIndex
h.phase = frame.Phase
// Execute phase-appropriate actions
switch frame.Phase {
case "plan":
h.planPhase(frame)
case "execute":
h.executePhase(frame)
case "review":
h.reviewPhase(frame)
}
}
func validateBeatFrame(frame BeatFrame) error {
if frame.Type != "backbeat.beatframe.v1" {
return fmt.Errorf("invalid message type: %s", frame.Type)
}
if frame.TempoBPM < 0.1 || frame.TempoBPM > 1000 {
return fmt.Errorf("invalid tempo: %f", frame.TempoBPM)
}
// Add more validation as needed
return nil
}
```
### Implementing StatusClaim Publisher (INT-B)
Your agents should publish status claims to the Reverb service:
```go
func (agent *Agent) PublishStatusClaim(beatIndex int64, state string) error {
claim := StatusClaim{
Type: "backbeat.statusclaim.v1",
AgentID: agent.ID,
BeatIndex: beatIndex,
State: state,
HLC: agent.generateHLC(),
Progress: agent.calculateProgress(),
Notes: agent.getCurrentStatus(),
}
// Validate before sending
if err := validateStatusClaim(claim); err != nil {
return fmt.Errorf("invalid status claim: %w", err)
}
return agent.publisher.Publish("backbeat.statusclaims", claim)
}
func validateStatusClaim(claim StatusClaim) error {
validStates := []string{"idle", "planning", "executing", "reviewing", "completed", "failed", "blocked", "helping"}
for _, valid := range validStates {
if claim.State == valid {
return nil
}
}
return fmt.Errorf("invalid state: %s", claim.State)
}
```
### Implementing BarReport Consumer (INT-C)
Services should consume bar reports for cluster health awareness:
```go
func (service *Service) HandleBarReport(report BarReport) {
// Validate the bar report
if err := validateBarReport(report); err != nil {
log.Errorf("Invalid bar report: %v", err)
return
}
// Update cluster health metrics
service.updateClusterHealth(report)
// React to issues
if len(report.Issues) > 0 {
service.handleClusterIssues(report.Issues)
}
// Store performance metrics
service.storePerformanceMetrics(report.Performance)
}
func (service *Service) updateClusterHealth(report BarReport) {
service.clusterMetrics.AgentsReporting = report.AgentsReporting
service.clusterMetrics.OnTimeRate = float64(report.OnTimeReviews) / float64(report.AgentsReporting)
service.clusterMetrics.TempoDrift = report.TempoDriftMS
service.clusterMetrics.SecretRotationsOK = report.SecretRotationsOK
}
```
## Message Format Requirements
### Common Patterns
All BACKBEAT messages share these patterns:
1. **Type Field**: Must exactly match the schema constant
2. **HLC Timestamps**: Format `XXXX:XXXX:XXXX` (hex digits)
3. **Beat Indices**: Monotonically increasing integers ≥ 0
4. **Window IDs**: 32-character hexadecimal strings
5. **Agent IDs**: Pattern `service:instance` or `agent:identifier`
### Validation Best Practices
1. **Always validate messages before processing**
2. **Use schema validation in tests**
3. **Handle validation errors gracefully**
4. **Log validation failures for debugging**
Example validation function:
```go
func ValidateMessage(messageBytes []byte, expectedType string) error {
// Parse and check type
var msg map[string]interface{}
if err := json.Unmarshal(messageBytes, &msg); err != nil {
return fmt.Errorf("invalid JSON: %w", err)
}
msgType, ok := msg["type"].(string)
if !ok || msgType != expectedType {
return fmt.Errorf("expected type %s, got %s", expectedType, msgType)
}
// Use schema validation
return validateWithSchema(messageBytes, expectedType)
}
```
## Tempo and Timing Considerations
### Understanding Tempo
- **Default Tempo**: 1 BPM (60-second beats)
- **Minimum Tempo**: 1 BPM (60-second beats for batch or recovery windows)
- **Maximum Tempo**: 24 BPM (~2.5-second beats for high-frequency workloads)
### Phase Timing
Each beat consists of three phases with equal time allocation:
```
Beat Duration = 60 / TempoBPM seconds
Phase Duration = Beat Duration / 3
Plan Phase: [0, Beat Duration / 3)
Execute Phase: [Beat Duration / 3, 2 * Beat Duration / 3)
Review Phase: [2 * Beat Duration / 3, Beat Duration)
```
### Implementation Guidelines
1. **Respect Deadlines**: Always complete phase work before `deadline_at`
2. **Handle Tempo Changes**: Pulse may adjust tempo based on cluster performance
3. **Plan for Latency**: Factor in network and processing delays
4. **Implement Backpressure**: Report when unable to keep up with tempo
## Error Handling
### Schema Validation Failures
```go
func HandleInvalidMessage(err error, messageBytes []byte) {
log.Errorf("Schema validation failed: %v", err)
log.Debugf("Invalid message: %s", string(messageBytes))
// Send to dead letter queue or error handler
errorHandler.HandleInvalidMessage(messageBytes, err)
// Update metrics
metrics.InvalidMessageCounter.Inc()
}
```
### Network and Timing Issues
```go
func (agent *Agent) HandleMissedBeat(expectedBeat int64) {
// Report missed beat
claim := StatusClaim{
Type: "backbeat.statusclaim.v1",
AgentID: agent.ID,
BeatIndex: expectedBeat,
State: "blocked",
Notes: "missed beat due to network issues",
HLC: agent.generateHLC(),
}
// Try to catch up
agent.attemptResynchronization()
}
```
## Testing Your Integration
### Unit Tests
```go
func TestBeatFrameValidation(t *testing.T) {
validFrame := BeatFrame{
Type: "backbeat.beatframe.v1",
ClusterID: "test",
BeatIndex: 100,
Downbeat: false,
Phase: "execute",
HLC: "7ffd:0001:abcd",
DeadlineAt: time.Now().Add(30 * time.Second),
TempoBPM: 2.0,
WindowID: "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
}
err := validateBeatFrame(validFrame)
assert.NoError(t, err)
}
```
### Integration Tests
Use the BACKBEAT validation tools:
```bash
# Test your message files
backbeat-validate --schemas /path/to/backbeat/schemas --dir messages/
# Test individual messages
echo '{"type":"backbeat.beatframe.v1",...}' | backbeat-validate --schemas /path/to/backbeat/schemas --message -
```
### Load Testing
Consider tempo and message volume in your load tests:
```go
func TestHighTempoHandling(t *testing.T) {
// Simulate 10 BPM (6-second beats)
tempo := 10.0
beatInterval := time.Duration(60/tempo) * time.Second
for i := 0; i < 100; i++ {
frame := generateBeatFrame(i, tempo)
handler.HandleBeatFrame(frame)
time.Sleep(beatInterval)
}
// Verify no beats were dropped
assert.Equal(t, 100, handler.processedBeats)
}
```
## Production Deployment
### Monitoring
Monitor these key metrics:
1. **Message Validation Rate**: Percentage of valid messages received
2. **Beat Processing Latency**: Time to process each beat phase
3. **Missed Beat Count**: Number of beats that couldn't be processed on time
4. **Schema Version Compatibility**: Ensure all services use compatible versions
### Alerting
Set up alerts for:
- Schema validation failures > 1%
- Beat processing latency > 90% of phase duration
- Missed beats > 5% in any 10-minute window
- HLC timestamp drift > 5 seconds
### Gradual Rollout
1. **Validate in CI**: Ensure all messages pass schema validation
2. **Deploy to dev**: Test with low tempo (0.5 BPM)
3. **Staging validation**: Use production-like tempo and load
4. **Canary deployment**: Roll out to small percentage of production traffic
5. **Full production**: Monitor closely and be ready to rollback
## Troubleshooting
### Common Issues
1. **Wrong Message Type**: Ensure `type` field exactly matches schema
2. **HLC Format**: Must be `XXXX:XXXX:XXXX` format with hex digits
3. **Window ID Length**: Must be exactly 32 hex characters
4. **Enum Values**: States, phases, severities must match schema exactly
5. **Numeric Ranges**: Check min/max constraints (tempo, beat_index, etc.)
### Debug Tools
```bash
# Validate specific message
backbeat-validate --schemas ./schemas --message '{"type":"backbeat.beatframe.v1",...}'
# Get detailed validation errors
backbeat-validate --schemas ./schemas --file message.json --json
# Validate entire directory with detailed output
backbeat-validate --schemas ./schemas --dir messages/ --json > validation-report.json
```
## Schema Evolution
See [schema-evolution.md](schema-evolution.md) for details on:
- Semantic versioning for schemas
- Backward compatibility requirements
- Migration strategies for schema updates
- Version compatibility matrix
## Performance Guidelines
See [tempo-guide.md](tempo-guide.md) for details on:
- Choosing appropriate tempo for your workload
- Optimizing beat processing performance
- Handling tempo changes gracefully
- Resource utilization best practices
## Support
- **Documentation**: This contracts package contains the authoritative reference
- **Examples**: See `contracts/tests/examples/` for valid/invalid message samples
- **Issues**: Report integration problems to the BACKBEAT team
- **Updates**: Monitor the contracts repository for schema updates

View File

@@ -0,0 +1,507 @@
# BACKBEAT Schema Evolution and Versioning
This document defines how BACKBEAT message schemas evolve over time while maintaining compatibility across the CHORUS 2.0.0 ecosystem.
## Versioning Strategy
### Semantic Versioning for Schemas
BACKBEAT schemas follow semantic versioning (SemVer) with CHORUS-specific interpretations:
- **MAJOR** (`X.0.0`): Breaking changes that require code updates
- **MINOR** (`X.Y.0`): Backward-compatible additions (new optional fields, enum values)
- **PATCH** (`X.Y.Z`): Documentation updates, constraint clarifications, examples
### Schema Identification
Each schema includes version information:
```json
{
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "https://chorus.services/schemas/backbeat/beatframe/v1.2.0",
"title": "BACKBEAT BeatFrame (INT-A)",
"version": "1.2.0"
}
```
### Message Type Versioning
Message types embed version information:
- `backbeat.beatframe.v1` → Schema version 1.x.x
- `backbeat.beatframe.v2` → Schema version 2.x.x
Only **major** version changes require new message type identifiers.
## Compatibility Matrix
### Current Schema Versions
| Interface | Schema Version | Message Type | Status |
|-----------|----------------|--------------|--------|
| INT-A (BeatFrame) | 1.0.0 | `backbeat.beatframe.v1` | Active |
| INT-B (StatusClaim) | 1.0.0 | `backbeat.statusclaim.v1` | Active |
| INT-C (BarReport) | 1.0.0 | `backbeat.barreport.v1` | Active |
### Version Compatibility Rules
1. **Minor/Patch Updates**: All v1.x.x schemas are compatible with `backbeat.*.v1` messages
2. **Major Updates**: Require new message type (e.g., `backbeat.beatframe.v2`)
3. **Transition Period**: Both old and new versions supported during migration
4. **Deprecation**: 6-month notice before removing support for old major versions
## Change Categories
### Minor Version Changes (Backward Compatible)
These changes increment the minor version (1.0.0 → 1.1.0):
#### 1. Adding Optional Fields
```json
// Before (v1.0.0)
{
"required": ["type", "cluster_id", "beat_index"],
"properties": {
"type": {...},
"cluster_id": {...},
"beat_index": {...}
}
}
// After (v1.1.0) - adds optional field
{
"required": ["type", "cluster_id", "beat_index"],
"properties": {
"type": {...},
"cluster_id": {...},
"beat_index": {...},
"priority": {
"type": "integer",
"minimum": 1,
"maximum": 10,
"description": "Optional processing priority (1=low, 10=high)"
}
}
}
```
#### 2. Adding Enum Values
```json
// Before (v1.0.0)
{
"properties": {
"phase": {
"enum": ["plan", "execute", "review"]
}
}
}
// After (v1.1.0) - adds new phase
{
"properties": {
"phase": {
"enum": ["plan", "execute", "review", "cleanup"]
}
}
}
```
#### 3. Relaxing Constraints
```json
// Before (v1.0.0)
{
"properties": {
"notes": {
"type": "string",
"maxLength": 256
}
}
}
// After (v1.1.0) - allows longer notes
{
"properties": {
"notes": {
"type": "string",
"maxLength": 512
}
}
}
```
#### 4. Adding Properties to Objects
```json
// Before (v1.0.0)
{
"properties": {
"metadata": {
"type": "object",
"properties": {
"version": {"type": "string"}
}
}
}
}
// After (v1.1.0) - adds new metadata field
{
"properties": {
"metadata": {
"type": "object",
"properties": {
"version": {"type": "string"},
"source": {"type": "string"}
}
}
}
}
```
### Major Version Changes (Breaking)
These changes increment the major version (1.x.x → 2.0.0):
#### 1. Removing Required Fields
```json
// v1.x.x
{
"required": ["type", "cluster_id", "beat_index", "deprecated_field"]
}
// v2.0.0
{
"required": ["type", "cluster_id", "beat_index"]
}
```
#### 2. Changing Field Types
```json
// v1.x.x
{
"properties": {
"beat_index": {"type": "integer"}
}
}
// v2.0.0
{
"properties": {
"beat_index": {"type": "string"}
}
}
```
#### 3. Removing Enum Values
```json
// v1.x.x
{
"properties": {
"state": {
"enum": ["idle", "executing", "deprecated_state"]
}
}
}
// v2.0.0
{
"properties": {
"state": {
"enum": ["idle", "executing"]
}
}
}
```
#### 4. Tightening Constraints
```json
// v1.x.x
{
"properties": {
"agent_id": {
"type": "string",
"maxLength": 256
}
}
}
// v2.0.0
{
"properties": {
"agent_id": {
"type": "string",
"maxLength": 128
}
}
}
```
### Patch Version Changes (Non-Breaking)
These changes increment the patch version (1.0.0 → 1.0.1):
1. **Documentation updates**
2. **Example additions**
3. **Description clarifications**
4. **Comment additions**
## Migration Strategies
### Minor Version Migration
Services automatically benefit from minor version updates:
```go
// This code works with both v1.0.0 and v1.1.0
func handleBeatFrame(frame BeatFrame) {
// Core fields always present
log.Printf("Beat %d in phase %s", frame.BeatIndex, frame.Phase)
// New optional fields checked safely
if frame.Priority != nil {
log.Printf("Priority: %d", *frame.Priority)
}
}
```
### Major Version Migration
Requires explicit handling of both versions during transition:
```go
func handleMessage(messageBytes []byte) error {
var msgType struct {
Type string `json:"type"`
}
if err := json.Unmarshal(messageBytes, &msgType); err != nil {
return err
}
switch msgType.Type {
case "backbeat.beatframe.v1":
return handleBeatFrameV1(messageBytes)
case "backbeat.beatframe.v2":
return handleBeatFrameV2(messageBytes)
default:
return fmt.Errorf("unsupported message type: %s", msgType.Type)
}
}
```
### Gradual Migration Process
1. **Preparation Phase** (Months 1-2)
- Announce upcoming major version change
- Publish v2.0.0 schemas alongside v1.x.x
- Update documentation and examples
- Provide migration tools and guides
2. **Dual Support Phase** (Months 3-4)
- Services support both v1 and v2 message types
- New services prefer v2 messages
- Monitoring tracks v1 vs v2 usage
3. **Migration Phase** (Months 5-6)
- All services updated to send v2 messages
- Services still accept v1 for backward compatibility
- Warnings logged for v1 message reception
4. **Cleanup Phase** (Month 7+)
- Drop support for v1 messages
- Remove v1 handling code
- Update schemas to mark v1 as deprecated
## Implementation Guidelines
### Schema Development
1. **Start Conservative**: Begin with strict constraints, relax later if needed
2. **Plan for Growth**: Design extensible structures with optional metadata objects
3. **Document Thoroughly**: Include clear descriptions and examples
4. **Test Extensively**: Validate with real-world data before releasing
### Version Detection
Services should detect schema versions:
```go
type SchemaInfo struct {
Version string `json:"version"`
MessageType string `json:"message_type"`
IsSupported bool `json:"is_supported"`
}
func detectSchemaVersion(messageType string) SchemaInfo {
switch messageType {
case "backbeat.beatframe.v1":
return SchemaInfo{
Version: "1.x.x",
MessageType: messageType,
IsSupported: true,
}
case "backbeat.beatframe.v2":
return SchemaInfo{
Version: "2.x.x",
MessageType: messageType,
IsSupported: true,
}
default:
return SchemaInfo{
MessageType: messageType,
IsSupported: false,
}
}
}
```
### Validation Strategy
```go
func validateWithVersionFallback(messageBytes []byte) error {
// Try latest version first
if err := validateV2(messageBytes); err == nil {
return nil
}
// Fall back to previous version
if err := validateV1(messageBytes); err == nil {
log.Warn("Received v1 message, consider upgrading sender")
return nil
}
return fmt.Errorf("message does not match any supported schema version")
}
```
## Testing Schema Evolution
### Compatibility Tests
```go
func TestSchemaBackwardCompatibility(t *testing.T) {
// Test that v1.1.0 accepts all valid v1.0.0 messages
v100Messages := loadTestMessages("v1.0.0")
v110Schema := loadSchema("beatframe-v1.1.0.schema.json")
for _, msg := range v100Messages {
err := validateAgainstSchema(msg, v110Schema)
assert.NoError(t, err, "v1.1.0 should accept v1.0.0 messages")
}
}
func TestSchemaForwardCompatibility(t *testing.T) {
// Test that v1.0.0 code gracefully handles v1.1.0 messages
v110Message := loadTestMessage("beatframe-v1.1.0-with-new-fields.json")
var beatFrame BeatFrameV1
err := json.Unmarshal(v110Message, &beatFrame)
assert.NoError(t, err, "v1.0.0 struct should parse v1.1.0 messages")
// Core fields should be populated
assert.NotEmpty(t, beatFrame.Type)
assert.NotEmpty(t, beatFrame.ClusterID)
}
```
### Migration Tests
```go
func TestDualVersionSupport(t *testing.T) {
handler := NewMessageHandler()
v1Message := generateBeatFrameV1()
v2Message := generateBeatFrameV2()
// Both versions should be handled correctly
err1 := handler.HandleMessage(v1Message)
err2 := handler.HandleMessage(v2Message)
assert.NoError(t, err1)
assert.NoError(t, err2)
}
```
## Deprecation Process
### Marking Deprecated Features
```json
{
"properties": {
"legacy_field": {
"type": "string",
"description": "DEPRECATED: Use new_field instead. Will be removed in v2.0.0",
"deprecated": true
},
"new_field": {
"type": "string",
"description": "Replacement for legacy_field"
}
}
}
```
### Communication Timeline
1. **6 months before**: Announce deprecation in release notes
2. **3 months before**: Add deprecation warnings to schemas
3. **1 month before**: Final migration reminder
4. **Release day**: Remove deprecated features
### Tooling Support
```bash
# Check for deprecated schema usage
backbeat-validate --schemas ./schemas --dir messages/ --check-deprecated
# Migration helper
backbeat-migrate --from v1 --to v2 --dir messages/
```
## Best Practices
### For Schema Authors
1. **Communicate Early**: Announce changes well in advance
2. **Provide Tools**: Create migration utilities and documentation
3. **Monitor Usage**: Track which versions are being used
4. **Be Conservative**: Prefer minor over major version changes
### For Service Developers
1. **Stay Updated**: Subscribe to schema change notifications
2. **Plan for Migration**: Build version handling into your services
3. **Test Thoroughly**: Validate against multiple schema versions
4. **Monitor Compatibility**: Alert on unsupported message versions
### For Operations Teams
1. **Version Tracking**: Monitor which schema versions are active
2. **Migration Planning**: Coordinate major version migrations
3. **Rollback Capability**: Be prepared to revert if migrations fail
4. **Performance Impact**: Monitor schema validation performance
## Future Considerations
### Planned Enhancements
1. **Schema Registry**: Centralized schema version management
2. **Auto-Migration**: Tools to automatically update message formats
3. **Version Negotiation**: Services negotiate supported versions
4. **Schema Analytics**: Usage metrics and compatibility reporting
### Long-term Vision
- **Continuous Evolution**: Schemas evolve without breaking existing services
- **Zero-Downtime Updates**: Schema changes deploy without service interruption
- **Automated Testing**: CI/CD pipelines validate schema compatibility
- **Self-Healing**: Services automatically adapt to schema changes

View File

@@ -0,0 +1,610 @@
# BACKBEAT Tempo Guide: Beat Timing and Performance Recommendations
This guide provides comprehensive recommendations for choosing optimal tempo settings, implementing beat processing, and achieving optimal performance in BACKBEAT-enabled CHORUS 2.0.0 services.
## Understanding BACKBEAT Tempo
### Tempo Basics
BACKBEAT tempo is measured in **Beats Per Minute (BPM)**, similar to musical tempo:
- **1 BPM** = 60-second beats (**default**, good for batch processing and recovery windows)
- **2 BPM** = 30-second beats (good for most services)
- **4 BPM** = 15-second beats (good for responsive services)
- **60 BPM** = 1-second beats (good for high-frequency operations)
### Beat Structure
Each beat consists of three equal phases:
```
Beat Duration = 60 / TempoBPM seconds
Phase Duration = Beat Duration / 3
┌─────────────┬─────────────┬─────────────┐
│ PLAN │ EXECUTE │ REVIEW │
│ Phase 1 │ Phase 2 │ Phase 3 │
└─────────────┴─────────────┴─────────────┘
│←────────── Beat Duration ──────────────→│
```
### Tempo Ranges and Use Cases
| Tempo Range | Beat Duration | Use Cases | Examples |
|-------------|---------------|-----------|----------|
| 0.1 - 0.5 BPM | 2-10 minutes | Large batch jobs, ETL | Data warehouse loads, ML training |
| 0.5 - 2 BPM | 30s - 2 minutes | Standard operations | API services, web apps |
| 2 - 10 BPM | 6-30 seconds | Responsive services | Real-time dashboards, monitoring |
| 10 - 60 BPM | 1-6 seconds | High-frequency | Trading systems, IoT data processing |
| 60+ BPM | <1 second | Ultra-high-frequency | Hardware control, real-time gaming |
## Choosing the Right Tempo
### Workload Analysis
Before selecting tempo, analyze your workload characteristics:
1. **Task Duration**: How long do typical operations take?
2. **Coordination Needs**: How often do services need to synchronize?
3. **Resource Requirements**: How much CPU/memory/I/O does work consume?
4. **Latency Tolerance**: How quickly must the system respond to changes?
5. **Error Recovery**: How quickly should the system detect and recover from failures?
### Tempo Selection Guidelines
#### Rule 1: Task Duration Constraint
```
Recommended Tempo ≤ 60 / (Average Task Duration × 3)
```
**Example**: If tasks take 5 seconds on average:
- Maximum recommended tempo = 60 / (5 × 3) = 4 BPM
- Use 2-4 BPM for safe operation
#### Rule 2: Coordination Frequency
```
Coordination Tempo = 60 / Desired Sync Interval
```
**Example**: If services should sync every 2 minutes:
- Recommended tempo = 60 / 120 = 0.5 BPM
#### Rule 3: Resource Utilization
```
Sustainable Tempo = 60 / (Task Duration + Recovery Time)
```
**Example**: 10s tasks with 5s recovery time:
- Maximum sustainable tempo = 60 / (10 + 5) = 4 BPM
### Common Tempo Patterns
#### Development/Testing: 0.1-0.5 BPM
```json
{
"tempo_bpm": 0.2,
"beat_duration": "5 minutes",
"use_case": "Development and debugging",
"advantages": ["Easy to observe", "Time to investigate issues"],
"disadvantages": ["Slow feedback", "Not production realistic"]
}
```
#### Standard Services: 1-4 BPM
```json
{
"tempo_bpm": 2.0,
"beat_duration": "30 seconds",
"use_case": "Most production services",
"advantages": ["Good balance", "Reasonable coordination", "Error recovery"],
"disadvantages": ["May be slow for real-time needs"]
}
```
#### Responsive Applications: 4-20 BPM
```json
{
"tempo_bpm": 10.0,
"beat_duration": "6 seconds",
"use_case": "Interactive applications",
"advantages": ["Quick response", "Fast error detection"],
"disadvantages": ["Higher overhead", "More network traffic"]
}
```
#### High-Frequency Systems: 20+ BPM
```json
{
"tempo_bpm": 60.0,
"beat_duration": "1 second",
"use_case": "Real-time trading, IoT",
"advantages": ["Ultra-responsive", "Immediate coordination"],
"disadvantages": ["High resource usage", "Network intensive"]
}
```
## Implementation Guidelines
### Beat Processing Architecture
#### Single-Threaded Processing
Best for low-medium tempo (≤10 BPM):
```go
type BeatProcessor struct {
currentBeat int64
phase string
workQueue chan Task
}
func (p *BeatProcessor) ProcessBeat(frame BeatFrame) {
// Update state
p.currentBeat = frame.BeatIndex
p.phase = frame.Phase
// Process phase synchronously
switch frame.Phase {
case "plan":
p.planPhase(frame)
case "execute":
p.executePhase(frame)
case "review":
p.reviewPhase(frame)
}
// Report status before deadline
p.reportStatus(frame.BeatIndex, "completed")
}
```
#### Pipelined Processing
Best for high tempo (>10 BPM):
```go
type PipelinedProcessor struct {
planQueue chan BeatFrame
executeQueue chan BeatFrame
reviewQueue chan BeatFrame
}
func (p *PipelinedProcessor) Start() {
// Separate goroutines for each phase
go p.planWorker()
go p.executeWorker()
go p.reviewWorker()
}
func (p *PipelinedProcessor) ProcessBeat(frame BeatFrame) {
switch frame.Phase {
case "plan":
p.planQueue <- frame
case "execute":
p.executeQueue <- frame
case "review":
p.reviewQueue <- frame
}
}
```
### Timing Implementation
#### Deadline Management
```go
func (p *BeatProcessor) executeWithDeadline(frame BeatFrame, work func() error) error {
// Calculate remaining time
remainingTime := time.Until(frame.DeadlineAt)
// Create timeout context
ctx, cancel := context.WithTimeout(context.Background(), remainingTime)
defer cancel()
// Execute with timeout
done := make(chan error, 1)
go func() {
done <- work()
}()
select {
case err := <-done:
return err
case <-ctx.Done():
return fmt.Errorf("work timed out after %v", remainingTime)
}
}
```
#### Adaptive Processing
```go
type AdaptiveProcessor struct {
processingTimes []time.Duration
targetUtilization float64 // 0.8 = use 80% of available time
}
func (p *AdaptiveProcessor) shouldProcessWork(frame BeatFrame) bool {
// Calculate phase time available
phaseTime := time.Duration(60/frame.TempoBPM*1000/3) * time.Millisecond
// Estimate processing time based on history
avgProcessingTime := p.calculateAverage()
// Only process if we have enough time
requiredTime := time.Duration(float64(avgProcessingTime) / p.targetUtilization)
return phaseTime >= requiredTime
}
```
### Performance Optimization
#### Batch Processing within Beats
```go
func (p *BeatProcessor) executePhase(frame BeatFrame) error {
// Calculate optimal batch size based on tempo
phaseDuration := time.Duration(60/frame.TempoBPM*1000/3) * time.Millisecond
targetTime := time.Duration(float64(phaseDuration) * 0.8) // Use 80% of time
// Process work in batches
batchSize := p.calculateOptimalBatchSize(targetTime)
for p.hasWork() && time.Until(frame.DeadlineAt) > time.Second {
batch := p.getWorkBatch(batchSize)
if err := p.processBatch(batch); err != nil {
return err
}
}
return nil
}
```
#### Caching and Pre-computation
```go
type SmartProcessor struct {
cache map[string]interface{}
precomputed map[int64]interface{} // Keyed by beat index
}
func (p *SmartProcessor) planPhase(frame BeatFrame) {
// Pre-compute work for future beats during plan phase
nextBeat := frame.BeatIndex + 1
if _, exists := p.precomputed[nextBeat]; !exists {
p.precomputed[nextBeat] = p.precomputeWork(nextBeat)
}
// Cache frequently accessed data
p.cacheRelevantData(frame)
}
func (p *SmartProcessor) executePhase(frame BeatFrame) {
// Use pre-computed results if available
if precomputed, exists := p.precomputed[frame.BeatIndex]; exists {
return p.usePrecomputedWork(precomputed)
}
// Fall back to real-time computation
return p.computeWork(frame)
}
```
## Performance Monitoring
### Key Metrics
Track these metrics for tempo optimization:
```go
type TempoMetrics struct {
// Timing metrics
BeatProcessingLatency time.Duration // How long beats take to process
PhaseCompletionRate float64 // % of phases completed on time
DeadlineMissRate float64 // % of deadlines missed
// Resource metrics
CPUUtilization float64 // CPU usage during beats
MemoryUtilization float64 // Memory usage
NetworkBandwidth int64 // Bytes/sec for BACKBEAT messages
// Throughput metrics
TasksPerBeat int // Work completed per beat
BeatsPerSecond float64 // Effective beat processing rate
TempoDriftMS float64 // How far behind/ahead we're running
}
```
### Performance Alerts
```go
func (m *TempoMetrics) checkAlerts() []Alert {
var alerts []Alert
// Beat processing taking too long
if m.BeatProcessingLatency > m.phaseDuration() * 0.9 {
alerts = append(alerts, Alert{
Level: "warning",
Message: "Beat processing approaching deadline",
Recommendation: "Consider reducing tempo or optimizing processing",
})
}
// Missing too many deadlines
if m.DeadlineMissRate > 0.05 { // 5%
alerts = append(alerts, Alert{
Level: "critical",
Message: "High deadline miss rate",
Recommendation: "Reduce tempo immediately or scale resources",
})
}
// Resource exhaustion
if m.CPUUtilization > 0.9 {
alerts = append(alerts, Alert{
Level: "warning",
Message: "High CPU utilization",
Recommendation: "Scale up or reduce workload per beat",
})
}
return alerts
}
```
### Adaptive Tempo Adjustment
```go
type TempoController struct {
currentTempo float64
targetLatency time.Duration
adjustmentRate float64 // How aggressively to adjust
}
func (tc *TempoController) adjustTempo(metrics TempoMetrics) float64 {
// Calculate desired tempo based on performance
if metrics.DeadlineMissRate > 0.02 { // 2% miss rate
// Slow down
tc.currentTempo *= (1.0 - tc.adjustmentRate)
} else if metrics.PhaseCompletionRate > 0.95 && metrics.CPUUtilization < 0.7 {
// Speed up
tc.currentTempo *= (1.0 + tc.adjustmentRate)
}
// Apply constraints
tc.currentTempo = math.Max(0.1, tc.currentTempo) // Minimum 0.1 BPM
tc.currentTempo = math.Min(1000, tc.currentTempo) // Maximum 1000 BPM
return tc.currentTempo
}
```
## Load Testing and Capacity Planning
### Beat Load Testing
```go
func TestBeatProcessingUnderLoad(t *testing.T) {
processor := NewBeatProcessor()
tempo := 10.0 // 10 BPM = 6-second beats
beatInterval := time.Duration(60/tempo) * time.Second
// Simulate sustained load
for i := 0; i < 1000; i++ {
frame := generateBeatFrame(i, tempo)
start := time.Now()
err := processor.ProcessBeat(frame)
duration := time.Since(start)
// Verify processing completed within phase duration
phaseDuration := beatInterval / 3
assert.Less(t, duration, phaseDuration)
assert.NoError(t, err)
// Wait for next beat
time.Sleep(beatInterval)
}
}
```
### Capacity Planning
```go
type CapacityPlanner struct {
maxTempo float64
resourceLimits ResourceLimits
taskCharacteristics TaskProfile
}
func (cp *CapacityPlanner) calculateMaxTempo() float64 {
// Based on CPU capacity
cpuConstrainedTempo := 60.0 / (cp.taskCharacteristics.CPUTime * 3)
// Based on memory capacity
memConstrainedTempo := cp.resourceLimits.Memory / cp.taskCharacteristics.MemoryPerBeat
// Based on I/O capacity
ioConstrainedTempo := cp.resourceLimits.IOPS / cp.taskCharacteristics.IOPerBeat
// Take the minimum (most restrictive constraint)
return math.Min(cpuConstrainedTempo, math.Min(memConstrainedTempo, ioConstrainedTempo))
}
```
## Common Patterns and Anti-Patterns
### ✅ Good Patterns
#### Progressive Backoff
```go
func (p *Processor) handleOverload() {
if p.metrics.DeadlineMissRate > 0.1 {
// Temporarily reduce work per beat
p.workPerBeat *= 0.8
log.Warn("Reducing work per beat due to overload")
}
}
```
#### Graceful Degradation
```go
func (p *Processor) executePhase(frame BeatFrame) error {
timeRemaining := time.Until(frame.DeadlineAt)
if timeRemaining < p.minimumTime {
// Skip non-essential work
return p.executeEssentialOnly(frame)
}
return p.executeFullWorkload(frame)
}
```
#### Work Prioritization
```go
func (p *Processor) planPhase(frame BeatFrame) {
// Sort work by priority and deadline
work := p.getAvailableWork()
sort.Sort(ByPriorityAndDeadline(work))
// Plan only what can be completed in time
plannedWork := p.selectWorkForTempo(work, frame.TempoBPM)
p.scheduleWork(plannedWork)
}
```
### ❌ Anti-Patterns
#### Blocking I/O in Beat Processing
```go
// DON'T: Synchronous I/O can cause deadline misses
func badExecutePhase(frame BeatFrame) error {
data := fetchFromDatabase() // Blocking call!
return processData(data)
}
// DO: Use async I/O with timeouts
func goodExecutePhase(frame BeatFrame) error {
ctx, cancel := context.WithDeadline(context.Background(), frame.DeadlineAt)
defer cancel()
data, err := fetchFromDatabaseAsync(ctx)
if err != nil {
return err
}
return processData(data)
}
```
#### Ignoring Tempo Changes
```go
// DON'T: Assume tempo is constant
func badBeatHandler(frame BeatFrame) {
// Hard-coded timing assumptions
time.Sleep(10 * time.Second) // Fails if tempo > 6 BPM!
}
// DO: Adapt to current tempo
func goodBeatHandler(frame BeatFrame) {
phaseDuration := time.Duration(60/frame.TempoBPM*1000/3) * time.Millisecond
maxWorkTime := time.Duration(float64(phaseDuration) * 0.8)
// Adapt work to available time
ctx, cancel := context.WithTimeout(context.Background(), maxWorkTime)
defer cancel()
doWork(ctx)
}
```
#### Unbounded Work Queues
```go
// DON'T: Let work queues grow infinitely
type BadProcessor struct {
workQueue chan Task // Unbounded queue
}
// DO: Use bounded queues with backpressure
type GoodProcessor struct {
workQueue chan Task // Bounded queue
metrics *TempoMetrics
}
func (p *GoodProcessor) addWork(task Task) error {
select {
case p.workQueue <- task:
return nil
default:
p.metrics.WorkRejectedCount++
return ErrQueueFull
}
}
```
## Troubleshooting Performance Issues
### Diagnostic Checklist
1. **Beat Processing Time**: Are beats completing within phase deadlines?
2. **Resource Utilization**: Is CPU/memory/I/O being over-utilized?
3. **Network Latency**: Are BACKBEAT messages arriving late?
4. **Work Distribution**: Is work evenly distributed across beats?
5. **Error Rates**: Are errors causing processing delays?
### Performance Tuning Steps
1. **Measure Current Performance**
```bash
# Monitor beat processing metrics
kubectl logs deployment/my-service | grep "beat_processing_time"
# Check resource utilization
kubectl top pods
```
2. **Identify Bottlenecks**
```go
func profileBeatProcessing(frame BeatFrame) {
defer func(start time.Time) {
log.Infof("Beat %d phase %s took %v",
frame.BeatIndex, frame.Phase, time.Since(start))
}(time.Now())
// Your beat processing code here
}
```
3. **Optimize Critical Paths**
- Cache frequently accessed data
- Use connection pooling
- Implement circuit breakers
- Add request timeouts
4. **Scale Resources**
- Increase CPU/memory limits
- Add more replicas
- Use faster storage
- Optimize network configuration
5. **Adjust Tempo**
- Reduce tempo if overloaded
- Increase tempo if under-utilized
- Consider tempo auto-scaling
## Future Enhancements
### Planned Features
1. **Dynamic Tempo Scaling**: Automatic tempo adjustment based on load
2. **Beat Prediction**: ML-based prediction of optimal tempo
3. **Resource-Aware Scheduling**: Beat scheduling based on resource availability
4. **Cross-Service Tempo Negotiation**: Services negotiate optimal cluster tempo
### Experimental Features
1. **Hierarchical Beats**: Different tempo for different service types
2. **Beat Priorities**: Critical beats get processing preference
3. **Temporal Load Balancing**: Distribute work across beat phases
4. **Beat Replay**: Replay missed beats during low-load periods
Understanding and implementing these tempo guidelines will ensure your BACKBEAT-enabled services operate efficiently and reliably across the full range of CHORUS 2.0.0 workloads.

View File

@@ -0,0 +1,267 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "https://chorus.services/schemas/backbeat/barreport/v1.0.0",
"title": "BACKBEAT BarReport (INT-C)",
"description": "Periodic report from Reverb service summarizing agent activity over a bar (120 beats)",
"version": "1.0.0",
"type": "object",
"required": [
"type",
"window_id",
"from_beat",
"to_beat",
"agents_reporting",
"on_time_reviews",
"help_promises_fulfilled",
"secret_rotations_ok",
"tempo_drift_ms"
],
"additionalProperties": false,
"properties": {
"type": {
"type": "string",
"const": "backbeat.barreport.v1",
"description": "Message type identifier for BarReport v1"
},
"window_id": {
"type": "string",
"pattern": "^[0-9a-fA-F]{32}$",
"description": "Unique identifier for this reporting window"
},
"from_beat": {
"type": "integer",
"minimum": 0,
"maximum": 9223372036854775807,
"description": "Starting beat index for this report (inclusive)"
},
"to_beat": {
"type": "integer",
"minimum": 0,
"maximum": 9223372036854775807,
"description": "Ending beat index for this report (inclusive)"
},
"agents_reporting": {
"type": "integer",
"minimum": 0,
"description": "Total number of unique agents that sent status claims during this window"
},
"on_time_reviews": {
"type": "integer",
"minimum": 0,
"description": "Number of agents that completed review phase within deadline"
},
"help_promises_fulfilled": {
"type": "integer",
"minimum": 0,
"description": "Number of successful help/collaboration completions"
},
"secret_rotations_ok": {
"type": "boolean",
"description": "True if all required credential rotations completed successfully"
},
"tempo_drift_ms": {
"type": "number",
"description": "Average timing drift in milliseconds (positive = running behind, negative = ahead)"
},
"issues": {
"type": "array",
"maxItems": 100,
"description": "List of significant issues or anomalies detected during this window",
"items": {
"type": "object",
"required": ["severity", "category", "count"],
"additionalProperties": false,
"properties": {
"severity": {
"type": "string",
"enum": ["info", "warning", "error", "critical"],
"description": "Issue severity level"
},
"category": {
"type": "string",
"enum": [
"timing",
"failed_tasks",
"missing_agents",
"resource_exhaustion",
"network_partition",
"credential_failure",
"data_corruption",
"unknown"
],
"description": "Issue category for automated handling"
},
"count": {
"type": "integer",
"minimum": 1,
"description": "Number of occurrences of this issue type"
},
"description": {
"type": "string",
"maxLength": 512,
"description": "Human-readable description of the issue"
},
"affected_agents": {
"type": "array",
"maxItems": 50,
"description": "List of agent IDs affected by this issue",
"items": {
"type": "string",
"pattern": "^[a-zA-Z0-9_:-]+$",
"maxLength": 128
}
},
"first_seen_beat": {
"type": "integer",
"minimum": 0,
"description": "Beat index when this issue was first detected"
},
"last_seen_beat": {
"type": "integer",
"minimum": 0,
"description": "Beat index when this issue was last seen"
}
}
}
},
"performance": {
"type": "object",
"description": "Performance metrics for this reporting window",
"additionalProperties": false,
"properties": {
"avg_response_time_ms": {
"type": "number",
"minimum": 0,
"description": "Average response time for status claims in milliseconds"
},
"p95_response_time_ms": {
"type": "number",
"minimum": 0,
"description": "95th percentile response time for status claims"
},
"total_tasks_completed": {
"type": "integer",
"minimum": 0,
"description": "Total number of tasks completed during this window"
},
"total_tasks_failed": {
"type": "integer",
"minimum": 0,
"description": "Total number of tasks that failed during this window"
},
"peak_concurrent_agents": {
"type": "integer",
"minimum": 0,
"description": "Maximum number of agents active simultaneously"
},
"network_bytes_transferred": {
"type": "integer",
"minimum": 0,
"description": "Total network bytes transferred by all agents"
}
}
},
"health_indicators": {
"type": "object",
"description": "Cluster health indicators",
"additionalProperties": false,
"properties": {
"cluster_sync_score": {
"type": "number",
"minimum": 0.0,
"maximum": 1.0,
"description": "How well synchronized the cluster is (1.0 = perfect sync)"
},
"resource_utilization": {
"type": "number",
"minimum": 0.0,
"maximum": 1.0,
"description": "Average resource utilization across all agents"
},
"collaboration_efficiency": {
"type": "number",
"minimum": 0.0,
"maximum": 1.0,
"description": "How effectively agents are helping each other"
},
"error_rate": {
"type": "number",
"minimum": 0.0,
"maximum": 1.0,
"description": "Proportion of beats that had errors"
}
}
},
"metadata": {
"type": "object",
"description": "Optional metadata for extensions and debugging",
"additionalProperties": true,
"properties": {
"reverb_version": {
"type": "string",
"description": "Version of the Reverb service generating this report"
},
"report_generation_time_ms": {
"type": "number",
"minimum": 0,
"description": "Time taken to generate this report"
},
"next_window_id": {
"type": "string",
"pattern": "^[0-9a-fA-F]{32}$",
"description": "Window ID for the next reporting period"
}
}
}
},
"examples": [
{
"type": "backbeat.barreport.v1",
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
"from_beat": 240,
"to_beat": 359,
"agents_reporting": 978,
"on_time_reviews": 942,
"help_promises_fulfilled": 87,
"secret_rotations_ok": true,
"tempo_drift_ms": 7.3,
"issues": [
{
"severity": "warning",
"category": "timing",
"count": 12,
"description": "Some agents consistently reporting 50ms+ late",
"affected_agents": ["worker:batch-03", "indexer:shard-7"],
"first_seen_beat": 245,
"last_seen_beat": 358
}
],
"performance": {
"avg_response_time_ms": 45.2,
"p95_response_time_ms": 125.7,
"total_tasks_completed": 15678,
"total_tasks_failed": 23,
"peak_concurrent_agents": 1203,
"network_bytes_transferred": 67890123
},
"health_indicators": {
"cluster_sync_score": 0.94,
"resource_utilization": 0.67,
"collaboration_efficiency": 0.89,
"error_rate": 0.001
}
},
{
"type": "backbeat.barreport.v1",
"window_id": "a1b2c3d4e5f6789012345678901234ab",
"from_beat": 0,
"to_beat": 119,
"agents_reporting": 150,
"on_time_reviews": 147,
"help_promises_fulfilled": 12,
"secret_rotations_ok": true,
"tempo_drift_ms": -2.1,
"issues": []
}
]
}

View File

@@ -0,0 +1,121 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "https://chorus.services/schemas/backbeat/beatframe/v1.0.0",
"title": "BACKBEAT BeatFrame (INT-A)",
"description": "Beat synchronization message broadcast from Pulse service to all BACKBEAT-enabled services",
"version": "1.0.0",
"type": "object",
"required": [
"type",
"cluster_id",
"beat_index",
"downbeat",
"phase",
"hlc",
"deadline_at",
"tempo_bpm",
"window_id"
],
"additionalProperties": false,
"properties": {
"type": {
"type": "string",
"const": "backbeat.beatframe.v1",
"description": "Message type identifier for BeatFrame v1"
},
"cluster_id": {
"type": "string",
"pattern": "^[a-zA-Z0-9_-]+$",
"minLength": 1,
"maxLength": 64,
"description": "Unique identifier for the BACKBEAT cluster"
},
"beat_index": {
"type": "integer",
"minimum": 0,
"maximum": 9223372036854775807,
"description": "Monotonically increasing beat counter since cluster start"
},
"downbeat": {
"type": "boolean",
"description": "True if this is the first beat of a new bar (every 120 beats by default)"
},
"phase": {
"type": "string",
"enum": ["plan", "execute", "review"],
"description": "Current phase within the beat cycle"
},
"hlc": {
"type": "string",
"pattern": "^[0-9a-fA-F]{4}:[0-9a-fA-F]{4}:[0-9a-fA-F]{4}$",
"description": "Hybrid Logical Clock timestamp for causal ordering (format: wall:logical:node)"
},
"deadline_at": {
"type": "string",
"format": "date-time",
"description": "ISO 8601 timestamp when this beat phase must complete"
},
"tempo_bpm": {
"type": "number",
"minimum": 0.1,
"maximum": 1000,
"multipleOf": 0.1,
"description": "Current tempo in beats per minute (default: 2.0 for 30-second beats)"
},
"window_id": {
"type": "string",
"pattern": "^[0-9a-fA-F]{32}$",
"description": "Unique identifier for the current reporting window (changes every bar)"
},
"metadata": {
"type": "object",
"description": "Optional metadata for extensions and debugging",
"additionalProperties": true,
"properties": {
"pulse_version": {
"type": "string",
"description": "Version of the Pulse service generating this beat"
},
"cluster_health": {
"type": "string",
"enum": ["healthy", "degraded", "critical"],
"description": "Overall cluster health status"
},
"expected_agents": {
"type": "integer",
"minimum": 0,
"description": "Number of agents expected to participate in this beat"
}
}
}
},
"examples": [
{
"type": "backbeat.beatframe.v1",
"cluster_id": "chorus-prod",
"beat_index": 1337,
"downbeat": false,
"phase": "execute",
"hlc": "7ffd:0001:abcd",
"deadline_at": "2025-09-05T12:30:00Z",
"tempo_bpm": 2.0,
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
"metadata": {
"pulse_version": "1.2.3",
"cluster_health": "healthy",
"expected_agents": 150
}
},
{
"type": "backbeat.beatframe.v1",
"cluster_id": "dev-cluster",
"beat_index": 0,
"downbeat": true,
"phase": "plan",
"hlc": "0001:0000:cafe",
"deadline_at": "2025-09-05T12:00:30Z",
"tempo_bpm": 4.0,
"window_id": "a1b2c3d4e5f6789012345678901234ab"
}
]
}

View File

@@ -0,0 +1,181 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "https://chorus.services/schemas/backbeat/statusclaim/v1.0.0",
"title": "BACKBEAT StatusClaim (INT-B)",
"description": "Status update message sent from agents to Reverb service during beat execution",
"version": "1.0.0",
"type": "object",
"required": [
"type",
"agent_id",
"beat_index",
"state",
"hlc"
],
"additionalProperties": false,
"properties": {
"type": {
"type": "string",
"const": "backbeat.statusclaim.v1",
"description": "Message type identifier for StatusClaim v1"
},
"agent_id": {
"type": "string",
"pattern": "^[a-zA-Z0-9_:-]+$",
"minLength": 1,
"maxLength": 128,
"description": "Unique identifier for the reporting agent (format: service:instance or agent:id)"
},
"task_id": {
"type": "string",
"pattern": "^[a-zA-Z0-9_:-]+$",
"minLength": 1,
"maxLength": 128,
"description": "Optional task identifier if agent is working on a specific task"
},
"beat_index": {
"type": "integer",
"minimum": 0,
"maximum": 9223372036854775807,
"description": "Beat index this status claim refers to (must match current or recent BeatFrame)"
},
"state": {
"type": "string",
"enum": [
"idle",
"planning",
"executing",
"reviewing",
"completed",
"failed",
"blocked",
"helping"
],
"description": "Current state of the agent"
},
"beats_left": {
"type": "integer",
"minimum": 0,
"maximum": 1000,
"description": "Estimated number of beats needed to complete current work (0 = done this beat)"
},
"progress": {
"type": "number",
"minimum": 0.0,
"maximum": 1.0,
"description": "Progress percentage for current task/phase (0.0 = not started, 1.0 = complete)"
},
"notes": {
"type": "string",
"maxLength": 256,
"description": "Brief human-readable status description or error message"
},
"hlc": {
"type": "string",
"pattern": "^[0-9a-fA-F]{4}:[0-9a-fA-F]{4}:[0-9a-fA-F]{4}$",
"description": "Hybrid Logical Clock timestamp from the agent"
},
"resources": {
"type": "object",
"description": "Optional resource utilization information",
"additionalProperties": false,
"properties": {
"cpu_percent": {
"type": "number",
"minimum": 0.0,
"maximum": 100.0,
"description": "CPU utilization percentage"
},
"memory_mb": {
"type": "integer",
"minimum": 0,
"description": "Memory usage in megabytes"
},
"disk_io_ops": {
"type": "integer",
"minimum": 0,
"description": "Disk I/O operations since last beat"
},
"network_kb": {
"type": "integer",
"minimum": 0,
"description": "Network traffic in kilobytes since last beat"
}
}
},
"dependencies": {
"type": "array",
"maxItems": 50,
"description": "List of agent IDs this agent is waiting on or helping",
"items": {
"type": "string",
"pattern": "^[a-zA-Z0-9_:-]+$",
"maxLength": 128
}
},
"metadata": {
"type": "object",
"description": "Optional metadata for extensions and debugging",
"additionalProperties": true,
"properties": {
"agent_version": {
"type": "string",
"description": "Version of the agent software"
},
"error_code": {
"type": "string",
"description": "Structured error code if state is 'failed'"
},
"retry_count": {
"type": "integer",
"minimum": 0,
"description": "Number of retries attempted for current task"
}
}
}
},
"examples": [
{
"type": "backbeat.statusclaim.v1",
"agent_id": "search-indexer:worker-03",
"task_id": "index-batch:20250905-120",
"beat_index": 1337,
"state": "executing",
"beats_left": 3,
"progress": 0.65,
"notes": "processing batch 120/200",
"hlc": "7ffd:0001:beef",
"resources": {
"cpu_percent": 85.0,
"memory_mb": 2048,
"disk_io_ops": 1250,
"network_kb": 512
}
},
{
"type": "backbeat.statusclaim.v1",
"agent_id": "agent:backup-runner",
"beat_index": 1338,
"state": "failed",
"beats_left": 0,
"progress": 0.0,
"notes": "connection timeout to storage backend",
"hlc": "7ffe:0002:dead",
"metadata": {
"agent_version": "2.1.0",
"error_code": "STORAGE_TIMEOUT",
"retry_count": 3
}
},
{
"type": "backbeat.statusclaim.v1",
"agent_id": "ml-trainer:gpu-node-1",
"beat_index": 1336,
"state": "helping",
"progress": 1.0,
"notes": "completed own work, assisting node-2 with large model",
"hlc": "7ffc:0005:cafe",
"dependencies": ["ml-trainer:gpu-node-2"]
}
]
}

View File

@@ -0,0 +1,533 @@
package tests
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"strings"
"testing"
"time"
"github.com/xeipuuv/gojsonschema"
)
// MessageTypes defines the three core BACKBEAT interfaces
const (
BeatFrameType = "backbeat.beatframe.v1"
StatusClaimType = "backbeat.statusclaim.v1"
BarReportType = "backbeat.barreport.v1"
)
// BeatFrame represents INT-A: Pulse → All Services
type BeatFrame struct {
Type string `json:"type"`
ClusterID string `json:"cluster_id"`
BeatIndex int64 `json:"beat_index"`
Downbeat bool `json:"downbeat"`
Phase string `json:"phase"`
HLC string `json:"hlc"`
DeadlineAt time.Time `json:"deadline_at"`
TempoBPM float64 `json:"tempo_bpm"`
WindowID string `json:"window_id"`
Metadata map[string]interface{} `json:"metadata,omitempty"`
}
// StatusClaim represents INT-B: Agents → Reverb
type StatusClaim struct {
Type string `json:"type"`
AgentID string `json:"agent_id"`
TaskID string `json:"task_id,omitempty"`
BeatIndex int64 `json:"beat_index"`
State string `json:"state"`
BeatsLeft int `json:"beats_left,omitempty"`
Progress float64 `json:"progress,omitempty"`
Notes string `json:"notes,omitempty"`
HLC string `json:"hlc"`
Resources map[string]interface{} `json:"resources,omitempty"`
Dependencies []string `json:"dependencies,omitempty"`
Metadata map[string]interface{} `json:"metadata,omitempty"`
}
// BarReport represents INT-C: Reverb → All Services
type BarReport struct {
Type string `json:"type"`
WindowID string `json:"window_id"`
FromBeat int64 `json:"from_beat"`
ToBeat int64 `json:"to_beat"`
AgentsReporting int `json:"agents_reporting"`
OnTimeReviews int `json:"on_time_reviews"`
HelpPromisesFulfilled int `json:"help_promises_fulfilled"`
SecretRotationsOK bool `json:"secret_rotations_ok"`
TempoDriftMS float64 `json:"tempo_drift_ms"`
Issues []map[string]interface{} `json:"issues,omitempty"`
Performance map[string]interface{} `json:"performance,omitempty"`
HealthIndicators map[string]interface{} `json:"health_indicators,omitempty"`
Metadata map[string]interface{} `json:"metadata,omitempty"`
}
// TestSchemaValidation tests that all JSON schemas are valid and messages conform
func TestSchemaValidation(t *testing.T) {
schemaDir := "../schemas"
tests := []struct {
name string
schemaFile string
validMsgs []interface{}
invalidMsgs []map[string]interface{}
}{
{
name: "BeatFrame Schema Validation",
schemaFile: "beatframe-v1.schema.json",
validMsgs: []interface{}{
BeatFrame{
Type: BeatFrameType,
ClusterID: "test-cluster",
BeatIndex: 100,
Downbeat: false,
Phase: "execute",
HLC: "7ffd:0001:abcd",
DeadlineAt: time.Now().Add(30 * time.Second),
TempoBPM: 2.0,
WindowID: "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
},
BeatFrame{
Type: BeatFrameType,
ClusterID: "prod",
BeatIndex: 0,
Downbeat: true,
Phase: "plan",
HLC: "0001:0000:cafe",
DeadlineAt: time.Now().Add(15 * time.Second),
TempoBPM: 4.0,
WindowID: "a1b2c3d4e5f6789012345678901234ab",
Metadata: map[string]interface{}{
"pulse_version": "1.0.0",
"cluster_health": "healthy",
},
},
},
invalidMsgs: []map[string]interface{}{
// Missing required fields
{
"type": BeatFrameType,
"cluster_id": "test",
// missing beat_index, downbeat, phase, etc.
},
// Invalid phase
{
"type": BeatFrameType,
"cluster_id": "test",
"beat_index": 0,
"downbeat": false,
"phase": "invalid_phase",
"hlc": "7ffd:0001:abcd",
"deadline_at": "2025-09-05T12:00:00Z",
"tempo_bpm": 2.0,
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
},
// Invalid HLC format
{
"type": BeatFrameType,
"cluster_id": "test",
"beat_index": 0,
"downbeat": false,
"phase": "plan",
"hlc": "invalid-hlc-format",
"deadline_at": "2025-09-05T12:00:00Z",
"tempo_bpm": 2.0,
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
},
},
},
{
name: "StatusClaim Schema Validation",
schemaFile: "statusclaim-v1.schema.json",
validMsgs: []interface{}{
StatusClaim{
Type: StatusClaimType,
AgentID: "worker:test-01",
TaskID: "task:123",
BeatIndex: 100,
State: "executing",
BeatsLeft: 3,
Progress: 0.5,
Notes: "processing batch",
HLC: "7ffd:0001:beef",
},
StatusClaim{
Type: StatusClaimType,
AgentID: "agent:backup",
BeatIndex: 101,
State: "idle",
HLC: "7ffe:0002:dead",
Resources: map[string]interface{}{
"cpu_percent": 25.0,
"memory_mb": 512,
},
},
},
invalidMsgs: []map[string]interface{}{
// Missing required fields
{
"type": StatusClaimType,
"agent_id": "test",
// missing beat_index, state, hlc
},
// Invalid state
{
"type": StatusClaimType,
"agent_id": "test",
"beat_index": 0,
"state": "invalid_state",
"hlc": "7ffd:0001:abcd",
},
// Negative progress
{
"type": StatusClaimType,
"agent_id": "test",
"beat_index": 0,
"state": "executing",
"progress": -0.1,
"hlc": "7ffd:0001:abcd",
},
},
},
{
name: "BarReport Schema Validation",
schemaFile: "barreport-v1.schema.json",
validMsgs: []interface{}{
BarReport{
Type: BarReportType,
WindowID: "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
FromBeat: 0,
ToBeat: 119,
AgentsReporting: 150,
OnTimeReviews: 147,
HelpPromisesFulfilled: 12,
SecretRotationsOK: true,
TempoDriftMS: -2.1,
},
BarReport{
Type: BarReportType,
WindowID: "a1b2c3d4e5f6789012345678901234ab",
FromBeat: 120,
ToBeat: 239,
AgentsReporting: 200,
OnTimeReviews: 195,
HelpPromisesFulfilled: 25,
SecretRotationsOK: false,
TempoDriftMS: 15.7,
Issues: []map[string]interface{}{
{
"severity": "warning",
"category": "timing",
"count": 5,
"description": "Some agents running late",
},
},
},
},
invalidMsgs: []map[string]interface{}{
// Missing required fields
{
"type": BarReportType,
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
// missing from_beat, to_beat, etc.
},
// Invalid window_id format
{
"type": BarReportType,
"window_id": "invalid-window-id",
"from_beat": 0,
"to_beat": 119,
"agents_reporting": 150,
"on_time_reviews": 147,
"help_promises_fulfilled": 12,
"secret_rotations_ok": true,
"tempo_drift_ms": 0.0,
},
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Load schema
schemaPath := filepath.Join(schemaDir, tt.schemaFile)
schemaLoader := gojsonschema.NewReferenceLoader("file://" + schemaPath)
// Test valid messages
for i, validMsg := range tt.validMsgs {
t.Run(fmt.Sprintf("Valid_%d", i), func(t *testing.T) {
msgBytes, err := json.Marshal(validMsg)
if err != nil {
t.Fatalf("Failed to marshal valid message: %v", err)
}
docLoader := gojsonschema.NewBytesLoader(msgBytes)
result, err := gojsonschema.Validate(schemaLoader, docLoader)
if err != nil {
t.Fatalf("Schema validation failed: %v", err)
}
if !result.Valid() {
t.Errorf("Valid message failed validation: %v", result.Errors())
}
})
}
// Test invalid messages
for i, invalidMsg := range tt.invalidMsgs {
t.Run(fmt.Sprintf("Invalid_%d", i), func(t *testing.T) {
msgBytes, err := json.Marshal(invalidMsg)
if err != nil {
t.Fatalf("Failed to marshal invalid message: %v", err)
}
docLoader := gojsonschema.NewBytesLoader(msgBytes)
result, err := gojsonschema.Validate(schemaLoader, docLoader)
if err != nil {
t.Fatalf("Schema validation failed: %v", err)
}
if result.Valid() {
t.Errorf("Invalid message passed validation when it should have failed")
}
})
}
})
}
}
// TestMessageParsing tests that messages can be correctly parsed from JSON
func TestMessageParsing(t *testing.T) {
tests := []struct {
name string
jsonStr string
expected interface{}
}{
{
name: "Parse BeatFrame",
jsonStr: `{
"type": "backbeat.beatframe.v1",
"cluster_id": "test",
"beat_index": 123,
"downbeat": true,
"phase": "review",
"hlc": "7ffd:0001:abcd",
"deadline_at": "2025-09-05T12:00:00Z",
"tempo_bpm": 2.5,
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5"
}`,
expected: BeatFrame{
Type: BeatFrameType,
ClusterID: "test",
BeatIndex: 123,
Downbeat: true,
Phase: "review",
HLC: "7ffd:0001:abcd",
TempoBPM: 2.5,
WindowID: "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
},
},
{
name: "Parse StatusClaim",
jsonStr: `{
"type": "backbeat.statusclaim.v1",
"agent_id": "worker:01",
"beat_index": 456,
"state": "completed",
"progress": 1.0,
"hlc": "7ffe:0002:beef"
}`,
expected: StatusClaim{
Type: StatusClaimType,
AgentID: "worker:01",
BeatIndex: 456,
State: "completed",
Progress: 1.0,
HLC: "7ffe:0002:beef",
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
switch expected := tt.expected.(type) {
case BeatFrame:
var parsed BeatFrame
err := json.Unmarshal([]byte(tt.jsonStr), &parsed)
if err != nil {
t.Fatalf("Failed to parse BeatFrame: %v", err)
}
if parsed.Type != expected.Type ||
parsed.ClusterID != expected.ClusterID ||
parsed.BeatIndex != expected.BeatIndex {
t.Errorf("Parsed BeatFrame doesn't match expected")
}
case StatusClaim:
var parsed StatusClaim
err := json.Unmarshal([]byte(tt.jsonStr), &parsed)
if err != nil {
t.Fatalf("Failed to parse StatusClaim: %v", err)
}
if parsed.Type != expected.Type ||
parsed.AgentID != expected.AgentID ||
parsed.State != expected.State {
t.Errorf("Parsed StatusClaim doesn't match expected")
}
}
})
}
}
// TestHLCValidation tests Hybrid Logical Clock format validation
func TestHLCValidation(t *testing.T) {
validHLCs := []string{
"0000:0000:0000",
"7ffd:0001:abcd",
"FFFF:FFFF:FFFF",
"1234:5678:90ab",
}
invalidHLCs := []string{
"invalid",
"7ffd:0001", // too short
"7ffd:0001:abcd:ef", // too long
"gggg:0001:abcd", // invalid hex
"7ffd:0001:abcdz", // invalid hex
}
for _, hlc := range validHLCs {
t.Run(fmt.Sprintf("Valid_%s", hlc), func(t *testing.T) {
if !isValidHLC(hlc) {
t.Errorf("Valid HLC %s was rejected", hlc)
}
})
}
for _, hlc := range invalidHLCs {
t.Run(fmt.Sprintf("Invalid_%s", hlc), func(t *testing.T) {
if isValidHLC(hlc) {
t.Errorf("Invalid HLC %s was accepted", hlc)
}
})
}
}
// TestWindowIDValidation tests window ID format validation
func TestWindowIDValidation(t *testing.T) {
validWindowIDs := []string{
"7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
"a1b2c3d4e5f6789012345678901234ab",
"00000000000000000000000000000000",
"FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF",
}
invalidWindowIDs := []string{
"invalid",
"7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d", // too short
"7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d55", // too long
"7e9b0e6c4c9a4e59b7f2d9a3c1b2e4g5", // invalid hex
}
for _, windowID := range validWindowIDs {
t.Run(fmt.Sprintf("Valid_%s", windowID), func(t *testing.T) {
if !isValidWindowID(windowID) {
t.Errorf("Valid window ID %s was rejected", windowID)
}
})
}
for _, windowID := range invalidWindowIDs {
t.Run(fmt.Sprintf("Invalid_%s", windowID), func(t *testing.T) {
if isValidWindowID(windowID) {
t.Errorf("Invalid window ID %s was accepted", windowID)
}
})
}
}
// Helper functions for validation
func isValidHLC(hlc string) bool {
parts := strings.Split(hlc, ":")
if len(parts) != 3 {
return false
}
for _, part := range parts {
if len(part) != 4 {
return false
}
for _, char := range part {
if !((char >= '0' && char <= '9') || (char >= 'a' && char <= 'f') || (char >= 'A' && char <= 'F')) {
return false
}
}
}
return true
}
func isValidWindowID(windowID string) bool {
if len(windowID) != 32 {
return false
}
for _, char := range windowID {
if !((char >= '0' && char <= '9') || (char >= 'a' && char <= 'f') || (char >= 'A' && char <= 'F')) {
return false
}
}
return true
}
// BenchmarkSchemaValidation benchmarks schema validation performance
func BenchmarkSchemaValidation(b *testing.B) {
schemaDir := "../schemas"
schemaPath := filepath.Join(schemaDir, "beatframe-v1.schema.json")
schemaLoader := gojsonschema.NewReferenceLoader("file://" + schemaPath)
beatFrame := BeatFrame{
Type: BeatFrameType,
ClusterID: "benchmark",
BeatIndex: 1000,
Downbeat: false,
Phase: "execute",
HLC: "7ffd:0001:abcd",
DeadlineAt: time.Now().Add(30 * time.Second),
TempoBPM: 2.0,
WindowID: "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
}
msgBytes, _ := json.Marshal(beatFrame)
docLoader := gojsonschema.NewBytesLoader(msgBytes)
b.ResetTimer()
for i := 0; i < b.N; i++ {
result, err := gojsonschema.Validate(schemaLoader, docLoader)
if err != nil || !result.Valid() {
b.Fatal("Validation failed")
}
}
}
// Helper function to check if schema files exist
func TestSchemaFilesExist(t *testing.T) {
schemaDir := "../schemas"
requiredSchemas := []string{
"beatframe-v1.schema.json",
"statusclaim-v1.schema.json",
"barreport-v1.schema.json",
}
for _, schema := range requiredSchemas {
schemaPath := filepath.Join(schemaDir, schema)
if _, err := os.Stat(schemaPath); os.IsNotExist(err) {
t.Errorf("Required schema file %s does not exist", schemaPath)
}
}
}

View File

@@ -0,0 +1,275 @@
[
{
"description": "Missing required field 'from_beat'",
"message": {
"type": "backbeat.barreport.v1",
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
"to_beat": 119,
"agents_reporting": 150,
"on_time_reviews": 147,
"help_promises_fulfilled": 12,
"secret_rotations_ok": true,
"tempo_drift_ms": -2.1
},
"expected_errors": ["from_beat is required"]
},
{
"description": "Missing required field 'agents_reporting'",
"message": {
"type": "backbeat.barreport.v1",
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
"from_beat": 0,
"to_beat": 119,
"on_time_reviews": 147,
"help_promises_fulfilled": 12,
"secret_rotations_ok": true,
"tempo_drift_ms": -2.1
},
"expected_errors": ["agents_reporting is required"]
},
{
"description": "Invalid window_id format (too short)",
"message": {
"type": "backbeat.barreport.v1",
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d",
"from_beat": 0,
"to_beat": 119,
"agents_reporting": 150,
"on_time_reviews": 147,
"help_promises_fulfilled": 12,
"secret_rotations_ok": true,
"tempo_drift_ms": -2.1
},
"expected_errors": ["window_id must be exactly 32 hex characters"]
},
{
"description": "Invalid window_id format (non-hex characters)",
"message": {
"type": "backbeat.barreport.v1",
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4g5",
"from_beat": 0,
"to_beat": 119,
"agents_reporting": 150,
"on_time_reviews": 147,
"help_promises_fulfilled": 12,
"secret_rotations_ok": true,
"tempo_drift_ms": -2.1
},
"expected_errors": ["window_id must match pattern ^[0-9a-fA-F]{32}$"]
},
{
"description": "Negative from_beat",
"message": {
"type": "backbeat.barreport.v1",
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
"from_beat": -1,
"to_beat": 119,
"agents_reporting": 150,
"on_time_reviews": 147,
"help_promises_fulfilled": 12,
"secret_rotations_ok": true,
"tempo_drift_ms": -2.1
},
"expected_errors": ["from_beat must be >= 0"]
},
{
"description": "Negative agents_reporting",
"message": {
"type": "backbeat.barreport.v1",
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
"from_beat": 0,
"to_beat": 119,
"agents_reporting": -1,
"on_time_reviews": 147,
"help_promises_fulfilled": 12,
"secret_rotations_ok": true,
"tempo_drift_ms": -2.1
},
"expected_errors": ["agents_reporting must be >= 0"]
},
{
"description": "Negative on_time_reviews",
"message": {
"type": "backbeat.barreport.v1",
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
"from_beat": 0,
"to_beat": 119,
"agents_reporting": 150,
"on_time_reviews": -1,
"help_promises_fulfilled": 12,
"secret_rotations_ok": true,
"tempo_drift_ms": -2.1
},
"expected_errors": ["on_time_reviews must be >= 0"]
},
{
"description": "Too many issues (over 100)",
"message": {
"type": "backbeat.barreport.v1",
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
"from_beat": 0,
"to_beat": 119,
"agents_reporting": 150,
"on_time_reviews": 147,
"help_promises_fulfilled": 12,
"secret_rotations_ok": true,
"tempo_drift_ms": -2.1,
"issues": []
},
"note": "This would need 101 issues to properly test, generating dynamically in actual test"
},
{
"description": "Issue with invalid severity",
"message": {
"type": "backbeat.barreport.v1",
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
"from_beat": 0,
"to_beat": 119,
"agents_reporting": 150,
"on_time_reviews": 147,
"help_promises_fulfilled": 12,
"secret_rotations_ok": true,
"tempo_drift_ms": -2.1,
"issues": [
{
"severity": "invalid_severity",
"category": "timing",
"count": 1,
"description": "Some issue"
}
]
},
"expected_errors": ["issue.severity must be one of: info, warning, error, critical"]
},
{
"description": "Issue with invalid category",
"message": {
"type": "backbeat.barreport.v1",
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
"from_beat": 0,
"to_beat": 119,
"agents_reporting": 150,
"on_time_reviews": 147,
"help_promises_fulfilled": 12,
"secret_rotations_ok": true,
"tempo_drift_ms": -2.1,
"issues": [
{
"severity": "warning",
"category": "invalid_category",
"count": 1,
"description": "Some issue"
}
]
},
"expected_errors": ["issue.category must be one of: timing, failed_tasks, missing_agents, resource_exhaustion, network_partition, credential_failure, data_corruption, unknown"]
},
{
"description": "Issue with zero count",
"message": {
"type": "backbeat.barreport.v1",
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
"from_beat": 0,
"to_beat": 119,
"agents_reporting": 150,
"on_time_reviews": 147,
"help_promises_fulfilled": 12,
"secret_rotations_ok": true,
"tempo_drift_ms": -2.1,
"issues": [
{
"severity": "warning",
"category": "timing",
"count": 0,
"description": "Some issue"
}
]
},
"expected_errors": ["issue.count must be >= 1"]
},
{
"description": "Issue with description too long (over 512 chars)",
"message": {
"type": "backbeat.barreport.v1",
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
"from_beat": 0,
"to_beat": 119,
"agents_reporting": 150,
"on_time_reviews": 147,
"help_promises_fulfilled": 12,
"secret_rotations_ok": true,
"tempo_drift_ms": -2.1,
"issues": [
{
"severity": "warning",
"category": "timing",
"count": 1,
"description": "This is a very long description that exceeds the maximum allowed length of 512 characters for issue descriptions in BACKBEAT BarReport messages. This constraint is in place to prevent excessively large messages and ensure that issue descriptions remain concise and actionable. The system should reject this message because the description field contains more than 512 characters and violates the schema validation rules that have been carefully designed to maintain message size limits and system performance characteristics."
}
]
},
"expected_errors": ["issue.description must be at most 512 characters"]
},
{
"description": "Issue with too many affected agents (over 50)",
"message": {
"type": "backbeat.barreport.v1",
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
"from_beat": 0,
"to_beat": 119,
"agents_reporting": 150,
"on_time_reviews": 147,
"help_promises_fulfilled": 12,
"secret_rotations_ok": true,
"tempo_drift_ms": -2.1,
"issues": [
{
"severity": "warning",
"category": "timing",
"count": 1,
"description": "Too many affected agents",
"affected_agents": [
"agent1", "agent2", "agent3", "agent4", "agent5", "agent6", "agent7", "agent8", "agent9", "agent10",
"agent11", "agent12", "agent13", "agent14", "agent15", "agent16", "agent17", "agent18", "agent19", "agent20",
"agent21", "agent22", "agent23", "agent24", "agent25", "agent26", "agent27", "agent28", "agent29", "agent30",
"agent31", "agent32", "agent33", "agent34", "agent35", "agent36", "agent37", "agent38", "agent39", "agent40",
"agent41", "agent42", "agent43", "agent44", "agent45", "agent46", "agent47", "agent48", "agent49", "agent50",
"agent51"
]
}
]
},
"expected_errors": ["issue.affected_agents must have at most 50 items"]
},
{
"description": "Wrong message type",
"message": {
"type": "backbeat.wrongtype.v1",
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
"from_beat": 0,
"to_beat": 119,
"agents_reporting": 150,
"on_time_reviews": 147,
"help_promises_fulfilled": 12,
"secret_rotations_ok": true,
"tempo_drift_ms": -2.1
},
"expected_errors": ["type must be 'backbeat.barreport.v1'"]
},
{
"description": "Extra unknown properties (should fail with additionalProperties: false)",
"message": {
"type": "backbeat.barreport.v1",
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
"from_beat": 0,
"to_beat": 119,
"agents_reporting": 150,
"on_time_reviews": 147,
"help_promises_fulfilled": 12,
"secret_rotations_ok": true,
"tempo_drift_ms": -2.1,
"unknown_field": "should not be allowed"
},
"expected_errors": ["Additional property unknown_field is not allowed"]
}
]

View File

@@ -0,0 +1,190 @@
[
{
"description": "Healthy cluster with good performance",
"message": {
"type": "backbeat.barreport.v1",
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
"from_beat": 240,
"to_beat": 359,
"agents_reporting": 978,
"on_time_reviews": 942,
"help_promises_fulfilled": 87,
"secret_rotations_ok": true,
"tempo_drift_ms": 7.3,
"issues": [
{
"severity": "warning",
"category": "timing",
"count": 12,
"description": "Some agents consistently reporting 50ms+ late",
"affected_agents": ["worker:batch-03", "indexer:shard-7"],
"first_seen_beat": 245,
"last_seen_beat": 358
}
],
"performance": {
"avg_response_time_ms": 45.2,
"p95_response_time_ms": 125.7,
"total_tasks_completed": 15678,
"total_tasks_failed": 23,
"peak_concurrent_agents": 1203,
"network_bytes_transferred": 67890123
},
"health_indicators": {
"cluster_sync_score": 0.94,
"resource_utilization": 0.67,
"collaboration_efficiency": 0.89,
"error_rate": 0.001
}
}
},
{
"description": "Small development cluster with perfect sync",
"message": {
"type": "backbeat.barreport.v1",
"window_id": "a1b2c3d4e5f6789012345678901234ab",
"from_beat": 0,
"to_beat": 119,
"agents_reporting": 5,
"on_time_reviews": 5,
"help_promises_fulfilled": 2,
"secret_rotations_ok": true,
"tempo_drift_ms": -0.1,
"issues": []
}
},
{
"description": "Cluster with multiple serious issues",
"message": {
"type": "backbeat.barreport.v1",
"window_id": "fedcba9876543210fedcba9876543210",
"from_beat": 1200,
"to_beat": 1319,
"agents_reporting": 450,
"on_time_reviews": 380,
"help_promises_fulfilled": 15,
"secret_rotations_ok": false,
"tempo_drift_ms": 125.7,
"issues": [
{
"severity": "critical",
"category": "credential_failure",
"count": 3,
"description": "Failed to rotate database credentials",
"affected_agents": ["db-manager:primary", "backup:secondary"],
"first_seen_beat": 1205,
"last_seen_beat": 1318
},
{
"severity": "error",
"category": "network_partition",
"count": 1,
"description": "Lost connection to east coast data center",
"affected_agents": ["worker:east-01", "worker:east-02", "worker:east-03"],
"first_seen_beat": 1210,
"last_seen_beat": 1319
},
{
"severity": "warning",
"category": "resource_exhaustion",
"count": 45,
"description": "High memory usage detected",
"affected_agents": ["ml-trainer:gpu-01"],
"first_seen_beat": 1200,
"last_seen_beat": 1315
}
],
"performance": {
"avg_response_time_ms": 180.5,
"p95_response_time_ms": 450.0,
"total_tasks_completed": 5432,
"total_tasks_failed": 123,
"peak_concurrent_agents": 487,
"network_bytes_transferred": 23456789
},
"health_indicators": {
"cluster_sync_score": 0.72,
"resource_utilization": 0.95,
"collaboration_efficiency": 0.45,
"error_rate": 0.022
}
}
},
{
"description": "High-frequency cluster report (8 BPM tempo)",
"message": {
"type": "backbeat.barreport.v1",
"window_id": "0123456789abcdef0123456789abcdef",
"from_beat": 960,
"to_beat": 1079,
"agents_reporting": 2000,
"on_time_reviews": 1985,
"help_promises_fulfilled": 156,
"secret_rotations_ok": true,
"tempo_drift_ms": 3.2,
"issues": [
{
"severity": "info",
"category": "timing",
"count": 15,
"description": "Minor timing variations detected",
"first_seen_beat": 965,
"last_seen_beat": 1078
}
],
"performance": {
"avg_response_time_ms": 25.1,
"p95_response_time_ms": 67.3,
"total_tasks_completed": 45678,
"total_tasks_failed": 12,
"peak_concurrent_agents": 2100,
"network_bytes_transferred": 123456789
},
"health_indicators": {
"cluster_sync_score": 0.98,
"resource_utilization": 0.78,
"collaboration_efficiency": 0.92,
"error_rate": 0.0003
},
"metadata": {
"reverb_version": "1.3.0",
"report_generation_time_ms": 45.7,
"next_window_id": "fedcba0987654321fedcba0987654321"
}
}
},
{
"description": "Minimal valid bar report (only required fields)",
"message": {
"type": "backbeat.barreport.v1",
"window_id": "1111222233334444555566667777888",
"from_beat": 600,
"to_beat": 719,
"agents_reporting": 1,
"on_time_reviews": 1,
"help_promises_fulfilled": 0,
"secret_rotations_ok": true,
"tempo_drift_ms": 0.0
}
},
{
"description": "Empty issues array (valid)",
"message": {
"type": "backbeat.barreport.v1",
"window_id": "9999aaaa0000bbbb1111cccc2222dddd",
"from_beat": 480,
"to_beat": 599,
"agents_reporting": 100,
"on_time_reviews": 98,
"help_promises_fulfilled": 25,
"secret_rotations_ok": true,
"tempo_drift_ms": -1.5,
"issues": [],
"performance": {
"avg_response_time_ms": 50.0,
"total_tasks_completed": 1000,
"total_tasks_failed": 2
}
}
}
]

View File

@@ -0,0 +1,152 @@
[
{
"description": "Missing required field 'beat_index'",
"message": {
"type": "backbeat.beatframe.v1",
"cluster_id": "test",
"downbeat": false,
"phase": "execute",
"hlc": "7ffd:0001:abcd",
"deadline_at": "2025-09-05T12:00:00Z",
"tempo_bpm": 2.0,
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5"
},
"expected_errors": ["beat_index is required"]
},
{
"description": "Invalid phase value",
"message": {
"type": "backbeat.beatframe.v1",
"cluster_id": "test",
"beat_index": 0,
"downbeat": false,
"phase": "invalid_phase",
"hlc": "7ffd:0001:abcd",
"deadline_at": "2025-09-05T12:00:00Z",
"tempo_bpm": 2.0,
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5"
},
"expected_errors": ["phase must be one of: plan, execute, review"]
},
{
"description": "Invalid HLC format (wrong number of segments)",
"message": {
"type": "backbeat.beatframe.v1",
"cluster_id": "test",
"beat_index": 0,
"downbeat": false,
"phase": "plan",
"hlc": "7ffd:0001",
"deadline_at": "2025-09-05T12:00:00Z",
"tempo_bpm": 2.0,
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5"
},
"expected_errors": ["hlc must match pattern ^[0-9a-fA-F]{4}:[0-9a-fA-F]{4}:[0-9a-fA-F]{4}$"]
},
{
"description": "Invalid HLC format (non-hex characters)",
"message": {
"type": "backbeat.beatframe.v1",
"cluster_id": "test",
"beat_index": 0,
"downbeat": false,
"phase": "plan",
"hlc": "gggg:0001:abcd",
"deadline_at": "2025-09-05T12:00:00Z",
"tempo_bpm": 2.0,
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5"
},
"expected_errors": ["hlc must match pattern ^[0-9a-fA-F]{4}:[0-9a-fA-F]{4}:[0-9a-fA-F]{4}$"]
},
{
"description": "Invalid window_id format (too short)",
"message": {
"type": "backbeat.beatframe.v1",
"cluster_id": "test",
"beat_index": 0,
"downbeat": false,
"phase": "plan",
"hlc": "7ffd:0001:abcd",
"deadline_at": "2025-09-05T12:00:00Z",
"tempo_bpm": 2.0,
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d"
},
"expected_errors": ["window_id must be exactly 32 hex characters"]
},
{
"description": "Invalid tempo_bpm (too low)",
"message": {
"type": "backbeat.beatframe.v1",
"cluster_id": "test",
"beat_index": 0,
"downbeat": false,
"phase": "plan",
"hlc": "7ffd:0001:abcd",
"deadline_at": "2025-09-05T12:00:00Z",
"tempo_bpm": 0.05,
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5"
},
"expected_errors": ["tempo_bpm must be at least 0.1"]
},
{
"description": "Invalid tempo_bpm (too high)",
"message": {
"type": "backbeat.beatframe.v1",
"cluster_id": "test",
"beat_index": 0,
"downbeat": false,
"phase": "plan",
"hlc": "7ffd:0001:abcd",
"deadline_at": "2025-09-05T12:00:00Z",
"tempo_bpm": 1001.0,
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5"
},
"expected_errors": ["tempo_bpm must be at most 1000"]
},
{
"description": "Invalid beat_index (negative)",
"message": {
"type": "backbeat.beatframe.v1",
"cluster_id": "test",
"beat_index": -1,
"downbeat": false,
"phase": "plan",
"hlc": "7ffd:0001:abcd",
"deadline_at": "2025-09-05T12:00:00Z",
"tempo_bpm": 2.0,
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5"
},
"expected_errors": ["beat_index must be >= 0"]
},
{
"description": "Wrong message type",
"message": {
"type": "backbeat.wrongtype.v1",
"cluster_id": "test",
"beat_index": 0,
"downbeat": false,
"phase": "plan",
"hlc": "7ffd:0001:abcd",
"deadline_at": "2025-09-05T12:00:00Z",
"tempo_bpm": 2.0,
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5"
},
"expected_errors": ["type must be 'backbeat.beatframe.v1'"]
},
{
"description": "Extra unknown properties (should fail with additionalProperties: false)",
"message": {
"type": "backbeat.beatframe.v1",
"cluster_id": "test",
"beat_index": 0,
"downbeat": false,
"phase": "plan",
"hlc": "7ffd:0001:abcd",
"deadline_at": "2025-09-05T12:00:00Z",
"tempo_bpm": 2.0,
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
"unknown_field": "should not be allowed"
},
"expected_errors": ["Additional property unknown_field is not allowed"]
}
]

View File

@@ -0,0 +1,82 @@
[
{
"description": "Standard beat frame during execute phase",
"message": {
"type": "backbeat.beatframe.v1",
"cluster_id": "chorus-prod",
"beat_index": 1337,
"downbeat": false,
"phase": "execute",
"hlc": "7ffd:0001:abcd",
"deadline_at": "2025-09-05T12:30:00Z",
"tempo_bpm": 2.0,
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5"
}
},
{
"description": "Downbeat starting new bar in plan phase",
"message": {
"type": "backbeat.beatframe.v1",
"cluster_id": "dev-cluster",
"beat_index": 0,
"downbeat": true,
"phase": "plan",
"hlc": "0001:0000:cafe",
"deadline_at": "2025-09-05T12:00:30Z",
"tempo_bpm": 4.0,
"window_id": "a1b2c3d4e5f6789012345678901234ab"
}
},
{
"description": "High-frequency beat with metadata",
"message": {
"type": "backbeat.beatframe.v1",
"cluster_id": "fast-cluster",
"beat_index": 999999,
"downbeat": false,
"phase": "review",
"hlc": "abcd:ef01:2345",
"deadline_at": "2025-09-05T12:00:07.5Z",
"tempo_bpm": 8.0,
"window_id": "fedcba9876543210fedcba9876543210",
"metadata": {
"pulse_version": "1.2.3",
"cluster_health": "healthy",
"expected_agents": 150
}
}
},
{
"description": "Low-frequency beat (1 BPM = 60 second beats)",
"message": {
"type": "backbeat.beatframe.v1",
"cluster_id": "slow-batch",
"beat_index": 42,
"downbeat": true,
"phase": "plan",
"hlc": "FFFF:FFFF:FFFF",
"deadline_at": "2025-09-05T13:00:00Z",
"tempo_bpm": 1.0,
"window_id": "0123456789abcdef0123456789abcdef",
"metadata": {
"pulse_version": "2.0.0",
"cluster_health": "degraded",
"expected_agents": 5
}
}
},
{
"description": "Minimal valid beat frame (no optional fields)",
"message": {
"type": "backbeat.beatframe.v1",
"cluster_id": "minimal",
"beat_index": 1,
"downbeat": false,
"phase": "execute",
"hlc": "0000:0001:0002",
"deadline_at": "2025-09-05T12:01:00Z",
"tempo_bpm": 2.0,
"window_id": "1234567890abcdef1234567890abcdef"
}
}
]

View File

@@ -0,0 +1,189 @@
[
{
"description": "Missing required field 'beat_index'",
"message": {
"type": "backbeat.statusclaim.v1",
"agent_id": "test:agent",
"state": "executing",
"hlc": "7ffd:0001:abcd"
},
"expected_errors": ["beat_index is required"]
},
{
"description": "Missing required field 'state'",
"message": {
"type": "backbeat.statusclaim.v1",
"agent_id": "test:agent",
"beat_index": 100,
"hlc": "7ffd:0001:abcd"
},
"expected_errors": ["state is required"]
},
{
"description": "Missing required field 'hlc'",
"message": {
"type": "backbeat.statusclaim.v1",
"agent_id": "test:agent",
"beat_index": 100,
"state": "executing"
},
"expected_errors": ["hlc is required"]
},
{
"description": "Invalid state value",
"message": {
"type": "backbeat.statusclaim.v1",
"agent_id": "test:agent",
"beat_index": 100,
"state": "invalid_state",
"hlc": "7ffd:0001:abcd"
},
"expected_errors": ["state must be one of: idle, planning, executing, reviewing, completed, failed, blocked, helping"]
},
{
"description": "Invalid progress value (negative)",
"message": {
"type": "backbeat.statusclaim.v1",
"agent_id": "test:agent",
"beat_index": 100,
"state": "executing",
"progress": -0.1,
"hlc": "7ffd:0001:abcd"
},
"expected_errors": ["progress must be between 0.0 and 1.0"]
},
{
"description": "Invalid progress value (greater than 1.0)",
"message": {
"type": "backbeat.statusclaim.v1",
"agent_id": "test:agent",
"beat_index": 100,
"state": "executing",
"progress": 1.1,
"hlc": "7ffd:0001:abcd"
},
"expected_errors": ["progress must be between 0.0 and 1.0"]
},
{
"description": "Invalid beats_left (negative)",
"message": {
"type": "backbeat.statusclaim.v1",
"agent_id": "test:agent",
"beat_index": 100,
"state": "executing",
"beats_left": -1,
"hlc": "7ffd:0001:abcd"
},
"expected_errors": ["beats_left must be >= 0"]
},
{
"description": "Invalid beats_left (too high)",
"message": {
"type": "backbeat.statusclaim.v1",
"agent_id": "test:agent",
"beat_index": 100,
"state": "executing",
"beats_left": 1001,
"hlc": "7ffd:0001:abcd"
},
"expected_errors": ["beats_left must be <= 1000"]
},
{
"description": "Invalid beat_index (negative)",
"message": {
"type": "backbeat.statusclaim.v1",
"agent_id": "test:agent",
"beat_index": -1,
"state": "executing",
"hlc": "7ffd:0001:abcd"
},
"expected_errors": ["beat_index must be >= 0"]
},
{
"description": "Invalid HLC format",
"message": {
"type": "backbeat.statusclaim.v1",
"agent_id": "test:agent",
"beat_index": 100,
"state": "executing",
"hlc": "invalid-hlc"
},
"expected_errors": ["hlc must match pattern ^[0-9a-fA-F]{4}:[0-9a-fA-F]{4}:[0-9a-fA-F]{4}$"]
},
{
"description": "Notes too long (over 256 characters)",
"message": {
"type": "backbeat.statusclaim.v1",
"agent_id": "test:agent",
"beat_index": 100,
"state": "executing",
"notes": "This is a very long notes field that exceeds the maximum allowed length of 256 characters. This should fail validation because it contains too much text and violates the maxLength constraint that was set to keep status messages concise and prevent excessive message sizes in the BACKBEAT system.",
"hlc": "7ffd:0001:abcd"
},
"expected_errors": ["notes must be at most 256 characters"]
},
{
"description": "Too many dependencies (over 50)",
"message": {
"type": "backbeat.statusclaim.v1",
"agent_id": "test:agent",
"beat_index": 100,
"state": "blocked",
"hlc": "7ffd:0001:abcd",
"dependencies": [
"dep1", "dep2", "dep3", "dep4", "dep5", "dep6", "dep7", "dep8", "dep9", "dep10",
"dep11", "dep12", "dep13", "dep14", "dep15", "dep16", "dep17", "dep18", "dep19", "dep20",
"dep21", "dep22", "dep23", "dep24", "dep25", "dep26", "dep27", "dep28", "dep29", "dep30",
"dep31", "dep32", "dep33", "dep34", "dep35", "dep36", "dep37", "dep38", "dep39", "dep40",
"dep41", "dep42", "dep43", "dep44", "dep45", "dep46", "dep47", "dep48", "dep49", "dep50",
"dep51"
]
},
"expected_errors": ["dependencies must have at most 50 items"]
},
{
"description": "Invalid agent_id format (empty)",
"message": {
"type": "backbeat.statusclaim.v1",
"agent_id": "",
"beat_index": 100,
"state": "executing",
"hlc": "7ffd:0001:abcd"
},
"expected_errors": ["agent_id must be at least 1 character"]
},
{
"description": "Agent_id too long (over 128 characters)",
"message": {
"type": "backbeat.statusclaim.v1",
"agent_id": "this_is_a_very_long_agent_id_that_exceeds_the_maximum_allowed_length_of_128_characters_and_should_fail_validation_because_it_is_too_long_for_the_system_to_handle_properly",
"beat_index": 100,
"state": "executing",
"hlc": "7ffd:0001:abcd"
},
"expected_errors": ["agent_id must be at most 128 characters"]
},
{
"description": "Wrong message type",
"message": {
"type": "backbeat.wrongtype.v1",
"agent_id": "test:agent",
"beat_index": 100,
"state": "executing",
"hlc": "7ffd:0001:abcd"
},
"expected_errors": ["type must be 'backbeat.statusclaim.v1'"]
},
{
"description": "Extra unknown properties (should fail with additionalProperties: false)",
"message": {
"type": "backbeat.statusclaim.v1",
"agent_id": "test:agent",
"beat_index": 100,
"state": "executing",
"hlc": "7ffd:0001:abcd",
"unknown_field": "should not be allowed"
},
"expected_errors": ["Additional property unknown_field is not allowed"]
}
]

View File

@@ -0,0 +1,135 @@
[
{
"description": "Worker executing a batch processing task",
"message": {
"type": "backbeat.statusclaim.v1",
"agent_id": "search-indexer:worker-03",
"task_id": "index-batch:20250905-120",
"beat_index": 1337,
"state": "executing",
"beats_left": 3,
"progress": 0.65,
"notes": "processing batch 120/200",
"hlc": "7ffd:0001:beef",
"resources": {
"cpu_percent": 85.0,
"memory_mb": 2048,
"disk_io_ops": 1250,
"network_kb": 512
}
}
},
{
"description": "Failed backup agent with error details",
"message": {
"type": "backbeat.statusclaim.v1",
"agent_id": "agent:backup-runner",
"beat_index": 1338,
"state": "failed",
"beats_left": 0,
"progress": 0.0,
"notes": "connection timeout to storage backend",
"hlc": "7ffe:0002:dead",
"metadata": {
"agent_version": "2.1.0",
"error_code": "STORAGE_TIMEOUT",
"retry_count": 3
}
}
},
{
"description": "ML trainer helping another node",
"message": {
"type": "backbeat.statusclaim.v1",
"agent_id": "ml-trainer:gpu-node-1",
"beat_index": 1336,
"state": "helping",
"progress": 1.0,
"notes": "completed own work, assisting node-2 with large model",
"hlc": "7ffc:0005:cafe",
"dependencies": ["ml-trainer:gpu-node-2"]
}
},
{
"description": "Idle agent waiting for work",
"message": {
"type": "backbeat.statusclaim.v1",
"agent_id": "worker:standby-01",
"beat_index": 1339,
"state": "idle",
"progress": 0.0,
"hlc": "8000:0000:1111"
}
},
{
"description": "Agent in planning phase",
"message": {
"type": "backbeat.statusclaim.v1",
"agent_id": "coordinator:main",
"task_id": "deploy:v2.1.0",
"beat_index": 1340,
"state": "planning",
"beats_left": 5,
"progress": 0.2,
"notes": "analyzing dependency graph",
"hlc": "8001:0001:2222",
"resources": {
"cpu_percent": 15.0,
"memory_mb": 512
}
}
},
{
"description": "Reviewing agent with completed task",
"message": {
"type": "backbeat.statusclaim.v1",
"agent_id": "quality-checker:auto",
"task_id": "validate:batch-45",
"beat_index": 1341,
"state": "reviewing",
"beats_left": 1,
"progress": 0.9,
"notes": "final verification of output quality",
"hlc": "8002:0002:3333"
}
},
{
"description": "Completed agent ready for next task",
"message": {
"type": "backbeat.statusclaim.v1",
"agent_id": "processor:fast-01",
"task_id": "process:item-567",
"beat_index": 1342,
"state": "completed",
"beats_left": 0,
"progress": 1.0,
"notes": "item processed successfully",
"hlc": "8003:0003:4444"
}
},
{
"description": "Blocked agent waiting for external dependency",
"message": {
"type": "backbeat.statusclaim.v1",
"agent_id": "data-loader:external",
"task_id": "load:dataset-789",
"beat_index": 1343,
"state": "blocked",
"beats_left": 10,
"progress": 0.1,
"notes": "waiting for external API rate limit reset",
"hlc": "8004:0004:5555",
"dependencies": ["external-api:rate-limiter"]
}
},
{
"description": "Minimal valid status claim (only required fields)",
"message": {
"type": "backbeat.statusclaim.v1",
"agent_id": "simple:agent",
"beat_index": 1344,
"state": "idle",
"hlc": "8005:0005:6666"
}
}
]

View File

@@ -0,0 +1,206 @@
# BACKBEAT Contracts CI Integration Makefile
# Variables
SCHEMA_DIR = ../../schemas
EXAMPLES_DIR = ../examples
CLI_TOOL = ./cmd/backbeat-validate
BINARY_NAME = backbeat-validate
# Default target
.PHONY: all
all: build test
# Build the CLI validation tool
.PHONY: build
build:
@echo "Building BACKBEAT validation CLI tool..."
go build -o $(BINARY_NAME) $(CLI_TOOL)
# Run all tests
.PHONY: test
test: test-schemas test-examples test-integration
# Test schema files are valid
.PHONY: test-schemas
test-schemas:
@echo "Testing JSON schema files..."
@for schema in $(SCHEMA_DIR)/*.schema.json; do \
echo "Validating schema: $$schema"; \
python3 -c "import json; json.load(open('$$schema'))" || exit 1; \
done
# Test all example files
.PHONY: test-examples
test-examples: build
@echo "Testing example messages..."
./$(BINARY_NAME) --schemas $(SCHEMA_DIR) --dir $(EXAMPLES_DIR)
# Run Go integration tests
.PHONY: test-integration
test-integration:
@echo "Running Go integration tests..."
go test -v ./...
# Validate built-in examples
.PHONY: validate-examples
validate-examples: build
@echo "Validating built-in examples..."
./$(BINARY_NAME) --schemas $(SCHEMA_DIR) --examples
# Validate a specific directory (for CI use)
.PHONY: validate-dir
validate-dir: build
@if [ -z "$(DIR)" ]; then \
echo "Usage: make validate-dir DIR=/path/to/messages"; \
exit 1; \
fi
./$(BINARY_NAME) --schemas $(SCHEMA_DIR) --dir $(DIR) --exit-code
# Validate a specific file (for CI use)
.PHONY: validate-file
validate-file: build
@if [ -z "$(FILE)" ]; then \
echo "Usage: make validate-file FILE=/path/to/message.json"; \
exit 1; \
fi
./$(BINARY_NAME) --schemas $(SCHEMA_DIR) --file $(FILE) --exit-code
# Clean build artifacts
.PHONY: clean
clean:
rm -f $(BINARY_NAME)
# Install dependencies
.PHONY: deps
deps:
go mod tidy
go mod download
# Format Go code
.PHONY: fmt
fmt:
go fmt ./...
# Run static analysis
.PHONY: lint
lint:
go vet ./...
# Generate CI configuration examples
.PHONY: examples
examples: generate-github-actions generate-gitlab-ci generate-makefile-example
# Generate GitHub Actions workflow
.PHONY: generate-github-actions
generate-github-actions:
@echo "Generating GitHub Actions workflow..."
@mkdir -p ci-examples
@cat > ci-examples/github-actions.yml << 'EOF'\
name: BACKBEAT Contract Validation\
\
on:\
push:\
branches: [ main, develop ]\
pull_request:\
branches: [ main ]\
\
jobs:\
validate-backbeat-messages:\
runs-on: ubuntu-latest\
\
steps:\
- uses: actions/checkout@v4\
with:\
repository: 'chorus-services/backbeat'\
path: 'backbeat-contracts'\
\
- uses: actions/checkout@v4\
with:\
path: 'current-repo'\
\
- name: Set up Go\
uses: actions/setup-go@v4\
with:\
go-version: '1.22'\
\
- name: Build BACKBEAT validator\
run: |\
cd backbeat-contracts/contracts/tests/integration\
make build\
\
- name: Validate BACKBEAT messages\
run: |\
cd backbeat-contracts/contracts/tests/integration\
./backbeat-validate \\\
--schemas ../../schemas \\\
--dir ../../../current-repo/path/to/messages \\\
--exit-code\
EOF
# Generate GitLab CI configuration
.PHONY: generate-gitlab-ci
generate-gitlab-ci:
@echo "Generating GitLab CI configuration..."
@mkdir -p ci-examples
@cat > ci-examples/gitlab-ci.yml << 'EOF'\
validate-backbeat-contracts:\
stage: test\
image: golang:1.22\
\
before_script:\
- git clone https://github.com/chorus-services/backbeat.git /tmp/backbeat\
- cd /tmp/backbeat/contracts/tests/integration\
- make deps build\
\
script:\
- /tmp/backbeat/contracts/tests/integration/backbeat-validate \\\
--schemas /tmp/backbeat/contracts/schemas \\\
--dir $$CI_PROJECT_DIR/path/to/messages \\\
--exit-code\
\
only:\
- merge_requests\
- main\
- develop\
EOF
# Generate example Makefile for downstream projects
.PHONY: generate-makefile-example
generate-makefile-example:
@echo "Generating example Makefile for downstream projects..."
@mkdir -p ci-examples
@echo "# Example Makefile for BACKBEAT contract validation" > ci-examples/downstream-makefile
@echo "" >> ci-examples/downstream-makefile
@echo "BACKBEAT_REPO = https://github.com/chorus-services/backbeat.git" >> ci-examples/downstream-makefile
@echo "BACKBEAT_DIR = .backbeat-contracts" >> ci-examples/downstream-makefile
@echo "" >> ci-examples/downstream-makefile
@echo "validate-backbeat:" >> ci-examples/downstream-makefile
@echo " git clone \$$(BACKBEAT_REPO) \$$(BACKBEAT_DIR) 2>/dev/null || true" >> ci-examples/downstream-makefile
@echo " cd \$$(BACKBEAT_DIR)/contracts/tests/integration && make build" >> ci-examples/downstream-makefile
@echo " \$$(BACKBEAT_DIR)/contracts/tests/integration/backbeat-validate --schemas \$$(BACKBEAT_DIR)/contracts/schemas --dir messages --exit-code" >> ci-examples/downstream-makefile
# Help target
.PHONY: help
help:
@echo "BACKBEAT Contracts CI Integration Makefile"
@echo ""
@echo "Available targets:"
@echo " all - Build and test everything"
@echo " build - Build the CLI validation tool"
@echo " test - Run all tests"
@echo " test-schemas - Validate JSON schema files"
@echo " test-examples - Test example message files"
@echo " test-integration - Run Go integration tests"
@echo " validate-examples - Validate built-in examples"
@echo " validate-dir DIR=path - Validate messages in directory"
@echo " validate-file FILE=path - Validate single message file"
@echo " clean - Clean build artifacts"
@echo " deps - Install Go dependencies"
@echo " fmt - Format Go code"
@echo " lint - Run static analysis"
@echo " examples - Generate CI configuration examples"
@echo " help - Show this help message"
@echo ""
@echo "Examples:"
@echo " make validate-dir DIR=../../../examples"
@echo " make validate-file FILE=../../../examples/beatframe-valid.json"

View File

@@ -0,0 +1,279 @@
// Package integration provides CI helper functions for BACKBEAT contract testing
package integration
import (
"encoding/json"
"fmt"
"io/fs"
"os"
"path/filepath"
"strings"
)
// CIHelper provides utilities for continuous integration testing
type CIHelper struct {
validator *MessageValidator
}
// NewCIHelper creates a new CI helper with a message validator
func NewCIHelper(schemaDir string) (*CIHelper, error) {
validator, err := NewMessageValidator(schemaDir)
if err != nil {
return nil, fmt.Errorf("failed to create validator: %w", err)
}
return &CIHelper{
validator: validator,
}, nil
}
// ValidateDirectory validates all JSON files in a directory against BACKBEAT schemas
func (ci *CIHelper) ValidateDirectory(dir string) (*DirectoryValidationResult, error) {
result := &DirectoryValidationResult{
Directory: dir,
Files: make(map[string]*FileValidationResult),
}
err := filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
// Skip non-JSON files
if d.IsDir() || !strings.HasSuffix(strings.ToLower(path), ".json") {
return nil
}
fileResult, validateErr := ci.validateFile(path)
if validateErr != nil {
result.Errors = append(result.Errors, fmt.Sprintf("Failed to validate %s: %v", path, validateErr))
} else {
relPath, _ := filepath.Rel(dir, path)
result.Files[relPath] = fileResult
result.TotalFiles++
if fileResult.AllValid {
result.ValidFiles++
} else {
result.InvalidFiles++
}
}
return nil
})
if err != nil {
return nil, fmt.Errorf("failed to walk directory: %w", err)
}
result.ValidationRate = float64(result.ValidFiles) / float64(result.TotalFiles)
return result, nil
}
// validateFile validates a single JSON file
func (ci *CIHelper) validateFile(filePath string) (*FileValidationResult, error) {
data, err := os.ReadFile(filePath)
if err != nil {
return nil, fmt.Errorf("failed to read file: %w", err)
}
result := &FileValidationResult{
FilePath: filePath,
AllValid: true,
}
// Try to parse as single message first
var singleMessage map[string]interface{}
if err := json.Unmarshal(data, &singleMessage); err == nil {
if msgType, hasType := singleMessage["type"].(string); hasType && ci.validator.IsMessageTypeSupported(msgType) {
// Single BACKBEAT message
validationResult, validateErr := ci.validator.ValidateMessage(data)
if validateErr != nil {
return nil, validateErr
}
result.Messages = []*ValidationResult{validationResult}
result.AllValid = validationResult.Valid
return result, nil
}
}
// Try to parse as array of messages
var messageArray []map[string]interface{}
if err := json.Unmarshal(data, &messageArray); err == nil {
for i, msg := range messageArray {
msgBytes, marshalErr := json.Marshal(msg)
if marshalErr != nil {
result.Errors = append(result.Errors, fmt.Sprintf("Message %d: failed to marshal: %v", i, marshalErr))
result.AllValid = false
continue
}
validationResult, validateErr := ci.validator.ValidateMessage(msgBytes)
if validateErr != nil {
result.Errors = append(result.Errors, fmt.Sprintf("Message %d: validation error: %v", i, validateErr))
result.AllValid = false
continue
}
result.Messages = append(result.Messages, validationResult)
if !validationResult.Valid {
result.AllValid = false
}
}
return result, nil
}
// Try to parse as examples format (array with description and message fields)
var examples []ExampleMessage
if err := json.Unmarshal(data, &examples); err == nil {
for i, example := range examples {
msgBytes, marshalErr := json.Marshal(example.Message)
if marshalErr != nil {
result.Errors = append(result.Errors, fmt.Sprintf("Example %d (%s): failed to marshal: %v", i, example.Description, marshalErr))
result.AllValid = false
continue
}
validationResult, validateErr := ci.validator.ValidateMessage(msgBytes)
if validateErr != nil {
result.Errors = append(result.Errors, fmt.Sprintf("Example %d (%s): validation error: %v", i, example.Description, validateErr))
result.AllValid = false
continue
}
result.Messages = append(result.Messages, validationResult)
if !validationResult.Valid {
result.AllValid = false
}
}
return result, nil
}
return nil, fmt.Errorf("file does not contain valid JSON message format")
}
// ExampleMessage represents a message example with description
type ExampleMessage struct {
Description string `json:"description"`
Message map[string]interface{} `json:"message"`
}
// DirectoryValidationResult contains results for validating a directory
type DirectoryValidationResult struct {
Directory string `json:"directory"`
TotalFiles int `json:"total_files"`
ValidFiles int `json:"valid_files"`
InvalidFiles int `json:"invalid_files"`
ValidationRate float64 `json:"validation_rate"`
Files map[string]*FileValidationResult `json:"files"`
Errors []string `json:"errors,omitempty"`
}
// FileValidationResult contains results for validating a single file
type FileValidationResult struct {
FilePath string `json:"file_path"`
AllValid bool `json:"all_valid"`
Messages []*ValidationResult `json:"messages"`
Errors []string `json:"errors,omitempty"`
}
// GenerateCIReport generates a formatted report suitable for CI systems
func (ci *CIHelper) GenerateCIReport(result *DirectoryValidationResult) string {
var sb strings.Builder
sb.WriteString("BACKBEAT Contract Validation Report\n")
sb.WriteString("===================================\n\n")
sb.WriteString(fmt.Sprintf("Directory: %s\n", result.Directory))
sb.WriteString(fmt.Sprintf("Total Files: %d\n", result.TotalFiles))
sb.WriteString(fmt.Sprintf("Valid Files: %d\n", result.ValidFiles))
sb.WriteString(fmt.Sprintf("Invalid Files: %d\n", result.InvalidFiles))
sb.WriteString(fmt.Sprintf("Validation Rate: %.2f%%\n\n", result.ValidationRate*100))
if len(result.Errors) > 0 {
sb.WriteString("Directory-level Errors:\n")
for _, err := range result.Errors {
sb.WriteString(fmt.Sprintf(" - %s\n", err))
}
sb.WriteString("\n")
}
// Group files by validation status
validFiles := make([]string, 0)
invalidFiles := make([]string, 0)
for filePath, fileResult := range result.Files {
if fileResult.AllValid {
validFiles = append(validFiles, filePath)
} else {
invalidFiles = append(invalidFiles, filePath)
}
}
if len(validFiles) > 0 {
sb.WriteString("Valid Files:\n")
for _, file := range validFiles {
sb.WriteString(fmt.Sprintf(" ✓ %s\n", file))
}
sb.WriteString("\n")
}
if len(invalidFiles) > 0 {
sb.WriteString("Invalid Files:\n")
for _, file := range invalidFiles {
fileResult := result.Files[file]
sb.WriteString(fmt.Sprintf(" ✗ %s\n", file))
for _, err := range fileResult.Errors {
sb.WriteString(fmt.Sprintf(" - %s\n", err))
}
for i, msg := range fileResult.Messages {
if !msg.Valid {
sb.WriteString(fmt.Sprintf(" Message %d (%s):\n", i+1, msg.MessageType))
for _, valErr := range msg.Errors {
sb.WriteString(fmt.Sprintf(" - %s: %s\n", valErr.Field, valErr.Message))
}
}
}
sb.WriteString("\n")
}
}
return sb.String()
}
// ExitWithStatus exits the program with appropriate status code for CI
func (ci *CIHelper) ExitWithStatus(result *DirectoryValidationResult) {
if result.InvalidFiles > 0 || len(result.Errors) > 0 {
fmt.Fprint(os.Stderr, ci.GenerateCIReport(result))
os.Exit(1)
} else {
fmt.Print(ci.GenerateCIReport(result))
os.Exit(0)
}
}
// ValidateExamples validates the built-in example messages
func (ci *CIHelper) ValidateExamples() ([]*ValidationResult, error) {
examples := ExampleMessages()
results := make([]*ValidationResult, 0, len(examples))
for name, example := range examples {
result, err := ci.validator.ValidateStruct(example)
if err != nil {
return nil, fmt.Errorf("failed to validate example %s: %w", name, err)
}
results = append(results, result)
}
return results, nil
}
// GetSchemaInfo returns information about loaded schemas
func (ci *CIHelper) GetSchemaInfo() map[string]string {
info := make(map[string]string)
for _, msgType := range ci.validator.GetSupportedMessageTypes() {
info[msgType] = getSchemaVersion(msgType)
}
return info
}

View File

@@ -0,0 +1,283 @@
// Package integration provides CI validation helpers for BACKBEAT conformance testing
package integration
import (
"encoding/json"
"fmt"
"path/filepath"
"strings"
"github.com/xeipuuv/gojsonschema"
)
// MessageValidator provides validation for BACKBEAT messages against JSON schemas
type MessageValidator struct {
schemaLoaders map[string]gojsonschema.JSONLoader
}
// MessageType constants for the three core BACKBEAT interfaces
const (
BeatFrameType = "backbeat.beatframe.v1"
StatusClaimType = "backbeat.statusclaim.v1"
BarReportType = "backbeat.barreport.v1"
)
// ValidationError represents a validation failure with context
type ValidationError struct {
MessageType string `json:"message_type"`
Field string `json:"field"`
Value string `json:"value"`
Message string `json:"message"`
Errors []string `json:"errors"`
}
func (ve ValidationError) Error() string {
return fmt.Sprintf("validation failed for %s: %s", ve.MessageType, strings.Join(ve.Errors, "; "))
}
// ValidationResult contains the outcome of message validation
type ValidationResult struct {
Valid bool `json:"valid"`
MessageType string `json:"message_type"`
Errors []ValidationError `json:"errors,omitempty"`
SchemaVersion string `json:"schema_version"`
}
// NewMessageValidator creates a new validator with schema loaders
func NewMessageValidator(schemaDir string) (*MessageValidator, error) {
validator := &MessageValidator{
schemaLoaders: make(map[string]gojsonschema.JSONLoader),
}
// Load all schema files
schemas := map[string]string{
BeatFrameType: "beatframe-v1.schema.json",
StatusClaimType: "statusclaim-v1.schema.json",
BarReportType: "barreport-v1.schema.json",
}
for msgType, schemaFile := range schemas {
schemaPath := filepath.Join(schemaDir, schemaFile)
loader := gojsonschema.NewReferenceLoader("file://" + schemaPath)
validator.schemaLoaders[msgType] = loader
}
return validator, nil
}
// ValidateMessage validates a JSON message against the appropriate BACKBEAT schema
func (v *MessageValidator) ValidateMessage(messageJSON []byte) (*ValidationResult, error) {
// Parse message to determine type
var msgMap map[string]interface{}
if err := json.Unmarshal(messageJSON, &msgMap); err != nil {
return nil, fmt.Errorf("failed to parse JSON: %w", err)
}
msgType, ok := msgMap["type"].(string)
if !ok {
return &ValidationResult{
Valid: false,
MessageType: "unknown",
Errors: []ValidationError{
{
Field: "type",
Message: "message type field is missing or not a string",
Errors: []string{"type field is required and must be a string"},
},
},
}, nil
}
// Get appropriate schema loader
schemaLoader, exists := v.schemaLoaders[msgType]
if !exists {
return &ValidationResult{
Valid: false,
MessageType: msgType,
Errors: []ValidationError{
{
Field: "type",
Value: msgType,
Message: fmt.Sprintf("unsupported message type: %s", msgType),
Errors: []string{fmt.Sprintf("message type %s is not supported by BACKBEAT contracts", msgType)},
},
},
}, nil
}
// Validate against schema
docLoader := gojsonschema.NewBytesLoader(messageJSON)
result, err := gojsonschema.Validate(schemaLoader, docLoader)
if err != nil {
return nil, fmt.Errorf("schema validation failed: %w", err)
}
validationResult := &ValidationResult{
Valid: result.Valid(),
MessageType: msgType,
SchemaVersion: getSchemaVersion(msgType),
}
if !result.Valid() {
for _, desc := range result.Errors() {
validationResult.Errors = append(validationResult.Errors, ValidationError{
MessageType: msgType,
Field: desc.Field(),
Value: fmt.Sprintf("%v", desc.Value()),
Message: desc.Description(),
Errors: []string{desc.String()},
})
}
}
return validationResult, nil
}
// ValidateMessageString validates a JSON message string
func (v *MessageValidator) ValidateMessageString(messageJSON string) (*ValidationResult, error) {
return v.ValidateMessage([]byte(messageJSON))
}
// ValidateStruct validates a Go struct by marshaling to JSON first
func (v *MessageValidator) ValidateStruct(message interface{}) (*ValidationResult, error) {
jsonBytes, err := json.Marshal(message)
if err != nil {
return nil, fmt.Errorf("failed to marshal struct to JSON: %w", err)
}
return v.ValidateMessage(jsonBytes)
}
// BatchValidate validates multiple messages and returns aggregated results
func (v *MessageValidator) BatchValidate(messages [][]byte) ([]*ValidationResult, error) {
results := make([]*ValidationResult, len(messages))
for i, msg := range messages {
result, err := v.ValidateMessage(msg)
if err != nil {
return nil, fmt.Errorf("failed to validate message %d: %w", i, err)
}
results[i] = result
}
return results, nil
}
// GetSupportedMessageTypes returns the list of supported BACKBEAT message types
func (v *MessageValidator) GetSupportedMessageTypes() []string {
types := make([]string, 0, len(v.schemaLoaders))
for msgType := range v.schemaLoaders {
types = append(types, msgType)
}
return types
}
// IsMessageTypeSupported checks if a message type is supported
func (v *MessageValidator) IsMessageTypeSupported(msgType string) bool {
_, exists := v.schemaLoaders[msgType]
return exists
}
// getSchemaVersion returns the version for a given message type
func getSchemaVersion(msgType string) string {
versions := map[string]string{
BeatFrameType: "1.0.0",
StatusClaimType: "1.0.0",
BarReportType: "1.0.0",
}
return versions[msgType]
}
// ValidationStats provides summary statistics for batch validation
type ValidationStats struct {
TotalMessages int `json:"total_messages"`
ValidMessages int `json:"valid_messages"`
InvalidMessages int `json:"invalid_messages"`
MessageTypes map[string]int `json:"message_types"`
ErrorSummary map[string]int `json:"error_summary"`
ValidationRate float64 `json:"validation_rate"`
}
// GetValidationStats computes statistics from validation results
func GetValidationStats(results []*ValidationResult) *ValidationStats {
stats := &ValidationStats{
TotalMessages: len(results),
MessageTypes: make(map[string]int),
ErrorSummary: make(map[string]int),
}
for _, result := range results {
// Count message types
stats.MessageTypes[result.MessageType]++
if result.Valid {
stats.ValidMessages++
} else {
stats.InvalidMessages++
// Aggregate error types
for _, err := range result.Errors {
stats.ErrorSummary[err.Field]++
}
}
}
if stats.TotalMessages > 0 {
stats.ValidationRate = float64(stats.ValidMessages) / float64(stats.TotalMessages)
}
return stats
}
// ExampleMessages provides sample messages for testing and documentation
func ExampleMessages() map[string]interface{} {
return map[string]interface{}{
"beatframe_minimal": map[string]interface{}{
"type": BeatFrameType,
"cluster_id": "test-cluster",
"beat_index": 0,
"downbeat": true,
"phase": "plan",
"hlc": "0001:0000:cafe",
"deadline_at": "2025-09-05T12:00:30Z",
"tempo_bpm": 2.0,
"window_id": "a1b2c3d4e5f6789012345678901234ab",
},
"statusclaim_minimal": map[string]interface{}{
"type": StatusClaimType,
"agent_id": "test:agent",
"beat_index": 100,
"state": "idle",
"hlc": "7ffd:0001:abcd",
},
"barreport_minimal": map[string]interface{}{
"type": BarReportType,
"window_id": "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
"from_beat": 0,
"to_beat": 119,
"agents_reporting": 1,
"on_time_reviews": 1,
"help_promises_fulfilled": 0,
"secret_rotations_ok": true,
"tempo_drift_ms": 0.0,
},
}
}
// PrettyPrintValidationResult formats validation results for human reading
func PrettyPrintValidationResult(result *ValidationResult) string {
var sb strings.Builder
sb.WriteString(fmt.Sprintf("Message Type: %s\n", result.MessageType))
sb.WriteString(fmt.Sprintf("Schema Version: %s\n", result.SchemaVersion))
sb.WriteString(fmt.Sprintf("Valid: %t\n", result.Valid))
if !result.Valid && len(result.Errors) > 0 {
sb.WriteString("\nValidation Errors:\n")
for i, err := range result.Errors {
sb.WriteString(fmt.Sprintf(" %d. Field: %s\n", i+1, err.Field))
if err.Value != "" {
sb.WriteString(fmt.Sprintf(" Value: %s\n", err.Value))
}
sb.WriteString(fmt.Sprintf(" Error: %s\n", err.Message))
}
}
return sb.String()
}

View File

@@ -0,0 +1,33 @@
# Decision Record — Slow BACKBEAT Pulse to 1 BPM
- **UCXL**: `ucxl://chorus-agent:arbiter@backbeat:tempo-calibration/*^/decisions/tempo-1bpm`
- **Date**: 2025-10-06
- **Status**: Accepted
## Problem
BACKBEAT pulse instances were emitting beats at 2 BPM by default, keeping the cluster cadence at 30-second intervals. Operations requested a slower system-wide pulse so councils and agents have a full minute between beats while stabilising the swarm elections and WHOOSH rebroadcast loops.
## Options Considered
1. **Just change deployment flags** — override `backbeat-pulse` with `-bpm 1` in compose files. *Pros*: no code changes. *Cons*: min/max validation rejects <4 BPM, mock SDK fallback still emits 2 BPM, and documentation would drift from runtime behaviour.
2. **Adjust runtime control messages** send tempo change commands after startup. *Pros*: no rebuild. *Cons*: ±10% guard blocks the 21 jump, and services would still boot at the faster tempo.
3. **Update defaults and guardrails in code** *(chosen)* set default/minimum BPM to 1, align SDK degradation cadence, and refresh guides/tests.
## Decision
Implement option 3. The pulse service now defaults to 1 BPM, accepts values down to 1 BPM, and the SDKs local degradation beats mirror the slower cadence. Reference docs now mark 1 BPM as the default. Regression tests were updated and pass with `GOWORK=off go test ./...`.
## Impact
- Cluster pulse slows to 60-second beats without runtime overrides.
- Operators retain the ability to bump tempo up to 24 BPM within safeguards.
- SDK degradation flows remain consistent with production cadence.
- Deployments that relied on the old 2 BPM default should confirm updated expectations before release.
## Evidence / Links
- Code: `project-queues/active/BACKBEAT/backbeat/prototype/cmd/pulse/main.go`
- SDK fallback: `project-queues/active/BACKBEAT/backbeat/prototype/pkg/sdk/internal.go`
- Docs: `project-queues/active/BACKBEAT/backbeat/prototype/TEMPO-RECOMMENDATIONS.md`, `project-queues/active/BACKBEAT/backbeat/prototype/contracts/docs/tempo-guide.md`
- Tests: `GOWORK=off go test ./...` (module root)

View File

@@ -0,0 +1,31 @@
# Decision Record: Publish BACKBEAT Go Module v0.1.0
- **Date:** 2025-10-16
- **Status:** Accepted
- **UCXL:** ucxl://backbeat:maintainer@BACKBEAT:releases/DRs/2025-10-16-module-release.md
## Problem
WHOOSH, CHORUS, and downstream SDK users depended on BACKBEAT via local `replace` directives because no tagged module version existed. This blocked clean dependency management and external consumption.
## Options
1. Keep using workspace replacements.
2. Fold BACKBEAT into the CHORUS repository.
3. Publish an initial semantic version (`v0.1.0`). *(Chosen)*
## Decision
Cut release tag `v0.1.0` from `master`, update documentation to advertise the module, and repoint consuming services at the tagged version while removing local overrides.
## Impact
- Establishes a canonical module reference for SDK consumers.
- Enables removal of `replace` directives in CHORUS/WHOOSH.
- No runtime changes; this is a packaging improvement.
## Follow-Up
1. Maintain a release checklist for future tags (change log, tests, artifact links).
2. Monitor CHORUS/WHOOSH builds to ensure GOPRIVATE/git config allow fetching the module.
3. Mirror the module to public hosting if external consumers require open access.

205
docker-compose.swarm.yml Normal file
View File

@@ -0,0 +1,205 @@
version: '3.8'
services:
# BACKBEAT Pulse Service - Leader-elected tempo broadcaster
# REQ: BACKBEAT-REQ-001 - Single BeatFrame publisher per cluster
# REQ: BACKBEAT-OPS-001 - One replica prefers leadership
backbeat-pulse:
image: anthonyrawlins/backbeat-pulse:v1.0.4
command: >
./pulse
-cluster=chorus-production
-admin-port=8080
-raft-bind=0.0.0.0:9000
-data-dir=/data
-nats=nats://nats:4222
-tempo=2
-bar-length=8
-log-level=info
environment:
# REQ: BACKBEAT-OPS-003 - Configuration via environment variables
- BACKBEAT_CLUSTER_ID=chorus-production
- BACKBEAT_TEMPO_BPM=2 # 30-second beats for production
- BACKBEAT_BAR_LENGTH=8 # 4-minute windows
- BACKBEAT_PHASE_PLAN=plan,work,review
- BACKBEAT_NATS_URL=nats://nats:4222
- BACKBEAT_MIN_BPM=1 # 60-second beats minimum
- BACKBEAT_MAX_BPM=60 # 1-second beats maximum
- BACKBEAT_LOG_LEVEL=info
# REQ: BACKBEAT-OPS-002 - Health probes for liveness/readiness
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080/healthz"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
deploy:
replicas: 1 # Single leader with automatic failover
restart_policy:
condition: on-failure
delay: 30s # Wait longer for NATS to be ready
max_attempts: 5
window: 120s
update_config:
parallelism: 1
delay: 30s # Wait for leader election
failure_action: rollback
monitor: 60s
order: start-first
placement:
preferences:
- spread: node.hostname
constraints:
- node.hostname != rosewood # Avoid intermittent gaming PC
resources:
limits:
memory: 256M
cpus: '0.5'
reservations:
memory: 128M
cpus: '0.25'
# Traefik routing for admin API
labels:
- traefik.enable=true
- traefik.http.routers.backbeat-pulse.rule=Host(`backbeat-pulse.chorus.services`)
- traefik.http.routers.backbeat-pulse.tls=true
- traefik.http.routers.backbeat-pulse.tls.certresolver=letsencryptresolver
- traefik.http.services.backbeat-pulse.loadbalancer.server.port=8080
networks:
- backbeat-net
- tengig # External network for Traefik
# Container logging
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
tag: "backbeat-pulse/{{.Name}}/{{.ID}}"
# BACKBEAT Reverb Service - StatusClaim aggregator
# REQ: BACKBEAT-REQ-020 - Subscribe to INT-B and group by window_id
# REQ: BACKBEAT-OPS-001 - Reverb can scale stateless
backbeat-reverb:
image: anthonyrawlins/backbeat-reverb:v1.0.1
command: >
./reverb
-cluster=chorus-production
-nats=nats://nats:4222
-bar-length=8
-log-level=info
environment:
# REQ: BACKBEAT-OPS-003 - Configuration matching pulse service
- BACKBEAT_CLUSTER_ID=chorus-production
- BACKBEAT_NATS_URL=nats://nats:4222
- BACKBEAT_LOG_LEVEL=info
- BACKBEAT_WINDOW_TTL=300s # 5-minute cleanup
- BACKBEAT_MAX_WINDOWS=100 # Memory limit
# REQ: BACKBEAT-OPS-002 - Health probes for orchestration
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080/healthz"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
deploy:
replicas: 2 # Stateless, can scale horizontally
restart_policy:
condition: on-failure
delay: 10s
max_attempts: 3
window: 120s
update_config:
parallelism: 1
delay: 15s
failure_action: rollback
monitor: 45s
order: start-first
placement:
preferences:
- spread: node.hostname
constraints:
- node.hostname != rosewood
resources:
limits:
memory: 512M # Larger for window aggregation
cpus: '1.0'
reservations:
memory: 256M
cpus: '0.5'
# Traefik routing for admin API
labels:
- traefik.enable=true
- traefik.http.routers.backbeat-reverb.rule=Host(`backbeat-reverb.chorus.services`)
- traefik.http.routers.backbeat-reverb.tls=true
- traefik.http.routers.backbeat-reverb.tls.certresolver=letsencryptresolver
- traefik.http.services.backbeat-reverb.loadbalancer.server.port=8080
networks:
- backbeat-net
- tengig # External network for Traefik
# Container logging
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
tag: "backbeat-reverb/{{.Name}}/{{.ID}}"
# NATS Message Broker - Use existing or deploy dedicated instance
# REQ: BACKBEAT-INT-001 - Topics via NATS for at-least-once delivery
nats:
image: nats:2.9-alpine
command: ["--jetstream"]
deploy:
replicas: 1
restart_policy:
condition: on-failure
delay: 10s
max_attempts: 3
window: 120s
placement:
preferences:
- spread: node.hostname
constraints:
- node.hostname != rosewood
resources:
limits:
memory: 256M
cpus: '0.5'
reservations:
memory: 128M
cpus: '0.25'
networks:
- backbeat-net
# Container logging
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
tag: "nats/{{.Name}}/{{.ID}}"
# Network configuration
networks:
tengig:
external: true # External network for Traefik
backbeat-net:
driver: overlay
attachable: true # Allow external containers to connect
ipam:
config:
- subnet: 10.202.0.0/24
# Persistent storage
# volumes:

181
docker-compose.yml Normal file
View File

@@ -0,0 +1,181 @@
version: '3.8'
services:
# NATS message broker
nats:
image: nats:2.10-alpine
ports:
- "4222:4222"
- "8222:8222"
command: >
nats-server
--jetstream
--store_dir=/data
--http_port=8222
--port=4222
volumes:
- nats_data:/data
healthcheck:
test: ["CMD", "nats-server", "--check"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
# BACKBEAT pulse service (leader election + beat generation)
pulse-1:
build:
context: .
dockerfile: Dockerfile
target: pulse
environment:
- BACKBEAT_ENV=development
command: >
./pulse
-cluster=chorus-dev
-node=pulse-1
-admin-port=8080
-raft-bind=0.0.0.0:9000
-data-dir=/data
-nats=nats://nats:4222
-log-level=info
ports:
- "8080:8080"
- "9000:9000"
volumes:
- pulse1_data:/data
depends_on:
nats:
condition: service_healthy
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080/health"]
interval: 30s
timeout: 5s
retries: 3
start_period: 10s
# Second pulse node for leader election testing
pulse-2:
build:
context: .
dockerfile: Dockerfile
target: pulse
environment:
- BACKBEAT_ENV=development
command: >
./pulse
-cluster=chorus-dev
-node=pulse-2
-admin-port=8080
-raft-bind=0.0.0.0:9000
-data-dir=/data
-nats=nats://nats:4222
-peers=pulse-1:9000
-log-level=info
ports:
- "8081:8080"
- "9001:9000"
volumes:
- pulse2_data:/data
depends_on:
nats:
condition: service_healthy
pulse-1:
condition: service_healthy
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080/health"]
interval: 30s
timeout: 5s
retries: 3
start_period: 15s
# BACKBEAT reverb service (status aggregation + bar reports)
reverb:
build:
context: .
dockerfile: Dockerfile
target: reverb
environment:
- BACKBEAT_ENV=development
command: >
./reverb
-cluster=chorus-dev
-node=reverb-1
-nats=nats://nats:4222
-bar-length=120
-log-level=info
ports:
- "8082:8080"
depends_on:
nats:
condition: service_healthy
pulse-1:
condition: service_healthy
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080/health"]
interval: 30s
timeout: 5s
retries: 3
start_period: 10s
# Agent simulator for testing
agent-sim:
build:
context: .
dockerfile: Dockerfile
target: agent-sim
environment:
- BACKBEAT_ENV=development
command: >
./agent-sim
-cluster=chorus-dev
-nats=nats://nats:4222
-agents=10
-rate=2.0
-log-level=info
depends_on:
nats:
condition: service_healthy
pulse-1:
condition: service_healthy
reverb:
condition: service_healthy
scale: 1
# Prometheus for metrics collection
prometheus:
image: prom/prometheus:latest
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=200h'
- '--web.enable-lifecycle'
depends_on:
- pulse-1
- reverb
# Grafana for metrics visualization
grafana:
image: grafana/grafana:latest
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
volumes:
- grafana_data:/var/lib/grafana
depends_on:
- prometheus
volumes:
nats_data:
pulse1_data:
pulse2_data:
prometheus_data:
grafana_data:

41
go.mod Normal file
View File

@@ -0,0 +1,41 @@
module github.com/chorus-services/backbeat
go 1.22
require (
github.com/google/uuid v1.6.0
github.com/gorilla/mux v1.8.1
github.com/hashicorp/raft v1.6.1
github.com/hashicorp/raft-boltdb/v2 v2.3.0
github.com/nats-io/nats.go v1.36.0
github.com/prometheus/client_golang v1.19.1
github.com/rs/zerolog v1.32.0
gopkg.in/yaml.v3 v3.0.1
)
require (
github.com/armon/go-metrics v0.4.1 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/boltdb/bolt v1.3.1 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/fatih/color v1.13.0 // indirect
github.com/hashicorp/go-hclog v1.6.2 // indirect
github.com/hashicorp/go-immutable-radix v1.0.0 // indirect
github.com/hashicorp/go-msgpack/v2 v2.1.1 // indirect
github.com/hashicorp/golang-lru v0.5.0 // indirect
github.com/klauspost/compress v1.17.2 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.19 // indirect
github.com/nats-io/nkeys v0.4.7 // indirect
github.com/nats-io/nuid v1.0.1 // indirect
github.com/prometheus/client_model v0.5.0 // indirect
github.com/prometheus/common v0.48.0 // indirect
github.com/prometheus/procfs v0.12.0 // indirect
github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f // indirect
github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 // indirect
github.com/xeipuuv/gojsonschema v1.2.0 // indirect
go.etcd.io/bbolt v1.3.5 // indirect
golang.org/x/crypto v0.18.0 // indirect
golang.org/x/sys v0.17.0 // indirect
google.golang.org/protobuf v1.33.0 // indirect
)

187
go.sum Normal file
View File

@@ -0,0 +1,187 @@
github.com/DataDog/datadog-go v3.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ=
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
github.com/armon/go-metrics v0.4.1 h1:hR91U9KYmb6bLBYLQjyM+3j+rcd/UhE+G78SFnF8gJA=
github.com/armon/go-metrics v0.4.1/go.mod h1:E6amYzXo6aW1tqzoZGT755KkbgrJsSdpwZ+3JqfkOG4=
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/boltdb/bolt v1.3.1 h1:JQmyP4ZBrce+ZQu0dY660FMfatumYDLun9hBCUVIkF4=
github.com/boltdb/bolt v1.3.1/go.mod h1:clJnj/oiGkjum5o1McbSZDSLxVThjynRyGBgiAx27Ps=
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible/go.mod h1:nmEj6Dob7S7YxXgwXpfOuvO54S+tGdZdw9fuRZt25Ag=
github.com/circonus-labs/circonusllhist v0.1.3/go.mod h1:kMXHVDlOchFAehlya5ePtbp5jckzBHf4XRpQvBOLI+I=
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/fatih/color v1.13.0 h1:8LOYc1KYPPmyKMuN8QV2DNRWNbLo6LZ0iLs8+mlH53w=
github.com/fatih/color v1.13.0/go.mod h1:kLAiJbzzSOZDVNGyDpeOxJ47H46qBXwg5ILebYFFOfk=
github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=
github.com/go-kit/kit v0.9.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=
github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE=
github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk=
github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY=
github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ=
github.com/hashicorp/go-cleanhttp v0.5.0/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80=
github.com/hashicorp/go-hclog v1.6.2 h1:NOtoftovWkDheyUM/8JW3QMiXyxJK3uHRK7wV04nD2I=
github.com/hashicorp/go-hclog v1.6.2/go.mod h1:W4Qnvbt70Wk/zYJryRzDRU/4r0kIg0PVHBcfoyhpF5M=
github.com/hashicorp/go-immutable-radix v1.0.0 h1:AKDB1HM5PWEA7i4nhcpwOrO2byshxBjXVn/J/3+z5/0=
github.com/hashicorp/go-immutable-radix v1.0.0/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60=
github.com/hashicorp/go-msgpack v0.5.5 h1:i9R9JSrqIz0QVLz3sz+i3YJdT7TTSLcfLLzJi9aZTuI=
github.com/hashicorp/go-msgpack v0.5.5/go.mod h1:ahLV/dePpqEmjfWmKiqvPkv/twdG7iPBM1vqhUKIvfM=
github.com/hashicorp/go-msgpack/v2 v2.1.1 h1:xQEY9yB2wnHitoSzk/B9UjXWRQ67QKu5AOm8aFp8N3I=
github.com/hashicorp/go-msgpack/v2 v2.1.1/go.mod h1:upybraOAblm4S7rx0+jeNy+CWWhzywQsSRV5033mMu4=
github.com/hashicorp/go-retryablehttp v0.5.3/go.mod h1:9B5zBasrRhHXnJnui7y6sL7es7NDiJgTc6Er0maI1Xs=
github.com/hashicorp/go-uuid v1.0.0 h1:RS8zrF7PhGwyNPOtxSClXXj9HA8feRnJzgnI1RJCSnM=
github.com/hashicorp/go-uuid v1.0.0/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro=
github.com/hashicorp/golang-lru v0.5.0 h1:CL2msUPvZTLb5O648aiLNJw3hnBxN2+1Jq8rCOH9wdo=
github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/hashicorp/raft v1.6.1 h1:v/jm5fcYHvVkL0akByAp+IDdDSzCNCGhdO6VdB56HIM=
github.com/hashicorp/raft v1.6.1/go.mod h1:N1sKh6Vn47mrWvEArQgILTyng8GoDRNYlgKyK7PMjs0=
github.com/hashicorp/raft-boltdb v0.0.0-20230125174641-2a8082862702 h1:RLKEcCuKcZ+qp2VlaaZsYZfLOmIiuJNpEi48Rl8u9cQ=
github.com/hashicorp/raft-boltdb v0.0.0-20230125174641-2a8082862702/go.mod h1:nTakvJ4XYq45UXtn0DbwR4aU9ZdjlnIenpbs6Cd+FM0=
github.com/hashicorp/raft-boltdb/v2 v2.3.0 h1:fPpQR1iGEVYjZ2OELvUHX600VAK5qmdnDEv3eXOwZUA=
github.com/hashicorp/raft-boltdb/v2 v2.3.0/go.mod h1:YHukhB04ChJsLHLJEUD6vjFyLX2L3dsX3wPBZcX4tmc=
github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
github.com/json-iterator/go v1.1.9/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
github.com/klauspost/compress v1.17.2 h1:RlWWUY/Dr4fL8qk9YG7DTZ7PDgME2V4csBXA8L/ixi4=
github.com/klauspost/compress v1.17.2/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/mattn/go-colorable v0.1.9/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc=
github.com/mattn/go-colorable v0.1.12/go.mod h1:u5H1YNBxpqRaxsYJYSkiCWKzEfiAb1Gb520KVy5xxl4=
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU=
github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94=
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA=
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
github.com/nats-io/nats.go v1.36.0 h1:suEUPuWzTSse/XhESwqLxXGuj8vGRuPRoG7MoRN/qyU=
github.com/nats-io/nats.go v1.36.0/go.mod h1:Ubdu4Nh9exXdSz0RVWRFBbRfrbSxOYd26oF0wkWclB8=
github.com/nats-io/nkeys v0.4.7 h1:RwNJbbIdYCoClSDNY7QVKZlyb/wfT6ugvFCiKy6vDvI=
github.com/nats-io/nkeys v0.4.7/go.mod h1:kqXRgRDPlGy7nGaEDMuYzmiJCIAAWDK0IMBtDmGD0nc=
github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=
github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
github.com/pascaldekloe/goe v0.1.0 h1:cBOtyMzM9HTpWjXfbbunk26uA6nG3a8n06Wieeh0MwY=
github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
github.com/prometheus/client_golang v1.4.0/go.mod h1:e9GMxYsXl05ICDXkRhurwBS4Q3OK1iX/F2sw+iXX5zU=
github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE=
github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho=
github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/prometheus/client_model v0.5.0 h1:VQw1hfvPvk3Uv6Qf29VrPF32JB6rtbgI6cYPYQjL0Qw=
github.com/prometheus/client_model v0.5.0/go.mod h1:dTiFglRmd66nLR9Pv9f0mZi7B7fk5Pm3gvsjB5tr+kI=
github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4=
github.com/prometheus/common v0.9.1/go.mod h1:yhUN8i9wzaXS3w1O07YhxHEBxD+W35wd8bs7vj7HSQ4=
github.com/prometheus/common v0.48.0 h1:QO8U2CdOzSn1BBsmXJXduaaW+dY/5QLjfB8svtSzKKE=
github.com/prometheus/common v0.48.0/go.mod h1:0/KsvlIEfPQCQ5I2iNSAWKPZziNCvRs5EC6ILDTlAPc=
github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A=
github.com/prometheus/procfs v0.12.0 h1:jluTpSng7V9hY0O2R9DzzJHYb2xULk9VTR1V1R/k6Bo=
github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3cnaOZAZEfOo=
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
github.com/rs/zerolog v1.32.0 h1:keLypqrlIjaFsbmJOBdB/qvyF8KEtCWHwobLp5l/mQ0=
github.com/rs/zerolog v1.32.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss=
github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.7.2/go.mod h1:R6va5+xMeoiuVRoj+gSkQ7d3FALtqAAGI1FQKckRals=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/tv42/httpunix v0.0.0-20150427012821-b75d8614f926/go.mod h1:9ESjWnEqriFuLhtthL60Sar/7RFoluCcXsuvEwTV5KM=
github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f h1:J9EGpcZtP0E/raorCMxlFGSTBrsSlaDGf3jU/qvAE2c=
github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU=
github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 h1:EzJWgHovont7NscjpAxXsDA8S8BMYve8Y5+7cuRE7R0=
github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1:GwrjFmJcFw6At/Gs6z4yjiIwzuJ1/+UwLxMQDVQXShQ=
github.com/xeipuuv/gojsonschema v1.2.0 h1:LhYJRs+L4fBtjZUfuSZIKGeVu0QRy8e5Xi7D17UxZ74=
github.com/xeipuuv/gojsonschema v1.2.0/go.mod h1:anYRn/JVcOK2ZgGU+IjEV4nwlhoK5sQluxsYJ78Id3Y=
go.etcd.io/bbolt v1.3.5 h1:XAzx9gjCb0Rxj7EoqcClPD1d5ZBxZJk0jbuoPHenBt0=
go.etcd.io/bbolt v1.3.5/go.mod h1:G5EMThwa9y8QZGBClrRx5EY+Yw9kAhnjy3bSjsnlVTQ=
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.18.0 h1:PGVlW0xEltQnzFZ55hkuX5+KLyrMYhHld1YHO4AKcdc=
golang.org/x/crypto v0.18.0/go.mod h1:R0j02AL6hcrfOiy9T4ZYp/rcWeMxM3L6QYxlOuEG1mg=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20210927094055-39ccf1dd6fa6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220503163025-988cb79eb6c6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y=
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI=
google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos=
gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.5/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

View File

@@ -0,0 +1,12 @@
apiVersion: 1
providers:
- name: 'backbeat'
orgId: 1
folder: 'BACKBEAT'
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /etc/grafana/provisioning/dashboards

View File

@@ -0,0 +1,9 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: true

28
monitoring/prometheus.yml Normal file
View File

@@ -0,0 +1,28 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
scrape_configs:
# BACKBEAT Pulse Services
- job_name: 'backbeat-pulse'
static_configs:
- targets: ['pulse-leader:8080', 'pulse-follower:8080']
metrics_path: '/metrics'
scrape_interval: 10s
scrape_timeout: 5s
# NATS Monitoring
- job_name: 'nats'
static_configs:
- targets: ['nats:8222']
metrics_path: '/metrics'
scrape_interval: 15s
# Prometheus itself
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']

373
pkg/sdk/README.md Normal file
View File

@@ -0,0 +1,373 @@
# BACKBEAT Go SDK
The BACKBEAT Go SDK enables CHORUS services to become "BACKBEAT-aware" by providing client libraries for beat synchronization, status emission, and beat-budget management.
## Features
- **Beat Subscription (BACKBEAT-REQ-040)**: Subscribe to beat and downbeat events with jitter-tolerant scheduling
- **Status Emission (BACKBEAT-REQ-041)**: Emit status claims with automatic agent_id, task_id, and HLC population
- **Beat Budgets (BACKBEAT-REQ-042)**: Execute functions with beat-based timeouts and cancellation
- **Legacy Compatibility (BACKBEAT-REQ-043)**: Support for legacy `{bar,beat}` patterns with migration warnings
- **Security (BACKBEAT-REQ-044)**: Ed25519 signing and required headers for status claims
- **Local Degradation**: Continue operating when pulse service is unavailable
- **Comprehensive Observability**: Metrics, health reporting, and performance monitoring
## Quick Start
```go
package main
import (
"context"
"crypto/ed25519"
"crypto/rand"
"log/slog"
"github.com/chorus-services/backbeat/pkg/sdk"
)
func main() {
// Generate signing key
_, signingKey, _ := ed25519.GenerateKey(rand.Reader)
// Configure SDK
config := sdk.DefaultConfig()
config.ClusterID = "chorus-dev"
config.AgentID = "my-service"
config.NATSUrl = "nats://localhost:4222"
config.SigningKey = signingKey
// Create client
client := sdk.NewClient(config)
// Register beat callback
client.OnBeat(func(beat sdk.BeatFrame) {
slog.Info("Beat received", "beat_index", beat.BeatIndex)
// Emit status
client.EmitStatusClaim(sdk.StatusClaim{
State: "executing",
BeatsLeft: 5,
Progress: 0.3,
Notes: "Processing data",
})
})
// Start client
ctx := context.Background()
if err := client.Start(ctx); err != nil {
panic(err)
}
defer client.Stop()
// Your service logic here...
select {}
}
```
## Configuration
### Basic Configuration
```go
config := &sdk.Config{
ClusterID: "your-cluster", // BACKBEAT cluster ID
AgentID: "your-agent", // Unique agent identifier
NATSUrl: "nats://localhost:4222", // NATS connection URL
}
```
### Advanced Configuration
```go
config := sdk.DefaultConfig()
config.ClusterID = "chorus-prod"
config.AgentID = "web-service-01"
config.NATSUrl = "nats://nats.cluster.local:4222"
config.SigningKey = loadSigningKey() // Ed25519 private key
config.JitterTolerance = 100 * time.Millisecond
config.ReconnectDelay = 2 * time.Second
config.MaxReconnects = 10 // -1 for infinite
config.Logger = slog.New(slog.NewJSONHandler(os.Stdout, nil))
```
## Core Features
### Beat Subscription
```go
// Register beat callback (called every beat)
client.OnBeat(func(beat sdk.BeatFrame) {
// Your beat logic here
fmt.Printf("Beat %d at %s\n", beat.BeatIndex, beat.DeadlineAt)
})
// Register downbeat callback (called at bar starts)
client.OnDownbeat(func(beat sdk.BeatFrame) {
// Your downbeat logic here
fmt.Printf("Bar started: %s\n", beat.WindowID)
})
```
### Status Emission
```go
// Basic status emission
err := client.EmitStatusClaim(sdk.StatusClaim{
State: "executing", // executing|planning|waiting|review|done|failed
BeatsLeft: 10, // estimated beats remaining
Progress: 0.75, // progress ratio (0.0-1.0)
Notes: "Processing batch 5/10",
})
// Advanced status with task tracking
err := client.EmitStatusClaim(sdk.StatusClaim{
TaskID: "task-12345", // auto-generated if empty
State: "waiting",
WaitFor: []string{"hmmm://thread/abc123"}, // dependencies
BeatsLeft: 0,
Progress: 1.0,
Notes: "Waiting for thread completion",
})
```
### Beat Budgets
```go
// Execute with beat-based timeout
err := client.WithBeatBudget(10, func() error {
// This function has 10 beats to complete
return performTask()
})
if err != nil {
// Handle timeout or task error
fmt.Printf("Task failed or exceeded budget: %v\n", err)
}
// Real-world example
err := client.WithBeatBudget(20, func() error {
// Database operation with beat budget
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
return database.ProcessBatch(ctx, batchData)
})
```
## Client Interface
```go
type Client interface {
// Beat subscription
OnBeat(callback func(BeatFrame)) error
OnDownbeat(callback func(BeatFrame)) error
// Status emission
EmitStatusClaim(claim StatusClaim) error
// Beat budgets
WithBeatBudget(n int, fn func() error) error
// Utilities
GetCurrentBeat() int64
GetCurrentWindow() string
IsInWindow(windowID string) bool
// Lifecycle
Start(ctx context.Context) error
Stop() error
Health() HealthStatus
}
```
## Examples
The SDK includes comprehensive examples:
- **[Simple Agent](examples/simple_agent.go)**: Basic beat subscription and status emission
- **[Task Processor](examples/task_processor.go)**: Beat budget usage for task timeout management
- **[Service Monitor](examples/service_monitor.go)**: Health monitoring with beat-aligned reporting
### Running Examples
```bash
# Simple agent example
go run pkg/sdk/examples/simple_agent.go
# Task processor with beat budgets
go run pkg/sdk/examples/task_processor.go
# Service monitor with health reporting
go run pkg/sdk/examples/service_monitor.go
```
## Observability
### Health Monitoring
```go
health := client.Health()
fmt.Printf("Connected: %v\n", health.Connected)
fmt.Printf("Last Beat: %d at %s\n", health.LastBeat, health.LastBeatTime)
fmt.Printf("Time Drift: %s\n", health.TimeDrift)
fmt.Printf("Reconnects: %d\n", health.ReconnectCount)
fmt.Printf("Local Degradation: %v\n", health.LocalDegradation)
```
### Metrics
The SDK exposes metrics via Go's `expvar` package:
- Connection metrics: status, reconnection count, duration
- Beat metrics: received, jitter, callback latency, misses
- Status metrics: claims emitted, errors
- Budget metrics: created, completed, timed out
- Error metrics: total count, last error
Access metrics at `http://localhost:8080/debug/vars` when using `expvar`.
### Logging
The SDK uses structured logging via `slog`:
```go
config.Logger = slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
Level: slog.LevelDebug, // Set appropriate level
}))
```
## Error Handling
The SDK provides comprehensive error handling:
- **Connection Errors**: Automatic reconnection with exponential backoff
- **Beat Jitter**: Tolerance for network delays and timing variations
- **Callback Panics**: Recovery and logging without affecting other callbacks
- **Validation Errors**: Status claim validation with detailed error messages
- **Timeout Errors**: Beat budget timeouts with context cancellation
## Local Degradation
When the pulse service is unavailable, the SDK automatically enters local degradation mode:
- Generates synthetic beats to maintain callback timing
- Uses fallback 60 BPM tempo
- Marks beat frames with "degraded" phase
- Automatically recovers when pulse service returns
## Legacy Compatibility
Support for legacy `{bar,beat}` patterns (BACKBEAT-REQ-043):
```go
// Convert legacy format (logs warning once)
beatIndex := client.ConvertLegacyBeat(bar, beat)
// Get legacy format from current beat
legacy := client.GetLegacyBeatInfo()
fmt.Printf("Bar: %d, Beat: %d\n", legacy.Bar, legacy.Beat)
```
## Security
The SDK implements BACKBEAT security requirements:
- **Ed25519 Signatures**: All status claims are signed when signing key provided
- **Required Headers**: Includes `x-window-id` and `x-hlc` headers
- **Agent Identification**: Automatic `x-agent-id` header for routing
```go
// Configure signing
_, signingKey, _ := ed25519.GenerateKey(rand.Reader)
config.SigningKey = signingKey
```
## Performance
The SDK is designed for high performance:
- **Beat Callback Latency**: Target ≤5ms callback execution
- **Timer Drift**: ≤1% drift over 1 hour without leader
- **Concurrent Safe**: All operations are goroutine-safe
- **Memory Efficient**: Bounded error lists and metric samples
## Integration Patterns
### Web Service Integration
```go
func main() {
// Initialize BACKBEAT client
client := sdk.NewClient(config)
client.OnBeat(func(beat sdk.BeatFrame) {
// Report web service status
client.EmitStatusClaim(sdk.StatusClaim{
State: "executing",
Progress: getRequestSuccessRate(),
Notes: fmt.Sprintf("Handling %d req/s", getCurrentRPS()),
})
})
// Start HTTP server
http.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
health := client.Health()
json.NewEncoder(w).Encode(health)
})
}
```
### Background Job Processor
```go
func processJobs(client sdk.Client) {
for job := range jobQueue {
// Use beat budget for job timeout
err := client.WithBeatBudget(job.MaxBeats, func() error {
return processJob(job)
})
if err != nil {
client.EmitStatusClaim(sdk.StatusClaim{
TaskID: job.ID,
State: "failed",
Notes: err.Error(),
})
}
}
}
```
## Testing
The SDK includes comprehensive test utilities:
```bash
# Run all tests
go test ./pkg/sdk/...
# Run with race detection
go test -race ./pkg/sdk/...
# Run benchmarks
go test -bench=. ./pkg/sdk/examples/
```
## Requirements
- Go 1.22 or later
- NATS server for messaging
- BACKBEAT pulse service running
- Network connectivity to cluster
## Contributing
1. Follow standard Go conventions
2. Include comprehensive tests
3. Update documentation for API changes
4. Ensure examples remain working
5. Maintain backward compatibility
## License
This SDK is part of the BACKBEAT project and follows the same licensing terms.

480
pkg/sdk/client.go Normal file
View File

@@ -0,0 +1,480 @@
// Package sdk provides the BACKBEAT Go SDK for enabling CHORUS services
// to become BACKBEAT-aware with beat synchronization and status emission.
package sdk
import (
"context"
"crypto/ed25519"
"encoding/json"
"fmt"
"log/slog"
"sync"
"time"
"github.com/google/uuid"
"github.com/nats-io/nats.go"
)
// Client interface defines the core BACKBEAT SDK functionality
// Implements BACKBEAT-REQ-040, 041, 042, 043, 044
type Client interface {
// Beat subscription (BACKBEAT-REQ-040)
OnBeat(callback func(BeatFrame)) error
OnDownbeat(callback func(BeatFrame)) error
// Status emission (BACKBEAT-REQ-041)
EmitStatusClaim(claim StatusClaim) error
// Beat budgets (BACKBEAT-REQ-042)
WithBeatBudget(n int, fn func() error) error
// Utilities
GetCurrentBeat() int64
GetCurrentWindow() string
IsInWindow(windowID string) bool
GetCurrentTempo() int
GetTempoDrift() time.Duration
// Lifecycle management
Start(ctx context.Context) error
Stop() error
Health() HealthStatus
}
// Config represents the SDK configuration
type Config struct {
ClusterID string // BACKBEAT cluster identifier
AgentID string // Unique agent identifier
NATSUrl string // NATS connection URL
SigningKey ed25519.PrivateKey // Ed25519 private key for signing (BACKBEAT-REQ-044)
Logger *slog.Logger // Structured logger
JitterTolerance time.Duration // Maximum jitter tolerance (default: 50ms)
ReconnectDelay time.Duration // NATS reconnection delay (default: 1s)
MaxReconnects int // Maximum reconnection attempts (default: -1 for infinite)
}
// DefaultConfig returns a Config with sensible defaults
func DefaultConfig() *Config {
return &Config{
JitterTolerance: 50 * time.Millisecond,
ReconnectDelay: 1 * time.Second,
MaxReconnects: -1, // Infinite reconnects
Logger: slog.Default(),
}
}
// BeatFrame represents a beat frame with timing information
type BeatFrame struct {
Type string `json:"type"`
ClusterID string `json:"cluster_id"`
BeatIndex int64 `json:"beat_index"`
Downbeat bool `json:"downbeat"`
Phase string `json:"phase"`
HLC string `json:"hlc"`
DeadlineAt time.Time `json:"deadline_at"`
TempoBPM int `json:"tempo_bpm"`
WindowID string `json:"window_id"`
}
// StatusClaim represents a status claim emission
type StatusClaim struct {
// Auto-populated by SDK
Type string `json:"type"` // Always "backbeat.statusclaim.v1"
AgentID string `json:"agent_id"` // Auto-populated from config
TaskID string `json:"task_id"` // Auto-generated if not provided
BeatIndex int64 `json:"beat_index"` // Auto-populated from current beat
HLC string `json:"hlc"` // Auto-populated from current HLC
// User-provided
State string `json:"state"` // executing|planning|waiting|review|done|failed
WaitFor []string `json:"wait_for,omitempty"` // refs (e.g., hmmm://thread/...)
BeatsLeft int `json:"beats_left"` // estimated beats remaining
Progress float64 `json:"progress"` // progress ratio (0.0-1.0)
Notes string `json:"notes"` // status description
}
// HealthStatus represents the current health of the SDK client
type HealthStatus struct {
Connected bool `json:"connected"`
LastBeat int64 `json:"last_beat"`
LastBeatTime time.Time `json:"last_beat_time"`
TimeDrift time.Duration `json:"time_drift"`
ReconnectCount int `json:"reconnect_count"`
LocalDegradation bool `json:"local_degradation"`
CurrentTempo int `json:"current_tempo"`
TempoDrift time.Duration `json:"tempo_drift"`
MeasuredBPM float64 `json:"measured_bpm"`
Errors []string `json:"errors,omitempty"`
}
// LegacyBeatInfo represents legacy {bar,beat} information
// For BACKBEAT-REQ-043 compatibility
type LegacyBeatInfo struct {
Bar int `json:"bar"`
Beat int `json:"beat"`
}
// tempoSample represents a tempo measurement for drift calculation
type tempoSample struct {
BeatIndex int64
Tempo int
MeasuredTime time.Time
ActualBPM float64 // Measured BPM based on inter-beat timing
}
// client implements the Client interface
type client struct {
config *Config
nc *nats.Conn
ctx context.Context
cancel context.CancelFunc
wg sync.WaitGroup
// Beat tracking
currentBeat int64
currentWindow string
currentHLC string
lastBeatTime time.Time
currentTempo int // Current tempo in BPM
lastTempo int // Last known tempo for drift calculation
tempoHistory []tempoSample // History for drift calculation
beatMutex sync.RWMutex
// Callbacks
beatCallbacks []func(BeatFrame)
downbeatCallbacks []func(BeatFrame)
callbackMutex sync.RWMutex
// Health and metrics
reconnectCount int
localDegradation bool
errors []string
errorMutex sync.RWMutex
metrics *Metrics
// Beat budget tracking
budgetContexts map[string]context.CancelFunc
budgetMutex sync.Mutex
// Legacy compatibility
legacyWarned bool
legacyMutex sync.Mutex
}
// NewClient creates a new BACKBEAT SDK client
func NewClient(config *Config) Client {
if config.Logger == nil {
config.Logger = slog.Default()
}
c := &client{
config: config,
beatCallbacks: make([]func(BeatFrame), 0),
downbeatCallbacks: make([]func(BeatFrame), 0),
budgetContexts: make(map[string]context.CancelFunc),
errors: make([]string, 0),
tempoHistory: make([]tempoSample, 0, 100),
currentTempo: 60, // Default to 60 BPM
}
// Initialize metrics
prefix := fmt.Sprintf("backbeat.sdk.%s", config.AgentID)
c.metrics = NewMetrics(prefix)
return c
}
// Start initializes the client and begins beat synchronization
func (c *client) Start(ctx context.Context) error {
c.ctx, c.cancel = context.WithCancel(ctx)
if err := c.connect(); err != nil {
return fmt.Errorf("failed to connect to NATS: %w", err)
}
c.wg.Add(1)
go c.beatSubscriptionLoop()
c.config.Logger.Info("BACKBEAT SDK client started",
slog.String("cluster_id", c.config.ClusterID),
slog.String("agent_id", c.config.AgentID))
return nil
}
// Stop gracefully stops the client
func (c *client) Stop() error {
if c.cancel != nil {
c.cancel()
}
// Cancel all active beat budgets
c.budgetMutex.Lock()
for id, cancel := range c.budgetContexts {
cancel()
delete(c.budgetContexts, id)
}
c.budgetMutex.Unlock()
if c.nc != nil {
c.nc.Close()
}
c.wg.Wait()
c.config.Logger.Info("BACKBEAT SDK client stopped")
return nil
}
// OnBeat registers a callback for beat events (BACKBEAT-REQ-040)
func (c *client) OnBeat(callback func(BeatFrame)) error {
if callback == nil {
return fmt.Errorf("callback cannot be nil")
}
c.callbackMutex.Lock()
defer c.callbackMutex.Unlock()
c.beatCallbacks = append(c.beatCallbacks, callback)
return nil
}
// OnDownbeat registers a callback for downbeat events (BACKBEAT-REQ-040)
func (c *client) OnDownbeat(callback func(BeatFrame)) error {
if callback == nil {
return fmt.Errorf("callback cannot be nil")
}
c.callbackMutex.Lock()
defer c.callbackMutex.Unlock()
c.downbeatCallbacks = append(c.downbeatCallbacks, callback)
return nil
}
// EmitStatusClaim emits a status claim (BACKBEAT-REQ-041)
func (c *client) EmitStatusClaim(claim StatusClaim) error {
// Auto-populate required fields
claim.Type = "backbeat.statusclaim.v1"
claim.AgentID = c.config.AgentID
claim.BeatIndex = c.GetCurrentBeat()
claim.HLC = c.getCurrentHLC()
// Auto-generate task ID if not provided
if claim.TaskID == "" {
claim.TaskID = fmt.Sprintf("task:%s", uuid.New().String()[:8])
}
// Validate the claim
if err := c.validateStatusClaim(&claim); err != nil {
return fmt.Errorf("invalid status claim: %w", err)
}
// Sign the claim if signing key is available (BACKBEAT-REQ-044)
if c.config.SigningKey != nil {
if err := c.signStatusClaim(&claim); err != nil {
return fmt.Errorf("failed to sign status claim: %w", err)
}
}
// Publish to NATS
data, err := json.Marshal(claim)
if err != nil {
return fmt.Errorf("failed to marshal status claim: %w", err)
}
subject := fmt.Sprintf("backbeat.status.%s", c.config.ClusterID)
headers := c.createHeaders()
msg := &nats.Msg{
Subject: subject,
Data: data,
Header: headers,
}
if err := c.nc.PublishMsg(msg); err != nil {
c.addError(fmt.Sprintf("failed to publish status claim: %v", err))
c.metrics.RecordStatusClaim(false)
return fmt.Errorf("failed to publish status claim: %w", err)
}
c.metrics.RecordStatusClaim(true)
c.config.Logger.Debug("Status claim emitted",
slog.String("agent_id", claim.AgentID),
slog.String("task_id", claim.TaskID),
slog.String("state", claim.State),
slog.Int64("beat_index", claim.BeatIndex))
return nil
}
// WithBeatBudget executes a function with a beat-based timeout (BACKBEAT-REQ-042)
func (c *client) WithBeatBudget(n int, fn func() error) error {
if n <= 0 {
return fmt.Errorf("beat budget must be positive, got %d", n)
}
// Calculate timeout based on current tempo
currentBeat := c.GetCurrentBeat()
beatDuration := c.getBeatDuration()
timeout := time.Duration(n) * beatDuration
// Use background context if client context is not set (for testing)
baseCtx := c.ctx
if baseCtx == nil {
baseCtx = context.Background()
}
ctx, cancel := context.WithTimeout(baseCtx, timeout)
defer cancel()
// Track the budget context for cancellation
budgetID := uuid.New().String()
c.budgetMutex.Lock()
c.budgetContexts[budgetID] = cancel
c.budgetMutex.Unlock()
// Record budget creation
c.metrics.RecordBudgetCreated()
defer func() {
c.budgetMutex.Lock()
delete(c.budgetContexts, budgetID)
c.budgetMutex.Unlock()
}()
// Execute function with timeout
done := make(chan error, 1)
go func() {
done <- fn()
}()
select {
case err := <-done:
c.metrics.RecordBudgetCompleted(false) // Not timed out
if err != nil {
c.config.Logger.Debug("Beat budget function completed with error",
slog.Int("budget", n),
slog.Int64("start_beat", currentBeat),
slog.String("error", err.Error()))
} else {
c.config.Logger.Debug("Beat budget function completed successfully",
slog.Int("budget", n),
slog.Int64("start_beat", currentBeat))
}
return err
case <-ctx.Done():
c.metrics.RecordBudgetCompleted(true) // Timed out
c.config.Logger.Warn("Beat budget exceeded",
slog.Int("budget", n),
slog.Int64("start_beat", currentBeat),
slog.Duration("timeout", timeout))
return fmt.Errorf("beat budget of %d beats exceeded", n)
}
}
// GetCurrentBeat returns the current beat index
func (c *client) GetCurrentBeat() int64 {
c.beatMutex.RLock()
defer c.beatMutex.RUnlock()
return c.currentBeat
}
// GetCurrentWindow returns the current window ID
func (c *client) GetCurrentWindow() string {
c.beatMutex.RLock()
defer c.beatMutex.RUnlock()
return c.currentWindow
}
// IsInWindow checks if we're currently in the specified window
func (c *client) IsInWindow(windowID string) bool {
return c.GetCurrentWindow() == windowID
}
// GetCurrentTempo returns the current tempo in BPM
func (c *client) GetCurrentTempo() int {
c.beatMutex.RLock()
defer c.beatMutex.RUnlock()
return c.currentTempo
}
// GetTempoDrift calculates the drift between expected and actual tempo
func (c *client) GetTempoDrift() time.Duration {
c.beatMutex.RLock()
defer c.beatMutex.RUnlock()
if len(c.tempoHistory) < 2 {
return 0
}
// Calculate average measured BPM from recent samples
historyLen := len(c.tempoHistory)
recentCount := 10
if historyLen < recentCount {
recentCount = historyLen
}
recent := c.tempoHistory[historyLen-recentCount:]
if len(recent) < 2 {
recent = c.tempoHistory
}
totalBPM := 0.0
for _, sample := range recent {
totalBPM += sample.ActualBPM
}
avgMeasuredBPM := totalBPM / float64(len(recent))
// Calculate drift
expectedBeatDuration := 60.0 / float64(c.currentTempo)
actualBeatDuration := 60.0 / avgMeasuredBPM
drift := actualBeatDuration - expectedBeatDuration
return time.Duration(drift * float64(time.Second))
}
// Health returns the current health status
func (c *client) Health() HealthStatus {
c.errorMutex.RLock()
errors := make([]string, len(c.errors))
copy(errors, c.errors)
c.errorMutex.RUnlock()
c.beatMutex.RLock()
timeDrift := time.Since(c.lastBeatTime)
currentTempo := c.currentTempo
// Calculate measured BPM from recent tempo history
measuredBPM := 60.0 // Default
if len(c.tempoHistory) > 0 {
historyLen := len(c.tempoHistory)
recentCount := 5
if historyLen < recentCount {
recentCount = historyLen
}
recent := c.tempoHistory[historyLen-recentCount:]
totalBPM := 0.0
for _, sample := range recent {
totalBPM += sample.ActualBPM
}
measuredBPM = totalBPM / float64(len(recent))
}
c.beatMutex.RUnlock()
tempoDrift := c.GetTempoDrift()
return HealthStatus{
Connected: c.nc != nil && c.nc.IsConnected(),
LastBeat: c.GetCurrentBeat(),
LastBeatTime: c.lastBeatTime,
TimeDrift: timeDrift,
ReconnectCount: c.reconnectCount,
LocalDegradation: c.localDegradation,
CurrentTempo: currentTempo,
TempoDrift: tempoDrift,
MeasuredBPM: measuredBPM,
Errors: errors,
}
}

573
pkg/sdk/client_test.go Normal file
View File

@@ -0,0 +1,573 @@
package sdk
import (
"context"
"crypto/ed25519"
"crypto/rand"
"fmt"
"testing"
"time"
"log/slog"
"os"
"github.com/nats-io/nats.go"
)
var testCounter int
// generateUniqueAgentID generates unique agent IDs for tests to avoid expvar conflicts
func generateUniqueAgentID(prefix string) string {
testCounter++
return fmt.Sprintf("%s-%d", prefix, testCounter)
}
// TestClient tests basic client creation and configuration
func TestClient(t *testing.T) {
config := DefaultConfig()
config.ClusterID = "test-cluster"
config.AgentID = generateUniqueAgentID("test-agent")
config.NATSUrl = "nats://localhost:4222"
client := NewClient(config)
if client == nil {
t.Fatal("Expected client to be created")
}
// Test health before start
health := client.Health()
if health.Connected {
t.Error("Expected client to be disconnected before start")
}
}
// TestBeatCallbacks tests beat and downbeat callback registration
func TestBeatCallbacks(t *testing.T) {
config := DefaultConfig()
config.ClusterID = "test-cluster"
config.AgentID = generateUniqueAgentID("test-agent-callbacks")
client := NewClient(config)
var beatCalled, downbeatCalled bool
// Register callbacks
err := client.OnBeat(func(beat BeatFrame) {
beatCalled = true
})
if err != nil {
t.Fatalf("Failed to register beat callback: %v", err)
}
err = client.OnDownbeat(func(beat BeatFrame) {
downbeatCalled = true
})
if err != nil {
t.Fatalf("Failed to register downbeat callback: %v", err)
}
// Test nil callback rejection
err = client.OnBeat(nil)
if err == nil {
t.Error("Expected error when registering nil beat callback")
}
err = client.OnDownbeat(nil)
if err == nil {
t.Error("Expected error when registering nil downbeat callback")
}
// Use variables to prevent unused warnings
_ = beatCalled
_ = downbeatCalled
}
// TestStatusClaim tests status claim validation and emission
func TestStatusClaim(t *testing.T) {
_, signingKey, err := ed25519.GenerateKey(rand.Reader)
if err != nil {
t.Fatalf("Failed to generate signing key: %v", err)
}
config := DefaultConfig()
config.ClusterID = "test-cluster"
config.AgentID = generateUniqueAgentID("test-agent")
config.SigningKey = signingKey
client := NewClient(config).(*client)
// Test valid status claim
claim := StatusClaim{
State: "executing",
BeatsLeft: 5,
Progress: 0.5,
Notes: "Test status",
}
// Test validation without connection (should work for validation)
client.currentBeat = 1
client.currentHLC = "test-hlc"
// Test auto-population
if claim.AgentID != "" {
t.Error("Expected AgentID to be empty before emission")
}
// Since we can't actually emit without NATS connection, test validation directly
claim.Type = "backbeat.statusclaim.v1"
claim.AgentID = config.AgentID
claim.TaskID = "test-task"
claim.BeatIndex = 1
claim.HLC = "test-hlc"
err = client.validateStatusClaim(&claim)
if err != nil {
t.Errorf("Expected valid status claim to pass validation: %v", err)
}
// Test invalid states
invalidClaim := claim
invalidClaim.State = "invalid-state"
err = client.validateStatusClaim(&invalidClaim)
if err == nil {
t.Error("Expected invalid state to fail validation")
}
// Test invalid progress
invalidClaim = claim
invalidClaim.Progress = 1.5
err = client.validateStatusClaim(&invalidClaim)
if err == nil {
t.Error("Expected invalid progress to fail validation")
}
// Test negative beats left
invalidClaim = claim
invalidClaim.BeatsLeft = -1
err = client.validateStatusClaim(&invalidClaim)
if err == nil {
t.Error("Expected negative beats_left to fail validation")
}
}
// TestBeatBudget tests beat budget functionality
func TestBeatBudget(t *testing.T) {
config := DefaultConfig()
config.ClusterID = "test-cluster"
config.AgentID = generateUniqueAgentID("test-agent")
client := NewClient(config).(*client)
client.currentTempo = 120 // 120 BPM = 0.5 seconds per beat
ctx := context.Background()
client.ctx = ctx
// Test successful execution within budget
executed := false
err := client.WithBeatBudget(2, func() error {
executed = true
time.Sleep(100 * time.Millisecond) // Much less than 2 beats (1 second)
return nil
})
if err != nil {
t.Errorf("Expected function to complete successfully: %v", err)
}
if !executed {
t.Error("Expected function to be executed")
}
// Test timeout (need to be careful with timing)
timeoutErr := client.WithBeatBudget(1, func() error {
time.Sleep(2 * time.Second) // More than 1 beat at 120 BPM (0.5s)
return nil
})
if timeoutErr == nil {
t.Error("Expected function to timeout")
}
if timeoutErr.Error() != "beat budget of 1 beats exceeded" {
t.Errorf("Expected timeout error message, got: %v", timeoutErr)
}
// Test invalid budget
err = client.WithBeatBudget(0, func() error { return nil })
if err == nil {
t.Error("Expected error for zero beat budget")
}
err = client.WithBeatBudget(-1, func() error { return nil })
if err == nil {
t.Error("Expected error for negative beat budget")
}
}
// TestTempoTracking tests tempo tracking and drift calculation
func TestTempoTracking(t *testing.T) {
config := DefaultConfig()
config.ClusterID = "test-cluster"
config.AgentID = generateUniqueAgentID("test-agent")
client := NewClient(config).(*client)
// Test initial values
if client.GetCurrentTempo() != 60 {
t.Errorf("Expected default tempo to be 60, got %d", client.GetCurrentTempo())
}
if client.GetTempoDrift() != 0 {
t.Errorf("Expected initial tempo drift to be 0, got %v", client.GetTempoDrift())
}
// Simulate tempo changes
client.beatMutex.Lock()
client.currentTempo = 120
client.tempoHistory = append(client.tempoHistory, tempoSample{
BeatIndex: 1,
Tempo: 120,
MeasuredTime: time.Now(),
ActualBPM: 118.0, // Slightly slower than expected
})
client.tempoHistory = append(client.tempoHistory, tempoSample{
BeatIndex: 2,
Tempo: 120,
MeasuredTime: time.Now().Add(500 * time.Millisecond),
ActualBPM: 119.0, // Still slightly slower
})
client.beatMutex.Unlock()
if client.GetCurrentTempo() != 120 {
t.Errorf("Expected current tempo to remain at 120 BPM, got %d", client.GetCurrentTempo())
}
// Test drift calculation (should be non-zero due to difference between 120 and measured BPM)
drift := client.GetTempoDrift()
if drift == 0 {
t.Error("Expected non-zero tempo drift")
}
}
// TestLegacyCompatibility tests legacy beat conversion
func TestLegacyCompatibility(t *testing.T) {
config := DefaultConfig()
config.ClusterID = "test-cluster"
config.AgentID = generateUniqueAgentID("test-agent")
client := NewClient(config).(*client)
// Test legacy beat conversion
beatIndex := client.ConvertLegacyBeat(2, 3) // Bar 2, Beat 3
expectedBeatIndex := int64(7) // (2-1)*4 + 3 = 7
if beatIndex != expectedBeatIndex {
t.Errorf("Expected beat index %d, got %d", expectedBeatIndex, beatIndex)
}
// Test reverse conversion
client.beatMutex.Lock()
client.currentBeat = 7
client.beatMutex.Unlock()
legacyInfo := client.GetLegacyBeatInfo()
if legacyInfo.Bar != 2 || legacyInfo.Beat != 3 {
t.Errorf("Expected bar=2, beat=3, got bar=%d, beat=%d", legacyInfo.Bar, legacyInfo.Beat)
}
// Test edge cases
beatIndex = client.ConvertLegacyBeat(1, 1) // First beat
if beatIndex != 1 {
t.Errorf("Expected beat index 1 for first beat, got %d", beatIndex)
}
client.beatMutex.Lock()
client.currentBeat = 0 // Edge case
client.beatMutex.Unlock()
legacyInfo = client.GetLegacyBeatInfo()
if legacyInfo.Bar != 1 || legacyInfo.Beat != 1 {
t.Errorf("Expected bar=1, beat=1 for zero beat, got bar=%d, beat=%d", legacyInfo.Bar, legacyInfo.Beat)
}
}
// TestHealthStatus tests health status reporting
func TestHealthStatus(t *testing.T) {
config := DefaultConfig()
config.ClusterID = "test-cluster"
config.AgentID = generateUniqueAgentID("test-agent")
client := NewClient(config).(*client)
// Test initial health
health := client.Health()
if health.Connected {
t.Error("Expected client to be disconnected initially")
}
if health.LastBeat != 0 {
t.Error("Expected last beat to be 0 initially")
}
if health.CurrentTempo != 60 {
t.Errorf("Expected default tempo 60, got %d", health.CurrentTempo)
}
// Simulate some activity
client.beatMutex.Lock()
client.currentBeat = 10
client.currentTempo = 90
client.lastBeatTime = time.Now().Add(-100 * time.Millisecond)
client.beatMutex.Unlock()
client.addError("test error")
health = client.Health()
if health.LastBeat != 10 {
t.Errorf("Expected last beat to be 10, got %d", health.LastBeat)
}
if health.CurrentTempo != 90 {
t.Errorf("Expected current tempo to be 90, got %d", health.CurrentTempo)
}
if len(health.Errors) != 1 {
t.Errorf("Expected 1 error, got %d", len(health.Errors))
}
if health.TimeDrift <= 0 {
t.Error("Expected positive time drift")
}
}
// TestMetrics tests metrics integration
func TestMetrics(t *testing.T) {
config := DefaultConfig()
config.ClusterID = "test-cluster"
config.AgentID = generateUniqueAgentID("test-agent")
client := NewClient(config).(*client)
if client.metrics == nil {
t.Fatal("Expected metrics to be initialized")
}
// Test metrics snapshot
snapshot := client.metrics.GetMetricsSnapshot()
if snapshot == nil {
t.Error("Expected metrics snapshot to be available")
}
// Check for expected metric keys
expectedKeys := []string{
"connection_status",
"reconnect_count",
"beats_received",
"status_claims_emitted",
"budgets_created",
"total_errors",
}
for _, key := range expectedKeys {
if _, exists := snapshot[key]; !exists {
t.Errorf("Expected metric key '%s' to exist in snapshot", key)
}
}
}
// TestConfig tests configuration validation and defaults
func TestConfig(t *testing.T) {
// Test default config
config := DefaultConfig()
if config.JitterTolerance != 50*time.Millisecond {
t.Errorf("Expected default jitter tolerance 50ms, got %v", config.JitterTolerance)
}
if config.ReconnectDelay != 1*time.Second {
t.Errorf("Expected default reconnect delay 1s, got %v", config.ReconnectDelay)
}
if config.MaxReconnects != -1 {
t.Errorf("Expected default max reconnects -1, got %d", config.MaxReconnects)
}
// Test logger initialization
config.Logger = nil
client := NewClient(config)
if client == nil {
t.Error("Expected client to be created even with nil logger")
}
// Test with custom config
_, signingKey, err := ed25519.GenerateKey(rand.Reader)
if err != nil {
t.Fatalf("Failed to generate signing key: %v", err)
}
config.ClusterID = "custom-cluster"
config.AgentID = "custom-agent"
config.SigningKey = signingKey
config.JitterTolerance = 100 * time.Millisecond
config.Logger = slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelDebug}))
client = NewClient(config)
if client == nil {
t.Error("Expected client to be created with custom config")
}
}
// TestBeatDurationCalculation tests beat duration calculation
func TestBeatDurationCalculation(t *testing.T) {
config := DefaultConfig()
config.ClusterID = "test-cluster"
config.AgentID = generateUniqueAgentID("test-agent")
client := NewClient(config).(*client)
// Test default 60 BPM (1 second per beat)
duration := client.getBeatDuration()
expected := 1000 * time.Millisecond
if duration != expected {
t.Errorf("Expected beat duration %v for 60 BPM, got %v", expected, duration)
}
// Test 120 BPM (0.5 seconds per beat)
client.beatMutex.Lock()
client.currentTempo = 120
client.beatMutex.Unlock()
duration = client.getBeatDuration()
expected = 500 * time.Millisecond
if duration != expected {
t.Errorf("Expected beat duration %v for 120 BPM, got %v", expected, duration)
}
// Test 30 BPM (2 seconds per beat)
client.beatMutex.Lock()
client.currentTempo = 30
client.beatMutex.Unlock()
duration = client.getBeatDuration()
expected = 2000 * time.Millisecond
if duration != expected {
t.Errorf("Expected beat duration %v for 30 BPM, got %v", expected, duration)
}
// Test edge case: zero tempo (should default to 60 BPM)
client.beatMutex.Lock()
client.currentTempo = 0
client.beatMutex.Unlock()
duration = client.getBeatDuration()
expected = 1000 * time.Millisecond
if duration != expected {
t.Errorf("Expected beat duration %v for 0 BPM (default 60), got %v", expected, duration)
}
}
// BenchmarkBeatCallback benchmarks beat callback execution
func BenchmarkBeatCallback(b *testing.B) {
config := DefaultConfig()
config.ClusterID = "bench-cluster"
config.AgentID = "bench-agent"
client := NewClient(config).(*client)
beatFrame := BeatFrame{
Type: "backbeat.beatframe.v1",
ClusterID: "bench-cluster",
BeatIndex: 1,
Downbeat: false,
Phase: "test",
HLC: "test-hlc",
DeadlineAt: time.Now().Add(time.Second),
TempoBPM: 60,
WindowID: "test-window",
}
callbackCount := 0
client.OnBeat(func(beat BeatFrame) {
callbackCount++
})
b.ResetTimer()
for i := 0; i < b.N; i++ {
client.safeExecuteCallback(client.beatCallbacks[0], beatFrame, "beat")
}
if callbackCount != b.N {
b.Errorf("Expected callback to be called %d times, got %d", b.N, callbackCount)
}
}
// BenchmarkStatusClaimValidation benchmarks status claim validation
func BenchmarkStatusClaimValidation(b *testing.B) {
config := DefaultConfig()
config.ClusterID = "bench-cluster"
config.AgentID = "bench-agent"
client := NewClient(config).(*client)
claim := StatusClaim{
Type: "backbeat.statusclaim.v1",
AgentID: "bench-agent",
TaskID: "bench-task",
BeatIndex: 1,
State: "executing",
BeatsLeft: 5,
Progress: 0.5,
Notes: "Benchmark test",
HLC: "bench-hlc",
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
err := client.validateStatusClaim(&claim)
if err != nil {
b.Fatal(err)
}
}
}
// Mock NATS server for integration tests (if needed)
func setupTestNATSServer(t *testing.T) *nats.Conn {
// This would start an embedded NATS server for testing
// For now, we'll skip tests that require NATS if it's not available
nc, err := nats.Connect(nats.DefaultURL)
if err != nil {
t.Skipf("NATS server not available: %v", err)
return nil
}
return nc
}
func TestIntegrationWithNATS(t *testing.T) {
nc := setupTestNATSServer(t)
if nc == nil {
return // Skipped
}
defer nc.Close()
config := DefaultConfig()
config.ClusterID = "integration-test"
config.AgentID = generateUniqueAgentID("test-agent")
config.NATSUrl = nats.DefaultURL
client := NewClient(config)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
// Test start/stop cycle
err := client.Start(ctx)
if err != nil {
t.Fatalf("Failed to start client: %v", err)
}
// Check health after start
health := client.Health()
if !health.Connected {
t.Error("Expected client to be connected after start")
}
// Test stop
err = client.Stop()
if err != nil {
t.Errorf("Failed to stop client: %v", err)
}
// Check health after stop
health = client.Health()
if health.Connected {
t.Error("Expected client to be disconnected after stop")
}
}

110
pkg/sdk/doc.go Normal file
View File

@@ -0,0 +1,110 @@
// Package sdk provides the BACKBEAT Go SDK for enabling CHORUS services
// to become BACKBEAT-aware with beat synchronization and status emission.
//
// The BACKBEAT SDK enables services to:
// - Subscribe to cluster-wide beat events with jitter tolerance
// - Emit status claims with automatic metadata population
// - Use beat budgets for timeout management
// - Operate in local degradation mode when pulse unavailable
// - Integrate comprehensive observability and health reporting
//
// # Quick Start
//
// config := sdk.DefaultConfig()
// config.ClusterID = "chorus-dev"
// config.AgentID = "my-service"
// config.NATSUrl = "nats://localhost:4222"
//
// client := sdk.NewClient(config)
//
// client.OnBeat(func(beat sdk.BeatFrame) {
// // Called every beat
// client.EmitStatusClaim(sdk.StatusClaim{
// State: "executing",
// Progress: 0.5,
// Notes: "Processing data",
// })
// })
//
// ctx := context.Background()
// client.Start(ctx)
// defer client.Stop()
//
// # Beat Subscription
//
// Register callbacks for beat and downbeat events:
//
// client.OnBeat(func(beat sdk.BeatFrame) {
// // Called every beat (~1-4 times per second depending on tempo)
// fmt.Printf("Beat %d\n", beat.BeatIndex)
// })
//
// client.OnDownbeat(func(beat sdk.BeatFrame) {
// // Called at the start of each bar (every 4 beats typically)
// fmt.Printf("Bar started: %s\n", beat.WindowID)
// })
//
// # Status Emission
//
// Emit status claims to report current state and progress:
//
// err := client.EmitStatusClaim(sdk.StatusClaim{
// State: "executing", // executing|planning|waiting|review|done|failed
// BeatsLeft: 10, // estimated beats remaining
// Progress: 0.75, // progress ratio (0.0-1.0)
// Notes: "Processing batch 5/10",
// })
//
// # Beat Budgets
//
// Execute functions with beat-based timeouts:
//
// err := client.WithBeatBudget(10, func() error {
// // This function has 10 beats to complete
// return performLongRunningTask()
// })
//
// if err != nil {
// // Handle timeout or task error
// log.Printf("Task failed or exceeded budget: %v", err)
// }
//
// # Health and Observability
//
// Monitor client health and metrics:
//
// health := client.Health()
// fmt.Printf("Connected: %v\n", health.Connected)
// fmt.Printf("Last Beat: %d\n", health.LastBeat)
// fmt.Printf("Reconnects: %d\n", health.ReconnectCount)
//
// # Local Degradation
//
// The SDK automatically handles network issues by entering local degradation mode:
// - Generates synthetic beats when pulse service unavailable
// - Uses fallback timing to maintain callback schedules
// - Automatically recovers when pulse service returns
// - Provides seamless operation during network partitions
//
// # Security
//
// The SDK implements BACKBEAT security requirements:
// - Ed25519 signing of all status claims when key provided
// - Required x-window-id and x-hlc headers
// - Agent identification for proper message routing
//
// # Performance
//
// Designed for production use with:
// - Beat callback latency target ≤5ms
// - Timer drift ≤1% over 1 hour without leader
// - Goroutine-safe concurrent operations
// - Bounded memory usage for metrics and errors
//
// # Examples
//
// See the examples subdirectory for complete usage patterns:
// - examples/simple_agent.go: Basic integration
// - examples/task_processor.go: Beat budget usage
// - examples/service_monitor.go: Health monitoring
package sdk

View File

@@ -0,0 +1,520 @@
package examples
import (
"context"
"crypto/ed25519"
"crypto/rand"
"fmt"
"testing"
"time"
"github.com/chorus-services/backbeat/pkg/sdk"
)
var testCounter int
// generateUniqueAgentID generates unique agent IDs for tests to avoid expvar conflicts
func generateUniqueAgentID(prefix string) string {
testCounter++
return fmt.Sprintf("%s-%d", prefix, testCounter)
}
// Test helper interface for both *testing.T and *testing.B
type testHelper interface {
Fatalf(format string, args ...interface{})
}
// Test helper to create a test client configuration
func createTestConfig(t testHelper, agentIDPrefix string) *sdk.Config {
_, signingKey, err := ed25519.GenerateKey(rand.Reader)
if err != nil {
t.Fatalf("Failed to generate signing key: %v", err)
}
config := sdk.DefaultConfig()
config.ClusterID = "test-cluster"
config.AgentID = generateUniqueAgentID(agentIDPrefix)
config.NATSUrl = "nats://localhost:4222" // Assumes NATS is running for tests
config.SigningKey = signingKey
return config
}
// TestSimpleAgentPattern tests the simple agent usage pattern
func TestSimpleAgentPattern(t *testing.T) {
config := createTestConfig(t, "test-simple-agent")
client := sdk.NewClient(config)
// Context for timeout control (used in full integration tests)
_ = context.Background()
// Track callback invocations
var beatCount, downbeatCount int
// Register callbacks
err := client.OnBeat(func(beat sdk.BeatFrame) {
beatCount++
t.Logf("Beat received: %d (downbeat: %v)", beat.BeatIndex, beat.Downbeat)
})
if err != nil {
t.Fatalf("Failed to register beat callback: %v", err)
}
err = client.OnDownbeat(func(beat sdk.BeatFrame) {
downbeatCount++
t.Logf("Downbeat received: %d", beat.BeatIndex)
})
if err != nil {
t.Fatalf("Failed to register downbeat callback: %v", err)
}
// Use variables to prevent unused warnings
_ = beatCount
_ = downbeatCount
// This test only checks if the client can be configured and started
// without errors. Full integration tests would require running services.
// Test health status before starting
health := client.Health()
if health.Connected {
t.Error("Client should not be connected before Start()")
}
// Test that we can create status claims
err = client.EmitStatusClaim(sdk.StatusClaim{
State: "planning",
BeatsLeft: 10,
Progress: 0.0,
Notes: "Test status claim",
})
// This should fail because client isn't started
if err == nil {
t.Error("EmitStatusClaim should fail when client not started")
}
}
// TestBeatBudgetPattern tests the beat budget usage pattern
func TestBeatBudgetPattern(t *testing.T) {
config := createTestConfig(t, "test-budget-agent")
client := sdk.NewClient(config)
// Test beat budget without starting client (should work for timeout logic)
err := client.WithBeatBudget(2, func() error {
time.Sleep(100 * time.Millisecond) // Quick task
return nil
})
// This may fail due to no beat timing available, but shouldn't panic
if err != nil {
t.Logf("Beat budget failed as expected (no timing): %v", err)
}
// Test invalid budget
err = client.WithBeatBudget(0, func() error {
return nil
})
if err == nil {
t.Error("WithBeatBudget should fail with zero budget")
}
err = client.WithBeatBudget(-1, func() error {
return nil
})
if err == nil {
t.Error("WithBeatBudget should fail with negative budget")
}
}
// TestClientConfiguration tests various client configuration scenarios
func TestClientConfiguration(t *testing.T) {
// Test with minimal config
config := &sdk.Config{
ClusterID: "test",
AgentID: "test-agent",
NATSUrl: "nats://localhost:4222",
}
client := sdk.NewClient(config)
if client == nil {
t.Fatal("NewClient should not return nil")
}
// Test health before start
health := client.Health()
if health.Connected {
t.Error("New client should not be connected")
}
// Test utilities with no beat data
beat := client.GetCurrentBeat()
if beat != 0 {
t.Errorf("GetCurrentBeat should return 0 initially, got %d", beat)
}
window := client.GetCurrentWindow()
if window != "" {
t.Errorf("GetCurrentWindow should return empty string initially, got %s", window)
}
// Test IsInWindow
if client.IsInWindow("any-window") {
t.Error("IsInWindow should return false with no current window")
}
}
// TestStatusClaimValidation tests status claim validation
func TestStatusClaimValidation(t *testing.T) {
config := createTestConfig(t, "test-validation")
client := sdk.NewClient(config)
// Test various invalid status claims
testCases := []struct {
name string
claim sdk.StatusClaim
wantErr bool
}{
{
name: "valid claim",
claim: sdk.StatusClaim{
State: "executing",
BeatsLeft: 5,
Progress: 0.5,
Notes: "Test note",
},
wantErr: false, // Will still error due to no connection, but validation should pass
},
{
name: "invalid state",
claim: sdk.StatusClaim{
State: "invalid",
BeatsLeft: 5,
Progress: 0.5,
Notes: "Test note",
},
wantErr: true,
},
{
name: "negative progress",
claim: sdk.StatusClaim{
State: "executing",
BeatsLeft: 5,
Progress: -0.1,
Notes: "Test note",
},
wantErr: true,
},
{
name: "progress too high",
claim: sdk.StatusClaim{
State: "executing",
BeatsLeft: 5,
Progress: 1.1,
Notes: "Test note",
},
wantErr: true,
},
{
name: "negative beats left",
claim: sdk.StatusClaim{
State: "executing",
BeatsLeft: -1,
Progress: 0.5,
Notes: "Test note",
},
wantErr: true,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
err := client.EmitStatusClaim(tc.claim)
if tc.wantErr && err == nil {
t.Error("Expected error but got none")
}
// Note: All will error due to no connection, but we're testing validation
if err != nil {
t.Logf("Error (expected): %v", err)
}
})
}
}
// BenchmarkStatusClaimEmission benchmarks status claim creation and validation
func BenchmarkStatusClaimEmission(b *testing.B) {
config := createTestConfig(b, "benchmark-agent")
client := sdk.NewClient(config)
claim := sdk.StatusClaim{
State: "executing",
BeatsLeft: 10,
Progress: 0.75,
Notes: "Benchmark test claim",
}
b.ResetTimer()
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
// This will fail due to no connection, but measures validation overhead
client.EmitStatusClaim(claim)
}
})
}
// BenchmarkBeatCallbacks benchmarks callback execution
func BenchmarkBeatCallbacks(b *testing.B) {
config := createTestConfig(b, "callback-benchmark")
client := sdk.NewClient(config)
// Register a simple callback
client.OnBeat(func(beat sdk.BeatFrame) {
// Minimal processing
_ = beat.BeatIndex
})
// Create a mock beat frame
beatFrame := sdk.BeatFrame{
Type: "backbeat.beatframe.v1",
ClusterID: "test",
BeatIndex: 1,
Downbeat: false,
Phase: "test",
HLC: "123-0",
WindowID: "test-window",
TempoBPM: 2, // 30-second beats - much more reasonable for testing
}
b.ResetTimer()
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
// Simulate callback execution
// Note: This doesn't actually invoke callbacks since client isn't started
_ = beatFrame
}
})
}
// TestDetermineState tests the state determination logic from simple_agent.go
func TestDetermineState(t *testing.T) {
tests := []struct {
total int64
completed int64
expected string
}{
{0, 0, "waiting"},
{5, 5, "done"},
{5, 3, "executing"},
{5, 0, "planning"},
{10, 8, "executing"},
{1, 1, "done"},
}
for _, test := range tests {
result := determineState(test.total, test.completed)
if result != test.expected {
t.Errorf("determineState(%d, %d) = %s; expected %s",
test.total, test.completed, result, test.expected)
}
}
}
// TestCalculateBeatsLeft tests the beats remaining calculation from simple_agent.go
func TestCalculateBeatsLeft(t *testing.T) {
tests := []struct {
total int64
completed int64
expected int
}{
{0, 0, 0},
{5, 5, 0},
{5, 3, 10}, // (5-3) * 5 = 10
{10, 0, 50}, // 10 * 5 = 50
{1, 0, 5}, // 1 * 5 = 5
}
for _, test := range tests {
result := calculateBeatsLeft(test.total, test.completed)
if result != test.expected {
t.Errorf("calculateBeatsLeft(%d, %d) = %d; expected %d",
test.total, test.completed, result, test.expected)
}
}
}
// TestTaskStructure tests Task struct from task_processor.go
func TestTaskStructure(t *testing.T) {
task := &Task{
ID: "test-task-123",
Description: "Test processing task",
BeatBudget: 8,
WorkTime: 3 * time.Second,
Created: time.Now(),
}
if task.ID == "" {
t.Error("Expected task ID to be set")
}
if task.Description == "" {
t.Error("Expected task description to be set")
}
if task.BeatBudget <= 0 {
t.Error("Expected positive beat budget")
}
if task.WorkTime <= 0 {
t.Error("Expected positive work time")
}
if task.Created.IsZero() {
t.Error("Expected creation time to be set")
}
}
// TestServiceHealthStructure tests ServiceHealth struct from service_monitor.go
func TestServiceHealthStructure(t *testing.T) {
health := &ServiceHealth{
ServiceName: "test-service",
Status: "healthy",
LastCheck: time.Now(),
ResponseTime: 150 * time.Millisecond,
ErrorCount: 0,
Uptime: 5 * time.Minute,
}
if health.ServiceName == "" {
t.Error("Expected service name to be set")
}
validStatuses := []string{"healthy", "degraded", "unhealthy", "unknown"}
validStatus := false
for _, status := range validStatuses {
if health.Status == status {
validStatus = true
break
}
}
if !validStatus {
t.Errorf("Expected valid status, got: %s", health.Status)
}
if health.ResponseTime < 0 {
t.Error("Expected non-negative response time")
}
if health.ErrorCount < 0 {
t.Error("Expected non-negative error count")
}
}
// TestSystemMetricsStructure tests SystemMetrics struct from service_monitor.go
func TestSystemMetricsStructure(t *testing.T) {
metrics := &SystemMetrics{
CPUPercent: 25.5,
MemoryPercent: 67.8,
GoroutineCount: 42,
HeapSizeMB: 128.5,
}
if metrics.CPUPercent < 0 || metrics.CPUPercent > 100 {
t.Error("Expected CPU percentage between 0 and 100")
}
if metrics.MemoryPercent < 0 || metrics.MemoryPercent > 100 {
t.Error("Expected memory percentage between 0 and 100")
}
if metrics.GoroutineCount < 0 {
t.Error("Expected non-negative goroutine count")
}
if metrics.HeapSizeMB < 0 {
t.Error("Expected non-negative heap size")
}
}
// TestHealthScoreCalculation tests calculateHealthScore from service_monitor.go
func TestHealthScoreCalculation(t *testing.T) {
tests := []struct {
summary map[string]int
expected float64
}{
{map[string]int{"healthy": 0, "degraded": 0, "unhealthy": 0, "unknown": 0}, 0.0},
{map[string]int{"healthy": 4, "degraded": 0, "unhealthy": 0, "unknown": 0}, 1.0},
{map[string]int{"healthy": 0, "degraded": 0, "unhealthy": 4, "unknown": 0}, 0.0},
{map[string]int{"healthy": 2, "degraded": 2, "unhealthy": 0, "unknown": 0}, 0.75},
{map[string]int{"healthy": 1, "degraded": 1, "unhealthy": 1, "unknown": 1}, 0.4375},
}
for i, test := range tests {
result := calculateHealthScore(test.summary)
if result != test.expected {
t.Errorf("Test %d: calculateHealthScore(%v) = %.4f; expected %.4f",
i, test.summary, result, test.expected)
}
}
}
// TestDetermineOverallState tests determineOverallState from service_monitor.go
func TestDetermineOverallState(t *testing.T) {
tests := []struct {
summary map[string]int
expected string
}{
{map[string]int{"healthy": 3, "degraded": 0, "unhealthy": 0, "unknown": 0}, "done"},
{map[string]int{"healthy": 2, "degraded": 1, "unhealthy": 0, "unknown": 0}, "executing"},
{map[string]int{"healthy": 1, "degraded": 1, "unhealthy": 1, "unknown": 0}, "failed"},
{map[string]int{"healthy": 0, "degraded": 0, "unhealthy": 0, "unknown": 3}, "waiting"},
{map[string]int{"healthy": 0, "degraded": 0, "unhealthy": 1, "unknown": 0}, "failed"},
}
for i, test := range tests {
result := determineOverallState(test.summary)
if result != test.expected {
t.Errorf("Test %d: determineOverallState(%v) = %s; expected %s",
i, test.summary, result, test.expected)
}
}
}
// TestFormatHealthSummary tests formatHealthSummary from service_monitor.go
func TestFormatHealthSummary(t *testing.T) {
summary := map[string]int{
"healthy": 3,
"degraded": 2,
"unhealthy": 1,
"unknown": 0,
}
result := formatHealthSummary(summary)
expected := "H:3 D:2 U:1 ?:0"
if result != expected {
t.Errorf("formatHealthSummary() = %s; expected %s", result, expected)
}
}
// TestCollectSystemMetrics tests collectSystemMetrics from service_monitor.go
func TestCollectSystemMetrics(t *testing.T) {
metrics := collectSystemMetrics()
if metrics.GoroutineCount <= 0 {
t.Error("Expected positive goroutine count")
}
if metrics.HeapSizeMB < 0 {
t.Error("Expected non-negative heap size")
}
// Note: CPU and Memory percentages are simplified in the example implementation
if metrics.CPUPercent < 0 {
t.Error("Expected non-negative CPU percentage")
}
if metrics.MemoryPercent < 0 {
t.Error("Expected non-negative memory percentage")
}
}

View File

@@ -0,0 +1,326 @@
package examples
import (
"context"
"crypto/ed25519"
"crypto/rand"
"encoding/json"
"fmt"
"log/slog"
"net/http"
"os"
"os/signal"
"runtime"
"sync"
"syscall"
"time"
"github.com/chorus-services/backbeat/pkg/sdk"
)
// ServiceHealth represents the health status of a monitored service
type ServiceHealth struct {
ServiceName string `json:"service_name"`
Status string `json:"status"` // healthy, degraded, unhealthy
LastCheck time.Time `json:"last_check"`
ResponseTime time.Duration `json:"response_time"`
ErrorCount int `json:"error_count"`
Uptime time.Duration `json:"uptime"`
}
// SystemMetrics represents system-level metrics
type SystemMetrics struct {
CPUPercent float64 `json:"cpu_percent"`
MemoryPercent float64 `json:"memory_percent"`
GoroutineCount int `json:"goroutine_count"`
HeapSizeMB float64 `json:"heap_size_mb"`
}
// ServiceMonitor demonstrates health monitoring with beat-aligned reporting
// This example shows how to integrate BACKBEAT with service monitoring
func ServiceMonitor() {
// Generate a signing key for this example
_, signingKey, err := ed25519.GenerateKey(rand.Reader)
if err != nil {
slog.Error("Failed to generate signing key", "error", err)
return
}
// Create SDK configuration
config := sdk.DefaultConfig()
config.ClusterID = "chorus-dev"
config.AgentID = "service-monitor"
config.NATSUrl = "nats://localhost:4222"
config.SigningKey = signingKey
config.Logger = slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{
Level: slog.LevelInfo,
}))
// Create BACKBEAT client
client := sdk.NewClient(config)
// Services to monitor (example endpoints)
monitoredServices := map[string]string{
"pulse-service": "http://localhost:8080/health",
"reverb-service": "http://localhost:8081/health",
"nats-server": "http://localhost:8222/varz", // NATS monitoring endpoint
}
// Health tracking
var (
healthStatus = make(map[string]*ServiceHealth)
healthMutex sync.RWMutex
startTime = time.Now()
)
// Initialize health status
for serviceName := range monitoredServices {
healthStatus[serviceName] = &ServiceHealth{
ServiceName: serviceName,
Status: "unknown",
LastCheck: time.Time{},
}
}
// Register beat callback for frequent health checks
client.OnBeat(func(beat sdk.BeatFrame) {
// Perform health checks every 4 beats (reduce frequency)
if beat.BeatIndex%4 == 0 {
performHealthChecks(monitoredServices, healthStatus, &healthMutex)
}
// Emit status claim with current health summary
if beat.BeatIndex%2 == 0 {
healthSummary := generateHealthSummary(healthStatus, &healthMutex)
systemMetrics := collectSystemMetrics()
state := determineOverallState(healthSummary)
notes := fmt.Sprintf("Services: %s | CPU: %.1f%% | Mem: %.1f%% | Goroutines: %d",
formatHealthSummary(healthSummary),
systemMetrics.CPUPercent,
systemMetrics.MemoryPercent,
systemMetrics.GoroutineCount)
err := client.EmitStatusClaim(sdk.StatusClaim{
State: state,
BeatsLeft: 0, // Monitoring is continuous
Progress: calculateHealthScore(healthSummary),
Notes: notes,
})
if err != nil {
slog.Error("Failed to emit status claim", "error", err)
}
}
})
// Register downbeat callback for detailed reporting
client.OnDownbeat(func(beat sdk.BeatFrame) {
healthMutex.RLock()
healthData, _ := json.MarshalIndent(healthStatus, "", " ")
healthMutex.RUnlock()
systemMetrics := collectSystemMetrics()
uptime := time.Since(startTime)
slog.Info("Service health report",
"beat_index", beat.BeatIndex,
"window_id", beat.WindowID,
"uptime", uptime.String(),
"cpu_percent", systemMetrics.CPUPercent,
"memory_percent", systemMetrics.MemoryPercent,
"heap_mb", systemMetrics.HeapSizeMB,
"goroutines", systemMetrics.GoroutineCount,
)
// Log health details
slog.Debug("Detailed health status", "health_data", string(healthData))
// Emit comprehensive status for the bar
healthSummary := generateHealthSummary(healthStatus, &healthMutex)
err := client.EmitStatusClaim(sdk.StatusClaim{
State: "review", // Downbeat is review time
BeatsLeft: 0,
Progress: calculateHealthScore(healthSummary),
Notes: fmt.Sprintf("Bar %d health review: %s", beat.BeatIndex/4, formatDetailedHealth(healthSummary, systemMetrics)),
})
if err != nil {
slog.Error("Failed to emit downbeat status", "error", err)
}
})
// Setup graceful shutdown
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// Handle shutdown signals
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
go func() {
<-sigChan
slog.Info("Shutdown signal received")
cancel()
}()
// Start the client
if err := client.Start(ctx); err != nil {
slog.Error("Failed to start BACKBEAT client", "error", err)
return
}
defer client.Stop()
slog.Info("Service monitor started - use Ctrl+C to stop",
"monitored_services", len(monitoredServices))
// Expose metrics endpoint
go func() {
http.HandleFunc("/metrics", func(w http.ResponseWriter, r *http.Request) {
healthMutex.RLock()
data := make(map[string]interface{})
data["health"] = healthStatus
data["system"] = collectSystemMetrics()
data["backbeat"] = client.Health()
healthMutex.RUnlock()
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(data)
})
slog.Info("Metrics endpoint available", "url", "http://localhost:9090/metrics")
if err := http.ListenAndServe(":9090", nil); err != nil {
slog.Error("Metrics server failed", "error", err)
}
}()
// Wait for shutdown
<-ctx.Done()
slog.Info("Service monitor shutting down")
}
// performHealthChecks checks the health of all monitored services
func performHealthChecks(services map[string]string, healthStatus map[string]*ServiceHealth, mutex *sync.RWMutex) {
for serviceName, endpoint := range services {
go func(name, url string) {
start := time.Now()
client := &http.Client{Timeout: 5 * time.Second}
resp, err := client.Get(url)
responseTime := time.Since(start)
mutex.Lock()
health := healthStatus[name]
health.LastCheck = time.Now()
health.ResponseTime = responseTime
if err != nil {
health.ErrorCount++
health.Status = "unhealthy"
slog.Warn("Health check failed",
"service", name,
"endpoint", url,
"error", err,
"response_time", responseTime)
} else {
if resp.StatusCode >= 200 && resp.StatusCode < 300 {
health.Status = "healthy"
} else if resp.StatusCode >= 300 && resp.StatusCode < 500 {
health.Status = "degraded"
} else {
health.Status = "unhealthy"
health.ErrorCount++
}
resp.Body.Close()
if responseTime > 2*time.Second {
health.Status = "degraded" // Slow response
}
slog.Debug("Health check completed",
"service", name,
"status", health.Status,
"response_time", responseTime,
"status_code", resp.StatusCode)
}
mutex.Unlock()
}(serviceName, endpoint)
}
}
// generateHealthSummary creates a summary of service health
func generateHealthSummary(healthStatus map[string]*ServiceHealth, mutex *sync.RWMutex) map[string]int {
mutex.RLock()
defer mutex.RUnlock()
summary := map[string]int{
"healthy": 0,
"degraded": 0,
"unhealthy": 0,
"unknown": 0,
}
for _, health := range healthStatus {
summary[health.Status]++
}
return summary
}
// determineOverallState determines the overall system state
func determineOverallState(healthSummary map[string]int) string {
if healthSummary["unhealthy"] > 0 {
return "failed"
}
if healthSummary["degraded"] > 0 {
return "executing" // Degraded but still working
}
if healthSummary["healthy"] > 0 {
return "done"
}
return "waiting" // All unknown
}
// calculateHealthScore calculates a health score (0.0-1.0)
func calculateHealthScore(healthSummary map[string]int) float64 {
total := healthSummary["healthy"] + healthSummary["degraded"] + healthSummary["unhealthy"] + healthSummary["unknown"]
if total == 0 {
return 0.0
}
// Weight the scores: healthy=1.0, degraded=0.5, unhealthy=0.0, unknown=0.25
score := float64(healthSummary["healthy"])*1.0 +
float64(healthSummary["degraded"])*0.5 +
float64(healthSummary["unknown"])*0.25
return score / float64(total)
}
// formatHealthSummary creates a compact string representation
func formatHealthSummary(healthSummary map[string]int) string {
return fmt.Sprintf("H:%d D:%d U:%d ?:%d",
healthSummary["healthy"],
healthSummary["degraded"],
healthSummary["unhealthy"],
healthSummary["unknown"])
}
// formatDetailedHealth creates detailed health information
func formatDetailedHealth(healthSummary map[string]int, systemMetrics SystemMetrics) string {
return fmt.Sprintf("Health: %s, CPU: %.1f%%, Mem: %.1f%%, Heap: %.1fMB",
formatHealthSummary(healthSummary),
systemMetrics.CPUPercent,
systemMetrics.MemoryPercent,
systemMetrics.HeapSizeMB)
}
// collectSystemMetrics collects basic system metrics
func collectSystemMetrics() SystemMetrics {
var mem runtime.MemStats
runtime.ReadMemStats(&mem)
return SystemMetrics{
CPUPercent: 0.0, // Would need external package like gopsutil for real CPU metrics
MemoryPercent: float64(mem.Sys) / (1024 * 1024 * 1024) * 100, // Rough approximation
GoroutineCount: runtime.NumGoroutine(),
HeapSizeMB: float64(mem.HeapSys) / (1024 * 1024),
}
}

View File

@@ -0,0 +1,150 @@
// Package examples demonstrates BACKBEAT SDK usage patterns
package examples
import (
"context"
"crypto/ed25519"
"crypto/rand"
"fmt"
"log/slog"
"os"
"os/signal"
"sync/atomic"
"syscall"
"time"
"github.com/chorus-services/backbeat/pkg/sdk"
)
// SimpleAgent demonstrates basic BACKBEAT SDK usage
// This example shows the minimal integration pattern for CHORUS services
func SimpleAgent() {
// Generate a signing key for this example
_, signingKey, err := ed25519.GenerateKey(rand.Reader)
if err != nil {
slog.Error("Failed to generate signing key", "error", err)
return
}
// Create SDK configuration
config := sdk.DefaultConfig()
config.ClusterID = "chorus-dev"
config.AgentID = "simple-agent"
config.NATSUrl = "nats://localhost:4222" // Adjust for your setup
config.SigningKey = signingKey
config.Logger = slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{
Level: slog.LevelInfo,
}))
// Create BACKBEAT client
client := sdk.NewClient(config)
// Track some simple state
var taskCounter int64
var completedTasks int64
// Register beat callback - this runs on every beat
client.OnBeat(func(beat sdk.BeatFrame) {
currentTasks := atomic.LoadInt64(&taskCounter)
completed := atomic.LoadInt64(&completedTasks)
// Emit status every few beats
if beat.BeatIndex%3 == 0 {
progress := 0.0
if currentTasks > 0 {
progress = float64(completed) / float64(currentTasks)
}
err := client.EmitStatusClaim(sdk.StatusClaim{
State: determineState(currentTasks, completed),
BeatsLeft: calculateBeatsLeft(currentTasks, completed),
Progress: progress,
Notes: fmt.Sprintf("Processing tasks: %d/%d", completed, currentTasks),
})
if err != nil {
slog.Error("Failed to emit status claim", "error", err)
}
}
})
// Register downbeat callback - this runs at the start of each bar
client.OnDownbeat(func(beat sdk.BeatFrame) {
slog.Info("Bar started",
"beat_index", beat.BeatIndex,
"window_id", beat.WindowID,
"phase", beat.Phase)
// Start new tasks at the beginning of bars
atomic.AddInt64(&taskCounter, 2) // Add 2 new tasks per bar
})
// Setup graceful shutdown
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// Handle shutdown signals
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
go func() {
<-sigChan
slog.Info("Shutdown signal received")
cancel()
}()
// Start the client
if err := client.Start(ctx); err != nil {
slog.Error("Failed to start BACKBEAT client", "error", err)
return
}
defer client.Stop()
slog.Info("Simple agent started - use Ctrl+C to stop")
// Simulate some work - complete tasks periodically
ticker := time.NewTicker(2 * time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
slog.Info("Shutting down simple agent")
return
case <-ticker.C:
// Complete a task if we have any pending
current := atomic.LoadInt64(&taskCounter)
completed := atomic.LoadInt64(&completedTasks)
if completed < current {
atomic.AddInt64(&completedTasks, 1)
slog.Debug("Completed a task",
"completed", completed+1,
"total", current)
}
}
}
}
// determineState calculates the current state based on task progress
func determineState(total, completed int64) string {
if total == 0 {
return "waiting"
}
if completed == total {
return "done"
}
if completed > 0 {
return "executing"
}
return "planning"
}
// calculateBeatsLeft estimates beats remaining based on current progress
func calculateBeatsLeft(total, completed int64) int {
if total == 0 || completed >= total {
return 0
}
remaining := total - completed
// Assume each task takes about 5 beats to complete
return int(remaining * 5)
}

View File

@@ -0,0 +1,259 @@
package examples
import (
"context"
"crypto/ed25519"
"crypto/rand"
"fmt"
"log/slog"
"math"
mathRand "math/rand"
"os"
"os/signal"
"sync"
"syscall"
"time"
"github.com/chorus-services/backbeat/pkg/sdk"
)
// Task represents a work item with beat budget requirements
type Task struct {
ID string
Description string
BeatBudget int // Maximum beats allowed for completion
WorkTime time.Duration // Simulated work duration
Created time.Time
}
// TaskProcessor demonstrates beat budget usage and timeout management
// This example shows how to use beat budgets for reliable task execution
func TaskProcessor() {
// Generate a signing key for this example
_, signingKey, err := ed25519.GenerateKey(rand.Reader)
if err != nil {
slog.Error("Failed to generate signing key", "error", err)
return
}
// Create SDK configuration
config := sdk.DefaultConfig()
config.ClusterID = "chorus-dev"
config.AgentID = "task-processor"
config.NATSUrl = "nats://localhost:4222"
config.SigningKey = signingKey
config.Logger = slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{
Level: slog.LevelDebug,
}))
// Create BACKBEAT client
client := sdk.NewClient(config)
// Task management
var (
taskQueue = make(chan *Task, 100)
activeTasks = make(map[string]*Task)
completedTasks = 0
failedTasks = 0
taskMutex sync.RWMutex
)
// Register beat callback for status reporting
client.OnBeat(func(beat sdk.BeatFrame) {
taskMutex.RLock()
activeCount := len(activeTasks)
taskMutex.RUnlock()
// Emit status every 2 beats
if beat.BeatIndex%2 == 0 {
state := "waiting"
if activeCount > 0 {
state = "executing"
}
progress := float64(completedTasks) / float64(completedTasks+failedTasks+activeCount+len(taskQueue))
if math.IsNaN(progress) {
progress = 0.0
}
err := client.EmitStatusClaim(sdk.StatusClaim{
State: state,
BeatsLeft: activeCount * 5, // Estimate 5 beats per active task
Progress: progress,
Notes: fmt.Sprintf("Active: %d, Completed: %d, Failed: %d, Queue: %d",
activeCount, completedTasks, failedTasks, len(taskQueue)),
})
if err != nil {
slog.Error("Failed to emit status claim", "error", err)
}
}
})
// Register downbeat callback to create new tasks
client.OnDownbeat(func(beat sdk.BeatFrame) {
slog.Info("New bar - creating tasks",
"beat_index", beat.BeatIndex,
"window_id", beat.WindowID)
// Create 1-3 new tasks each bar
numTasks := mathRand.Intn(3) + 1
for i := 0; i < numTasks; i++ {
task := &Task{
ID: fmt.Sprintf("task-%d-%d", beat.BeatIndex, i),
Description: fmt.Sprintf("Process data batch %d", i),
BeatBudget: mathRand.Intn(8) + 2, // 2-10 beat budget
WorkTime: time.Duration(mathRand.Intn(3)+1) * time.Second, // 1-4 seconds of work
Created: time.Now(),
}
select {
case taskQueue <- task:
slog.Debug("Task created", "task_id", task.ID, "budget", task.BeatBudget)
default:
slog.Warn("Task queue full, dropping task", "task_id", task.ID)
}
}
})
// Setup graceful shutdown
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// Handle shutdown signals
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
go func() {
<-sigChan
slog.Info("Shutdown signal received")
cancel()
}()
// Start the client
if err := client.Start(ctx); err != nil {
slog.Error("Failed to start BACKBEAT client", "error", err)
return
}
defer client.Stop()
slog.Info("Task processor started - use Ctrl+C to stop")
// Start task workers
const numWorkers = 3
for i := 0; i < numWorkers; i++ {
go func(workerID int) {
for {
select {
case <-ctx.Done():
return
case task := <-taskQueue:
processTaskWithBudget(ctx, client, task, workerID, &taskMutex, activeTasks, &completedTasks, &failedTasks)
}
}
}(i)
}
// Wait for shutdown
<-ctx.Done()
slog.Info("Task processor shutting down")
}
// processTaskWithBudget processes a task using BACKBEAT beat budgets
func processTaskWithBudget(
ctx context.Context,
client sdk.Client,
task *Task,
workerID int,
taskMutex *sync.RWMutex,
activeTasks map[string]*Task,
completedTasks *int,
failedTasks *int,
) {
// Add task to active tasks
taskMutex.Lock()
activeTasks[task.ID] = task
taskMutex.Unlock()
// Remove from active tasks when done
defer func() {
taskMutex.Lock()
delete(activeTasks, task.ID)
taskMutex.Unlock()
}()
slog.Info("Processing task",
"worker", workerID,
"task_id", task.ID,
"budget", task.BeatBudget,
"work_time", task.WorkTime)
// Use beat budget to execute the task
err := client.WithBeatBudget(task.BeatBudget, func() error {
// Emit starting status
client.EmitStatusClaim(sdk.StatusClaim{
TaskID: task.ID,
State: "executing",
BeatsLeft: task.BeatBudget,
Progress: 0.0,
Notes: fmt.Sprintf("Worker %d processing %s", workerID, task.Description),
})
// Simulate work with progress updates
steps := 5
stepDuration := task.WorkTime / time.Duration(steps)
for step := 0; step < steps; step++ {
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(stepDuration):
progress := float64(step+1) / float64(steps)
client.EmitStatusClaim(sdk.StatusClaim{
TaskID: task.ID,
State: "executing",
BeatsLeft: int(float64(task.BeatBudget) * (1.0 - progress)),
Progress: progress,
Notes: fmt.Sprintf("Worker %d step %d/%d", workerID, step+1, steps),
})
}
}
return nil
})
// Handle completion or timeout
if err != nil {
slog.Warn("Task failed or timed out",
"worker", workerID,
"task_id", task.ID,
"error", err)
*failedTasks++
// Emit failure status
client.EmitStatusClaim(sdk.StatusClaim{
TaskID: task.ID,
State: "failed",
BeatsLeft: 0,
Progress: 0.0,
Notes: fmt.Sprintf("Worker %d failed: %s", workerID, err.Error()),
})
} else {
slog.Info("Task completed successfully",
"worker", workerID,
"task_id", task.ID,
"duration", time.Since(task.Created))
*completedTasks++
// Emit completion status
client.EmitStatusClaim(sdk.StatusClaim{
TaskID: task.ID,
State: "done",
BeatsLeft: 0,
Progress: 1.0,
Notes: fmt.Sprintf("Worker %d completed %s", workerID, task.Description),
})
}
}

446
pkg/sdk/internal.go Normal file
View File

@@ -0,0 +1,446 @@
package sdk
import (
"crypto/ed25519"
"crypto/sha256"
"encoding/json"
"fmt"
"time"
"github.com/nats-io/nats.go"
)
// connect establishes connection to NATS with retry logic
func (c *client) connect() error {
opts := []nats.Option{
nats.ReconnectWait(c.config.ReconnectDelay),
nats.MaxReconnects(c.config.MaxReconnects),
nats.ReconnectHandler(func(nc *nats.Conn) {
c.reconnectCount++
c.metrics.RecordConnection()
c.config.Logger.Info("NATS reconnected",
"reconnect_count", c.reconnectCount,
"url", nc.ConnectedUrl())
}),
nats.DisconnectErrHandler(func(nc *nats.Conn, err error) {
if err != nil {
c.metrics.RecordDisconnection()
c.addError(fmt.Sprintf("NATS disconnected: %v", err))
c.config.Logger.Warn("NATS disconnected", "error", err)
}
}),
nats.ClosedHandler(func(nc *nats.Conn) {
c.metrics.RecordDisconnection()
c.config.Logger.Info("NATS connection closed")
}),
}
nc, err := nats.Connect(c.config.NATSUrl, opts...)
if err != nil {
c.metrics.RecordError(fmt.Sprintf("NATS connection failed: %v", err))
return fmt.Errorf("failed to connect to NATS: %w", err)
}
c.nc = nc
c.metrics.RecordConnection()
c.config.Logger.Info("Connected to NATS", "url", nc.ConnectedUrl())
return nil
}
// beatSubscriptionLoop handles beat frame subscription with jitter tolerance
func (c *client) beatSubscriptionLoop() {
defer c.wg.Done()
subject := fmt.Sprintf("backbeat.beat.%s", c.config.ClusterID)
// Subscribe to beat frames
sub, err := c.nc.Subscribe(subject, c.handleBeatFrame)
if err != nil {
c.addError(fmt.Sprintf("failed to subscribe to beats: %v", err))
c.config.Logger.Error("Failed to subscribe to beats", "error", err)
return
}
defer sub.Unsubscribe()
c.config.Logger.Info("Beat subscription active", "subject", subject)
// Start local degradation timer for fallback timing
localTicker := time.NewTicker(1 * time.Second) // Default 60 BPM fallback
defer localTicker.Stop()
for {
select {
case <-c.ctx.Done():
return
case <-localTicker.C:
// Local degradation mode - generate synthetic beats if no recent beats
c.beatMutex.RLock()
lastBeatTime := c.lastBeatTime
tempo := c.currentTempo
c.beatMutex.RUnlock()
if lastBeatTime.IsZero() {
continue
}
timeSinceLastBeat := time.Since(lastBeatTime)
if tempo <= 0 {
tempo = 60 // Default to 60 BPM if no tempo information available
}
expectedBeatDuration := time.Duration(float64(time.Minute) / float64(tempo))
if expectedBeatDuration < time.Second {
expectedBeatDuration = time.Second
}
grace := expectedBeatDuration / 2
if grace < 2*time.Second {
grace = 2 * time.Second
}
degradationThreshold := expectedBeatDuration + grace
if timeSinceLastBeat > degradationThreshold {
if !c.localDegradation {
// Entering degradation mode after extended silence.
c.localDegradation = true
}
c.handleLocalDegradationBeat()
c.metrics.RecordLocalDegradation(timeSinceLastBeat)
} else if c.localDegradation && timeSinceLastBeat <= expectedBeatDuration {
// Quietly exit degradation mode once beats resume within expected window.
c.localDegradation = false
}
}
}
}
// handleBeatFrame processes incoming beat frames with jitter tolerance
func (c *client) handleBeatFrame(msg *nats.Msg) {
var beatFrame BeatFrame
if err := json.Unmarshal(msg.Data, &beatFrame); err != nil {
c.addError(fmt.Sprintf("failed to unmarshal beat frame: %v", err))
return
}
// Validate beat frame
if beatFrame.Type != "backbeat.beatframe.v1" {
c.addError(fmt.Sprintf("invalid beat frame type: %s", beatFrame.Type))
return
}
// Check for jitter tolerance
now := time.Now()
expectedTime := beatFrame.DeadlineAt.Add(-c.getBeatDuration()) // Beat should arrive one duration before deadline
jitter := now.Sub(expectedTime)
if jitter.Abs() > c.config.JitterTolerance {
c.config.Logger.Debug("Beat jitter detected",
"jitter", jitter,
"tolerance", c.config.JitterTolerance,
"beat_index", beatFrame.BeatIndex)
}
// Update internal state
c.beatMutex.Lock()
c.currentBeat = beatFrame.BeatIndex
c.currentWindow = beatFrame.WindowID
c.currentHLC = beatFrame.HLC
// Track tempo changes and calculate actual BPM
if c.currentTempo != beatFrame.TempoBPM {
c.lastTempo = c.currentTempo
c.currentTempo = beatFrame.TempoBPM
}
// Calculate actual BPM from inter-beat timing
actualBPM := 60.0 // Default
if !c.lastBeatTime.IsZero() {
interBeatDuration := now.Sub(c.lastBeatTime)
if interBeatDuration > 0 {
actualBPM = 60.0 / interBeatDuration.Seconds()
}
}
// Record tempo sample for drift analysis
sample := tempoSample{
BeatIndex: beatFrame.BeatIndex,
Tempo: beatFrame.TempoBPM,
MeasuredTime: now,
ActualBPM: actualBPM,
}
c.tempoHistory = append(c.tempoHistory, sample)
// Keep only last 100 samples
if len(c.tempoHistory) > 100 {
c.tempoHistory = c.tempoHistory[1:]
}
c.lastBeatTime = now
c.beatMutex.Unlock()
// Record beat metrics
c.metrics.RecordBeat(beatFrame.DeadlineAt.Add(-c.getBeatDuration()), now, beatFrame.Downbeat)
// If we were in local degradation mode, exit it
if c.localDegradation {
// Reset without logging to keep synthetic beats silent.
c.localDegradation = false
}
// Execute beat callbacks with error handling
c.callbackMutex.RLock()
beatCallbacks := make([]func(BeatFrame), len(c.beatCallbacks))
copy(beatCallbacks, c.beatCallbacks)
var downbeatCallbacks []func(BeatFrame)
if beatFrame.Downbeat {
downbeatCallbacks = make([]func(BeatFrame), len(c.downbeatCallbacks))
copy(downbeatCallbacks, c.downbeatCallbacks)
}
c.callbackMutex.RUnlock()
// Execute callbacks in separate goroutines to prevent blocking
for _, callback := range beatCallbacks {
go c.safeExecuteCallback(callback, beatFrame, "beat")
}
if beatFrame.Downbeat {
for _, callback := range downbeatCallbacks {
go c.safeExecuteCallback(callback, beatFrame, "downbeat")
}
}
c.config.Logger.Debug("Beat processed",
"beat_index", beatFrame.BeatIndex,
"downbeat", beatFrame.Downbeat,
"phase", beatFrame.Phase,
"window_id", beatFrame.WindowID)
}
// handleLocalDegradationBeat generates synthetic beats during network issues
func (c *client) handleLocalDegradationBeat() {
c.beatMutex.Lock()
c.currentBeat++
// Generate synthetic beat frame
now := time.Now()
beatFrame := BeatFrame{
Type: "backbeat.beatframe.v1",
ClusterID: c.config.ClusterID,
BeatIndex: c.currentBeat,
Downbeat: (c.currentBeat-1)%4 == 0, // Assume 4/4 time signature
Phase: "degraded",
HLC: fmt.Sprintf("%d-0", now.UnixNano()),
DeadlineAt: now.Add(time.Second), // 1 second deadline in degradation
TempoBPM: 1, // Default 1 BPM (60-second beats) for safe recovery cadence
WindowID: c.generateDegradedWindowID(c.currentBeat),
}
c.currentWindow = beatFrame.WindowID
c.currentHLC = beatFrame.HLC
c.lastBeatTime = now
c.beatMutex.Unlock()
// Execute callbacks same as normal beats
c.callbackMutex.RLock()
beatCallbacks := make([]func(BeatFrame), len(c.beatCallbacks))
copy(beatCallbacks, c.beatCallbacks)
var downbeatCallbacks []func(BeatFrame)
if beatFrame.Downbeat {
downbeatCallbacks = make([]func(BeatFrame), len(c.downbeatCallbacks))
copy(downbeatCallbacks, c.downbeatCallbacks)
}
c.callbackMutex.RUnlock()
for _, callback := range beatCallbacks {
go c.safeExecuteCallback(callback, beatFrame, "degraded-beat")
}
if beatFrame.Downbeat {
for _, callback := range downbeatCallbacks {
go c.safeExecuteCallback(callback, beatFrame, "degraded-downbeat")
}
}
}
// safeExecuteCallback executes a callback with panic recovery
func (c *client) safeExecuteCallback(callback func(BeatFrame), beat BeatFrame, callbackType string) {
defer func() {
if r := recover(); r != nil {
errMsg := fmt.Sprintf("panic in %s callback: %v", callbackType, r)
c.addError(errMsg)
c.metrics.RecordError(errMsg)
c.config.Logger.Error("Callback panic recovered",
"type", callbackType,
"panic", r,
"beat_index", beat.BeatIndex)
}
}()
start := time.Now()
callback(beat)
duration := time.Since(start)
// Record callback latency metrics
c.metrics.RecordCallbackLatency(duration, callbackType)
// Warn about slow callbacks
if duration > 5*time.Millisecond {
c.config.Logger.Warn("Slow callback detected",
"type", callbackType,
"duration", duration,
"beat_index", beat.BeatIndex)
}
}
// validateStatusClaim validates a status claim
func (c *client) validateStatusClaim(claim *StatusClaim) error {
if claim.State == "" {
return fmt.Errorf("state is required")
}
validStates := map[string]bool{
"executing": true,
"planning": true,
"waiting": true,
"review": true,
"done": true,
"failed": true,
}
if !validStates[claim.State] {
return fmt.Errorf("invalid state: must be one of [executing, planning, waiting, review, done, failed], got '%s'", claim.State)
}
if claim.Progress < 0.0 || claim.Progress > 1.0 {
return fmt.Errorf("progress must be between 0.0 and 1.0, got %f", claim.Progress)
}
if claim.BeatsLeft < 0 {
return fmt.Errorf("beats_left must be non-negative, got %d", claim.BeatsLeft)
}
return nil
}
// signStatusClaim signs a status claim using Ed25519 (BACKBEAT-REQ-044)
func (c *client) signStatusClaim(claim *StatusClaim) error {
if c.config.SigningKey == nil {
return fmt.Errorf("signing key not configured")
}
// Create canonical representation for signing
canonical, err := json.Marshal(claim)
if err != nil {
return fmt.Errorf("failed to marshal claim for signing: %w", err)
}
// Sign the canonical representation
signature := ed25519.Sign(c.config.SigningKey, canonical)
// Add signature to notes (temporary until proper signature field added)
claim.Notes += fmt.Sprintf(" [sig:%x]", signature)
return nil
}
// createHeaders creates NATS headers with required security information
func (c *client) createHeaders() nats.Header {
headers := make(nats.Header)
// Add window ID header (BACKBEAT-REQ-044)
headers.Add("x-window-id", c.GetCurrentWindow())
// Add HLC header (BACKBEAT-REQ-044)
headers.Add("x-hlc", c.getCurrentHLC())
// Add agent ID for routing
headers.Add("x-agent-id", c.config.AgentID)
return headers
}
// getCurrentHLC returns the current HLC timestamp
func (c *client) getCurrentHLC() string {
c.beatMutex.RLock()
defer c.beatMutex.RUnlock()
if c.currentHLC != "" {
return c.currentHLC
}
// Generate fallback HLC
return fmt.Sprintf("%d-0", time.Now().UnixNano())
}
// getBeatDuration calculates the duration of a beat based on current tempo
func (c *client) getBeatDuration() time.Duration {
c.beatMutex.RLock()
tempo := c.currentTempo
c.beatMutex.RUnlock()
if tempo <= 0 {
tempo = 60 // Default to 60 BPM if no tempo information available
}
// Calculate beat duration: 60 seconds / BPM = seconds per beat
return time.Duration(60.0/float64(tempo)*1000) * time.Millisecond
}
// generateDegradedWindowID generates a window ID for degraded mode
func (c *client) generateDegradedWindowID(beatIndex int64) string {
// Use similar algorithm to regular window ID but mark as degraded
input := fmt.Sprintf("%s:degraded:%d", c.config.ClusterID, beatIndex/4) // Assume 4-beat bars
hash := sha256.Sum256([]byte(input))
return fmt.Sprintf("deg-%x", hash)[:32]
}
// addError adds an error to the error list with deduplication
func (c *client) addError(err string) {
c.errorMutex.Lock()
defer c.errorMutex.Unlock()
// Keep only the last 10 errors to prevent memory leaks
if len(c.errors) >= 10 {
c.errors = c.errors[1:]
}
timestampedErr := fmt.Sprintf("[%s] %s", time.Now().Format("15:04:05"), err)
c.errors = append(c.errors, timestampedErr)
// Record error in metrics
c.metrics.RecordError(timestampedErr)
}
// Legacy compatibility functions for BACKBEAT-REQ-043
// ConvertLegacyBeat converts legacy {bar,beat} to beat_index with warning
func (c *client) ConvertLegacyBeat(bar, beat int) int64 {
c.legacyMutex.Lock()
if !c.legacyWarned {
c.config.Logger.Warn("Legacy {bar,beat} format detected - please migrate to beat_index",
"bar", bar, "beat", beat)
c.legacyWarned = true
}
c.legacyMutex.Unlock()
// Convert assuming 4 beats per bar (standard)
return int64((bar-1)*4 + beat)
}
// GetLegacyBeatInfo converts current beat_index to legacy {bar,beat} format
func (c *client) GetLegacyBeatInfo() LegacyBeatInfo {
beatIndex := c.GetCurrentBeat()
if beatIndex <= 0 {
return LegacyBeatInfo{Bar: 1, Beat: 1}
}
// Convert assuming 4 beats per bar
bar := int((beatIndex-1)/4) + 1
beat := int((beatIndex-1)%4) + 1
return LegacyBeatInfo{Bar: bar, Beat: beat}
}

277
pkg/sdk/metrics.go Normal file
View File

@@ -0,0 +1,277 @@
package sdk
import (
"expvar"
"fmt"
"sync"
"time"
)
// Metrics provides comprehensive observability for the SDK
type Metrics struct {
// Connection metrics
ConnectionStatus *expvar.Int
ReconnectCount *expvar.Int
ConnectionDuration *expvar.Int
// Beat metrics
BeatsReceived *expvar.Int
DownbeatsReceived *expvar.Int
BeatJitterMS *expvar.Map
BeatCallbackLatency *expvar.Map
BeatMisses *expvar.Int
LocalDegradationTime *expvar.Int
// Status emission metrics
StatusClaimsEmitted *expvar.Int
StatusClaimErrors *expvar.Int
// Budget metrics
BudgetsCreated *expvar.Int
BudgetsCompleted *expvar.Int
BudgetsTimedOut *expvar.Int
// Error metrics
TotalErrors *expvar.Int
LastError *expvar.String
// Internal counters
beatJitterSamples []float64
jitterMutex sync.Mutex
callbackLatencies []float64
latencyMutex sync.Mutex
}
// NewMetrics creates a new metrics instance with expvar integration
func NewMetrics(prefix string) *Metrics {
m := &Metrics{
ConnectionStatus: expvar.NewInt(prefix + ".connection.status"),
ReconnectCount: expvar.NewInt(prefix + ".connection.reconnects"),
ConnectionDuration: expvar.NewInt(prefix + ".connection.duration_ms"),
BeatsReceived: expvar.NewInt(prefix + ".beats.received"),
DownbeatsReceived: expvar.NewInt(prefix + ".beats.downbeats"),
BeatJitterMS: expvar.NewMap(prefix + ".beats.jitter_ms"),
BeatCallbackLatency: expvar.NewMap(prefix + ".beats.callback_latency_ms"),
BeatMisses: expvar.NewInt(prefix + ".beats.misses"),
LocalDegradationTime: expvar.NewInt(prefix + ".beats.degradation_ms"),
StatusClaimsEmitted: expvar.NewInt(prefix + ".status.claims_emitted"),
StatusClaimErrors: expvar.NewInt(prefix + ".status.claim_errors"),
BudgetsCreated: expvar.NewInt(prefix + ".budgets.created"),
BudgetsCompleted: expvar.NewInt(prefix + ".budgets.completed"),
BudgetsTimedOut: expvar.NewInt(prefix + ".budgets.timed_out"),
TotalErrors: expvar.NewInt(prefix + ".errors.total"),
LastError: expvar.NewString(prefix + ".errors.last"),
beatJitterSamples: make([]float64, 0, 100),
callbackLatencies: make([]float64, 0, 100),
}
// Initialize connection status to disconnected
m.ConnectionStatus.Set(0)
return m
}
// RecordConnection records connection establishment
func (m *Metrics) RecordConnection() {
m.ConnectionStatus.Set(1)
m.ReconnectCount.Add(1)
}
// RecordDisconnection records connection loss
func (m *Metrics) RecordDisconnection() {
m.ConnectionStatus.Set(0)
}
// RecordBeat records a beat reception with jitter measurement
func (m *Metrics) RecordBeat(expectedTime, actualTime time.Time, isDownbeat bool) {
m.BeatsReceived.Add(1)
if isDownbeat {
m.DownbeatsReceived.Add(1)
}
// Calculate and record jitter
jitter := actualTime.Sub(expectedTime)
jitterMS := float64(jitter.Nanoseconds()) / 1e6
m.jitterMutex.Lock()
m.beatJitterSamples = append(m.beatJitterSamples, jitterMS)
if len(m.beatJitterSamples) > 100 {
m.beatJitterSamples = m.beatJitterSamples[1:]
}
// Update jitter statistics
if len(m.beatJitterSamples) > 0 {
avg, p95, p99 := m.calculatePercentiles(m.beatJitterSamples)
m.BeatJitterMS.Set("avg", &expvar.Float{})
m.BeatJitterMS.Get("avg").(*expvar.Float).Set(avg)
m.BeatJitterMS.Set("p95", &expvar.Float{})
m.BeatJitterMS.Get("p95").(*expvar.Float).Set(p95)
m.BeatJitterMS.Set("p99", &expvar.Float{})
m.BeatJitterMS.Get("p99").(*expvar.Float).Set(p99)
}
m.jitterMutex.Unlock()
}
// RecordBeatMiss records a missed beat
func (m *Metrics) RecordBeatMiss() {
m.BeatMisses.Add(1)
}
// RecordCallbackLatency records callback execution latency
func (m *Metrics) RecordCallbackLatency(duration time.Duration, callbackType string) {
latencyMS := float64(duration.Nanoseconds()) / 1e6
m.latencyMutex.Lock()
m.callbackLatencies = append(m.callbackLatencies, latencyMS)
if len(m.callbackLatencies) > 100 {
m.callbackLatencies = m.callbackLatencies[1:]
}
// Update latency statistics
if len(m.callbackLatencies) > 0 {
avg, p95, p99 := m.calculatePercentiles(m.callbackLatencies)
key := callbackType + "_avg"
m.BeatCallbackLatency.Set(key, &expvar.Float{})
m.BeatCallbackLatency.Get(key).(*expvar.Float).Set(avg)
key = callbackType + "_p95"
m.BeatCallbackLatency.Set(key, &expvar.Float{})
m.BeatCallbackLatency.Get(key).(*expvar.Float).Set(p95)
key = callbackType + "_p99"
m.BeatCallbackLatency.Set(key, &expvar.Float{})
m.BeatCallbackLatency.Get(key).(*expvar.Float).Set(p99)
}
m.latencyMutex.Unlock()
}
// RecordLocalDegradation records time spent in local degradation mode
func (m *Metrics) RecordLocalDegradation(duration time.Duration) {
durationMS := duration.Nanoseconds() / 1e6
m.LocalDegradationTime.Add(durationMS)
}
// RecordStatusClaim records a status claim emission
func (m *Metrics) RecordStatusClaim(success bool) {
if success {
m.StatusClaimsEmitted.Add(1)
} else {
m.StatusClaimErrors.Add(1)
}
}
// RecordBudget records budget creation and completion
func (m *Metrics) RecordBudgetCreated() {
m.BudgetsCreated.Add(1)
}
func (m *Metrics) RecordBudgetCompleted(timedOut bool) {
if timedOut {
m.BudgetsTimedOut.Add(1)
} else {
m.BudgetsCompleted.Add(1)
}
}
// RecordError records an error
func (m *Metrics) RecordError(err string) {
m.TotalErrors.Add(1)
m.LastError.Set(err)
}
// calculatePercentiles calculates avg, p95, p99 for a slice of samples
func (m *Metrics) calculatePercentiles(samples []float64) (avg, p95, p99 float64) {
if len(samples) == 0 {
return 0, 0, 0
}
// Calculate average
sum := 0.0
for _, s := range samples {
sum += s
}
avg = sum / float64(len(samples))
// Sort for percentiles (simple bubble sort for small slices)
sorted := make([]float64, len(samples))
copy(sorted, samples)
for i := 0; i < len(sorted); i++ {
for j := 0; j < len(sorted)-i-1; j++ {
if sorted[j] > sorted[j+1] {
sorted[j], sorted[j+1] = sorted[j+1], sorted[j]
}
}
}
// Calculate percentiles
p95Index := int(float64(len(sorted)) * 0.95)
if p95Index >= len(sorted) {
p95Index = len(sorted) - 1
}
p95 = sorted[p95Index]
p99Index := int(float64(len(sorted)) * 0.99)
if p99Index >= len(sorted) {
p99Index = len(sorted) - 1
}
p99 = sorted[p99Index]
return avg, p95, p99
}
// Enhanced client with metrics integration
func (c *client) initMetrics() {
prefix := fmt.Sprintf("backbeat.sdk.%s", c.config.AgentID)
c.metrics = NewMetrics(prefix)
}
// Add metrics field to client struct (this would go in client.go)
type clientWithMetrics struct {
*client
metrics *Metrics
}
// Prometheus integration helper
type PrometheusMetrics struct {
// This would integrate with prometheus/client_golang
// For now, we'll just use expvar which can be scraped
}
// GetMetricsSnapshot returns a snapshot of all current metrics
func (m *Metrics) GetMetricsSnapshot() map[string]interface{} {
snapshot := make(map[string]interface{})
snapshot["connection_status"] = m.ConnectionStatus.Value()
snapshot["reconnect_count"] = m.ReconnectCount.Value()
snapshot["beats_received"] = m.BeatsReceived.Value()
snapshot["downbeats_received"] = m.DownbeatsReceived.Value()
snapshot["beat_misses"] = m.BeatMisses.Value()
snapshot["status_claims_emitted"] = m.StatusClaimsEmitted.Value()
snapshot["status_claim_errors"] = m.StatusClaimErrors.Value()
snapshot["budgets_created"] = m.BudgetsCreated.Value()
snapshot["budgets_completed"] = m.BudgetsCompleted.Value()
snapshot["budgets_timed_out"] = m.BudgetsTimedOut.Value()
snapshot["total_errors"] = m.TotalErrors.Value()
snapshot["last_error"] = m.LastError.Value()
return snapshot
}
// Health check with metrics
func (c *client) GetHealthWithMetrics() map[string]interface{} {
health := map[string]interface{}{
"status": c.Health(),
}
if c.metrics != nil {
health["metrics"] = c.metrics.GetMetricsSnapshot()
}
return health
}

38
prometheus.yml Normal file
View File

@@ -0,0 +1,38 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
scrape_configs:
# BACKBEAT pulse service metrics
- job_name: 'backbeat-pulse'
static_configs:
- targets: ['pulse-1:8080', 'pulse-2:8080']
metrics_path: /metrics
scrape_interval: 10s
scrape_timeout: 5s
honor_labels: true
# BACKBEAT reverb service metrics
- job_name: 'backbeat-reverb'
static_configs:
- targets: ['reverb:8080']
metrics_path: /metrics
scrape_interval: 10s
scrape_timeout: 5s
honor_labels: true
# NATS monitoring
- job_name: 'nats'
static_configs:
- targets: ['nats:8222']
metrics_path: /
scrape_interval: 15s
# Prometheus self-monitoring
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']