backbeat: add module sources
This commit is contained in:
436
contracts/docs/integration-guide.md
Normal file
436
contracts/docs/integration-guide.md
Normal file
@@ -0,0 +1,436 @@
|
||||
# BACKBEAT Integration Guide for CHORUS 2.0.0 Projects
|
||||
|
||||
This guide explains how to integrate BACKBEAT contract validation into your CHORUS 2.0.0 project for guaranteed compatibility with the distributed orchestration system.
|
||||
|
||||
## Overview
|
||||
|
||||
BACKBEAT provides three core interfaces for coordinated distributed execution:
|
||||
|
||||
- **INT-A (BeatFrame)**: Rhythm coordination from Pulse service to all agents
|
||||
- **INT-B (StatusClaim)**: Agent status reporting to Reverb service
|
||||
- **INT-C (BarReport)**: Periodic summary reports from Reverb to all services
|
||||
|
||||
All messages must conform to the published JSON schemas to ensure reliable operation across the CHORUS ecosystem.
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1. Add Contract Validation to Your CI Pipeline
|
||||
|
||||
#### GitHub Actions
|
||||
```yaml
|
||||
name: BACKBEAT Contract Validation
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
jobs:
|
||||
validate-backbeat:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Checkout BACKBEAT contracts
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
repository: 'chorus-services/backbeat'
|
||||
path: 'backbeat-contracts'
|
||||
|
||||
- name: Set up Go
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: '1.22'
|
||||
|
||||
- name: Validate BACKBEAT messages
|
||||
run: |
|
||||
cd backbeat-contracts/contracts/tests/integration
|
||||
make build
|
||||
./backbeat-validate \
|
||||
--schemas ../../schemas \
|
||||
--dir ../../../your-messages-directory \
|
||||
--exit-code
|
||||
```
|
||||
|
||||
#### GitLab CI
|
||||
```yaml
|
||||
validate-backbeat:
|
||||
stage: test
|
||||
image: golang:1.22
|
||||
before_script:
|
||||
- git clone https://github.com/chorus-services/backbeat.git /tmp/backbeat
|
||||
- cd /tmp/backbeat/contracts/tests/integration && make build
|
||||
script:
|
||||
- /tmp/backbeat/contracts/tests/integration/backbeat-validate
|
||||
--schemas /tmp/backbeat/contracts/schemas
|
||||
--dir $CI_PROJECT_DIR/messages
|
||||
--exit-code
|
||||
```
|
||||
|
||||
### 2. Project Makefile Integration
|
||||
|
||||
Add to your project's `Makefile`:
|
||||
|
||||
```makefile
|
||||
# BACKBEAT contract validation
|
||||
BACKBEAT_REPO = https://github.com/chorus-services/backbeat.git
|
||||
BACKBEAT_DIR = .backbeat-contracts
|
||||
|
||||
$(BACKBEAT_DIR):
|
||||
git clone $(BACKBEAT_REPO) $(BACKBEAT_DIR)
|
||||
|
||||
validate-backbeat: $(BACKBEAT_DIR)
|
||||
cd $(BACKBEAT_DIR)/contracts/tests/integration && make build
|
||||
$(BACKBEAT_DIR)/contracts/tests/integration/backbeat-validate \
|
||||
--schemas $(BACKBEAT_DIR)/contracts/schemas \
|
||||
--dir messages \
|
||||
--exit-code
|
||||
|
||||
.PHONY: validate-backbeat
|
||||
```
|
||||
|
||||
## Message Implementation
|
||||
|
||||
### Implementing BeatFrame Consumer (INT-A)
|
||||
|
||||
Your service should subscribe to beat frames from the Pulse service and respond appropriately:
|
||||
|
||||
```go
|
||||
// Example Go implementation
|
||||
type BeatFrameHandler struct {
|
||||
currentBeat int64
|
||||
phase string
|
||||
}
|
||||
|
||||
func (h *BeatFrameHandler) HandleBeatFrame(frame BeatFrame) {
|
||||
// Validate the beat frame
|
||||
if err := validateBeatFrame(frame); err != nil {
|
||||
log.Errorf("Invalid beat frame: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
// Update internal state
|
||||
h.currentBeat = frame.BeatIndex
|
||||
h.phase = frame.Phase
|
||||
|
||||
// Execute phase-appropriate actions
|
||||
switch frame.Phase {
|
||||
case "plan":
|
||||
h.planPhase(frame)
|
||||
case "execute":
|
||||
h.executePhase(frame)
|
||||
case "review":
|
||||
h.reviewPhase(frame)
|
||||
}
|
||||
}
|
||||
|
||||
func validateBeatFrame(frame BeatFrame) error {
|
||||
if frame.Type != "backbeat.beatframe.v1" {
|
||||
return fmt.Errorf("invalid message type: %s", frame.Type)
|
||||
}
|
||||
if frame.TempoBPM < 0.1 || frame.TempoBPM > 1000 {
|
||||
return fmt.Errorf("invalid tempo: %f", frame.TempoBPM)
|
||||
}
|
||||
// Add more validation as needed
|
||||
return nil
|
||||
}
|
||||
```
|
||||
|
||||
### Implementing StatusClaim Publisher (INT-B)
|
||||
|
||||
Your agents should publish status claims to the Reverb service:
|
||||
|
||||
```go
|
||||
func (agent *Agent) PublishStatusClaim(beatIndex int64, state string) error {
|
||||
claim := StatusClaim{
|
||||
Type: "backbeat.statusclaim.v1",
|
||||
AgentID: agent.ID,
|
||||
BeatIndex: beatIndex,
|
||||
State: state,
|
||||
HLC: agent.generateHLC(),
|
||||
Progress: agent.calculateProgress(),
|
||||
Notes: agent.getCurrentStatus(),
|
||||
}
|
||||
|
||||
// Validate before sending
|
||||
if err := validateStatusClaim(claim); err != nil {
|
||||
return fmt.Errorf("invalid status claim: %w", err)
|
||||
}
|
||||
|
||||
return agent.publisher.Publish("backbeat.statusclaims", claim)
|
||||
}
|
||||
|
||||
func validateStatusClaim(claim StatusClaim) error {
|
||||
validStates := []string{"idle", "planning", "executing", "reviewing", "completed", "failed", "blocked", "helping"}
|
||||
for _, valid := range validStates {
|
||||
if claim.State == valid {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("invalid state: %s", claim.State)
|
||||
}
|
||||
```
|
||||
|
||||
### Implementing BarReport Consumer (INT-C)
|
||||
|
||||
Services should consume bar reports for cluster health awareness:
|
||||
|
||||
```go
|
||||
func (service *Service) HandleBarReport(report BarReport) {
|
||||
// Validate the bar report
|
||||
if err := validateBarReport(report); err != nil {
|
||||
log.Errorf("Invalid bar report: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
// Update cluster health metrics
|
||||
service.updateClusterHealth(report)
|
||||
|
||||
// React to issues
|
||||
if len(report.Issues) > 0 {
|
||||
service.handleClusterIssues(report.Issues)
|
||||
}
|
||||
|
||||
// Store performance metrics
|
||||
service.storePerformanceMetrics(report.Performance)
|
||||
}
|
||||
|
||||
func (service *Service) updateClusterHealth(report BarReport) {
|
||||
service.clusterMetrics.AgentsReporting = report.AgentsReporting
|
||||
service.clusterMetrics.OnTimeRate = float64(report.OnTimeReviews) / float64(report.AgentsReporting)
|
||||
service.clusterMetrics.TempoDrift = report.TempoDriftMS
|
||||
service.clusterMetrics.SecretRotationsOK = report.SecretRotationsOK
|
||||
}
|
||||
```
|
||||
|
||||
## Message Format Requirements
|
||||
|
||||
### Common Patterns
|
||||
|
||||
All BACKBEAT messages share these patterns:
|
||||
|
||||
1. **Type Field**: Must exactly match the schema constant
|
||||
2. **HLC Timestamps**: Format `XXXX:XXXX:XXXX` (hex digits)
|
||||
3. **Beat Indices**: Monotonically increasing integers ≥ 0
|
||||
4. **Window IDs**: 32-character hexadecimal strings
|
||||
5. **Agent IDs**: Pattern `service:instance` or `agent:identifier`
|
||||
|
||||
### Validation Best Practices
|
||||
|
||||
1. **Always validate messages before processing**
|
||||
2. **Use schema validation in tests**
|
||||
3. **Handle validation errors gracefully**
|
||||
4. **Log validation failures for debugging**
|
||||
|
||||
Example validation function:
|
||||
|
||||
```go
|
||||
func ValidateMessage(messageBytes []byte, expectedType string) error {
|
||||
// Parse and check type
|
||||
var msg map[string]interface{}
|
||||
if err := json.Unmarshal(messageBytes, &msg); err != nil {
|
||||
return fmt.Errorf("invalid JSON: %w", err)
|
||||
}
|
||||
|
||||
msgType, ok := msg["type"].(string)
|
||||
if !ok || msgType != expectedType {
|
||||
return fmt.Errorf("expected type %s, got %s", expectedType, msgType)
|
||||
}
|
||||
|
||||
// Use schema validation
|
||||
return validateWithSchema(messageBytes, expectedType)
|
||||
}
|
||||
```
|
||||
|
||||
## Tempo and Timing Considerations
|
||||
|
||||
### Understanding Tempo
|
||||
|
||||
- **Default Tempo**: 1 BPM (60-second beats)
|
||||
- **Minimum Tempo**: 1 BPM (60-second beats for batch or recovery windows)
|
||||
- **Maximum Tempo**: 24 BPM (~2.5-second beats for high-frequency workloads)
|
||||
|
||||
### Phase Timing
|
||||
|
||||
Each beat consists of three phases with equal time allocation:
|
||||
|
||||
```
|
||||
Beat Duration = 60 / TempoBPM seconds
|
||||
Phase Duration = Beat Duration / 3
|
||||
|
||||
Plan Phase: [0, Beat Duration / 3)
|
||||
Execute Phase: [Beat Duration / 3, 2 * Beat Duration / 3)
|
||||
Review Phase: [2 * Beat Duration / 3, Beat Duration)
|
||||
```
|
||||
|
||||
### Implementation Guidelines
|
||||
|
||||
1. **Respect Deadlines**: Always complete phase work before `deadline_at`
|
||||
2. **Handle Tempo Changes**: Pulse may adjust tempo based on cluster performance
|
||||
3. **Plan for Latency**: Factor in network and processing delays
|
||||
4. **Implement Backpressure**: Report when unable to keep up with tempo
|
||||
|
||||
## Error Handling
|
||||
|
||||
### Schema Validation Failures
|
||||
|
||||
```go
|
||||
func HandleInvalidMessage(err error, messageBytes []byte) {
|
||||
log.Errorf("Schema validation failed: %v", err)
|
||||
log.Debugf("Invalid message: %s", string(messageBytes))
|
||||
|
||||
// Send to dead letter queue or error handler
|
||||
errorHandler.HandleInvalidMessage(messageBytes, err)
|
||||
|
||||
// Update metrics
|
||||
metrics.InvalidMessageCounter.Inc()
|
||||
}
|
||||
```
|
||||
|
||||
### Network and Timing Issues
|
||||
|
||||
```go
|
||||
func (agent *Agent) HandleMissedBeat(expectedBeat int64) {
|
||||
// Report missed beat
|
||||
claim := StatusClaim{
|
||||
Type: "backbeat.statusclaim.v1",
|
||||
AgentID: agent.ID,
|
||||
BeatIndex: expectedBeat,
|
||||
State: "blocked",
|
||||
Notes: "missed beat due to network issues",
|
||||
HLC: agent.generateHLC(),
|
||||
}
|
||||
|
||||
// Try to catch up
|
||||
agent.attemptResynchronization()
|
||||
}
|
||||
```
|
||||
|
||||
## Testing Your Integration
|
||||
|
||||
### Unit Tests
|
||||
|
||||
```go
|
||||
func TestBeatFrameValidation(t *testing.T) {
|
||||
validFrame := BeatFrame{
|
||||
Type: "backbeat.beatframe.v1",
|
||||
ClusterID: "test",
|
||||
BeatIndex: 100,
|
||||
Downbeat: false,
|
||||
Phase: "execute",
|
||||
HLC: "7ffd:0001:abcd",
|
||||
DeadlineAt: time.Now().Add(30 * time.Second),
|
||||
TempoBPM: 2.0,
|
||||
WindowID: "7e9b0e6c4c9a4e59b7f2d9a3c1b2e4d5",
|
||||
}
|
||||
|
||||
err := validateBeatFrame(validFrame)
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
```
|
||||
|
||||
### Integration Tests
|
||||
|
||||
Use the BACKBEAT validation tools:
|
||||
|
||||
```bash
|
||||
# Test your message files
|
||||
backbeat-validate --schemas /path/to/backbeat/schemas --dir messages/
|
||||
|
||||
# Test individual messages
|
||||
echo '{"type":"backbeat.beatframe.v1",...}' | backbeat-validate --schemas /path/to/backbeat/schemas --message -
|
||||
```
|
||||
|
||||
### Load Testing
|
||||
|
||||
Consider tempo and message volume in your load tests:
|
||||
|
||||
```go
|
||||
func TestHighTempoHandling(t *testing.T) {
|
||||
// Simulate 10 BPM (6-second beats)
|
||||
tempo := 10.0
|
||||
beatInterval := time.Duration(60/tempo) * time.Second
|
||||
|
||||
for i := 0; i < 100; i++ {
|
||||
frame := generateBeatFrame(i, tempo)
|
||||
handler.HandleBeatFrame(frame)
|
||||
time.Sleep(beatInterval)
|
||||
}
|
||||
|
||||
// Verify no beats were dropped
|
||||
assert.Equal(t, 100, handler.processedBeats)
|
||||
}
|
||||
```
|
||||
|
||||
## Production Deployment
|
||||
|
||||
### Monitoring
|
||||
|
||||
Monitor these key metrics:
|
||||
|
||||
1. **Message Validation Rate**: Percentage of valid messages received
|
||||
2. **Beat Processing Latency**: Time to process each beat phase
|
||||
3. **Missed Beat Count**: Number of beats that couldn't be processed on time
|
||||
4. **Schema Version Compatibility**: Ensure all services use compatible versions
|
||||
|
||||
### Alerting
|
||||
|
||||
Set up alerts for:
|
||||
|
||||
- Schema validation failures > 1%
|
||||
- Beat processing latency > 90% of phase duration
|
||||
- Missed beats > 5% in any 10-minute window
|
||||
- HLC timestamp drift > 5 seconds
|
||||
|
||||
### Gradual Rollout
|
||||
|
||||
1. **Validate in CI**: Ensure all messages pass schema validation
|
||||
2. **Deploy to dev**: Test with low tempo (0.5 BPM)
|
||||
3. **Staging validation**: Use production-like tempo and load
|
||||
4. **Canary deployment**: Roll out to small percentage of production traffic
|
||||
5. **Full production**: Monitor closely and be ready to rollback
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
1. **Wrong Message Type**: Ensure `type` field exactly matches schema
|
||||
2. **HLC Format**: Must be `XXXX:XXXX:XXXX` format with hex digits
|
||||
3. **Window ID Length**: Must be exactly 32 hex characters
|
||||
4. **Enum Values**: States, phases, severities must match schema exactly
|
||||
5. **Numeric Ranges**: Check min/max constraints (tempo, beat_index, etc.)
|
||||
|
||||
### Debug Tools
|
||||
|
||||
```bash
|
||||
# Validate specific message
|
||||
backbeat-validate --schemas ./schemas --message '{"type":"backbeat.beatframe.v1",...}'
|
||||
|
||||
# Get detailed validation errors
|
||||
backbeat-validate --schemas ./schemas --file message.json --json
|
||||
|
||||
# Validate entire directory with detailed output
|
||||
backbeat-validate --schemas ./schemas --dir messages/ --json > validation-report.json
|
||||
```
|
||||
|
||||
## Schema Evolution
|
||||
|
||||
See [schema-evolution.md](schema-evolution.md) for details on:
|
||||
|
||||
- Semantic versioning for schemas
|
||||
- Backward compatibility requirements
|
||||
- Migration strategies for schema updates
|
||||
- Version compatibility matrix
|
||||
|
||||
## Performance Guidelines
|
||||
|
||||
See [tempo-guide.md](tempo-guide.md) for details on:
|
||||
|
||||
- Choosing appropriate tempo for your workload
|
||||
- Optimizing beat processing performance
|
||||
- Handling tempo changes gracefully
|
||||
- Resource utilization best practices
|
||||
|
||||
## Support
|
||||
|
||||
- **Documentation**: This contracts package contains the authoritative reference
|
||||
- **Examples**: See `contracts/tests/examples/` for valid/invalid message samples
|
||||
- **Issues**: Report integration problems to the BACKBEAT team
|
||||
- **Updates**: Monitor the contracts repository for schema updates
|
||||
507
contracts/docs/schema-evolution.md
Normal file
507
contracts/docs/schema-evolution.md
Normal file
@@ -0,0 +1,507 @@
|
||||
# BACKBEAT Schema Evolution and Versioning
|
||||
|
||||
This document defines how BACKBEAT message schemas evolve over time while maintaining compatibility across the CHORUS 2.0.0 ecosystem.
|
||||
|
||||
## Versioning Strategy
|
||||
|
||||
### Semantic Versioning for Schemas
|
||||
|
||||
BACKBEAT schemas follow semantic versioning (SemVer) with CHORUS-specific interpretations:
|
||||
|
||||
- **MAJOR** (`X.0.0`): Breaking changes that require code updates
|
||||
- **MINOR** (`X.Y.0`): Backward-compatible additions (new optional fields, enum values)
|
||||
- **PATCH** (`X.Y.Z`): Documentation updates, constraint clarifications, examples
|
||||
|
||||
### Schema Identification
|
||||
|
||||
Each schema includes version information:
|
||||
|
||||
```json
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"$id": "https://chorus.services/schemas/backbeat/beatframe/v1.2.0",
|
||||
"title": "BACKBEAT BeatFrame (INT-A)",
|
||||
"version": "1.2.0"
|
||||
}
|
||||
```
|
||||
|
||||
### Message Type Versioning
|
||||
|
||||
Message types embed version information:
|
||||
|
||||
- `backbeat.beatframe.v1` → Schema version 1.x.x
|
||||
- `backbeat.beatframe.v2` → Schema version 2.x.x
|
||||
|
||||
Only **major** version changes require new message type identifiers.
|
||||
|
||||
## Compatibility Matrix
|
||||
|
||||
### Current Schema Versions
|
||||
|
||||
| Interface | Schema Version | Message Type | Status |
|
||||
|-----------|----------------|--------------|--------|
|
||||
| INT-A (BeatFrame) | 1.0.0 | `backbeat.beatframe.v1` | Active |
|
||||
| INT-B (StatusClaim) | 1.0.0 | `backbeat.statusclaim.v1` | Active |
|
||||
| INT-C (BarReport) | 1.0.0 | `backbeat.barreport.v1` | Active |
|
||||
|
||||
### Version Compatibility Rules
|
||||
|
||||
1. **Minor/Patch Updates**: All v1.x.x schemas are compatible with `backbeat.*.v1` messages
|
||||
2. **Major Updates**: Require new message type (e.g., `backbeat.beatframe.v2`)
|
||||
3. **Transition Period**: Both old and new versions supported during migration
|
||||
4. **Deprecation**: 6-month notice before removing support for old major versions
|
||||
|
||||
## Change Categories
|
||||
|
||||
### Minor Version Changes (Backward Compatible)
|
||||
|
||||
These changes increment the minor version (1.0.0 → 1.1.0):
|
||||
|
||||
#### 1. Adding Optional Fields
|
||||
|
||||
```json
|
||||
// Before (v1.0.0)
|
||||
{
|
||||
"required": ["type", "cluster_id", "beat_index"],
|
||||
"properties": {
|
||||
"type": {...},
|
||||
"cluster_id": {...},
|
||||
"beat_index": {...}
|
||||
}
|
||||
}
|
||||
|
||||
// After (v1.1.0) - adds optional field
|
||||
{
|
||||
"required": ["type", "cluster_id", "beat_index"],
|
||||
"properties": {
|
||||
"type": {...},
|
||||
"cluster_id": {...},
|
||||
"beat_index": {...},
|
||||
"priority": {
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"maximum": 10,
|
||||
"description": "Optional processing priority (1=low, 10=high)"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### 2. Adding Enum Values
|
||||
|
||||
```json
|
||||
// Before (v1.0.0)
|
||||
{
|
||||
"properties": {
|
||||
"phase": {
|
||||
"enum": ["plan", "execute", "review"]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// After (v1.1.0) - adds new phase
|
||||
{
|
||||
"properties": {
|
||||
"phase": {
|
||||
"enum": ["plan", "execute", "review", "cleanup"]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### 3. Relaxing Constraints
|
||||
|
||||
```json
|
||||
// Before (v1.0.0)
|
||||
{
|
||||
"properties": {
|
||||
"notes": {
|
||||
"type": "string",
|
||||
"maxLength": 256
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// After (v1.1.0) - allows longer notes
|
||||
{
|
||||
"properties": {
|
||||
"notes": {
|
||||
"type": "string",
|
||||
"maxLength": 512
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### 4. Adding Properties to Objects
|
||||
|
||||
```json
|
||||
// Before (v1.0.0)
|
||||
{
|
||||
"properties": {
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"version": {"type": "string"}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// After (v1.1.0) - adds new metadata field
|
||||
{
|
||||
"properties": {
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"version": {"type": "string"},
|
||||
"source": {"type": "string"}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Major Version Changes (Breaking)
|
||||
|
||||
These changes increment the major version (1.x.x → 2.0.0):
|
||||
|
||||
#### 1. Removing Required Fields
|
||||
|
||||
```json
|
||||
// v1.x.x
|
||||
{
|
||||
"required": ["type", "cluster_id", "beat_index", "deprecated_field"]
|
||||
}
|
||||
|
||||
// v2.0.0
|
||||
{
|
||||
"required": ["type", "cluster_id", "beat_index"]
|
||||
}
|
||||
```
|
||||
|
||||
#### 2. Changing Field Types
|
||||
|
||||
```json
|
||||
// v1.x.x
|
||||
{
|
||||
"properties": {
|
||||
"beat_index": {"type": "integer"}
|
||||
}
|
||||
}
|
||||
|
||||
// v2.0.0
|
||||
{
|
||||
"properties": {
|
||||
"beat_index": {"type": "string"}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### 3. Removing Enum Values
|
||||
|
||||
```json
|
||||
// v1.x.x
|
||||
{
|
||||
"properties": {
|
||||
"state": {
|
||||
"enum": ["idle", "executing", "deprecated_state"]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// v2.0.0
|
||||
{
|
||||
"properties": {
|
||||
"state": {
|
||||
"enum": ["idle", "executing"]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### 4. Tightening Constraints
|
||||
|
||||
```json
|
||||
// v1.x.x
|
||||
{
|
||||
"properties": {
|
||||
"agent_id": {
|
||||
"type": "string",
|
||||
"maxLength": 256
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// v2.0.0
|
||||
{
|
||||
"properties": {
|
||||
"agent_id": {
|
||||
"type": "string",
|
||||
"maxLength": 128
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Patch Version Changes (Non-Breaking)
|
||||
|
||||
These changes increment the patch version (1.0.0 → 1.0.1):
|
||||
|
||||
1. **Documentation updates**
|
||||
2. **Example additions**
|
||||
3. **Description clarifications**
|
||||
4. **Comment additions**
|
||||
|
||||
## Migration Strategies
|
||||
|
||||
### Minor Version Migration
|
||||
|
||||
Services automatically benefit from minor version updates:
|
||||
|
||||
```go
|
||||
// This code works with both v1.0.0 and v1.1.0
|
||||
func handleBeatFrame(frame BeatFrame) {
|
||||
// Core fields always present
|
||||
log.Printf("Beat %d in phase %s", frame.BeatIndex, frame.Phase)
|
||||
|
||||
// New optional fields checked safely
|
||||
if frame.Priority != nil {
|
||||
log.Printf("Priority: %d", *frame.Priority)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Major Version Migration
|
||||
|
||||
Requires explicit handling of both versions during transition:
|
||||
|
||||
```go
|
||||
func handleMessage(messageBytes []byte) error {
|
||||
var msgType struct {
|
||||
Type string `json:"type"`
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(messageBytes, &msgType); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
switch msgType.Type {
|
||||
case "backbeat.beatframe.v1":
|
||||
return handleBeatFrameV1(messageBytes)
|
||||
case "backbeat.beatframe.v2":
|
||||
return handleBeatFrameV2(messageBytes)
|
||||
default:
|
||||
return fmt.Errorf("unsupported message type: %s", msgType.Type)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Gradual Migration Process
|
||||
|
||||
1. **Preparation Phase** (Months 1-2)
|
||||
- Announce upcoming major version change
|
||||
- Publish v2.0.0 schemas alongside v1.x.x
|
||||
- Update documentation and examples
|
||||
- Provide migration tools and guides
|
||||
|
||||
2. **Dual Support Phase** (Months 3-4)
|
||||
- Services support both v1 and v2 message types
|
||||
- New services prefer v2 messages
|
||||
- Monitoring tracks v1 vs v2 usage
|
||||
|
||||
3. **Migration Phase** (Months 5-6)
|
||||
- All services updated to send v2 messages
|
||||
- Services still accept v1 for backward compatibility
|
||||
- Warnings logged for v1 message reception
|
||||
|
||||
4. **Cleanup Phase** (Month 7+)
|
||||
- Drop support for v1 messages
|
||||
- Remove v1 handling code
|
||||
- Update schemas to mark v1 as deprecated
|
||||
|
||||
## Implementation Guidelines
|
||||
|
||||
### Schema Development
|
||||
|
||||
1. **Start Conservative**: Begin with strict constraints, relax later if needed
|
||||
2. **Plan for Growth**: Design extensible structures with optional metadata objects
|
||||
3. **Document Thoroughly**: Include clear descriptions and examples
|
||||
4. **Test Extensively**: Validate with real-world data before releasing
|
||||
|
||||
### Version Detection
|
||||
|
||||
Services should detect schema versions:
|
||||
|
||||
```go
|
||||
type SchemaInfo struct {
|
||||
Version string `json:"version"`
|
||||
MessageType string `json:"message_type"`
|
||||
IsSupported bool `json:"is_supported"`
|
||||
}
|
||||
|
||||
func detectSchemaVersion(messageType string) SchemaInfo {
|
||||
switch messageType {
|
||||
case "backbeat.beatframe.v1":
|
||||
return SchemaInfo{
|
||||
Version: "1.x.x",
|
||||
MessageType: messageType,
|
||||
IsSupported: true,
|
||||
}
|
||||
case "backbeat.beatframe.v2":
|
||||
return SchemaInfo{
|
||||
Version: "2.x.x",
|
||||
MessageType: messageType,
|
||||
IsSupported: true,
|
||||
}
|
||||
default:
|
||||
return SchemaInfo{
|
||||
MessageType: messageType,
|
||||
IsSupported: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Validation Strategy
|
||||
|
||||
```go
|
||||
func validateWithVersionFallback(messageBytes []byte) error {
|
||||
// Try latest version first
|
||||
if err := validateV2(messageBytes); err == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Fall back to previous version
|
||||
if err := validateV1(messageBytes); err == nil {
|
||||
log.Warn("Received v1 message, consider upgrading sender")
|
||||
return nil
|
||||
}
|
||||
|
||||
return fmt.Errorf("message does not match any supported schema version")
|
||||
}
|
||||
```
|
||||
|
||||
## Testing Schema Evolution
|
||||
|
||||
### Compatibility Tests
|
||||
|
||||
```go
|
||||
func TestSchemaBackwardCompatibility(t *testing.T) {
|
||||
// Test that v1.1.0 accepts all valid v1.0.0 messages
|
||||
v100Messages := loadTestMessages("v1.0.0")
|
||||
v110Schema := loadSchema("beatframe-v1.1.0.schema.json")
|
||||
|
||||
for _, msg := range v100Messages {
|
||||
err := validateAgainstSchema(msg, v110Schema)
|
||||
assert.NoError(t, err, "v1.1.0 should accept v1.0.0 messages")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSchemaForwardCompatibility(t *testing.T) {
|
||||
// Test that v1.0.0 code gracefully handles v1.1.0 messages
|
||||
v110Message := loadTestMessage("beatframe-v1.1.0-with-new-fields.json")
|
||||
|
||||
var beatFrame BeatFrameV1
|
||||
err := json.Unmarshal(v110Message, &beatFrame)
|
||||
assert.NoError(t, err, "v1.0.0 struct should parse v1.1.0 messages")
|
||||
|
||||
// Core fields should be populated
|
||||
assert.NotEmpty(t, beatFrame.Type)
|
||||
assert.NotEmpty(t, beatFrame.ClusterID)
|
||||
}
|
||||
```
|
||||
|
||||
### Migration Tests
|
||||
|
||||
```go
|
||||
func TestDualVersionSupport(t *testing.T) {
|
||||
handler := NewMessageHandler()
|
||||
|
||||
v1Message := generateBeatFrameV1()
|
||||
v2Message := generateBeatFrameV2()
|
||||
|
||||
// Both versions should be handled correctly
|
||||
err1 := handler.HandleMessage(v1Message)
|
||||
err2 := handler.HandleMessage(v2Message)
|
||||
|
||||
assert.NoError(t, err1)
|
||||
assert.NoError(t, err2)
|
||||
}
|
||||
```
|
||||
|
||||
## Deprecation Process
|
||||
|
||||
### Marking Deprecated Features
|
||||
|
||||
```json
|
||||
{
|
||||
"properties": {
|
||||
"legacy_field": {
|
||||
"type": "string",
|
||||
"description": "DEPRECATED: Use new_field instead. Will be removed in v2.0.0",
|
||||
"deprecated": true
|
||||
},
|
||||
"new_field": {
|
||||
"type": "string",
|
||||
"description": "Replacement for legacy_field"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Communication Timeline
|
||||
|
||||
1. **6 months before**: Announce deprecation in release notes
|
||||
2. **3 months before**: Add deprecation warnings to schemas
|
||||
3. **1 month before**: Final migration reminder
|
||||
4. **Release day**: Remove deprecated features
|
||||
|
||||
### Tooling Support
|
||||
|
||||
```bash
|
||||
# Check for deprecated schema usage
|
||||
backbeat-validate --schemas ./schemas --dir messages/ --check-deprecated
|
||||
|
||||
# Migration helper
|
||||
backbeat-migrate --from v1 --to v2 --dir messages/
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### For Schema Authors
|
||||
|
||||
1. **Communicate Early**: Announce changes well in advance
|
||||
2. **Provide Tools**: Create migration utilities and documentation
|
||||
3. **Monitor Usage**: Track which versions are being used
|
||||
4. **Be Conservative**: Prefer minor over major version changes
|
||||
|
||||
### For Service Developers
|
||||
|
||||
1. **Stay Updated**: Subscribe to schema change notifications
|
||||
2. **Plan for Migration**: Build version handling into your services
|
||||
3. **Test Thoroughly**: Validate against multiple schema versions
|
||||
4. **Monitor Compatibility**: Alert on unsupported message versions
|
||||
|
||||
### For Operations Teams
|
||||
|
||||
1. **Version Tracking**: Monitor which schema versions are active
|
||||
2. **Migration Planning**: Coordinate major version migrations
|
||||
3. **Rollback Capability**: Be prepared to revert if migrations fail
|
||||
4. **Performance Impact**: Monitor schema validation performance
|
||||
|
||||
## Future Considerations
|
||||
|
||||
### Planned Enhancements
|
||||
|
||||
1. **Schema Registry**: Centralized schema version management
|
||||
2. **Auto-Migration**: Tools to automatically update message formats
|
||||
3. **Version Negotiation**: Services negotiate supported versions
|
||||
4. **Schema Analytics**: Usage metrics and compatibility reporting
|
||||
|
||||
### Long-term Vision
|
||||
|
||||
- **Continuous Evolution**: Schemas evolve without breaking existing services
|
||||
- **Zero-Downtime Updates**: Schema changes deploy without service interruption
|
||||
- **Automated Testing**: CI/CD pipelines validate schema compatibility
|
||||
- **Self-Healing**: Services automatically adapt to schema changes
|
||||
610
contracts/docs/tempo-guide.md
Normal file
610
contracts/docs/tempo-guide.md
Normal file
@@ -0,0 +1,610 @@
|
||||
# BACKBEAT Tempo Guide: Beat Timing and Performance Recommendations
|
||||
|
||||
This guide provides comprehensive recommendations for choosing optimal tempo settings, implementing beat processing, and achieving optimal performance in BACKBEAT-enabled CHORUS 2.0.0 services.
|
||||
|
||||
## Understanding BACKBEAT Tempo
|
||||
|
||||
### Tempo Basics
|
||||
|
||||
BACKBEAT tempo is measured in **Beats Per Minute (BPM)**, similar to musical tempo:
|
||||
|
||||
- **1 BPM** = 60-second beats (**default**, good for batch processing and recovery windows)
|
||||
- **2 BPM** = 30-second beats (good for most services)
|
||||
- **4 BPM** = 15-second beats (good for responsive services)
|
||||
- **60 BPM** = 1-second beats (good for high-frequency operations)
|
||||
|
||||
### Beat Structure
|
||||
|
||||
Each beat consists of three equal phases:
|
||||
|
||||
```
|
||||
Beat Duration = 60 / TempoBPM seconds
|
||||
Phase Duration = Beat Duration / 3
|
||||
|
||||
┌─────────────┬─────────────┬─────────────┐
|
||||
│ PLAN │ EXECUTE │ REVIEW │
|
||||
│ Phase 1 │ Phase 2 │ Phase 3 │
|
||||
└─────────────┴─────────────┴─────────────┘
|
||||
│←────────── Beat Duration ──────────────→│
|
||||
```
|
||||
|
||||
### Tempo Ranges and Use Cases
|
||||
|
||||
| Tempo Range | Beat Duration | Use Cases | Examples |
|
||||
|-------------|---------------|-----------|----------|
|
||||
| 0.1 - 0.5 BPM | 2-10 minutes | Large batch jobs, ETL | Data warehouse loads, ML training |
|
||||
| 0.5 - 2 BPM | 30s - 2 minutes | Standard operations | API services, web apps |
|
||||
| 2 - 10 BPM | 6-30 seconds | Responsive services | Real-time dashboards, monitoring |
|
||||
| 10 - 60 BPM | 1-6 seconds | High-frequency | Trading systems, IoT data processing |
|
||||
| 60+ BPM | <1 second | Ultra-high-frequency | Hardware control, real-time gaming |
|
||||
|
||||
## Choosing the Right Tempo
|
||||
|
||||
### Workload Analysis
|
||||
|
||||
Before selecting tempo, analyze your workload characteristics:
|
||||
|
||||
1. **Task Duration**: How long do typical operations take?
|
||||
2. **Coordination Needs**: How often do services need to synchronize?
|
||||
3. **Resource Requirements**: How much CPU/memory/I/O does work consume?
|
||||
4. **Latency Tolerance**: How quickly must the system respond to changes?
|
||||
5. **Error Recovery**: How quickly should the system detect and recover from failures?
|
||||
|
||||
### Tempo Selection Guidelines
|
||||
|
||||
#### Rule 1: Task Duration Constraint
|
||||
```
|
||||
Recommended Tempo ≤ 60 / (Average Task Duration × 3)
|
||||
```
|
||||
|
||||
**Example**: If tasks take 5 seconds on average:
|
||||
- Maximum recommended tempo = 60 / (5 × 3) = 4 BPM
|
||||
- Use 2-4 BPM for safe operation
|
||||
|
||||
#### Rule 2: Coordination Frequency
|
||||
```
|
||||
Coordination Tempo = 60 / Desired Sync Interval
|
||||
```
|
||||
|
||||
**Example**: If services should sync every 2 minutes:
|
||||
- Recommended tempo = 60 / 120 = 0.5 BPM
|
||||
|
||||
#### Rule 3: Resource Utilization
|
||||
```
|
||||
Sustainable Tempo = 60 / (Task Duration + Recovery Time)
|
||||
```
|
||||
|
||||
**Example**: 10s tasks with 5s recovery time:
|
||||
- Maximum sustainable tempo = 60 / (10 + 5) = 4 BPM
|
||||
|
||||
### Common Tempo Patterns
|
||||
|
||||
#### Development/Testing: 0.1-0.5 BPM
|
||||
```json
|
||||
{
|
||||
"tempo_bpm": 0.2,
|
||||
"beat_duration": "5 minutes",
|
||||
"use_case": "Development and debugging",
|
||||
"advantages": ["Easy to observe", "Time to investigate issues"],
|
||||
"disadvantages": ["Slow feedback", "Not production realistic"]
|
||||
}
|
||||
```
|
||||
|
||||
#### Standard Services: 1-4 BPM
|
||||
```json
|
||||
{
|
||||
"tempo_bpm": 2.0,
|
||||
"beat_duration": "30 seconds",
|
||||
"use_case": "Most production services",
|
||||
"advantages": ["Good balance", "Reasonable coordination", "Error recovery"],
|
||||
"disadvantages": ["May be slow for real-time needs"]
|
||||
}
|
||||
```
|
||||
|
||||
#### Responsive Applications: 4-20 BPM
|
||||
```json
|
||||
{
|
||||
"tempo_bpm": 10.0,
|
||||
"beat_duration": "6 seconds",
|
||||
"use_case": "Interactive applications",
|
||||
"advantages": ["Quick response", "Fast error detection"],
|
||||
"disadvantages": ["Higher overhead", "More network traffic"]
|
||||
}
|
||||
```
|
||||
|
||||
#### High-Frequency Systems: 20+ BPM
|
||||
```json
|
||||
{
|
||||
"tempo_bpm": 60.0,
|
||||
"beat_duration": "1 second",
|
||||
"use_case": "Real-time trading, IoT",
|
||||
"advantages": ["Ultra-responsive", "Immediate coordination"],
|
||||
"disadvantages": ["High resource usage", "Network intensive"]
|
||||
}
|
||||
```
|
||||
|
||||
## Implementation Guidelines
|
||||
|
||||
### Beat Processing Architecture
|
||||
|
||||
#### Single-Threaded Processing
|
||||
Best for low-medium tempo (≤10 BPM):
|
||||
|
||||
```go
|
||||
type BeatProcessor struct {
|
||||
currentBeat int64
|
||||
phase string
|
||||
workQueue chan Task
|
||||
}
|
||||
|
||||
func (p *BeatProcessor) ProcessBeat(frame BeatFrame) {
|
||||
// Update state
|
||||
p.currentBeat = frame.BeatIndex
|
||||
p.phase = frame.Phase
|
||||
|
||||
// Process phase synchronously
|
||||
switch frame.Phase {
|
||||
case "plan":
|
||||
p.planPhase(frame)
|
||||
case "execute":
|
||||
p.executePhase(frame)
|
||||
case "review":
|
||||
p.reviewPhase(frame)
|
||||
}
|
||||
|
||||
// Report status before deadline
|
||||
p.reportStatus(frame.BeatIndex, "completed")
|
||||
}
|
||||
```
|
||||
|
||||
#### Pipelined Processing
|
||||
Best for high tempo (>10 BPM):
|
||||
|
||||
```go
|
||||
type PipelinedProcessor struct {
|
||||
planQueue chan BeatFrame
|
||||
executeQueue chan BeatFrame
|
||||
reviewQueue chan BeatFrame
|
||||
}
|
||||
|
||||
func (p *PipelinedProcessor) Start() {
|
||||
// Separate goroutines for each phase
|
||||
go p.planWorker()
|
||||
go p.executeWorker()
|
||||
go p.reviewWorker()
|
||||
}
|
||||
|
||||
func (p *PipelinedProcessor) ProcessBeat(frame BeatFrame) {
|
||||
switch frame.Phase {
|
||||
case "plan":
|
||||
p.planQueue <- frame
|
||||
case "execute":
|
||||
p.executeQueue <- frame
|
||||
case "review":
|
||||
p.reviewQueue <- frame
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Timing Implementation
|
||||
|
||||
#### Deadline Management
|
||||
|
||||
```go
|
||||
func (p *BeatProcessor) executeWithDeadline(frame BeatFrame, work func() error) error {
|
||||
// Calculate remaining time
|
||||
remainingTime := time.Until(frame.DeadlineAt)
|
||||
|
||||
// Create timeout context
|
||||
ctx, cancel := context.WithTimeout(context.Background(), remainingTime)
|
||||
defer cancel()
|
||||
|
||||
// Execute with timeout
|
||||
done := make(chan error, 1)
|
||||
go func() {
|
||||
done <- work()
|
||||
}()
|
||||
|
||||
select {
|
||||
case err := <-done:
|
||||
return err
|
||||
case <-ctx.Done():
|
||||
return fmt.Errorf("work timed out after %v", remainingTime)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Adaptive Processing
|
||||
|
||||
```go
|
||||
type AdaptiveProcessor struct {
|
||||
processingTimes []time.Duration
|
||||
targetUtilization float64 // 0.8 = use 80% of available time
|
||||
}
|
||||
|
||||
func (p *AdaptiveProcessor) shouldProcessWork(frame BeatFrame) bool {
|
||||
// Calculate phase time available
|
||||
phaseTime := time.Duration(60/frame.TempoBPM*1000/3) * time.Millisecond
|
||||
|
||||
// Estimate processing time based on history
|
||||
avgProcessingTime := p.calculateAverage()
|
||||
|
||||
// Only process if we have enough time
|
||||
requiredTime := time.Duration(float64(avgProcessingTime) / p.targetUtilization)
|
||||
return phaseTime >= requiredTime
|
||||
}
|
||||
```
|
||||
|
||||
### Performance Optimization
|
||||
|
||||
#### Batch Processing within Beats
|
||||
|
||||
```go
|
||||
func (p *BeatProcessor) executePhase(frame BeatFrame) error {
|
||||
// Calculate optimal batch size based on tempo
|
||||
phaseDuration := time.Duration(60/frame.TempoBPM*1000/3) * time.Millisecond
|
||||
targetTime := time.Duration(float64(phaseDuration) * 0.8) // Use 80% of time
|
||||
|
||||
// Process work in batches
|
||||
batchSize := p.calculateOptimalBatchSize(targetTime)
|
||||
|
||||
for p.hasWork() && time.Until(frame.DeadlineAt) > time.Second {
|
||||
batch := p.getWorkBatch(batchSize)
|
||||
if err := p.processBatch(batch); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
```
|
||||
|
||||
#### Caching and Pre-computation
|
||||
|
||||
```go
|
||||
type SmartProcessor struct {
|
||||
cache map[string]interface{}
|
||||
precomputed map[int64]interface{} // Keyed by beat index
|
||||
}
|
||||
|
||||
func (p *SmartProcessor) planPhase(frame BeatFrame) {
|
||||
// Pre-compute work for future beats during plan phase
|
||||
nextBeat := frame.BeatIndex + 1
|
||||
if _, exists := p.precomputed[nextBeat]; !exists {
|
||||
p.precomputed[nextBeat] = p.precomputeWork(nextBeat)
|
||||
}
|
||||
|
||||
// Cache frequently accessed data
|
||||
p.cacheRelevantData(frame)
|
||||
}
|
||||
|
||||
func (p *SmartProcessor) executePhase(frame BeatFrame) {
|
||||
// Use pre-computed results if available
|
||||
if precomputed, exists := p.precomputed[frame.BeatIndex]; exists {
|
||||
return p.usePrecomputedWork(precomputed)
|
||||
}
|
||||
|
||||
// Fall back to real-time computation
|
||||
return p.computeWork(frame)
|
||||
}
|
||||
```
|
||||
|
||||
## Performance Monitoring
|
||||
|
||||
### Key Metrics
|
||||
|
||||
Track these metrics for tempo optimization:
|
||||
|
||||
```go
|
||||
type TempoMetrics struct {
|
||||
// Timing metrics
|
||||
BeatProcessingLatency time.Duration // How long beats take to process
|
||||
PhaseCompletionRate float64 // % of phases completed on time
|
||||
DeadlineMissRate float64 // % of deadlines missed
|
||||
|
||||
// Resource metrics
|
||||
CPUUtilization float64 // CPU usage during beats
|
||||
MemoryUtilization float64 // Memory usage
|
||||
NetworkBandwidth int64 // Bytes/sec for BACKBEAT messages
|
||||
|
||||
// Throughput metrics
|
||||
TasksPerBeat int // Work completed per beat
|
||||
BeatsPerSecond float64 // Effective beat processing rate
|
||||
TempoDriftMS float64 // How far behind/ahead we're running
|
||||
}
|
||||
```
|
||||
|
||||
### Performance Alerts
|
||||
|
||||
```go
|
||||
func (m *TempoMetrics) checkAlerts() []Alert {
|
||||
var alerts []Alert
|
||||
|
||||
// Beat processing taking too long
|
||||
if m.BeatProcessingLatency > m.phaseDuration() * 0.9 {
|
||||
alerts = append(alerts, Alert{
|
||||
Level: "warning",
|
||||
Message: "Beat processing approaching deadline",
|
||||
Recommendation: "Consider reducing tempo or optimizing processing",
|
||||
})
|
||||
}
|
||||
|
||||
// Missing too many deadlines
|
||||
if m.DeadlineMissRate > 0.05 { // 5%
|
||||
alerts = append(alerts, Alert{
|
||||
Level: "critical",
|
||||
Message: "High deadline miss rate",
|
||||
Recommendation: "Reduce tempo immediately or scale resources",
|
||||
})
|
||||
}
|
||||
|
||||
// Resource exhaustion
|
||||
if m.CPUUtilization > 0.9 {
|
||||
alerts = append(alerts, Alert{
|
||||
Level: "warning",
|
||||
Message: "High CPU utilization",
|
||||
Recommendation: "Scale up or reduce workload per beat",
|
||||
})
|
||||
}
|
||||
|
||||
return alerts
|
||||
}
|
||||
```
|
||||
|
||||
### Adaptive Tempo Adjustment
|
||||
|
||||
```go
|
||||
type TempoController struct {
|
||||
currentTempo float64
|
||||
targetLatency time.Duration
|
||||
adjustmentRate float64 // How aggressively to adjust
|
||||
}
|
||||
|
||||
func (tc *TempoController) adjustTempo(metrics TempoMetrics) float64 {
|
||||
// Calculate desired tempo based on performance
|
||||
if metrics.DeadlineMissRate > 0.02 { // 2% miss rate
|
||||
// Slow down
|
||||
tc.currentTempo *= (1.0 - tc.adjustmentRate)
|
||||
} else if metrics.PhaseCompletionRate > 0.95 && metrics.CPUUtilization < 0.7 {
|
||||
// Speed up
|
||||
tc.currentTempo *= (1.0 + tc.adjustmentRate)
|
||||
}
|
||||
|
||||
// Apply constraints
|
||||
tc.currentTempo = math.Max(0.1, tc.currentTempo) // Minimum 0.1 BPM
|
||||
tc.currentTempo = math.Min(1000, tc.currentTempo) // Maximum 1000 BPM
|
||||
|
||||
return tc.currentTempo
|
||||
}
|
||||
```
|
||||
|
||||
## Load Testing and Capacity Planning
|
||||
|
||||
### Beat Load Testing
|
||||
|
||||
```go
|
||||
func TestBeatProcessingUnderLoad(t *testing.T) {
|
||||
processor := NewBeatProcessor()
|
||||
tempo := 10.0 // 10 BPM = 6-second beats
|
||||
beatInterval := time.Duration(60/tempo) * time.Second
|
||||
|
||||
// Simulate sustained load
|
||||
for i := 0; i < 1000; i++ {
|
||||
frame := generateBeatFrame(i, tempo)
|
||||
|
||||
start := time.Now()
|
||||
err := processor.ProcessBeat(frame)
|
||||
duration := time.Since(start)
|
||||
|
||||
// Verify processing completed within phase duration
|
||||
phaseDuration := beatInterval / 3
|
||||
assert.Less(t, duration, phaseDuration)
|
||||
assert.NoError(t, err)
|
||||
|
||||
// Wait for next beat
|
||||
time.Sleep(beatInterval)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Capacity Planning
|
||||
|
||||
```go
|
||||
type CapacityPlanner struct {
|
||||
maxTempo float64
|
||||
resourceLimits ResourceLimits
|
||||
taskCharacteristics TaskProfile
|
||||
}
|
||||
|
||||
func (cp *CapacityPlanner) calculateMaxTempo() float64 {
|
||||
// Based on CPU capacity
|
||||
cpuConstrainedTempo := 60.0 / (cp.taskCharacteristics.CPUTime * 3)
|
||||
|
||||
// Based on memory capacity
|
||||
memConstrainedTempo := cp.resourceLimits.Memory / cp.taskCharacteristics.MemoryPerBeat
|
||||
|
||||
// Based on I/O capacity
|
||||
ioConstrainedTempo := cp.resourceLimits.IOPS / cp.taskCharacteristics.IOPerBeat
|
||||
|
||||
// Take the minimum (most restrictive constraint)
|
||||
return math.Min(cpuConstrainedTempo, math.Min(memConstrainedTempo, ioConstrainedTempo))
|
||||
}
|
||||
```
|
||||
|
||||
## Common Patterns and Anti-Patterns
|
||||
|
||||
### ✅ Good Patterns
|
||||
|
||||
#### Progressive Backoff
|
||||
```go
|
||||
func (p *Processor) handleOverload() {
|
||||
if p.metrics.DeadlineMissRate > 0.1 {
|
||||
// Temporarily reduce work per beat
|
||||
p.workPerBeat *= 0.8
|
||||
log.Warn("Reducing work per beat due to overload")
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Graceful Degradation
|
||||
```go
|
||||
func (p *Processor) executePhase(frame BeatFrame) error {
|
||||
timeRemaining := time.Until(frame.DeadlineAt)
|
||||
|
||||
if timeRemaining < p.minimumTime {
|
||||
// Skip non-essential work
|
||||
return p.executeEssentialOnly(frame)
|
||||
}
|
||||
|
||||
return p.executeFullWorkload(frame)
|
||||
}
|
||||
```
|
||||
|
||||
#### Work Prioritization
|
||||
```go
|
||||
func (p *Processor) planPhase(frame BeatFrame) {
|
||||
// Sort work by priority and deadline
|
||||
work := p.getAvailableWork()
|
||||
sort.Sort(ByPriorityAndDeadline(work))
|
||||
|
||||
// Plan only what can be completed in time
|
||||
plannedWork := p.selectWorkForTempo(work, frame.TempoBPM)
|
||||
p.scheduleWork(plannedWork)
|
||||
}
|
||||
```
|
||||
|
||||
### ❌ Anti-Patterns
|
||||
|
||||
#### Blocking I/O in Beat Processing
|
||||
```go
|
||||
// DON'T: Synchronous I/O can cause deadline misses
|
||||
func badExecutePhase(frame BeatFrame) error {
|
||||
data := fetchFromDatabase() // Blocking call!
|
||||
return processData(data)
|
||||
}
|
||||
|
||||
// DO: Use async I/O with timeouts
|
||||
func goodExecutePhase(frame BeatFrame) error {
|
||||
ctx, cancel := context.WithDeadline(context.Background(), frame.DeadlineAt)
|
||||
defer cancel()
|
||||
|
||||
data, err := fetchFromDatabaseAsync(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return processData(data)
|
||||
}
|
||||
```
|
||||
|
||||
#### Ignoring Tempo Changes
|
||||
```go
|
||||
// DON'T: Assume tempo is constant
|
||||
func badBeatHandler(frame BeatFrame) {
|
||||
// Hard-coded timing assumptions
|
||||
time.Sleep(10 * time.Second) // Fails if tempo > 6 BPM!
|
||||
}
|
||||
|
||||
// DO: Adapt to current tempo
|
||||
func goodBeatHandler(frame BeatFrame) {
|
||||
phaseDuration := time.Duration(60/frame.TempoBPM*1000/3) * time.Millisecond
|
||||
maxWorkTime := time.Duration(float64(phaseDuration) * 0.8)
|
||||
|
||||
// Adapt work to available time
|
||||
ctx, cancel := context.WithTimeout(context.Background(), maxWorkTime)
|
||||
defer cancel()
|
||||
|
||||
doWork(ctx)
|
||||
}
|
||||
```
|
||||
|
||||
#### Unbounded Work Queues
|
||||
```go
|
||||
// DON'T: Let work queues grow infinitely
|
||||
type BadProcessor struct {
|
||||
workQueue chan Task // Unbounded queue
|
||||
}
|
||||
|
||||
// DO: Use bounded queues with backpressure
|
||||
type GoodProcessor struct {
|
||||
workQueue chan Task // Bounded queue
|
||||
metrics *TempoMetrics
|
||||
}
|
||||
|
||||
func (p *GoodProcessor) addWork(task Task) error {
|
||||
select {
|
||||
case p.workQueue <- task:
|
||||
return nil
|
||||
default:
|
||||
p.metrics.WorkRejectedCount++
|
||||
return ErrQueueFull
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Troubleshooting Performance Issues
|
||||
|
||||
### Diagnostic Checklist
|
||||
|
||||
1. **Beat Processing Time**: Are beats completing within phase deadlines?
|
||||
2. **Resource Utilization**: Is CPU/memory/I/O being over-utilized?
|
||||
3. **Network Latency**: Are BACKBEAT messages arriving late?
|
||||
4. **Work Distribution**: Is work evenly distributed across beats?
|
||||
5. **Error Rates**: Are errors causing processing delays?
|
||||
|
||||
### Performance Tuning Steps
|
||||
|
||||
1. **Measure Current Performance**
|
||||
```bash
|
||||
# Monitor beat processing metrics
|
||||
kubectl logs deployment/my-service | grep "beat_processing_time"
|
||||
|
||||
# Check resource utilization
|
||||
kubectl top pods
|
||||
```
|
||||
|
||||
2. **Identify Bottlenecks**
|
||||
```go
|
||||
func profileBeatProcessing(frame BeatFrame) {
|
||||
defer func(start time.Time) {
|
||||
log.Infof("Beat %d phase %s took %v",
|
||||
frame.BeatIndex, frame.Phase, time.Since(start))
|
||||
}(time.Now())
|
||||
|
||||
// Your beat processing code here
|
||||
}
|
||||
```
|
||||
|
||||
3. **Optimize Critical Paths**
|
||||
- Cache frequently accessed data
|
||||
- Use connection pooling
|
||||
- Implement circuit breakers
|
||||
- Add request timeouts
|
||||
|
||||
4. **Scale Resources**
|
||||
- Increase CPU/memory limits
|
||||
- Add more replicas
|
||||
- Use faster storage
|
||||
- Optimize network configuration
|
||||
|
||||
5. **Adjust Tempo**
|
||||
- Reduce tempo if overloaded
|
||||
- Increase tempo if under-utilized
|
||||
- Consider tempo auto-scaling
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
### Planned Features
|
||||
|
||||
1. **Dynamic Tempo Scaling**: Automatic tempo adjustment based on load
|
||||
2. **Beat Prediction**: ML-based prediction of optimal tempo
|
||||
3. **Resource-Aware Scheduling**: Beat scheduling based on resource availability
|
||||
4. **Cross-Service Tempo Negotiation**: Services negotiate optimal cluster tempo
|
||||
|
||||
### Experimental Features
|
||||
|
||||
1. **Hierarchical Beats**: Different tempo for different service types
|
||||
2. **Beat Priorities**: Critical beats get processing preference
|
||||
3. **Temporal Load Balancing**: Distribute work across beat phases
|
||||
4. **Beat Replay**: Replay missed beats during low-load periods
|
||||
|
||||
Understanding and implementing these tempo guidelines will ensure your BACKBEAT-enabled services operate efficiently and reliably across the full range of CHORUS 2.0.0 workloads.
|
||||
Reference in New Issue
Block a user