🚀 Complete BZZZ Issue Resolution - All 17 Issues Solved

Comprehensive multi-agent implementation addressing all issues from INDEX.md: ## Core Architecture & Validation - ✅ Issue 001: UCXL address validation at all system boundaries - ✅ Issue 002: Fixed search parsing bug in encrypted storage - ✅ Issue 003: Wired UCXI P2P announce and discover functionality - ✅ Issue 011: Aligned temporal grammar and documentation - ✅ Issue 012: SLURP idempotency, backpressure, and DLQ implementation - ✅ Issue 013: Linked SLURP events to UCXL decisions and DHT ## API Standardization & Configuration - ✅ Issue 004: Standardized UCXI payloads to UCXL codes - ✅ Issue 010: Status endpoints and configuration surface ## Infrastructure & Operations - ✅ Issue 005: Election heartbeat on admin transition - ✅ Issue 006: Active health checks for PubSub and DHT - ✅ Issue 007: DHT replication and provider records - ✅ Issue 014: SLURP leadership lifecycle and health probes - ✅ Issue 015: Comprehensive monitoring, SLOs, and alerts ## Security & Access Control - ✅ Issue 008: Key rotation and role-based access policies ## Testing & Quality Assurance - ✅ Issue 009: Integration tests for UCXI + DHT encryption + search - ✅ Issue 016: E2E tests for HMMM → SLURP → UCXL workflow ## HMMM Integration - ✅ Issue 017: HMMM adapter wiring and comprehensive testing ## Key Features Delivered: - Enterprise-grade security with automated key rotation - Comprehensive monitoring with Prometheus/Grafana stack - Role-based collaboration with HMMM integration - Complete API standardization with UCXL response formats - Full test coverage with integration and E2E testing - Production-ready infrastructure monitoring and alerting All solutions include comprehensive testing, documentation, and production-ready implementations. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-29 12:39:38 +10:00
parent 59f40e17a5
commit 92779523c0
136 changed files with 56649 additions and 134 deletions
--- a/pkg/integration/slurp_reliability.go
+++ b/pkg/integration/slurp_reliability.go
@@ -0,0 +1,474 @@
+package integration
+
+import (
+	"context"
+	"crypto/sha256"
+	"encoding/json"
+	"fmt"
+	"log"
+	"math"
+	"math/rand"
+	"os"
+	"path/filepath"
+	"sync"
+	"time"
+)
+
+// CircuitState represents the state of a circuit breaker
+type CircuitState int
+
+const (
+	CircuitClosed CircuitState = iota
+	CircuitOpen
+	CircuitHalfOpen
+)
+
+// String returns string representation of circuit state
+func (s CircuitState) String() string {
+	switch s {
+	case CircuitClosed:
+		return "CLOSED"
+	case CircuitOpen:
+		return "OPEN"
+	case CircuitHalfOpen:
+		return "HALF_OPEN"
+	default:
+		return "UNKNOWN"
+	}
+}
+
+// CircuitBreaker implements circuit breaker pattern for SLURP client
+type CircuitBreaker struct {
+	mu                sync.RWMutex
+	state             CircuitState
+	failureCount      int
+	consecutiveFailures int
+	lastFailureTime   time.Time
+	nextRetryTime     time.Time
+	
+	// Configuration
+	maxFailures       int           // Max failures before opening circuit
+	cooldownPeriod    time.Duration // How long to stay open
+	halfOpenTimeout   time.Duration // How long to wait in half-open before closing
+	
+	// Metrics
+	totalRequests     int64
+	successfulRequests int64
+	failedRequests    int64
+}
+
+// NewCircuitBreaker creates a new circuit breaker
+func NewCircuitBreaker(maxFailures int, cooldownPeriod, halfOpenTimeout time.Duration) *CircuitBreaker {
+	return &CircuitBreaker{
+		state:           CircuitClosed,
+		maxFailures:     maxFailures,
+		cooldownPeriod:  cooldownPeriod,
+		halfOpenTimeout: halfOpenTimeout,
+	}
+}
+
+// CanProceed checks if request can proceed through circuit breaker
+func (cb *CircuitBreaker) CanProceed() bool {
+	cb.mu.Lock()
+	defer cb.mu.Unlock()
+	
+	cb.totalRequests++
+	
+	switch cb.state {
+	case CircuitClosed:
+		return true
+		
+	case CircuitOpen:
+		if time.Now().After(cb.nextRetryTime) {
+			cb.state = CircuitHalfOpen
+			log.Printf("🔄 Circuit breaker moving to HALF_OPEN state")
+			return true
+		}
+		return false
+		
+	case CircuitHalfOpen:
+		return true
+		
+	default:
+		return false
+	}
+}
+
+// RecordSuccess records a successful operation
+func (cb *CircuitBreaker) RecordSuccess() {
+	cb.mu.Lock()
+	defer cb.mu.Unlock()
+	
+	cb.successfulRequests++
+	cb.failureCount = 0
+	cb.consecutiveFailures = 0
+	
+	if cb.state == CircuitHalfOpen {
+		cb.state = CircuitClosed
+		log.Printf("✅ Circuit breaker closed after successful operation")
+	}
+}
+
+// RecordFailure records a failed operation
+func (cb *CircuitBreaker) RecordFailure() {
+	cb.mu.Lock()
+	defer cb.mu.Unlock()
+	
+	cb.failedRequests++
+	cb.failureCount++
+	cb.consecutiveFailures++
+	cb.lastFailureTime = time.Now()
+	
+	if cb.failureCount >= cb.maxFailures && cb.state == CircuitClosed {
+		cb.state = CircuitOpen
+		cb.nextRetryTime = time.Now().Add(cb.cooldownPeriod)
+		log.Printf("🚫 Circuit breaker opened due to %d consecutive failures", cb.consecutiveFailures)
+	}
+}
+
+// GetStats returns circuit breaker statistics
+func (cb *CircuitBreaker) GetStats() map[string]interface{} {
+	cb.mu.RLock()
+	defer cb.mu.RUnlock()
+	
+	return map[string]interface{}{
+		"state":                cb.state.String(),
+		"total_requests":       cb.totalRequests,
+		"successful_requests":  cb.successfulRequests,
+		"failed_requests":      cb.failedRequests,
+		"current_failures":     cb.failureCount,
+		"consecutive_failures": cb.consecutiveFailures,
+		"last_failure_time":    cb.lastFailureTime,
+		"next_retry_time":      cb.nextRetryTime,
+	}
+}
+
+// IdempotencyManager handles idempotency key generation and tracking
+type IdempotencyManager struct {
+	keys   map[string]time.Time
+	mu     sync.RWMutex
+	maxAge time.Duration
+}
+
+// NewIdempotencyManager creates a new idempotency manager
+func NewIdempotencyManager(maxAge time.Duration) *IdempotencyManager {
+	im := &IdempotencyManager{
+		keys:   make(map[string]time.Time),
+		maxAge: maxAge,
+	}
+	
+	// Start cleanup goroutine
+	go im.cleanupExpiredKeys()
+	
+	return im
+}
+
+// GenerateKey generates a stable idempotency key for an event
+func (im *IdempotencyManager) GenerateKey(discussionID, eventType string, timestamp time.Time) string {
+	// Create 5-minute time buckets to handle slight timing differences
+	bucket := timestamp.Truncate(5 * time.Minute)
+	
+	// Generate stable hash
+	data := fmt.Sprintf("%s_%s_%d", discussionID, eventType, bucket.Unix())
+	hash := sha256.Sum256([]byte(data))
+	return fmt.Sprintf("hmmm_%x", hash[:8]) // Use first 8 bytes for shorter key
+}
+
+// IsProcessed checks if an idempotency key has been processed recently
+func (im *IdempotencyManager) IsProcessed(key string) bool {
+	im.mu.RLock()
+	defer im.mu.RUnlock()
+	
+	processTime, exists := im.keys[key]
+	if !exists {
+		return false
+	}
+	
+	// Check if key is still valid (not expired)
+	return time.Since(processTime) <= im.maxAge
+}
+
+// MarkProcessed marks an idempotency key as processed
+func (im *IdempotencyManager) MarkProcessed(key string) {
+	im.mu.Lock()
+	defer im.mu.Unlock()
+	
+	im.keys[key] = time.Now()
+}
+
+// cleanupExpiredKeys periodically removes expired idempotency keys
+func (im *IdempotencyManager) cleanupExpiredKeys() {
+	ticker := time.NewTicker(im.maxAge / 2) // Cleanup twice as often as expiry
+	defer ticker.Stop()
+	
+	for range ticker.C {
+		im.mu.Lock()
+		now := time.Now()
+		expired := make([]string, 0)
+		
+		for key, processTime := range im.keys {
+			if now.Sub(processTime) > im.maxAge {
+				expired = append(expired, key)
+			}
+		}
+		
+		for _, key := range expired {
+			delete(im.keys, key)
+		}
+		
+		if len(expired) > 0 {
+			log.Printf("🧹 Cleaned up %d expired idempotency keys", len(expired))
+		}
+		
+		im.mu.Unlock()
+	}
+}
+
+// DeadLetterQueue handles failed events that need to be retried later
+type DeadLetterQueue struct {
+	queueDir  string
+	mu        sync.RWMutex
+	items     map[string]*DLQItem
+	maxRetries int
+}
+
+// DLQItem represents an item in the dead letter queue
+type DLQItem struct {
+	Event         SlurpEvent `json:"event"`
+	FailureReason string     `json:"failure_reason"`
+	RetryCount    int        `json:"retry_count"`
+	NextRetryTime time.Time  `json:"next_retry_time"`
+	FirstFailed   time.Time  `json:"first_failed"`
+	LastFailed    time.Time  `json:"last_failed"`
+}
+
+// NewDeadLetterQueue creates a new dead letter queue
+func NewDeadLetterQueue(queueDir string, maxRetries int) (*DeadLetterQueue, error) {
+	if err := os.MkdirAll(queueDir, 0755); err != nil {
+		return nil, fmt.Errorf("failed to create queue directory: %w", err)
+	}
+	
+	dlq := &DeadLetterQueue{
+		queueDir:   queueDir,
+		items:      make(map[string]*DLQItem),
+		maxRetries: maxRetries,
+	}
+	
+	// Load existing items from disk
+	if err := dlq.loadFromDisk(); err != nil {
+		log.Printf("⚠️ Failed to load DLQ from disk: %v", err)
+	}
+	
+	return dlq, nil
+}
+
+// Enqueue adds a failed event to the dead letter queue
+func (dlq *DeadLetterQueue) Enqueue(event SlurpEvent, reason string) error {
+	dlq.mu.Lock()
+	defer dlq.mu.Unlock()
+	
+	eventID := dlq.generateEventID(event)
+	now := time.Now()
+	
+	// Check if event already exists in DLQ
+	if existing, exists := dlq.items[eventID]; exists {
+		existing.RetryCount++
+		existing.FailureReason = reason
+		existing.LastFailed = now
+		existing.NextRetryTime = dlq.calculateNextRetry(existing.RetryCount)
+		
+		log.Printf("💀 Updated DLQ item %s (retry %d/%d)", eventID, existing.RetryCount, dlq.maxRetries)
+	} else {
+		// Create new DLQ item
+		item := &DLQItem{
+			Event:         event,
+			FailureReason: reason,
+			RetryCount:    1,
+			NextRetryTime: dlq.calculateNextRetry(1),
+			FirstFailed:   now,
+			LastFailed:    now,
+		}
+		
+		dlq.items[eventID] = item
+		log.Printf("💀 Added new item to DLQ: %s", eventID)
+	}
+	
+	// Persist to disk
+	return dlq.saveToDisk()
+}
+
+// GetReadyItems returns items that are ready for retry
+func (dlq *DeadLetterQueue) GetReadyItems() []*DLQItem {
+	dlq.mu.RLock()
+	defer dlq.mu.RUnlock()
+	
+	now := time.Now()
+	ready := make([]*DLQItem, 0)
+	
+	for _, item := range dlq.items {
+		if item.RetryCount <= dlq.maxRetries && now.After(item.NextRetryTime) {
+			ready = append(ready, item)
+		}
+	}
+	
+	return ready
+}
+
+// MarkSuccess removes an item from the DLQ after successful retry
+func (dlq *DeadLetterQueue) MarkSuccess(eventID string) error {
+	dlq.mu.Lock()
+	defer dlq.mu.Unlock()
+	
+	delete(dlq.items, eventID)
+	log.Printf("✅ Removed successfully retried item from DLQ: %s", eventID)
+	
+	return dlq.saveToDisk()
+}
+
+// MarkFailure updates retry count for failed retry attempt
+func (dlq *DeadLetterQueue) MarkFailure(eventID string, reason string) error {
+	dlq.mu.Lock()
+	defer dlq.mu.Unlock()
+	
+	if item, exists := dlq.items[eventID]; exists {
+		item.RetryCount++
+		item.FailureReason = reason
+		item.LastFailed = time.Now()
+		item.NextRetryTime = dlq.calculateNextRetry(item.RetryCount)
+		
+		if item.RetryCount > dlq.maxRetries {
+			log.Printf("💀 Item exceeded max retries, keeping in DLQ for manual review: %s", eventID)
+		}
+	}
+	
+	return dlq.saveToDisk()
+}
+
+// GetStats returns DLQ statistics
+func (dlq *DeadLetterQueue) GetStats() map[string]interface{} {
+	dlq.mu.RLock()
+	defer dlq.mu.RUnlock()
+	
+	ready := 0
+	exhausted := 0
+	waiting := 0
+	
+	now := time.Now()
+	for _, item := range dlq.items {
+		if item.RetryCount > dlq.maxRetries {
+			exhausted++
+		} else if now.After(item.NextRetryTime) {
+			ready++
+		} else {
+			waiting++
+		}
+	}
+	
+	return map[string]interface{}{
+		"total_items":     len(dlq.items),
+		"ready_for_retry": ready,
+		"waiting":         waiting,
+		"exhausted":       exhausted,
+		"max_retries":     dlq.maxRetries,
+	}
+}
+
+// calculateNextRetry calculates the next retry time using exponential backoff with jitter
+func (dlq *DeadLetterQueue) calculateNextRetry(retryCount int) time.Time {
+	// Exponential backoff: 2^retryCount minutes with jitter
+	baseDelay := time.Duration(math.Pow(2, float64(retryCount))) * time.Minute
+	
+	// Add jitter (±25% random variation)
+	jitter := time.Duration(rand.Float64()*0.5-0.25) * baseDelay
+	delay := baseDelay + jitter
+	
+	// Cap at 1 hour maximum
+	if delay > time.Hour {
+		delay = time.Hour
+	}
+	
+	return time.Now().Add(delay)
+}
+
+// generateEventID creates a unique ID for an event
+func (dlq *DeadLetterQueue) generateEventID(event SlurpEvent) string {
+	data := fmt.Sprintf("%s_%s_%s_%d", 
+		event.EventType, 
+		event.Path, 
+		event.CreatedBy,
+		event.Timestamp.Unix())
+	
+	hash := sha256.Sum256([]byte(data))
+	return fmt.Sprintf("dlq_%x", hash[:8])
+}
+
+// saveToDisk persists the DLQ to disk
+func (dlq *DeadLetterQueue) saveToDisk() error {
+	filePath := filepath.Join(dlq.queueDir, "dlq_items.json")
+	
+	data, err := json.MarshalIndent(dlq.items, "", "  ")
+	if err != nil {
+		return fmt.Errorf("failed to marshal DLQ items: %w", err)
+	}
+	
+	return os.WriteFile(filePath, data, 0644)
+}
+
+// loadFromDisk loads the DLQ from disk
+func (dlq *DeadLetterQueue) loadFromDisk() error {
+	filePath := filepath.Join(dlq.queueDir, "dlq_items.json")
+	
+	data, err := os.ReadFile(filePath)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil // No existing queue file, start fresh
+		}
+		return fmt.Errorf("failed to read DLQ file: %w", err)
+	}
+	
+	return json.Unmarshal(data, &dlq.items)
+}
+
+// BackoffStrategy calculates retry delays with exponential backoff and jitter
+type BackoffStrategy struct {
+	initialDelay time.Duration
+	maxDelay     time.Duration
+	multiplier   float64
+	jitterFactor float64
+}
+
+// NewBackoffStrategy creates a new backoff strategy
+func NewBackoffStrategy(initialDelay, maxDelay time.Duration, multiplier, jitterFactor float64) *BackoffStrategy {
+	return &BackoffStrategy{
+		initialDelay: initialDelay,
+		maxDelay:     maxDelay,
+		multiplier:   multiplier,
+		jitterFactor: jitterFactor,
+	}
+}
+
+// GetDelay calculates the delay for a given attempt number
+func (bs *BackoffStrategy) GetDelay(attempt int) time.Duration {
+	if attempt <= 0 {
+		return bs.initialDelay
+	}
+	
+	// Exponential backoff
+	delay := time.Duration(float64(bs.initialDelay) * math.Pow(bs.multiplier, float64(attempt-1)))
+	
+	// Apply maximum delay cap
+	if delay > bs.maxDelay {
+		delay = bs.maxDelay
+	}
+	
+	// Add jitter to avoid thundering herd
+	jitter := time.Duration(rand.Float64()*bs.jitterFactor*2-bs.jitterFactor) * delay
+	delay += jitter
+	
+	// Ensure delay is never negative
+	if delay < 0 {
+		delay = bs.initialDelay
+	}
+	
+	return delay
+}