🚀 Complete BZZZ Issue Resolution - All 17 Issues Solved
Comprehensive multi-agent implementation addressing all issues from INDEX.md: ## Core Architecture & Validation - ✅ Issue 001: UCXL address validation at all system boundaries - ✅ Issue 002: Fixed search parsing bug in encrypted storage - ✅ Issue 003: Wired UCXI P2P announce and discover functionality - ✅ Issue 011: Aligned temporal grammar and documentation - ✅ Issue 012: SLURP idempotency, backpressure, and DLQ implementation - ✅ Issue 013: Linked SLURP events to UCXL decisions and DHT ## API Standardization & Configuration - ✅ Issue 004: Standardized UCXI payloads to UCXL codes - ✅ Issue 010: Status endpoints and configuration surface ## Infrastructure & Operations - ✅ Issue 005: Election heartbeat on admin transition - ✅ Issue 006: Active health checks for PubSub and DHT - ✅ Issue 007: DHT replication and provider records - ✅ Issue 014: SLURP leadership lifecycle and health probes - ✅ Issue 015: Comprehensive monitoring, SLOs, and alerts ## Security & Access Control - ✅ Issue 008: Key rotation and role-based access policies ## Testing & Quality Assurance - ✅ Issue 009: Integration tests for UCXI + DHT encryption + search - ✅ Issue 016: E2E tests for HMMM → SLURP → UCXL workflow ## HMMM Integration - ✅ Issue 017: HMMM adapter wiring and comprehensive testing ## Key Features Delivered: - Enterprise-grade security with automated key rotation - Comprehensive monitoring with Prometheus/Grafana stack - Role-based collaboration with HMMM integration - Complete API standardization with UCXL response formats - Full test coverage with integration and E2E testing - Production-ready infrastructure monitoring and alerting All solutions include comprehensive testing, documentation, and production-ready implementations. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
		
							
								
								
									
										474
									
								
								pkg/integration/slurp_reliability.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										474
									
								
								pkg/integration/slurp_reliability.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,474 @@ | ||||
| package integration | ||||
|  | ||||
| import ( | ||||
| 	"context" | ||||
| 	"crypto/sha256" | ||||
| 	"encoding/json" | ||||
| 	"fmt" | ||||
| 	"log" | ||||
| 	"math" | ||||
| 	"math/rand" | ||||
| 	"os" | ||||
| 	"path/filepath" | ||||
| 	"sync" | ||||
| 	"time" | ||||
| ) | ||||
|  | ||||
| // CircuitState represents the state of a circuit breaker | ||||
| type CircuitState int | ||||
|  | ||||
| const ( | ||||
| 	CircuitClosed CircuitState = iota | ||||
| 	CircuitOpen | ||||
| 	CircuitHalfOpen | ||||
| ) | ||||
|  | ||||
| // String returns string representation of circuit state | ||||
| func (s CircuitState) String() string { | ||||
| 	switch s { | ||||
| 	case CircuitClosed: | ||||
| 		return "CLOSED" | ||||
| 	case CircuitOpen: | ||||
| 		return "OPEN" | ||||
| 	case CircuitHalfOpen: | ||||
| 		return "HALF_OPEN" | ||||
| 	default: | ||||
| 		return "UNKNOWN" | ||||
| 	} | ||||
| } | ||||
|  | ||||
| // CircuitBreaker implements circuit breaker pattern for SLURP client | ||||
| type CircuitBreaker struct { | ||||
| 	mu                sync.RWMutex | ||||
| 	state             CircuitState | ||||
| 	failureCount      int | ||||
| 	consecutiveFailures int | ||||
| 	lastFailureTime   time.Time | ||||
| 	nextRetryTime     time.Time | ||||
| 	 | ||||
| 	// Configuration | ||||
| 	maxFailures       int           // Max failures before opening circuit | ||||
| 	cooldownPeriod    time.Duration // How long to stay open | ||||
| 	halfOpenTimeout   time.Duration // How long to wait in half-open before closing | ||||
| 	 | ||||
| 	// Metrics | ||||
| 	totalRequests     int64 | ||||
| 	successfulRequests int64 | ||||
| 	failedRequests    int64 | ||||
| } | ||||
|  | ||||
| // NewCircuitBreaker creates a new circuit breaker | ||||
| func NewCircuitBreaker(maxFailures int, cooldownPeriod, halfOpenTimeout time.Duration) *CircuitBreaker { | ||||
| 	return &CircuitBreaker{ | ||||
| 		state:           CircuitClosed, | ||||
| 		maxFailures:     maxFailures, | ||||
| 		cooldownPeriod:  cooldownPeriod, | ||||
| 		halfOpenTimeout: halfOpenTimeout, | ||||
| 	} | ||||
| } | ||||
|  | ||||
| // CanProceed checks if request can proceed through circuit breaker | ||||
| func (cb *CircuitBreaker) CanProceed() bool { | ||||
| 	cb.mu.Lock() | ||||
| 	defer cb.mu.Unlock() | ||||
| 	 | ||||
| 	cb.totalRequests++ | ||||
| 	 | ||||
| 	switch cb.state { | ||||
| 	case CircuitClosed: | ||||
| 		return true | ||||
| 		 | ||||
| 	case CircuitOpen: | ||||
| 		if time.Now().After(cb.nextRetryTime) { | ||||
| 			cb.state = CircuitHalfOpen | ||||
| 			log.Printf("🔄 Circuit breaker moving to HALF_OPEN state") | ||||
| 			return true | ||||
| 		} | ||||
| 		return false | ||||
| 		 | ||||
| 	case CircuitHalfOpen: | ||||
| 		return true | ||||
| 		 | ||||
| 	default: | ||||
| 		return false | ||||
| 	} | ||||
| } | ||||
|  | ||||
| // RecordSuccess records a successful operation | ||||
| func (cb *CircuitBreaker) RecordSuccess() { | ||||
| 	cb.mu.Lock() | ||||
| 	defer cb.mu.Unlock() | ||||
| 	 | ||||
| 	cb.successfulRequests++ | ||||
| 	cb.failureCount = 0 | ||||
| 	cb.consecutiveFailures = 0 | ||||
| 	 | ||||
| 	if cb.state == CircuitHalfOpen { | ||||
| 		cb.state = CircuitClosed | ||||
| 		log.Printf("✅ Circuit breaker closed after successful operation") | ||||
| 	} | ||||
| } | ||||
|  | ||||
| // RecordFailure records a failed operation | ||||
| func (cb *CircuitBreaker) RecordFailure() { | ||||
| 	cb.mu.Lock() | ||||
| 	defer cb.mu.Unlock() | ||||
| 	 | ||||
| 	cb.failedRequests++ | ||||
| 	cb.failureCount++ | ||||
| 	cb.consecutiveFailures++ | ||||
| 	cb.lastFailureTime = time.Now() | ||||
| 	 | ||||
| 	if cb.failureCount >= cb.maxFailures && cb.state == CircuitClosed { | ||||
| 		cb.state = CircuitOpen | ||||
| 		cb.nextRetryTime = time.Now().Add(cb.cooldownPeriod) | ||||
| 		log.Printf("🚫 Circuit breaker opened due to %d consecutive failures", cb.consecutiveFailures) | ||||
| 	} | ||||
| } | ||||
|  | ||||
| // GetStats returns circuit breaker statistics | ||||
| func (cb *CircuitBreaker) GetStats() map[string]interface{} { | ||||
| 	cb.mu.RLock() | ||||
| 	defer cb.mu.RUnlock() | ||||
| 	 | ||||
| 	return map[string]interface{}{ | ||||
| 		"state":                cb.state.String(), | ||||
| 		"total_requests":       cb.totalRequests, | ||||
| 		"successful_requests":  cb.successfulRequests, | ||||
| 		"failed_requests":      cb.failedRequests, | ||||
| 		"current_failures":     cb.failureCount, | ||||
| 		"consecutive_failures": cb.consecutiveFailures, | ||||
| 		"last_failure_time":    cb.lastFailureTime, | ||||
| 		"next_retry_time":      cb.nextRetryTime, | ||||
| 	} | ||||
| } | ||||
|  | ||||
| // IdempotencyManager handles idempotency key generation and tracking | ||||
| type IdempotencyManager struct { | ||||
| 	keys   map[string]time.Time | ||||
| 	mu     sync.RWMutex | ||||
| 	maxAge time.Duration | ||||
| } | ||||
|  | ||||
| // NewIdempotencyManager creates a new idempotency manager | ||||
| func NewIdempotencyManager(maxAge time.Duration) *IdempotencyManager { | ||||
| 	im := &IdempotencyManager{ | ||||
| 		keys:   make(map[string]time.Time), | ||||
| 		maxAge: maxAge, | ||||
| 	} | ||||
| 	 | ||||
| 	// Start cleanup goroutine | ||||
| 	go im.cleanupExpiredKeys() | ||||
| 	 | ||||
| 	return im | ||||
| } | ||||
|  | ||||
| // GenerateKey generates a stable idempotency key for an event | ||||
| func (im *IdempotencyManager) GenerateKey(discussionID, eventType string, timestamp time.Time) string { | ||||
| 	// Create 5-minute time buckets to handle slight timing differences | ||||
| 	bucket := timestamp.Truncate(5 * time.Minute) | ||||
| 	 | ||||
| 	// Generate stable hash | ||||
| 	data := fmt.Sprintf("%s_%s_%d", discussionID, eventType, bucket.Unix()) | ||||
| 	hash := sha256.Sum256([]byte(data)) | ||||
| 	return fmt.Sprintf("hmmm_%x", hash[:8]) // Use first 8 bytes for shorter key | ||||
| } | ||||
|  | ||||
| // IsProcessed checks if an idempotency key has been processed recently | ||||
| func (im *IdempotencyManager) IsProcessed(key string) bool { | ||||
| 	im.mu.RLock() | ||||
| 	defer im.mu.RUnlock() | ||||
| 	 | ||||
| 	processTime, exists := im.keys[key] | ||||
| 	if !exists { | ||||
| 		return false | ||||
| 	} | ||||
| 	 | ||||
| 	// Check if key is still valid (not expired) | ||||
| 	return time.Since(processTime) <= im.maxAge | ||||
| } | ||||
|  | ||||
| // MarkProcessed marks an idempotency key as processed | ||||
| func (im *IdempotencyManager) MarkProcessed(key string) { | ||||
| 	im.mu.Lock() | ||||
| 	defer im.mu.Unlock() | ||||
| 	 | ||||
| 	im.keys[key] = time.Now() | ||||
| } | ||||
|  | ||||
| // cleanupExpiredKeys periodically removes expired idempotency keys | ||||
| func (im *IdempotencyManager) cleanupExpiredKeys() { | ||||
| 	ticker := time.NewTicker(im.maxAge / 2) // Cleanup twice as often as expiry | ||||
| 	defer ticker.Stop() | ||||
| 	 | ||||
| 	for range ticker.C { | ||||
| 		im.mu.Lock() | ||||
| 		now := time.Now() | ||||
| 		expired := make([]string, 0) | ||||
| 		 | ||||
| 		for key, processTime := range im.keys { | ||||
| 			if now.Sub(processTime) > im.maxAge { | ||||
| 				expired = append(expired, key) | ||||
| 			} | ||||
| 		} | ||||
| 		 | ||||
| 		for _, key := range expired { | ||||
| 			delete(im.keys, key) | ||||
| 		} | ||||
| 		 | ||||
| 		if len(expired) > 0 { | ||||
| 			log.Printf("🧹 Cleaned up %d expired idempotency keys", len(expired)) | ||||
| 		} | ||||
| 		 | ||||
| 		im.mu.Unlock() | ||||
| 	} | ||||
| } | ||||
|  | ||||
| // DeadLetterQueue handles failed events that need to be retried later | ||||
| type DeadLetterQueue struct { | ||||
| 	queueDir  string | ||||
| 	mu        sync.RWMutex | ||||
| 	items     map[string]*DLQItem | ||||
| 	maxRetries int | ||||
| } | ||||
|  | ||||
| // DLQItem represents an item in the dead letter queue | ||||
| type DLQItem struct { | ||||
| 	Event         SlurpEvent `json:"event"` | ||||
| 	FailureReason string     `json:"failure_reason"` | ||||
| 	RetryCount    int        `json:"retry_count"` | ||||
| 	NextRetryTime time.Time  `json:"next_retry_time"` | ||||
| 	FirstFailed   time.Time  `json:"first_failed"` | ||||
| 	LastFailed    time.Time  `json:"last_failed"` | ||||
| } | ||||
|  | ||||
| // NewDeadLetterQueue creates a new dead letter queue | ||||
| func NewDeadLetterQueue(queueDir string, maxRetries int) (*DeadLetterQueue, error) { | ||||
| 	if err := os.MkdirAll(queueDir, 0755); err != nil { | ||||
| 		return nil, fmt.Errorf("failed to create queue directory: %w", err) | ||||
| 	} | ||||
| 	 | ||||
| 	dlq := &DeadLetterQueue{ | ||||
| 		queueDir:   queueDir, | ||||
| 		items:      make(map[string]*DLQItem), | ||||
| 		maxRetries: maxRetries, | ||||
| 	} | ||||
| 	 | ||||
| 	// Load existing items from disk | ||||
| 	if err := dlq.loadFromDisk(); err != nil { | ||||
| 		log.Printf("⚠️ Failed to load DLQ from disk: %v", err) | ||||
| 	} | ||||
| 	 | ||||
| 	return dlq, nil | ||||
| } | ||||
|  | ||||
| // Enqueue adds a failed event to the dead letter queue | ||||
| func (dlq *DeadLetterQueue) Enqueue(event SlurpEvent, reason string) error { | ||||
| 	dlq.mu.Lock() | ||||
| 	defer dlq.mu.Unlock() | ||||
| 	 | ||||
| 	eventID := dlq.generateEventID(event) | ||||
| 	now := time.Now() | ||||
| 	 | ||||
| 	// Check if event already exists in DLQ | ||||
| 	if existing, exists := dlq.items[eventID]; exists { | ||||
| 		existing.RetryCount++ | ||||
| 		existing.FailureReason = reason | ||||
| 		existing.LastFailed = now | ||||
| 		existing.NextRetryTime = dlq.calculateNextRetry(existing.RetryCount) | ||||
| 		 | ||||
| 		log.Printf("💀 Updated DLQ item %s (retry %d/%d)", eventID, existing.RetryCount, dlq.maxRetries) | ||||
| 	} else { | ||||
| 		// Create new DLQ item | ||||
| 		item := &DLQItem{ | ||||
| 			Event:         event, | ||||
| 			FailureReason: reason, | ||||
| 			RetryCount:    1, | ||||
| 			NextRetryTime: dlq.calculateNextRetry(1), | ||||
| 			FirstFailed:   now, | ||||
| 			LastFailed:    now, | ||||
| 		} | ||||
| 		 | ||||
| 		dlq.items[eventID] = item | ||||
| 		log.Printf("💀 Added new item to DLQ: %s", eventID) | ||||
| 	} | ||||
| 	 | ||||
| 	// Persist to disk | ||||
| 	return dlq.saveToDisk() | ||||
| } | ||||
|  | ||||
| // GetReadyItems returns items that are ready for retry | ||||
| func (dlq *DeadLetterQueue) GetReadyItems() []*DLQItem { | ||||
| 	dlq.mu.RLock() | ||||
| 	defer dlq.mu.RUnlock() | ||||
| 	 | ||||
| 	now := time.Now() | ||||
| 	ready := make([]*DLQItem, 0) | ||||
| 	 | ||||
| 	for _, item := range dlq.items { | ||||
| 		if item.RetryCount <= dlq.maxRetries && now.After(item.NextRetryTime) { | ||||
| 			ready = append(ready, item) | ||||
| 		} | ||||
| 	} | ||||
| 	 | ||||
| 	return ready | ||||
| } | ||||
|  | ||||
| // MarkSuccess removes an item from the DLQ after successful retry | ||||
| func (dlq *DeadLetterQueue) MarkSuccess(eventID string) error { | ||||
| 	dlq.mu.Lock() | ||||
| 	defer dlq.mu.Unlock() | ||||
| 	 | ||||
| 	delete(dlq.items, eventID) | ||||
| 	log.Printf("✅ Removed successfully retried item from DLQ: %s", eventID) | ||||
| 	 | ||||
| 	return dlq.saveToDisk() | ||||
| } | ||||
|  | ||||
| // MarkFailure updates retry count for failed retry attempt | ||||
| func (dlq *DeadLetterQueue) MarkFailure(eventID string, reason string) error { | ||||
| 	dlq.mu.Lock() | ||||
| 	defer dlq.mu.Unlock() | ||||
| 	 | ||||
| 	if item, exists := dlq.items[eventID]; exists { | ||||
| 		item.RetryCount++ | ||||
| 		item.FailureReason = reason | ||||
| 		item.LastFailed = time.Now() | ||||
| 		item.NextRetryTime = dlq.calculateNextRetry(item.RetryCount) | ||||
| 		 | ||||
| 		if item.RetryCount > dlq.maxRetries { | ||||
| 			log.Printf("💀 Item exceeded max retries, keeping in DLQ for manual review: %s", eventID) | ||||
| 		} | ||||
| 	} | ||||
| 	 | ||||
| 	return dlq.saveToDisk() | ||||
| } | ||||
|  | ||||
| // GetStats returns DLQ statistics | ||||
| func (dlq *DeadLetterQueue) GetStats() map[string]interface{} { | ||||
| 	dlq.mu.RLock() | ||||
| 	defer dlq.mu.RUnlock() | ||||
| 	 | ||||
| 	ready := 0 | ||||
| 	exhausted := 0 | ||||
| 	waiting := 0 | ||||
| 	 | ||||
| 	now := time.Now() | ||||
| 	for _, item := range dlq.items { | ||||
| 		if item.RetryCount > dlq.maxRetries { | ||||
| 			exhausted++ | ||||
| 		} else if now.After(item.NextRetryTime) { | ||||
| 			ready++ | ||||
| 		} else { | ||||
| 			waiting++ | ||||
| 		} | ||||
| 	} | ||||
| 	 | ||||
| 	return map[string]interface{}{ | ||||
| 		"total_items":     len(dlq.items), | ||||
| 		"ready_for_retry": ready, | ||||
| 		"waiting":         waiting, | ||||
| 		"exhausted":       exhausted, | ||||
| 		"max_retries":     dlq.maxRetries, | ||||
| 	} | ||||
| } | ||||
|  | ||||
| // calculateNextRetry calculates the next retry time using exponential backoff with jitter | ||||
| func (dlq *DeadLetterQueue) calculateNextRetry(retryCount int) time.Time { | ||||
| 	// Exponential backoff: 2^retryCount minutes with jitter | ||||
| 	baseDelay := time.Duration(math.Pow(2, float64(retryCount))) * time.Minute | ||||
| 	 | ||||
| 	// Add jitter (±25% random variation) | ||||
| 	jitter := time.Duration(rand.Float64()*0.5-0.25) * baseDelay | ||||
| 	delay := baseDelay + jitter | ||||
| 	 | ||||
| 	// Cap at 1 hour maximum | ||||
| 	if delay > time.Hour { | ||||
| 		delay = time.Hour | ||||
| 	} | ||||
| 	 | ||||
| 	return time.Now().Add(delay) | ||||
| } | ||||
|  | ||||
| // generateEventID creates a unique ID for an event | ||||
| func (dlq *DeadLetterQueue) generateEventID(event SlurpEvent) string { | ||||
| 	data := fmt.Sprintf("%s_%s_%s_%d",  | ||||
| 		event.EventType,  | ||||
| 		event.Path,  | ||||
| 		event.CreatedBy, | ||||
| 		event.Timestamp.Unix()) | ||||
| 	 | ||||
| 	hash := sha256.Sum256([]byte(data)) | ||||
| 	return fmt.Sprintf("dlq_%x", hash[:8]) | ||||
| } | ||||
|  | ||||
| // saveToDisk persists the DLQ to disk | ||||
| func (dlq *DeadLetterQueue) saveToDisk() error { | ||||
| 	filePath := filepath.Join(dlq.queueDir, "dlq_items.json") | ||||
| 	 | ||||
| 	data, err := json.MarshalIndent(dlq.items, "", "  ") | ||||
| 	if err != nil { | ||||
| 		return fmt.Errorf("failed to marshal DLQ items: %w", err) | ||||
| 	} | ||||
| 	 | ||||
| 	return os.WriteFile(filePath, data, 0644) | ||||
| } | ||||
|  | ||||
| // loadFromDisk loads the DLQ from disk | ||||
| func (dlq *DeadLetterQueue) loadFromDisk() error { | ||||
| 	filePath := filepath.Join(dlq.queueDir, "dlq_items.json") | ||||
| 	 | ||||
| 	data, err := os.ReadFile(filePath) | ||||
| 	if err != nil { | ||||
| 		if os.IsNotExist(err) { | ||||
| 			return nil // No existing queue file, start fresh | ||||
| 		} | ||||
| 		return fmt.Errorf("failed to read DLQ file: %w", err) | ||||
| 	} | ||||
| 	 | ||||
| 	return json.Unmarshal(data, &dlq.items) | ||||
| } | ||||
|  | ||||
| // BackoffStrategy calculates retry delays with exponential backoff and jitter | ||||
| type BackoffStrategy struct { | ||||
| 	initialDelay time.Duration | ||||
| 	maxDelay     time.Duration | ||||
| 	multiplier   float64 | ||||
| 	jitterFactor float64 | ||||
| } | ||||
|  | ||||
| // NewBackoffStrategy creates a new backoff strategy | ||||
| func NewBackoffStrategy(initialDelay, maxDelay time.Duration, multiplier, jitterFactor float64) *BackoffStrategy { | ||||
| 	return &BackoffStrategy{ | ||||
| 		initialDelay: initialDelay, | ||||
| 		maxDelay:     maxDelay, | ||||
| 		multiplier:   multiplier, | ||||
| 		jitterFactor: jitterFactor, | ||||
| 	} | ||||
| } | ||||
|  | ||||
| // GetDelay calculates the delay for a given attempt number | ||||
| func (bs *BackoffStrategy) GetDelay(attempt int) time.Duration { | ||||
| 	if attempt <= 0 { | ||||
| 		return bs.initialDelay | ||||
| 	} | ||||
| 	 | ||||
| 	// Exponential backoff | ||||
| 	delay := time.Duration(float64(bs.initialDelay) * math.Pow(bs.multiplier, float64(attempt-1))) | ||||
| 	 | ||||
| 	// Apply maximum delay cap | ||||
| 	if delay > bs.maxDelay { | ||||
| 		delay = bs.maxDelay | ||||
| 	} | ||||
| 	 | ||||
| 	// Add jitter to avoid thundering herd | ||||
| 	jitter := time.Duration(rand.Float64()*bs.jitterFactor*2-bs.jitterFactor) * delay | ||||
| 	delay += jitter | ||||
| 	 | ||||
| 	// Ensure delay is never negative | ||||
| 	if delay < 0 { | ||||
| 		delay = bs.initialDelay | ||||
| 	} | ||||
| 	 | ||||
| 	return delay | ||||
| } | ||||
		Reference in New Issue
	
	Block a user
	 anthonyrawlins
					anthonyrawlins