# Enhanced Alert Rules for BZZZ v2 Infrastructure # Service Level Objectives and Critical System Alerts groups: # === System Health and SLO Alerts === - name: bzzz_system_health rules: # Overall system health score - alert: BZZZSystemHealthCritical expr: bzzz_system_health_score < 0.5 for: 2m labels: severity: critical service: bzzz slo: availability annotations: summary: "BZZZ system health is critically low" description: "System health score {{ $value }} is below critical threshold (0.5)" runbook_url: "https://wiki.chorus.services/runbooks/bzzz-health-critical" - alert: BZZZSystemHealthDegraded expr: bzzz_system_health_score < 0.8 for: 5m labels: severity: warning service: bzzz slo: availability annotations: summary: "BZZZ system health is degraded" description: "System health score {{ $value }} is below warning threshold (0.8)" runbook_url: "https://wiki.chorus.services/runbooks/bzzz-health-degraded" # Component health monitoring - alert: BZZZComponentUnhealthy expr: bzzz_component_health_score < 0.7 for: 3m labels: severity: warning service: bzzz component: "{{ $labels.component }}" annotations: summary: "BZZZ component {{ $labels.component }} is unhealthy" description: "Component {{ $labels.component }} health score {{ $value }} is below threshold" # === P2P Network Alerts === - name: bzzz_p2p_network rules: # Peer connectivity SLO: Maintain at least 3 connected peers - alert: BZZZInsufficientPeers expr: bzzz_p2p_connected_peers < 3 for: 1m labels: severity: critical service: bzzz component: p2p slo: connectivity annotations: summary: "BZZZ has insufficient P2P peers" description: "Only {{ $value }} peers connected, minimum required is 3" runbook_url: "https://wiki.chorus.services/runbooks/bzzz-peer-connectivity" # Message latency SLO: 95th percentile < 500ms - alert: BZZZP2PHighLatency expr: histogram_quantile(0.95, rate(bzzz_p2p_message_latency_seconds_bucket[5m])) > 0.5 for: 3m labels: severity: warning service: bzzz component: p2p slo: latency annotations: summary: "BZZZ P2P message latency is high" description: "95th percentile latency {{ $value }}s exceeds 500ms SLO" runbook_url: "https://wiki.chorus.services/runbooks/bzzz-p2p-latency" # Message loss detection - alert: BZZZP2PMessageLoss expr: rate(bzzz_p2p_messages_sent_total[5m]) - rate(bzzz_p2p_messages_received_total[5m]) > 0.1 for: 2m labels: severity: warning service: bzzz component: p2p annotations: summary: "BZZZ P2P message loss detected" description: "Message send/receive imbalance: {{ $value }} messages/sec" # === DHT Performance and Reliability === - name: bzzz_dht rules: # DHT operation success rate SLO: > 99% - alert: BZZZDHTLowSuccessRate expr: (rate(bzzz_dht_put_operations_total{status="success"}[5m]) + rate(bzzz_dht_get_operations_total{status="success"}[5m])) / (rate(bzzz_dht_put_operations_total[5m]) + rate(bzzz_dht_get_operations_total[5m])) < 0.99 for: 2m labels: severity: warning service: bzzz component: dht slo: success_rate annotations: summary: "BZZZ DHT operation success rate is low" description: "DHT success rate {{ $value | humanizePercentage }} is below 99% SLO" runbook_url: "https://wiki.chorus.services/runbooks/bzzz-dht-success-rate" # DHT operation latency SLO: 95th percentile < 300ms for gets - alert: BZZZDHTHighGetLatency expr: histogram_quantile(0.95, rate(bzzz_dht_operation_latency_seconds_bucket{operation="get"}[5m])) > 0.3 for: 3m labels: severity: warning service: bzzz component: dht slo: latency annotations: summary: "BZZZ DHT get operations are slow" description: "95th percentile get latency {{ $value }}s exceeds 300ms SLO" # DHT replication health - alert: BZZZDHTReplicationDegraded expr: avg(bzzz_dht_replication_factor) < 2 for: 5m labels: severity: warning service: bzzz component: dht slo: durability annotations: summary: "BZZZ DHT replication is degraded" description: "Average replication factor {{ $value }} is below target of 3" runbook_url: "https://wiki.chorus.services/runbooks/bzzz-dht-replication" # Provider record staleness - alert: BZZZDHTStaleProviders expr: increase(bzzz_dht_provider_records[1h]) == 0 and bzzz_dht_content_keys > 0 for: 10m labels: severity: warning service: bzzz component: dht annotations: summary: "BZZZ DHT provider records are not updating" description: "No provider record updates in the last hour despite having content" # === Election System Stability === - name: bzzz_election rules: # Leadership stability: Avoid frequent leadership changes - alert: BZZZFrequentLeadershipChanges expr: increase(bzzz_leadership_changes_total[1h]) > 3 for: 0m labels: severity: warning service: bzzz component: election annotations: summary: "BZZZ leadership is unstable" description: "{{ $value }} leadership changes in the last hour" runbook_url: "https://wiki.chorus.services/runbooks/bzzz-leadership-instability" # Election timeout - alert: BZZZElectionInProgress expr: bzzz_election_state{state="electing"} == 1 for: 2m labels: severity: warning service: bzzz component: election annotations: summary: "BZZZ election taking too long" description: "Election has been in progress for more than 2 minutes" # No admin elected - alert: BZZZNoAdminElected expr: bzzz_election_state{state="idle"} == 1 and absent(bzzz_heartbeats_received_total) for: 1m labels: severity: critical service: bzzz component: election annotations: summary: "BZZZ has no elected admin" description: "System is idle but no heartbeats are being received" runbook_url: "https://wiki.chorus.services/runbooks/bzzz-no-admin" # Heartbeat monitoring - alert: BZZZHeartbeatMissing expr: increase(bzzz_heartbeats_received_total[2m]) == 0 for: 1m labels: severity: critical service: bzzz component: election annotations: summary: "BZZZ admin heartbeat missing" description: "No heartbeats received from admin in the last 2 minutes" # === PubSub Messaging System === - name: bzzz_pubsub rules: # Message processing rate - alert: BZZZPubSubHighMessageRate expr: rate(bzzz_pubsub_messages_total[1m]) > 1000 for: 2m labels: severity: warning service: bzzz component: pubsub annotations: summary: "BZZZ PubSub message rate is very high" description: "Processing {{ $value }} messages/sec, may indicate spam or DoS" # Message latency - alert: BZZZPubSubHighLatency expr: histogram_quantile(0.95, rate(bzzz_pubsub_message_latency_seconds_bucket[5m])) > 1.0 for: 3m labels: severity: warning service: bzzz component: pubsub slo: latency annotations: summary: "BZZZ PubSub message latency is high" description: "95th percentile latency {{ $value }}s exceeds 1s threshold" # Topic monitoring - alert: BZZZPubSubNoTopics expr: bzzz_pubsub_topics == 0 for: 5m labels: severity: warning service: bzzz component: pubsub annotations: summary: "BZZZ PubSub has no active topics" description: "No PubSub topics are active, system may be isolated" # === Task Management and Processing === - name: bzzz_tasks rules: # Task queue backup - alert: BZZZTaskQueueBackup expr: bzzz_tasks_queued > 100 for: 5m labels: severity: warning service: bzzz component: tasks annotations: summary: "BZZZ task queue is backing up" description: "{{ $value }} tasks are queued, may indicate processing issues" runbook_url: "https://wiki.chorus.services/runbooks/bzzz-task-queue" # Task success rate SLO: > 95% - alert: BZZZTaskLowSuccessRate expr: rate(bzzz_tasks_completed_total{status="success"}[10m]) / rate(bzzz_tasks_completed_total[10m]) < 0.95 for: 5m labels: severity: warning service: bzzz component: tasks slo: success_rate annotations: summary: "BZZZ task success rate is low" description: "Task success rate {{ $value | humanizePercentage }} is below 95% SLO" # Task processing latency - alert: BZZZTaskHighProcessingTime expr: histogram_quantile(0.95, rate(bzzz_task_duration_seconds_bucket[5m])) > 300 for: 3m labels: severity: warning service: bzzz component: tasks annotations: summary: "BZZZ task processing time is high" description: "95th percentile task duration {{ $value }}s exceeds 5 minutes" # === SLURP Context Generation === - name: bzzz_slurp rules: # Context generation success rate - alert: BZZZSLURPLowSuccessRate expr: rate(bzzz_slurp_contexts_generated_total{status="success"}[10m]) / rate(bzzz_slurp_contexts_generated_total[10m]) < 0.90 for: 5m labels: severity: warning service: bzzz component: slurp annotations: summary: "SLURP context generation success rate is low" description: "Success rate {{ $value | humanizePercentage }} is below 90%" runbook_url: "https://wiki.chorus.services/runbooks/bzzz-slurp-generation" # Generation queue backup - alert: BZZZSLURPQueueBackup expr: bzzz_slurp_queue_length > 50 for: 10m labels: severity: warning service: bzzz component: slurp annotations: summary: "SLURP generation queue is backing up" description: "{{ $value }} contexts are queued for generation" # Generation time SLO: 95th percentile < 2 minutes - alert: BZZZSLURPSlowGeneration expr: histogram_quantile(0.95, rate(bzzz_slurp_generation_time_seconds_bucket[10m])) > 120 for: 5m labels: severity: warning service: bzzz component: slurp slo: latency annotations: summary: "SLURP context generation is slow" description: "95th percentile generation time {{ $value }}s exceeds 2 minutes" # === UCXI Protocol Resolution === - name: bzzz_ucxi rules: # Resolution success rate SLO: > 99% - alert: BZZZUCXILowSuccessRate expr: rate(bzzz_ucxi_requests_total{status=~"2.."}[5m]) / rate(bzzz_ucxi_requests_total[5m]) < 0.99 for: 3m labels: severity: warning service: bzzz component: ucxi slo: success_rate annotations: summary: "UCXI resolution success rate is low" description: "Success rate {{ $value | humanizePercentage }} is below 99% SLO" # Resolution latency SLO: 95th percentile < 100ms - alert: BZZZUCXIHighLatency expr: histogram_quantile(0.95, rate(bzzz_ucxi_resolution_latency_seconds_bucket[5m])) > 0.1 for: 3m labels: severity: warning service: bzzz component: ucxi slo: latency annotations: summary: "UCXI resolution latency is high" description: "95th percentile latency {{ $value }}s exceeds 100ms SLO" # === Resource Utilization === - name: bzzz_resources rules: # CPU utilization - alert: BZZZHighCPUUsage expr: bzzz_cpu_usage_ratio > 0.85 for: 5m labels: severity: warning service: bzzz component: system annotations: summary: "BZZZ CPU usage is high" description: "CPU usage {{ $value | humanizePercentage }} exceeds 85%" # Memory utilization - alert: BZZZHighMemoryUsage expr: bzzz_memory_usage_bytes / (1024*1024*1024) > 8 for: 3m labels: severity: warning service: bzzz component: system annotations: summary: "BZZZ memory usage is high" description: "Memory usage {{ $value | humanize1024 }}B is high" # Disk utilization - alert: BZZZHighDiskUsage expr: bzzz_disk_usage_ratio > 0.90 for: 5m labels: severity: critical service: bzzz component: system annotations: summary: "BZZZ disk usage is critical" description: "Disk usage {{ $value | humanizePercentage }} on {{ $labels.mount_point }} exceeds 90%" # Goroutine leak detection - alert: BZZZGoroutineLeak expr: increase(bzzz_goroutines[30m]) > 1000 for: 5m labels: severity: warning service: bzzz component: system annotations: summary: "Possible BZZZ goroutine leak" description: "Goroutine count increased by {{ $value }} in 30 minutes" # === Error Rate Monitoring === - name: bzzz_errors rules: # General error rate - alert: BZZZHighErrorRate expr: rate(bzzz_errors_total[5m]) > 10 for: 2m labels: severity: warning service: bzzz annotations: summary: "BZZZ error rate is high" description: "Error rate {{ $value }} errors/sec in component {{ $labels.component }}" # Panic detection - alert: BZZZPanicsDetected expr: increase(bzzz_panics_total[5m]) > 0 for: 0m labels: severity: critical service: bzzz annotations: summary: "BZZZ panic detected" description: "{{ $value }} panic(s) occurred in the last 5 minutes" runbook_url: "https://wiki.chorus.services/runbooks/bzzz-panic-recovery" # === Health Check Monitoring === - name: bzzz_health_checks rules: # Health check failure rate - alert: BZZZHealthCheckFailures expr: rate(bzzz_health_checks_failed_total[5m]) > 0.1 for: 2m labels: severity: warning service: bzzz component: health annotations: summary: "BZZZ health check failures detected" description: "Health check {{ $labels.check_name }} failing at {{ $value }} failures/sec" # Critical health check failure - alert: BZZZCriticalHealthCheckFailed expr: increase(bzzz_health_checks_failed_total{check_name=~".*-enhanced|p2p-connectivity"}[2m]) > 0 for: 0m labels: severity: critical service: bzzz component: health annotations: summary: "Critical BZZZ health check failed" description: "Critical health check {{ $labels.check_name }} failed: {{ $labels.reason }}" # === Service Level Indicator Recording Rules === - name: bzzz_sli_recording interval: 30s rules: # DHT operation SLI - record: bzzz:dht_success_rate expr: rate(bzzz_dht_put_operations_total{status="success"}[5m]) + rate(bzzz_dht_get_operations_total{status="success"}[5m]) / rate(bzzz_dht_put_operations_total[5m]) + rate(bzzz_dht_get_operations_total[5m]) # P2P connectivity SLI - record: bzzz:p2p_connectivity_ratio expr: bzzz_p2p_connected_peers / 10 # Target of 10 peers # UCXI success rate SLI - record: bzzz:ucxi_success_rate expr: rate(bzzz_ucxi_requests_total{status=~"2.."}[5m]) / rate(bzzz_ucxi_requests_total[5m]) # Task success rate SLI - record: bzzz:task_success_rate expr: rate(bzzz_tasks_completed_total{status="success"}[5m]) / rate(bzzz_tasks_completed_total[5m]) # Overall availability SLI - record: bzzz:overall_availability expr: bzzz_system_health_score # === Multi-Window Multi-Burn-Rate Alerts === - name: bzzz_slo_alerts rules: # Fast burn rate (2% of error budget in 1 hour) - alert: BZZZErrorBudgetBurnHigh expr: ( (1 - bzzz:dht_success_rate) > (14.4 * 0.01) # 14.4x burn rate for 99% SLO and (1 - bzzz:dht_success_rate) > (14.4 * 0.01) ) for: 2m labels: severity: critical service: bzzz burnrate: fast slo: dht_success_rate annotations: summary: "BZZZ DHT error budget burning fast" description: "DHT error budget will be exhausted in {{ with query \"(0.01 - (1 - bzzz:dht_success_rate)) / (1 - bzzz:dht_success_rate) * 1\" }}{{ . | first | value | humanizeDuration }}{{ end }}" # Slow burn rate (10% of error budget in 6 hours) - alert: BZZZErrorBudgetBurnSlow expr: ( (1 - bzzz:dht_success_rate) > (6 * 0.01) # 6x burn rate and (1 - bzzz:dht_success_rate) > (6 * 0.01) ) for: 15m labels: severity: warning service: bzzz burnrate: slow slo: dht_success_rate annotations: summary: "BZZZ DHT error budget burning slowly" description: "DHT error budget depletion rate is concerning"