# BZZZ v2 Prometheus Alert Rules groups: # P2P Network Health Rules - name: p2p-network rules: - alert: P2PNetworkPartition expr: bzzz_p2p_connected_peers < 2 for: 5m labels: severity: critical component: p2p annotations: summary: "P2P network partition detected" description: "Node {{ $labels.instance }} has less than 2 peers connected for more than 5 minutes" - alert: P2PHighLatency expr: histogram_quantile(0.95, bzzz_p2p_message_duration_seconds) > 5 for: 2m labels: severity: warning component: p2p annotations: summary: "High P2P message latency" description: "95th percentile P2P message latency is {{ $value }}s on {{ $labels.instance }}" - alert: P2PMessageDropRate expr: rate(bzzz_p2p_messages_dropped_total[5m]) > 0.1 for: 2m labels: severity: warning component: p2p annotations: summary: "High P2P message drop rate" description: "P2P message drop rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}" # DHT Network Rules - name: dht-network rules: - alert: DHTBootstrapNodeDown expr: up{job="dht-bootstrap"} == 0 for: 1m labels: severity: critical component: dht annotations: summary: "DHT bootstrap node is down" description: "DHT bootstrap node {{ $labels.instance }} has been down for more than 1 minute" - alert: DHTRoutingTableSize expr: bzzz_dht_routing_table_size < 10 for: 5m labels: severity: warning component: dht annotations: summary: "DHT routing table is small" description: "DHT routing table size is {{ $value }} on {{ $labels.instance }}, indicating poor network connectivity" - alert: DHTLookupFailureRate expr: rate(bzzz_dht_lookup_failures_total[5m]) / rate(bzzz_dht_lookups_total[5m]) > 0.2 for: 2m labels: severity: warning component: dht annotations: summary: "High DHT lookup failure rate" description: "DHT lookup failure rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}" # Content Store Rules - name: content-store rules: - alert: ContentStoreDiskUsage expr: (bzzz_content_store_disk_used_bytes / bzzz_content_store_disk_total_bytes) * 100 > 85 for: 5m labels: severity: warning component: content-store disk_usage: "{{ $value | humanize }}" annotations: summary: "Content store disk usage is high" description: "Content store disk usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}" - alert: ContentStoreDiskFull expr: (bzzz_content_store_disk_used_bytes / bzzz_content_store_disk_total_bytes) * 100 > 95 for: 1m labels: severity: critical component: content-store disk_usage: "{{ $value | humanize }}" annotations: summary: "Content store disk is nearly full" description: "Content store disk usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}" - alert: ContentReplicationFailed expr: increase(bzzz_content_replication_failures_total[10m]) > 5 for: 5m labels: severity: warning component: content-store annotations: summary: "Content replication failures detected" description: "{{ $value }} content replication failures in the last 10 minutes on {{ $labels.instance }}" - alert: BLAKE3HashCollision expr: increase(bzzz_blake3_hash_collisions_total[1h]) > 0 for: 0m labels: severity: critical component: content-store annotations: summary: "BLAKE3 hash collision detected" description: "BLAKE3 hash collision detected on {{ $labels.instance }} - immediate investigation required" # OpenAI Integration Rules - name: openai-integration rules: - alert: OpenAIHighCost expr: bzzz_openai_cost_daily_usd > 100 for: 0m labels: severity: warning component: openai-cost current_cost: "{{ $value }}" cost_threshold: "100" cost_period: "daily" annotations: summary: "OpenAI daily cost exceeds threshold" description: "Daily OpenAI cost is ${{ $value }}, exceeding the $100 threshold" - alert: OpenAICriticalCost expr: bzzz_openai_cost_daily_usd > 500 for: 0m labels: severity: critical component: openai-cost current_cost: "{{ $value }}" cost_threshold: "500" cost_period: "daily" annotations: summary: "OpenAI daily cost critically high" description: "Daily OpenAI cost is ${{ $value }}, which is critically high - consider rate limiting" - alert: OpenAIRateLimitHit expr: increase(bzzz_openai_rate_limit_hits_total[5m]) > 10 for: 1m labels: severity: warning component: openai-cost annotations: summary: "OpenAI rate limit frequently hit" description: "OpenAI rate limit hit {{ $value }} times in the last 5 minutes" - alert: OpenAIProxyDown expr: up{job="openai-proxy"} == 0 for: 2m labels: severity: critical component: service-health annotations: summary: "OpenAI proxy is down" description: "OpenAI proxy service is down on {{ $labels.instance }}" # MCP Server Rules - name: mcp-server rules: - alert: MCPServerDown expr: up{job="mcp-server"} == 0 for: 2m labels: severity: critical component: service-health annotations: summary: "MCP server is down" description: "MCP server is down on {{ $labels.instance }}" - alert: MCPHighResponseTime expr: histogram_quantile(0.95, bzzz_mcp_request_duration_seconds) > 10 for: 5m labels: severity: warning component: service-health annotations: summary: "MCP server high response time" description: "95th percentile MCP response time is {{ $value }}s on {{ $labels.instance }}" - alert: MCPConnectionLimit expr: bzzz_mcp_active_connections / bzzz_mcp_max_connections > 0.8 for: 2m labels: severity: warning component: service-health annotations: summary: "MCP server connection limit approaching" description: "MCP server connection usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}" # Conversation Threading Rules - name: conversation-threading rules: - alert: ConversationThreadLag expr: bzzz_conversation_lamport_clock_lag_seconds > 30 for: 2m labels: severity: warning component: conversation annotations: summary: "Conversation thread lag detected" description: "Lamport clock lag is {{ $value }}s on {{ $labels.instance }}, indicating thread synchronization issues" - alert: ConversationStorageFailure expr: increase(bzzz_conversation_storage_failures_total[5m]) > 3 for: 1m labels: severity: critical component: conversation annotations: summary: "Conversation storage failures" description: "{{ $value }} conversation storage failures in the last 5 minutes on {{ $labels.instance }}" # System Resource Rules - name: system-resources rules: - alert: NodeDown expr: up{job="node-exporter"} == 0 for: 1m labels: severity: critical component: system annotations: summary: "Node is down" description: "Node {{ $labels.instance }} has been down for more than 1 minute" - alert: HighCPUUsage expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 80 for: 5m labels: severity: warning component: resources resource_type: "cpu" usage_percent: "{{ $value | humanize }}" threshold: "80" annotations: summary: "High CPU usage" description: "CPU usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}" - alert: HighMemoryUsage expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85 for: 5m labels: severity: warning component: resources resource_type: "memory" usage_percent: "{{ $value | humanize }}" threshold: "85" annotations: summary: "High memory usage" description: "Memory usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}" - alert: DiskSpaceLow expr: (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"}) * 100 < 15 for: 5m labels: severity: warning component: resources resource_type: "disk" usage_percent: "{{ 100 - $value | humanize }}" threshold: "85" annotations: summary: "Low disk space" description: "Disk space is {{ 100 - $value | humanizePercentage }} full on {{ $labels.instance }} ({{ $labels.mountpoint }})" # Database Rules - name: database rules: - alert: PostgreSQLDown expr: up{job="postgres"} == 0 for: 1m labels: severity: critical component: service-health annotations: summary: "PostgreSQL is down" description: "PostgreSQL database is down on {{ $labels.instance }}" - alert: PostgreSQLHighConnections expr: pg_stat_database_numbackends / pg_settings_max_connections > 0.8 for: 2m labels: severity: warning component: service-health annotations: summary: "PostgreSQL connection limit approaching" description: "PostgreSQL connection usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}" - alert: RedisDown expr: up{job="redis"} == 0 for: 1m labels: severity: critical component: service-health annotations: summary: "Redis is down" description: "Redis cache is down on {{ $labels.instance }}" # Security Rules - name: security rules: - alert: UnauthorizedP2PConnection expr: increase(bzzz_p2p_unauthorized_connections_total[5m]) > 5 for: 1m labels: severity: warning component: security security_type: "unauthorized_connection" annotations: summary: "Unauthorized P2P connection attempts" description: "{{ $value }} unauthorized P2P connection attempts in the last 5 minutes on {{ $labels.instance }}" - alert: SuspiciousContentRequest expr: increase(bzzz_content_suspicious_requests_total[5m]) > 10 for: 2m labels: severity: warning component: security security_type: "suspicious_content" annotations: summary: "Suspicious content requests detected" description: "{{ $value }} suspicious content requests in the last 5 minutes on {{ $labels.instance }}" - alert: FailedAuthentication expr: increase(bzzz_auth_failures_total[5m]) > 20 for: 1m labels: severity: warning component: security security_type: "authentication_failure" annotations: summary: "High authentication failure rate" description: "{{ $value }} authentication failures in the last 5 minutes on {{ $labels.instance }}"