- Add FUTURE_DEVELOPMENT.md with comprehensive v2 protocol specification - Add MCP integration design and implementation foundation - Add infrastructure and deployment configurations - Update system architecture for v2 evolution 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
339 lines
12 KiB
YAML
339 lines
12 KiB
YAML
# BZZZ v2 Prometheus Alert Rules
|
|
|
|
groups:
|
|
# P2P Network Health Rules
|
|
- name: p2p-network
|
|
rules:
|
|
- alert: P2PNetworkPartition
|
|
expr: bzzz_p2p_connected_peers < 2
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
component: p2p
|
|
annotations:
|
|
summary: "P2P network partition detected"
|
|
description: "Node {{ $labels.instance }} has less than 2 peers connected for more than 5 minutes"
|
|
|
|
- alert: P2PHighLatency
|
|
expr: histogram_quantile(0.95, bzzz_p2p_message_duration_seconds) > 5
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
component: p2p
|
|
annotations:
|
|
summary: "High P2P message latency"
|
|
description: "95th percentile P2P message latency is {{ $value }}s on {{ $labels.instance }}"
|
|
|
|
- alert: P2PMessageDropRate
|
|
expr: rate(bzzz_p2p_messages_dropped_total[5m]) > 0.1
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
component: p2p
|
|
annotations:
|
|
summary: "High P2P message drop rate"
|
|
description: "P2P message drop rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
|
|
|
|
# DHT Network Rules
|
|
- name: dht-network
|
|
rules:
|
|
- alert: DHTBootstrapNodeDown
|
|
expr: up{job="dht-bootstrap"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
component: dht
|
|
annotations:
|
|
summary: "DHT bootstrap node is down"
|
|
description: "DHT bootstrap node {{ $labels.instance }} has been down for more than 1 minute"
|
|
|
|
- alert: DHTRoutingTableSize
|
|
expr: bzzz_dht_routing_table_size < 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: dht
|
|
annotations:
|
|
summary: "DHT routing table is small"
|
|
description: "DHT routing table size is {{ $value }} on {{ $labels.instance }}, indicating poor network connectivity"
|
|
|
|
- alert: DHTLookupFailureRate
|
|
expr: rate(bzzz_dht_lookup_failures_total[5m]) / rate(bzzz_dht_lookups_total[5m]) > 0.2
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
component: dht
|
|
annotations:
|
|
summary: "High DHT lookup failure rate"
|
|
description: "DHT lookup failure rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
|
|
|
|
# Content Store Rules
|
|
- name: content-store
|
|
rules:
|
|
- alert: ContentStoreDiskUsage
|
|
expr: (bzzz_content_store_disk_used_bytes / bzzz_content_store_disk_total_bytes) * 100 > 85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: content-store
|
|
disk_usage: "{{ $value | humanize }}"
|
|
annotations:
|
|
summary: "Content store disk usage is high"
|
|
description: "Content store disk usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
|
|
|
|
- alert: ContentStoreDiskFull
|
|
expr: (bzzz_content_store_disk_used_bytes / bzzz_content_store_disk_total_bytes) * 100 > 95
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
component: content-store
|
|
disk_usage: "{{ $value | humanize }}"
|
|
annotations:
|
|
summary: "Content store disk is nearly full"
|
|
description: "Content store disk usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
|
|
|
|
- alert: ContentReplicationFailed
|
|
expr: increase(bzzz_content_replication_failures_total[10m]) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: content-store
|
|
annotations:
|
|
summary: "Content replication failures detected"
|
|
description: "{{ $value }} content replication failures in the last 10 minutes on {{ $labels.instance }}"
|
|
|
|
- alert: BLAKE3HashCollision
|
|
expr: increase(bzzz_blake3_hash_collisions_total[1h]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
component: content-store
|
|
annotations:
|
|
summary: "BLAKE3 hash collision detected"
|
|
description: "BLAKE3 hash collision detected on {{ $labels.instance }} - immediate investigation required"
|
|
|
|
# OpenAI Integration Rules
|
|
- name: openai-integration
|
|
rules:
|
|
- alert: OpenAIHighCost
|
|
expr: bzzz_openai_cost_daily_usd > 100
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
component: openai-cost
|
|
current_cost: "{{ $value }}"
|
|
cost_threshold: "100"
|
|
cost_period: "daily"
|
|
annotations:
|
|
summary: "OpenAI daily cost exceeds threshold"
|
|
description: "Daily OpenAI cost is ${{ $value }}, exceeding the $100 threshold"
|
|
|
|
- alert: OpenAICriticalCost
|
|
expr: bzzz_openai_cost_daily_usd > 500
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
component: openai-cost
|
|
current_cost: "{{ $value }}"
|
|
cost_threshold: "500"
|
|
cost_period: "daily"
|
|
annotations:
|
|
summary: "OpenAI daily cost critically high"
|
|
description: "Daily OpenAI cost is ${{ $value }}, which is critically high - consider rate limiting"
|
|
|
|
- alert: OpenAIRateLimitHit
|
|
expr: increase(bzzz_openai_rate_limit_hits_total[5m]) > 10
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
component: openai-cost
|
|
annotations:
|
|
summary: "OpenAI rate limit frequently hit"
|
|
description: "OpenAI rate limit hit {{ $value }} times in the last 5 minutes"
|
|
|
|
- alert: OpenAIProxyDown
|
|
expr: up{job="openai-proxy"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
component: service-health
|
|
annotations:
|
|
summary: "OpenAI proxy is down"
|
|
description: "OpenAI proxy service is down on {{ $labels.instance }}"
|
|
|
|
# MCP Server Rules
|
|
- name: mcp-server
|
|
rules:
|
|
- alert: MCPServerDown
|
|
expr: up{job="mcp-server"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
component: service-health
|
|
annotations:
|
|
summary: "MCP server is down"
|
|
description: "MCP server is down on {{ $labels.instance }}"
|
|
|
|
- alert: MCPHighResponseTime
|
|
expr: histogram_quantile(0.95, bzzz_mcp_request_duration_seconds) > 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: service-health
|
|
annotations:
|
|
summary: "MCP server high response time"
|
|
description: "95th percentile MCP response time is {{ $value }}s on {{ $labels.instance }}"
|
|
|
|
- alert: MCPConnectionLimit
|
|
expr: bzzz_mcp_active_connections / bzzz_mcp_max_connections > 0.8
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
component: service-health
|
|
annotations:
|
|
summary: "MCP server connection limit approaching"
|
|
description: "MCP server connection usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
|
|
|
|
# Conversation Threading Rules
|
|
- name: conversation-threading
|
|
rules:
|
|
- alert: ConversationThreadLag
|
|
expr: bzzz_conversation_lamport_clock_lag_seconds > 30
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
component: conversation
|
|
annotations:
|
|
summary: "Conversation thread lag detected"
|
|
description: "Lamport clock lag is {{ $value }}s on {{ $labels.instance }}, indicating thread synchronization issues"
|
|
|
|
- alert: ConversationStorageFailure
|
|
expr: increase(bzzz_conversation_storage_failures_total[5m]) > 3
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
component: conversation
|
|
annotations:
|
|
summary: "Conversation storage failures"
|
|
description: "{{ $value }} conversation storage failures in the last 5 minutes on {{ $labels.instance }}"
|
|
|
|
# System Resource Rules
|
|
- name: system-resources
|
|
rules:
|
|
- alert: NodeDown
|
|
expr: up{job="node-exporter"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
component: system
|
|
annotations:
|
|
summary: "Node is down"
|
|
description: "Node {{ $labels.instance }} has been down for more than 1 minute"
|
|
|
|
- alert: HighCPUUsage
|
|
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: resources
|
|
resource_type: "cpu"
|
|
usage_percent: "{{ $value | humanize }}"
|
|
threshold: "80"
|
|
annotations:
|
|
summary: "High CPU usage"
|
|
description: "CPU usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
|
|
|
|
- alert: HighMemoryUsage
|
|
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: resources
|
|
resource_type: "memory"
|
|
usage_percent: "{{ $value | humanize }}"
|
|
threshold: "85"
|
|
annotations:
|
|
summary: "High memory usage"
|
|
description: "Memory usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
|
|
|
|
- alert: DiskSpaceLow
|
|
expr: (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"}) * 100 < 15
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: resources
|
|
resource_type: "disk"
|
|
usage_percent: "{{ 100 - $value | humanize }}"
|
|
threshold: "85"
|
|
annotations:
|
|
summary: "Low disk space"
|
|
description: "Disk space is {{ 100 - $value | humanizePercentage }} full on {{ $labels.instance }} ({{ $labels.mountpoint }})"
|
|
|
|
# Database Rules
|
|
- name: database
|
|
rules:
|
|
- alert: PostgreSQLDown
|
|
expr: up{job="postgres"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
component: service-health
|
|
annotations:
|
|
summary: "PostgreSQL is down"
|
|
description: "PostgreSQL database is down on {{ $labels.instance }}"
|
|
|
|
- alert: PostgreSQLHighConnections
|
|
expr: pg_stat_database_numbackends / pg_settings_max_connections > 0.8
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
component: service-health
|
|
annotations:
|
|
summary: "PostgreSQL connection limit approaching"
|
|
description: "PostgreSQL connection usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
|
|
|
|
- alert: RedisDown
|
|
expr: up{job="redis"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
component: service-health
|
|
annotations:
|
|
summary: "Redis is down"
|
|
description: "Redis cache is down on {{ $labels.instance }}"
|
|
|
|
# Security Rules
|
|
- name: security
|
|
rules:
|
|
- alert: UnauthorizedP2PConnection
|
|
expr: increase(bzzz_p2p_unauthorized_connections_total[5m]) > 5
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
component: security
|
|
security_type: "unauthorized_connection"
|
|
annotations:
|
|
summary: "Unauthorized P2P connection attempts"
|
|
description: "{{ $value }} unauthorized P2P connection attempts in the last 5 minutes on {{ $labels.instance }}"
|
|
|
|
- alert: SuspiciousContentRequest
|
|
expr: increase(bzzz_content_suspicious_requests_total[5m]) > 10
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
component: security
|
|
security_type: "suspicious_content"
|
|
annotations:
|
|
summary: "Suspicious content requests detected"
|
|
description: "{{ $value }} suspicious content requests in the last 5 minutes on {{ $labels.instance }}"
|
|
|
|
- alert: FailedAuthentication
|
|
expr: increase(bzzz_auth_failures_total[5m]) > 20
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
component: security
|
|
security_type: "authentication_failure"
|
|
annotations:
|
|
summary: "High authentication failure rate"
|
|
description: "{{ $value }} authentication failures in the last 5 minutes on {{ $labels.instance }}" |