Prepare for v2 development: Add MCP integration and future development planning
- Add FUTURE_DEVELOPMENT.md with comprehensive v2 protocol specification - Add MCP integration design and implementation foundation - Add infrastructure and deployment configurations - Update system architecture for v2 evolution 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
339
infrastructure/monitoring/configs/alert-rules.yml
Normal file
339
infrastructure/monitoring/configs/alert-rules.yml
Normal file
@@ -0,0 +1,339 @@
|
||||
# BZZZ v2 Prometheus Alert Rules
|
||||
|
||||
groups:
|
||||
# P2P Network Health Rules
|
||||
- name: p2p-network
|
||||
rules:
|
||||
- alert: P2PNetworkPartition
|
||||
expr: bzzz_p2p_connected_peers < 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
component: p2p
|
||||
annotations:
|
||||
summary: "P2P network partition detected"
|
||||
description: "Node {{ $labels.instance }} has less than 2 peers connected for more than 5 minutes"
|
||||
|
||||
- alert: P2PHighLatency
|
||||
expr: histogram_quantile(0.95, bzzz_p2p_message_duration_seconds) > 5
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
component: p2p
|
||||
annotations:
|
||||
summary: "High P2P message latency"
|
||||
description: "95th percentile P2P message latency is {{ $value }}s on {{ $labels.instance }}"
|
||||
|
||||
- alert: P2PMessageDropRate
|
||||
expr: rate(bzzz_p2p_messages_dropped_total[5m]) > 0.1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
component: p2p
|
||||
annotations:
|
||||
summary: "High P2P message drop rate"
|
||||
description: "P2P message drop rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
|
||||
|
||||
# DHT Network Rules
|
||||
- name: dht-network
|
||||
rules:
|
||||
- alert: DHTBootstrapNodeDown
|
||||
expr: up{job="dht-bootstrap"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
component: dht
|
||||
annotations:
|
||||
summary: "DHT bootstrap node is down"
|
||||
description: "DHT bootstrap node {{ $labels.instance }} has been down for more than 1 minute"
|
||||
|
||||
- alert: DHTRoutingTableSize
|
||||
expr: bzzz_dht_routing_table_size < 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: dht
|
||||
annotations:
|
||||
summary: "DHT routing table is small"
|
||||
description: "DHT routing table size is {{ $value }} on {{ $labels.instance }}, indicating poor network connectivity"
|
||||
|
||||
- alert: DHTLookupFailureRate
|
||||
expr: rate(bzzz_dht_lookup_failures_total[5m]) / rate(bzzz_dht_lookups_total[5m]) > 0.2
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
component: dht
|
||||
annotations:
|
||||
summary: "High DHT lookup failure rate"
|
||||
description: "DHT lookup failure rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
|
||||
|
||||
# Content Store Rules
|
||||
- name: content-store
|
||||
rules:
|
||||
- alert: ContentStoreDiskUsage
|
||||
expr: (bzzz_content_store_disk_used_bytes / bzzz_content_store_disk_total_bytes) * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: content-store
|
||||
disk_usage: "{{ $value | humanize }}"
|
||||
annotations:
|
||||
summary: "Content store disk usage is high"
|
||||
description: "Content store disk usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
|
||||
|
||||
- alert: ContentStoreDiskFull
|
||||
expr: (bzzz_content_store_disk_used_bytes / bzzz_content_store_disk_total_bytes) * 100 > 95
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
component: content-store
|
||||
disk_usage: "{{ $value | humanize }}"
|
||||
annotations:
|
||||
summary: "Content store disk is nearly full"
|
||||
description: "Content store disk usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
|
||||
|
||||
- alert: ContentReplicationFailed
|
||||
expr: increase(bzzz_content_replication_failures_total[10m]) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: content-store
|
||||
annotations:
|
||||
summary: "Content replication failures detected"
|
||||
description: "{{ $value }} content replication failures in the last 10 minutes on {{ $labels.instance }}"
|
||||
|
||||
- alert: BLAKE3HashCollision
|
||||
expr: increase(bzzz_blake3_hash_collisions_total[1h]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
component: content-store
|
||||
annotations:
|
||||
summary: "BLAKE3 hash collision detected"
|
||||
description: "BLAKE3 hash collision detected on {{ $labels.instance }} - immediate investigation required"
|
||||
|
||||
# OpenAI Integration Rules
|
||||
- name: openai-integration
|
||||
rules:
|
||||
- alert: OpenAIHighCost
|
||||
expr: bzzz_openai_cost_daily_usd > 100
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
component: openai-cost
|
||||
current_cost: "{{ $value }}"
|
||||
cost_threshold: "100"
|
||||
cost_period: "daily"
|
||||
annotations:
|
||||
summary: "OpenAI daily cost exceeds threshold"
|
||||
description: "Daily OpenAI cost is ${{ $value }}, exceeding the $100 threshold"
|
||||
|
||||
- alert: OpenAICriticalCost
|
||||
expr: bzzz_openai_cost_daily_usd > 500
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
component: openai-cost
|
||||
current_cost: "{{ $value }}"
|
||||
cost_threshold: "500"
|
||||
cost_period: "daily"
|
||||
annotations:
|
||||
summary: "OpenAI daily cost critically high"
|
||||
description: "Daily OpenAI cost is ${{ $value }}, which is critically high - consider rate limiting"
|
||||
|
||||
- alert: OpenAIRateLimitHit
|
||||
expr: increase(bzzz_openai_rate_limit_hits_total[5m]) > 10
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
component: openai-cost
|
||||
annotations:
|
||||
summary: "OpenAI rate limit frequently hit"
|
||||
description: "OpenAI rate limit hit {{ $value }} times in the last 5 minutes"
|
||||
|
||||
- alert: OpenAIProxyDown
|
||||
expr: up{job="openai-proxy"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
component: service-health
|
||||
annotations:
|
||||
summary: "OpenAI proxy is down"
|
||||
description: "OpenAI proxy service is down on {{ $labels.instance }}"
|
||||
|
||||
# MCP Server Rules
|
||||
- name: mcp-server
|
||||
rules:
|
||||
- alert: MCPServerDown
|
||||
expr: up{job="mcp-server"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
component: service-health
|
||||
annotations:
|
||||
summary: "MCP server is down"
|
||||
description: "MCP server is down on {{ $labels.instance }}"
|
||||
|
||||
- alert: MCPHighResponseTime
|
||||
expr: histogram_quantile(0.95, bzzz_mcp_request_duration_seconds) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: service-health
|
||||
annotations:
|
||||
summary: "MCP server high response time"
|
||||
description: "95th percentile MCP response time is {{ $value }}s on {{ $labels.instance }}"
|
||||
|
||||
- alert: MCPConnectionLimit
|
||||
expr: bzzz_mcp_active_connections / bzzz_mcp_max_connections > 0.8
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
component: service-health
|
||||
annotations:
|
||||
summary: "MCP server connection limit approaching"
|
||||
description: "MCP server connection usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
|
||||
|
||||
# Conversation Threading Rules
|
||||
- name: conversation-threading
|
||||
rules:
|
||||
- alert: ConversationThreadLag
|
||||
expr: bzzz_conversation_lamport_clock_lag_seconds > 30
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
component: conversation
|
||||
annotations:
|
||||
summary: "Conversation thread lag detected"
|
||||
description: "Lamport clock lag is {{ $value }}s on {{ $labels.instance }}, indicating thread synchronization issues"
|
||||
|
||||
- alert: ConversationStorageFailure
|
||||
expr: increase(bzzz_conversation_storage_failures_total[5m]) > 3
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
component: conversation
|
||||
annotations:
|
||||
summary: "Conversation storage failures"
|
||||
description: "{{ $value }} conversation storage failures in the last 5 minutes on {{ $labels.instance }}"
|
||||
|
||||
# System Resource Rules
|
||||
- name: system-resources
|
||||
rules:
|
||||
- alert: NodeDown
|
||||
expr: up{job="node-exporter"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
component: system
|
||||
annotations:
|
||||
summary: "Node is down"
|
||||
description: "Node {{ $labels.instance }} has been down for more than 1 minute"
|
||||
|
||||
- alert: HighCPUUsage
|
||||
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: resources
|
||||
resource_type: "cpu"
|
||||
usage_percent: "{{ $value | humanize }}"
|
||||
threshold: "80"
|
||||
annotations:
|
||||
summary: "High CPU usage"
|
||||
description: "CPU usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: resources
|
||||
resource_type: "memory"
|
||||
usage_percent: "{{ $value | humanize }}"
|
||||
threshold: "85"
|
||||
annotations:
|
||||
summary: "High memory usage"
|
||||
description: "Memory usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
|
||||
|
||||
- alert: DiskSpaceLow
|
||||
expr: (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"}) * 100 < 15
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: resources
|
||||
resource_type: "disk"
|
||||
usage_percent: "{{ 100 - $value | humanize }}"
|
||||
threshold: "85"
|
||||
annotations:
|
||||
summary: "Low disk space"
|
||||
description: "Disk space is {{ 100 - $value | humanizePercentage }} full on {{ $labels.instance }} ({{ $labels.mountpoint }})"
|
||||
|
||||
# Database Rules
|
||||
- name: database
|
||||
rules:
|
||||
- alert: PostgreSQLDown
|
||||
expr: up{job="postgres"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
component: service-health
|
||||
annotations:
|
||||
summary: "PostgreSQL is down"
|
||||
description: "PostgreSQL database is down on {{ $labels.instance }}"
|
||||
|
||||
- alert: PostgreSQLHighConnections
|
||||
expr: pg_stat_database_numbackends / pg_settings_max_connections > 0.8
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
component: service-health
|
||||
annotations:
|
||||
summary: "PostgreSQL connection limit approaching"
|
||||
description: "PostgreSQL connection usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
|
||||
|
||||
- alert: RedisDown
|
||||
expr: up{job="redis"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
component: service-health
|
||||
annotations:
|
||||
summary: "Redis is down"
|
||||
description: "Redis cache is down on {{ $labels.instance }}"
|
||||
|
||||
# Security Rules
|
||||
- name: security
|
||||
rules:
|
||||
- alert: UnauthorizedP2PConnection
|
||||
expr: increase(bzzz_p2p_unauthorized_connections_total[5m]) > 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
component: security
|
||||
security_type: "unauthorized_connection"
|
||||
annotations:
|
||||
summary: "Unauthorized P2P connection attempts"
|
||||
description: "{{ $value }} unauthorized P2P connection attempts in the last 5 minutes on {{ $labels.instance }}"
|
||||
|
||||
- alert: SuspiciousContentRequest
|
||||
expr: increase(bzzz_content_suspicious_requests_total[5m]) > 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
component: security
|
||||
security_type: "suspicious_content"
|
||||
annotations:
|
||||
summary: "Suspicious content requests detected"
|
||||
description: "{{ $value }} suspicious content requests in the last 5 minutes on {{ $labels.instance }}"
|
||||
|
||||
- alert: FailedAuthentication
|
||||
expr: increase(bzzz_auth_failures_total[5m]) > 20
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
component: security
|
||||
security_type: "authentication_failure"
|
||||
annotations:
|
||||
summary: "High authentication failure rate"
|
||||
description: "{{ $value }} authentication failures in the last 5 minutes on {{ $labels.instance }}"
|
||||
255
infrastructure/monitoring/configs/alertmanager.yml
Normal file
255
infrastructure/monitoring/configs/alertmanager.yml
Normal file
@@ -0,0 +1,255 @@
|
||||
# AlertManager Configuration for BZZZ v2
|
||||
|
||||
global:
|
||||
smtp_smarthost: 'localhost:587'
|
||||
smtp_from: 'alerts@deepblack.cloud'
|
||||
smtp_require_tls: true
|
||||
resolve_timeout: 5m
|
||||
|
||||
# Template files
|
||||
templates:
|
||||
- '/etc/alertmanager/templates/*.tmpl'
|
||||
|
||||
# Route configuration
|
||||
route:
|
||||
group_by: ['cluster', 'alertname', 'service']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 12h
|
||||
receiver: 'default'
|
||||
routes:
|
||||
# Critical P2P network issues
|
||||
- match:
|
||||
severity: critical
|
||||
component: p2p
|
||||
receiver: 'p2p-critical'
|
||||
group_wait: 10s
|
||||
repeat_interval: 5m
|
||||
|
||||
# DHT network issues
|
||||
- match:
|
||||
component: dht
|
||||
receiver: 'dht-alerts'
|
||||
group_wait: 1m
|
||||
repeat_interval: 30m
|
||||
|
||||
# Content store issues
|
||||
- match:
|
||||
component: content-store
|
||||
receiver: 'storage-alerts'
|
||||
group_wait: 2m
|
||||
repeat_interval: 1h
|
||||
|
||||
# OpenAI cost alerts
|
||||
- match:
|
||||
component: openai-cost
|
||||
receiver: 'cost-alerts'
|
||||
group_wait: 5m
|
||||
repeat_interval: 6h
|
||||
|
||||
# Service health alerts
|
||||
- match:
|
||||
component: service-health
|
||||
receiver: 'service-alerts'
|
||||
group_wait: 1m
|
||||
repeat_interval: 15m
|
||||
|
||||
# Resource exhaustion
|
||||
- match:
|
||||
severity: warning
|
||||
component: resources
|
||||
receiver: 'resource-alerts'
|
||||
group_wait: 5m
|
||||
repeat_interval: 2h
|
||||
|
||||
# Security alerts
|
||||
- match:
|
||||
component: security
|
||||
receiver: 'security-alerts'
|
||||
group_wait: 30s
|
||||
repeat_interval: 1h
|
||||
|
||||
# Inhibition rules
|
||||
inhibit_rules:
|
||||
# Silence warning if critical alert is firing
|
||||
- source_match:
|
||||
severity: 'critical'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['cluster', 'service', 'instance']
|
||||
|
||||
# Silence service alerts if node is down
|
||||
- source_match:
|
||||
alertname: 'NodeDown'
|
||||
target_match:
|
||||
component: 'service-health'
|
||||
equal: ['instance']
|
||||
|
||||
# Receiver configurations
|
||||
receivers:
|
||||
# Default receiver
|
||||
- name: 'default'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#bzzz-monitoring'
|
||||
title: 'BZZZ v2 Alert'
|
||||
text: |
|
||||
{{ range .Alerts }}
|
||||
*Alert:* {{ .Annotations.summary }}
|
||||
*Description:* {{ .Annotations.description }}
|
||||
*Severity:* {{ .Labels.severity }}
|
||||
*Instance:* {{ .Labels.instance }}
|
||||
*Service:* {{ .Labels.service }}
|
||||
{{ end }}
|
||||
send_resolved: true
|
||||
|
||||
# Critical P2P network alerts
|
||||
- name: 'p2p-critical'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#bzzz-critical'
|
||||
title: '🚨 CRITICAL P2P Network Issue'
|
||||
text: |
|
||||
{{ range .Alerts }}
|
||||
*CRITICAL P2P ALERT*
|
||||
|
||||
*Summary:* {{ .Annotations.summary }}
|
||||
*Description:* {{ .Annotations.description }}
|
||||
*Node:* {{ .Labels.instance }}
|
||||
*Time:* {{ .StartsAt.Format "2006-01-02 15:04:05" }}
|
||||
|
||||
*Immediate Action Required*
|
||||
{{ end }}
|
||||
send_resolved: true
|
||||
pagerduty_configs:
|
||||
- service_key: 'YOUR_PAGERDUTY_SERVICE_KEY'
|
||||
description: '{{ .GroupLabels.alertname }} - {{ .Annotations.summary }}'
|
||||
|
||||
# DHT network alerts
|
||||
- name: 'dht-alerts'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#bzzz-dht'
|
||||
title: '🔗 DHT Network Alert'
|
||||
text: |
|
||||
{{ range .Alerts }}
|
||||
*DHT Network Issue*
|
||||
|
||||
*Alert:* {{ .Annotations.summary }}
|
||||
*Description:* {{ .Annotations.description }}
|
||||
*Bootstrap Node:* {{ .Labels.instance }}
|
||||
*Peers Connected:* {{ .Labels.peer_count | default "unknown" }}
|
||||
{{ end }}
|
||||
send_resolved: true
|
||||
|
||||
# Storage alerts
|
||||
- name: 'storage-alerts'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#bzzz-storage'
|
||||
title: '💾 Content Store Alert'
|
||||
text: |
|
||||
{{ range .Alerts }}
|
||||
*Storage Alert*
|
||||
|
||||
*Issue:* {{ .Annotations.summary }}
|
||||
*Details:* {{ .Annotations.description }}
|
||||
*Node:* {{ .Labels.instance }}
|
||||
*Usage:* {{ .Labels.disk_usage | default "unknown" }}%
|
||||
{{ end }}
|
||||
send_resolved: true
|
||||
|
||||
# OpenAI cost alerts
|
||||
- name: 'cost-alerts'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#bzzz-costs'
|
||||
title: '💰 OpenAI Cost Alert'
|
||||
text: |
|
||||
{{ range .Alerts }}
|
||||
*Cost Alert*
|
||||
|
||||
*Alert:* {{ .Annotations.summary }}
|
||||
*Current Cost:* ${{ .Labels.current_cost | default "unknown" }}
|
||||
*Threshold:* ${{ .Labels.cost_threshold | default "unknown" }}
|
||||
*Period:* {{ .Labels.cost_period | default "daily" }}
|
||||
*Action:* {{ .Annotations.description }}
|
||||
{{ end }}
|
||||
send_resolved: true
|
||||
email_configs:
|
||||
- to: 'finance@deepblack.cloud'
|
||||
subject: 'BZZZ v2 OpenAI Cost Alert'
|
||||
body: |
|
||||
OpenAI usage has exceeded cost thresholds.
|
||||
|
||||
{{ range .Alerts }}
|
||||
Alert: {{ .Annotations.summary }}
|
||||
Current Cost: ${{ .Labels.current_cost }}
|
||||
Threshold: ${{ .Labels.cost_threshold }}
|
||||
{{ end }}
|
||||
|
||||
# Service health alerts
|
||||
- name: 'service-alerts'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#bzzz-services'
|
||||
title: '🔧 Service Health Alert'
|
||||
text: |
|
||||
{{ range .Alerts }}
|
||||
*Service Health Issue*
|
||||
|
||||
*Service:* {{ .Labels.service }}
|
||||
*Alert:* {{ .Annotations.summary }}
|
||||
*Node:* {{ .Labels.instance }}
|
||||
*Status:* {{ .Labels.status | default "unknown" }}
|
||||
*Description:* {{ .Annotations.description }}
|
||||
{{ end }}
|
||||
send_resolved: true
|
||||
|
||||
# Resource alerts
|
||||
- name: 'resource-alerts'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#bzzz-resources'
|
||||
title: '⚡ Resource Alert'
|
||||
text: |
|
||||
{{ range .Alerts }}
|
||||
*Resource Warning*
|
||||
|
||||
*Resource:* {{ .Labels.resource_type | default "unknown" }}
|
||||
*Node:* {{ .Labels.instance }}
|
||||
*Alert:* {{ .Annotations.summary }}
|
||||
*Current Usage:* {{ .Labels.usage_percent | default "unknown" }}%
|
||||
*Threshold:* {{ .Labels.threshold | default "unknown" }}%
|
||||
{{ end }}
|
||||
send_resolved: true
|
||||
|
||||
# Security alerts
|
||||
- name: 'security-alerts'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#bzzz-security'
|
||||
title: '🔒 Security Alert'
|
||||
text: |
|
||||
{{ range .Alerts }}
|
||||
*SECURITY ALERT*
|
||||
|
||||
*Type:* {{ .Labels.security_type | default "unknown" }}
|
||||
*Alert:* {{ .Annotations.summary }}
|
||||
*Source:* {{ .Labels.instance }}
|
||||
*Details:* {{ .Annotations.description }}
|
||||
*Severity:* {{ .Labels.severity }}
|
||||
{{ end }}
|
||||
send_resolved: true
|
||||
email_configs:
|
||||
- to: 'security@deepblack.cloud'
|
||||
subject: 'BZZZ v2 Security Alert'
|
||||
body: |
|
||||
Security alert triggered in BZZZ v2 cluster.
|
||||
|
||||
{{ range .Alerts }}
|
||||
Alert: {{ .Annotations.summary }}
|
||||
Severity: {{ .Labels.severity }}
|
||||
Source: {{ .Labels.instance }}
|
||||
Details: {{ .Annotations.description }}
|
||||
{{ end }}
|
||||
216
infrastructure/monitoring/configs/prometheus.yml
Normal file
216
infrastructure/monitoring/configs/prometheus.yml
Normal file
@@ -0,0 +1,216 @@
|
||||
# Prometheus Configuration for BZZZ v2 Monitoring
|
||||
|
||||
global:
|
||||
scrape_interval: 30s
|
||||
scrape_timeout: 10s
|
||||
evaluation_interval: 30s
|
||||
external_labels:
|
||||
cluster: 'deepblack-cloud'
|
||||
environment: 'production'
|
||||
|
||||
rule_files:
|
||||
- "/etc/prometheus/rules.yml"
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
|
||||
scrape_configs:
|
||||
# Prometheus self-monitoring
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 15s
|
||||
|
||||
# System metrics from node exporters
|
||||
- job_name: 'node-exporter'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'walnut:9100'
|
||||
- 'ironwood:9100'
|
||||
- 'acacia:9100'
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 15s
|
||||
|
||||
# Container metrics from cAdvisor
|
||||
- job_name: 'cadvisor'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'walnut:8080'
|
||||
- 'ironwood:8080'
|
||||
- 'acacia:8080'
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 30s
|
||||
|
||||
# BZZZ v2 Application Services
|
||||
- job_name: 'bzzz-agent'
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
port: 9000
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
target_label: __tmp_service_name
|
||||
- source_labels: [__tmp_service_name]
|
||||
regex: bzzz-v2_bzzz-agent
|
||||
action: keep
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_node_id]
|
||||
target_label: node_id
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
target_label: service
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 15s
|
||||
|
||||
# MCP Server Metrics
|
||||
- job_name: 'mcp-server'
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
port: 3001
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
regex: bzzz-v2_mcp-server
|
||||
action: keep
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
target_label: service
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 30s
|
||||
|
||||
# OpenAI Proxy Metrics
|
||||
- job_name: 'openai-proxy'
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
port: 3002
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
regex: bzzz-v2_openai-proxy
|
||||
action: keep
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
target_label: service
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 30s
|
||||
|
||||
# Content Resolver Metrics
|
||||
- job_name: 'content-resolver'
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
port: 3003
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
regex: bzzz-v2_content-resolver
|
||||
action: keep
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
target_label: service
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 30s
|
||||
|
||||
# DHT Bootstrap Nodes
|
||||
- job_name: 'dht-bootstrap'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'walnut:9101'
|
||||
- 'ironwood:9102'
|
||||
- 'acacia:9103'
|
||||
labels:
|
||||
service: 'dht-bootstrap'
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 15s
|
||||
|
||||
# P2P Network Metrics
|
||||
- job_name: 'bzzz-p2p-exporter'
|
||||
static_configs:
|
||||
- targets: ['bzzz-p2p-exporter:9200']
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 30s
|
||||
|
||||
# DHT Network Monitoring
|
||||
- job_name: 'dht-monitor'
|
||||
static_configs:
|
||||
- targets: ['dht-monitor:9201']
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 60s
|
||||
|
||||
# Content Store Monitoring
|
||||
- job_name: 'content-monitor'
|
||||
static_configs:
|
||||
- targets: ['content-monitor:9202']
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 300s # 5 minutes for storage checks
|
||||
|
||||
# OpenAI Cost Monitoring
|
||||
- job_name: 'openai-cost-monitor'
|
||||
static_configs:
|
||||
- targets: ['openai-cost-monitor:9203']
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 60s
|
||||
|
||||
# Database Metrics (PostgreSQL)
|
||||
- job_name: 'postgres'
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
port: 5432
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
regex: bzzz-v2_postgres
|
||||
action: keep
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
target_label: service
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 30s
|
||||
params:
|
||||
dbname: [bzzz_v2]
|
||||
|
||||
# Cache Metrics (Redis)
|
||||
- job_name: 'redis'
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
port: 6379
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
regex: bzzz-v2_redis
|
||||
action: keep
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
target_label: service
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 30s
|
||||
|
||||
# Traefik Load Balancer Metrics
|
||||
- job_name: 'traefik'
|
||||
static_configs:
|
||||
- targets: ['traefik:8080']
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 30s
|
||||
|
||||
# Conversation Management Metrics
|
||||
- job_name: 'conversation-manager'
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
port: 8090
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
regex: bzzz-v2_conversation-manager
|
||||
action: keep
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
target_label: service
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 30s
|
||||
|
||||
# External Service Monitoring (Webhook endpoints)
|
||||
- job_name: 'external-health'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'bzzz.deepblack.cloud'
|
||||
- 'mcp.deepblack.cloud'
|
||||
- 'resolve.deepblack.cloud'
|
||||
- 'openai.deepblack.cloud'
|
||||
metrics_path: /health
|
||||
scrape_interval: 60s
|
||||
scrape_timeout: 10s
|
||||
|
||||
# Remote write configuration for long-term storage (optional)
|
||||
# remote_write:
|
||||
# - url: "https://prometheus-remote-write.example.com/api/v1/write"
|
||||
# basic_auth:
|
||||
# username: "bzzz-cluster"
|
||||
# password_file: "/etc/prometheus/remote-write-password"
|
||||
Reference in New Issue
Block a user