Files
bzzz/infrastructure/monitoring/configs/alertmanager.yml
anthonyrawlins 065dddf8d5 Prepare for v2 development: Add MCP integration and future development planning
- Add FUTURE_DEVELOPMENT.md with comprehensive v2 protocol specification
- Add MCP integration design and implementation foundation
- Add infrastructure and deployment configurations
- Update system architecture for v2 evolution

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-07 14:38:22 +10:00

255 lines
7.4 KiB
YAML

# AlertManager Configuration for BZZZ v2
global:
smtp_smarthost: 'localhost:587'
smtp_from: 'alerts@deepblack.cloud'
smtp_require_tls: true
resolve_timeout: 5m
# Template files
templates:
- '/etc/alertmanager/templates/*.tmpl'
# Route configuration
route:
group_by: ['cluster', 'alertname', 'service']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: 'default'
routes:
# Critical P2P network issues
- match:
severity: critical
component: p2p
receiver: 'p2p-critical'
group_wait: 10s
repeat_interval: 5m
# DHT network issues
- match:
component: dht
receiver: 'dht-alerts'
group_wait: 1m
repeat_interval: 30m
# Content store issues
- match:
component: content-store
receiver: 'storage-alerts'
group_wait: 2m
repeat_interval: 1h
# OpenAI cost alerts
- match:
component: openai-cost
receiver: 'cost-alerts'
group_wait: 5m
repeat_interval: 6h
# Service health alerts
- match:
component: service-health
receiver: 'service-alerts'
group_wait: 1m
repeat_interval: 15m
# Resource exhaustion
- match:
severity: warning
component: resources
receiver: 'resource-alerts'
group_wait: 5m
repeat_interval: 2h
# Security alerts
- match:
component: security
receiver: 'security-alerts'
group_wait: 30s
repeat_interval: 1h
# Inhibition rules
inhibit_rules:
# Silence warning if critical alert is firing
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['cluster', 'service', 'instance']
# Silence service alerts if node is down
- source_match:
alertname: 'NodeDown'
target_match:
component: 'service-health'
equal: ['instance']
# Receiver configurations
receivers:
# Default receiver
- name: 'default'
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
channel: '#bzzz-monitoring'
title: 'BZZZ v2 Alert'
text: |
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }}
*Description:* {{ .Annotations.description }}
*Severity:* {{ .Labels.severity }}
*Instance:* {{ .Labels.instance }}
*Service:* {{ .Labels.service }}
{{ end }}
send_resolved: true
# Critical P2P network alerts
- name: 'p2p-critical'
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
channel: '#bzzz-critical'
title: '🚨 CRITICAL P2P Network Issue'
text: |
{{ range .Alerts }}
*CRITICAL P2P ALERT*
*Summary:* {{ .Annotations.summary }}
*Description:* {{ .Annotations.description }}
*Node:* {{ .Labels.instance }}
*Time:* {{ .StartsAt.Format "2006-01-02 15:04:05" }}
*Immediate Action Required*
{{ end }}
send_resolved: true
pagerduty_configs:
- service_key: 'YOUR_PAGERDUTY_SERVICE_KEY'
description: '{{ .GroupLabels.alertname }} - {{ .Annotations.summary }}'
# DHT network alerts
- name: 'dht-alerts'
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
channel: '#bzzz-dht'
title: '🔗 DHT Network Alert'
text: |
{{ range .Alerts }}
*DHT Network Issue*
*Alert:* {{ .Annotations.summary }}
*Description:* {{ .Annotations.description }}
*Bootstrap Node:* {{ .Labels.instance }}
*Peers Connected:* {{ .Labels.peer_count | default "unknown" }}
{{ end }}
send_resolved: true
# Storage alerts
- name: 'storage-alerts'
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
channel: '#bzzz-storage'
title: '💾 Content Store Alert'
text: |
{{ range .Alerts }}
*Storage Alert*
*Issue:* {{ .Annotations.summary }}
*Details:* {{ .Annotations.description }}
*Node:* {{ .Labels.instance }}
*Usage:* {{ .Labels.disk_usage | default "unknown" }}%
{{ end }}
send_resolved: true
# OpenAI cost alerts
- name: 'cost-alerts'
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
channel: '#bzzz-costs'
title: '💰 OpenAI Cost Alert'
text: |
{{ range .Alerts }}
*Cost Alert*
*Alert:* {{ .Annotations.summary }}
*Current Cost:* ${{ .Labels.current_cost | default "unknown" }}
*Threshold:* ${{ .Labels.cost_threshold | default "unknown" }}
*Period:* {{ .Labels.cost_period | default "daily" }}
*Action:* {{ .Annotations.description }}
{{ end }}
send_resolved: true
email_configs:
- to: 'finance@deepblack.cloud'
subject: 'BZZZ v2 OpenAI Cost Alert'
body: |
OpenAI usage has exceeded cost thresholds.
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Current Cost: ${{ .Labels.current_cost }}
Threshold: ${{ .Labels.cost_threshold }}
{{ end }}
# Service health alerts
- name: 'service-alerts'
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
channel: '#bzzz-services'
title: '🔧 Service Health Alert'
text: |
{{ range .Alerts }}
*Service Health Issue*
*Service:* {{ .Labels.service }}
*Alert:* {{ .Annotations.summary }}
*Node:* {{ .Labels.instance }}
*Status:* {{ .Labels.status | default "unknown" }}
*Description:* {{ .Annotations.description }}
{{ end }}
send_resolved: true
# Resource alerts
- name: 'resource-alerts'
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
channel: '#bzzz-resources'
title: '⚡ Resource Alert'
text: |
{{ range .Alerts }}
*Resource Warning*
*Resource:* {{ .Labels.resource_type | default "unknown" }}
*Node:* {{ .Labels.instance }}
*Alert:* {{ .Annotations.summary }}
*Current Usage:* {{ .Labels.usage_percent | default "unknown" }}%
*Threshold:* {{ .Labels.threshold | default "unknown" }}%
{{ end }}
send_resolved: true
# Security alerts
- name: 'security-alerts'
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
channel: '#bzzz-security'
title: '🔒 Security Alert'
text: |
{{ range .Alerts }}
*SECURITY ALERT*
*Type:* {{ .Labels.security_type | default "unknown" }}
*Alert:* {{ .Annotations.summary }}
*Source:* {{ .Labels.instance }}
*Details:* {{ .Annotations.description }}
*Severity:* {{ .Labels.severity }}
{{ end }}
send_resolved: true
email_configs:
- to: 'security@deepblack.cloud'
subject: 'BZZZ v2 Security Alert'
body: |
Security alert triggered in BZZZ v2 cluster.
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Severity: {{ .Labels.severity }}
Source: {{ .Labels.instance }}
Details: {{ .Annotations.description }}
{{ end }}