Prepare for v2 development: Add MCP integration and future development planning
- Add FUTURE_DEVELOPMENT.md with comprehensive v2 protocol specification - Add MCP integration design and implementation foundation - Add infrastructure and deployment configurations - Update system architecture for v2 evolution 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
255
infrastructure/monitoring/configs/alertmanager.yml
Normal file
255
infrastructure/monitoring/configs/alertmanager.yml
Normal file
@@ -0,0 +1,255 @@
|
||||
# AlertManager Configuration for BZZZ v2
|
||||
|
||||
global:
|
||||
smtp_smarthost: 'localhost:587'
|
||||
smtp_from: 'alerts@deepblack.cloud'
|
||||
smtp_require_tls: true
|
||||
resolve_timeout: 5m
|
||||
|
||||
# Template files
|
||||
templates:
|
||||
- '/etc/alertmanager/templates/*.tmpl'
|
||||
|
||||
# Route configuration
|
||||
route:
|
||||
group_by: ['cluster', 'alertname', 'service']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 12h
|
||||
receiver: 'default'
|
||||
routes:
|
||||
# Critical P2P network issues
|
||||
- match:
|
||||
severity: critical
|
||||
component: p2p
|
||||
receiver: 'p2p-critical'
|
||||
group_wait: 10s
|
||||
repeat_interval: 5m
|
||||
|
||||
# DHT network issues
|
||||
- match:
|
||||
component: dht
|
||||
receiver: 'dht-alerts'
|
||||
group_wait: 1m
|
||||
repeat_interval: 30m
|
||||
|
||||
# Content store issues
|
||||
- match:
|
||||
component: content-store
|
||||
receiver: 'storage-alerts'
|
||||
group_wait: 2m
|
||||
repeat_interval: 1h
|
||||
|
||||
# OpenAI cost alerts
|
||||
- match:
|
||||
component: openai-cost
|
||||
receiver: 'cost-alerts'
|
||||
group_wait: 5m
|
||||
repeat_interval: 6h
|
||||
|
||||
# Service health alerts
|
||||
- match:
|
||||
component: service-health
|
||||
receiver: 'service-alerts'
|
||||
group_wait: 1m
|
||||
repeat_interval: 15m
|
||||
|
||||
# Resource exhaustion
|
||||
- match:
|
||||
severity: warning
|
||||
component: resources
|
||||
receiver: 'resource-alerts'
|
||||
group_wait: 5m
|
||||
repeat_interval: 2h
|
||||
|
||||
# Security alerts
|
||||
- match:
|
||||
component: security
|
||||
receiver: 'security-alerts'
|
||||
group_wait: 30s
|
||||
repeat_interval: 1h
|
||||
|
||||
# Inhibition rules
|
||||
inhibit_rules:
|
||||
# Silence warning if critical alert is firing
|
||||
- source_match:
|
||||
severity: 'critical'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['cluster', 'service', 'instance']
|
||||
|
||||
# Silence service alerts if node is down
|
||||
- source_match:
|
||||
alertname: 'NodeDown'
|
||||
target_match:
|
||||
component: 'service-health'
|
||||
equal: ['instance']
|
||||
|
||||
# Receiver configurations
|
||||
receivers:
|
||||
# Default receiver
|
||||
- name: 'default'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#bzzz-monitoring'
|
||||
title: 'BZZZ v2 Alert'
|
||||
text: |
|
||||
{{ range .Alerts }}
|
||||
*Alert:* {{ .Annotations.summary }}
|
||||
*Description:* {{ .Annotations.description }}
|
||||
*Severity:* {{ .Labels.severity }}
|
||||
*Instance:* {{ .Labels.instance }}
|
||||
*Service:* {{ .Labels.service }}
|
||||
{{ end }}
|
||||
send_resolved: true
|
||||
|
||||
# Critical P2P network alerts
|
||||
- name: 'p2p-critical'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#bzzz-critical'
|
||||
title: '🚨 CRITICAL P2P Network Issue'
|
||||
text: |
|
||||
{{ range .Alerts }}
|
||||
*CRITICAL P2P ALERT*
|
||||
|
||||
*Summary:* {{ .Annotations.summary }}
|
||||
*Description:* {{ .Annotations.description }}
|
||||
*Node:* {{ .Labels.instance }}
|
||||
*Time:* {{ .StartsAt.Format "2006-01-02 15:04:05" }}
|
||||
|
||||
*Immediate Action Required*
|
||||
{{ end }}
|
||||
send_resolved: true
|
||||
pagerduty_configs:
|
||||
- service_key: 'YOUR_PAGERDUTY_SERVICE_KEY'
|
||||
description: '{{ .GroupLabels.alertname }} - {{ .Annotations.summary }}'
|
||||
|
||||
# DHT network alerts
|
||||
- name: 'dht-alerts'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#bzzz-dht'
|
||||
title: '🔗 DHT Network Alert'
|
||||
text: |
|
||||
{{ range .Alerts }}
|
||||
*DHT Network Issue*
|
||||
|
||||
*Alert:* {{ .Annotations.summary }}
|
||||
*Description:* {{ .Annotations.description }}
|
||||
*Bootstrap Node:* {{ .Labels.instance }}
|
||||
*Peers Connected:* {{ .Labels.peer_count | default "unknown" }}
|
||||
{{ end }}
|
||||
send_resolved: true
|
||||
|
||||
# Storage alerts
|
||||
- name: 'storage-alerts'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#bzzz-storage'
|
||||
title: '💾 Content Store Alert'
|
||||
text: |
|
||||
{{ range .Alerts }}
|
||||
*Storage Alert*
|
||||
|
||||
*Issue:* {{ .Annotations.summary }}
|
||||
*Details:* {{ .Annotations.description }}
|
||||
*Node:* {{ .Labels.instance }}
|
||||
*Usage:* {{ .Labels.disk_usage | default "unknown" }}%
|
||||
{{ end }}
|
||||
send_resolved: true
|
||||
|
||||
# OpenAI cost alerts
|
||||
- name: 'cost-alerts'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#bzzz-costs'
|
||||
title: '💰 OpenAI Cost Alert'
|
||||
text: |
|
||||
{{ range .Alerts }}
|
||||
*Cost Alert*
|
||||
|
||||
*Alert:* {{ .Annotations.summary }}
|
||||
*Current Cost:* ${{ .Labels.current_cost | default "unknown" }}
|
||||
*Threshold:* ${{ .Labels.cost_threshold | default "unknown" }}
|
||||
*Period:* {{ .Labels.cost_period | default "daily" }}
|
||||
*Action:* {{ .Annotations.description }}
|
||||
{{ end }}
|
||||
send_resolved: true
|
||||
email_configs:
|
||||
- to: 'finance@deepblack.cloud'
|
||||
subject: 'BZZZ v2 OpenAI Cost Alert'
|
||||
body: |
|
||||
OpenAI usage has exceeded cost thresholds.
|
||||
|
||||
{{ range .Alerts }}
|
||||
Alert: {{ .Annotations.summary }}
|
||||
Current Cost: ${{ .Labels.current_cost }}
|
||||
Threshold: ${{ .Labels.cost_threshold }}
|
||||
{{ end }}
|
||||
|
||||
# Service health alerts
|
||||
- name: 'service-alerts'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#bzzz-services'
|
||||
title: '🔧 Service Health Alert'
|
||||
text: |
|
||||
{{ range .Alerts }}
|
||||
*Service Health Issue*
|
||||
|
||||
*Service:* {{ .Labels.service }}
|
||||
*Alert:* {{ .Annotations.summary }}
|
||||
*Node:* {{ .Labels.instance }}
|
||||
*Status:* {{ .Labels.status | default "unknown" }}
|
||||
*Description:* {{ .Annotations.description }}
|
||||
{{ end }}
|
||||
send_resolved: true
|
||||
|
||||
# Resource alerts
|
||||
- name: 'resource-alerts'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#bzzz-resources'
|
||||
title: '⚡ Resource Alert'
|
||||
text: |
|
||||
{{ range .Alerts }}
|
||||
*Resource Warning*
|
||||
|
||||
*Resource:* {{ .Labels.resource_type | default "unknown" }}
|
||||
*Node:* {{ .Labels.instance }}
|
||||
*Alert:* {{ .Annotations.summary }}
|
||||
*Current Usage:* {{ .Labels.usage_percent | default "unknown" }}%
|
||||
*Threshold:* {{ .Labels.threshold | default "unknown" }}%
|
||||
{{ end }}
|
||||
send_resolved: true
|
||||
|
||||
# Security alerts
|
||||
- name: 'security-alerts'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#bzzz-security'
|
||||
title: '🔒 Security Alert'
|
||||
text: |
|
||||
{{ range .Alerts }}
|
||||
*SECURITY ALERT*
|
||||
|
||||
*Type:* {{ .Labels.security_type | default "unknown" }}
|
||||
*Alert:* {{ .Annotations.summary }}
|
||||
*Source:* {{ .Labels.instance }}
|
||||
*Details:* {{ .Annotations.description }}
|
||||
*Severity:* {{ .Labels.severity }}
|
||||
{{ end }}
|
||||
send_resolved: true
|
||||
email_configs:
|
||||
- to: 'security@deepblack.cloud'
|
||||
subject: 'BZZZ v2 Security Alert'
|
||||
body: |
|
||||
Security alert triggered in BZZZ v2 cluster.
|
||||
|
||||
{{ range .Alerts }}
|
||||
Alert: {{ .Annotations.summary }}
|
||||
Severity: {{ .Labels.severity }}
|
||||
Source: {{ .Labels.instance }}
|
||||
Details: {{ .Annotations.description }}
|
||||
{{ end }}
|
||||
Reference in New Issue
Block a user