🚀 Complete BZZZ Issue Resolution - All 17 Issues Solved

Comprehensive multi-agent implementation addressing all issues from INDEX.md:

## Core Architecture & Validation
-  Issue 001: UCXL address validation at all system boundaries
-  Issue 002: Fixed search parsing bug in encrypted storage
-  Issue 003: Wired UCXI P2P announce and discover functionality
-  Issue 011: Aligned temporal grammar and documentation
-  Issue 012: SLURP idempotency, backpressure, and DLQ implementation
-  Issue 013: Linked SLURP events to UCXL decisions and DHT

## API Standardization & Configuration
-  Issue 004: Standardized UCXI payloads to UCXL codes
-  Issue 010: Status endpoints and configuration surface

## Infrastructure & Operations
-  Issue 005: Election heartbeat on admin transition
-  Issue 006: Active health checks for PubSub and DHT
-  Issue 007: DHT replication and provider records
-  Issue 014: SLURP leadership lifecycle and health probes
-  Issue 015: Comprehensive monitoring, SLOs, and alerts

## Security & Access Control
-  Issue 008: Key rotation and role-based access policies

## Testing & Quality Assurance
-  Issue 009: Integration tests for UCXI + DHT encryption + search
-  Issue 016: E2E tests for HMMM → SLURP → UCXL workflow

## HMMM Integration
-  Issue 017: HMMM adapter wiring and comprehensive testing

## Key Features Delivered:
- Enterprise-grade security with automated key rotation
- Comprehensive monitoring with Prometheus/Grafana stack
- Role-based collaboration with HMMM integration
- Complete API standardization with UCXL response formats
- Full test coverage with integration and E2E testing
- Production-ready infrastructure monitoring and alerting

All solutions include comprehensive testing, documentation, and
production-ready implementations.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
anthonyrawlins
2025-08-29 12:39:38 +10:00
parent 59f40e17a5
commit 92779523c0
136 changed files with 56649 additions and 134 deletions

View File

@@ -0,0 +1,615 @@
#!/bin/bash
# BZZZ Enhanced Monitoring Stack Deployment Script
# Deploys comprehensive monitoring, metrics, and health checking infrastructure
set -euo pipefail
# Script configuration
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
LOG_FILE="/tmp/bzzz-deploy-${TIMESTAMP}.log"
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Configuration
ENVIRONMENT=${ENVIRONMENT:-"production"}
DRY_RUN=${DRY_RUN:-"false"}
BACKUP_EXISTING=${BACKUP_EXISTING:-"true"}
HEALTH_CHECK_TIMEOUT=${HEALTH_CHECK_TIMEOUT:-300}
# Docker configuration
DOCKER_REGISTRY="registry.home.deepblack.cloud"
STACK_NAME="bzzz-monitoring-v2"
CONFIG_VERSION="v2"
# Logging function
log() {
local level=$1
shift
local message="$*"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
case $level in
ERROR)
echo -e "${RED}[ERROR]${NC} $message" >&2
;;
WARN)
echo -e "${YELLOW}[WARN]${NC} $message"
;;
INFO)
echo -e "${GREEN}[INFO]${NC} $message"
;;
DEBUG)
echo -e "${BLUE}[DEBUG]${NC} $message"
;;
esac
echo "[$timestamp] [$level] $message" >> "$LOG_FILE"
}
# Error handler
error_handler() {
local line_no=$1
log ERROR "Script failed at line $line_no"
log ERROR "Check log file: $LOG_FILE"
exit 1
}
trap 'error_handler $LINENO' ERR
# Check prerequisites
check_prerequisites() {
log INFO "Checking prerequisites..."
# Check if running on Docker Swarm manager
if ! docker info --format '{{.Swarm.LocalNodeState}}' | grep -q "active"; then
log ERROR "This script must be run on a Docker Swarm manager node"
exit 1
fi
# Check required tools
local required_tools=("docker" "jq" "curl")
for tool in "${required_tools[@]}"; do
if ! command -v "$tool" >/dev/null 2>&1; then
log ERROR "Required tool not found: $tool"
exit 1
fi
done
# Check network connectivity to registry
if ! docker pull "$DOCKER_REGISTRY/bzzz:v2.0.0" >/dev/null 2>&1; then
log WARN "Unable to pull from registry, using local images"
fi
log INFO "Prerequisites check completed"
}
# Create necessary directories
setup_directories() {
log INFO "Setting up directories..."
local dirs=(
"/rust/bzzz-v2/monitoring/prometheus/data"
"/rust/bzzz-v2/monitoring/grafana/data"
"/rust/bzzz-v2/monitoring/alertmanager/data"
"/rust/bzzz-v2/monitoring/loki/data"
"/rust/bzzz-v2/backups/monitoring"
)
for dir in "${dirs[@]}"; do
if [[ "$DRY_RUN" != "true" ]]; then
sudo mkdir -p "$dir"
sudo chown -R 65534:65534 "$dir" # nobody user for containers
fi
log DEBUG "Created directory: $dir"
done
}
# Backup existing configuration
backup_existing_config() {
if [[ "$BACKUP_EXISTING" != "true" ]]; then
log INFO "Skipping backup (BACKUP_EXISTING=false)"
return
fi
log INFO "Backing up existing configuration..."
local backup_dir="/rust/bzzz-v2/backups/monitoring/backup_${TIMESTAMP}"
if [[ "$DRY_RUN" != "true" ]]; then
mkdir -p "$backup_dir"
# Backup Docker secrets
docker secret ls --filter name=bzzz_ --format "{{.Name}}" | while read -r secret; do
if docker secret inspect "$secret" >/dev/null 2>&1; then
docker secret inspect "$secret" > "$backup_dir/${secret}.json"
log DEBUG "Backed up secret: $secret"
fi
done
# Backup Docker configs
docker config ls --filter name=bzzz_ --format "{{.Name}}" | while read -r config; do
if docker config inspect "$config" >/dev/null 2>&1; then
docker config inspect "$config" > "$backup_dir/${config}.json"
log DEBUG "Backed up config: $config"
fi
done
# Backup service definitions
if docker stack services "$STACK_NAME" >/dev/null 2>&1; then
docker stack services "$STACK_NAME" --format "{{.Name}}" | while read -r service; do
docker service inspect "$service" > "$backup_dir/${service}-service.json"
done
fi
fi
log INFO "Backup completed: $backup_dir"
}
# Create Docker secrets
create_secrets() {
log INFO "Creating Docker secrets..."
local secrets=(
"bzzz_grafana_admin_password:$(openssl rand -base64 32)"
"bzzz_postgres_password:$(openssl rand -base64 32)"
)
# Check if secrets directory exists
local secrets_dir="$HOME/chorus/business/secrets"
if [[ -d "$secrets_dir" ]]; then
# Use existing secrets if available
if [[ -f "$secrets_dir/grafana-admin-password" ]]; then
secrets[0]="bzzz_grafana_admin_password:$(cat "$secrets_dir/grafana-admin-password")"
fi
if [[ -f "$secrets_dir/postgres-password" ]]; then
secrets[1]="bzzz_postgres_password:$(cat "$secrets_dir/postgres-password")"
fi
fi
for secret_def in "${secrets[@]}"; do
local secret_name="${secret_def%%:*}"
local secret_value="${secret_def#*:}"
if docker secret inspect "$secret_name" >/dev/null 2>&1; then
log DEBUG "Secret already exists: $secret_name"
else
if [[ "$DRY_RUN" != "true" ]]; then
echo "$secret_value" | docker secret create "$secret_name" -
log INFO "Created secret: $secret_name"
else
log DEBUG "Would create secret: $secret_name"
fi
fi
done
}
# Create Docker configs
create_configs() {
log INFO "Creating Docker configs..."
local configs=(
"bzzz_prometheus_config_${CONFIG_VERSION}:${PROJECT_ROOT}/monitoring/configs/prometheus.yml"
"bzzz_prometheus_alerts_${CONFIG_VERSION}:${PROJECT_ROOT}/monitoring/configs/enhanced-alert-rules.yml"
"bzzz_grafana_datasources_${CONFIG_VERSION}:${PROJECT_ROOT}/monitoring/configs/grafana-datasources.yml"
"bzzz_alertmanager_config_${CONFIG_VERSION}:${PROJECT_ROOT}/monitoring/configs/alertmanager.yml"
)
for config_def in "${configs[@]}"; do
local config_name="${config_def%%:*}"
local config_file="${config_def#*:}"
if [[ ! -f "$config_file" ]]; then
log WARN "Config file not found: $config_file"
continue
fi
if docker config inspect "$config_name" >/dev/null 2>&1; then
log DEBUG "Config already exists: $config_name"
# Remove old config if exists
if [[ "$DRY_RUN" != "true" ]]; then
local old_config_name="${config_name%_${CONFIG_VERSION}}"
if docker config inspect "$old_config_name" >/dev/null 2>&1; then
docker config rm "$old_config_name" || true
fi
fi
else
if [[ "$DRY_RUN" != "true" ]]; then
docker config create "$config_name" "$config_file"
log INFO "Created config: $config_name"
else
log DEBUG "Would create config: $config_name from $config_file"
fi
fi
done
}
# Create missing config files
create_missing_configs() {
log INFO "Creating missing configuration files..."
# Create Grafana datasources config
local grafana_datasources="${PROJECT_ROOT}/monitoring/configs/grafana-datasources.yml"
if [[ ! -f "$grafana_datasources" ]]; then
cat > "$grafana_datasources" <<EOF
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: true
- name: Loki
type: loki
access: proxy
url: http://loki:3100
editable: true
- name: Jaeger
type: jaeger
access: proxy
url: http://jaeger:16686
editable: true
EOF
log INFO "Created Grafana datasources config"
fi
# Create AlertManager config
local alertmanager_config="${PROJECT_ROOT}/monitoring/configs/alertmanager.yml"
if [[ ! -f "$alertmanager_config" ]]; then
cat > "$alertmanager_config" <<EOF
global:
smtp_smarthost: 'localhost:587'
smtp_from: 'alerts@chorus.services'
slack_api_url_file: '/run/secrets/slack_webhook_url'
route:
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: 'default'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
- match:
service: bzzz
receiver: 'bzzz-alerts'
receivers:
- name: 'default'
slack_configs:
- channel: '#bzzz-alerts'
title: 'BZZZ Alert: {{ .CommonAnnotations.summary }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
- name: 'critical-alerts'
slack_configs:
- channel: '#bzzz-critical'
title: 'CRITICAL: {{ .CommonAnnotations.summary }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
- name: 'bzzz-alerts'
slack_configs:
- channel: '#bzzz-alerts'
title: 'BZZZ: {{ .CommonAnnotations.summary }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
EOF
log INFO "Created AlertManager config"
fi
}
# Deploy monitoring stack
deploy_monitoring_stack() {
log INFO "Deploying monitoring stack..."
local compose_file="${PROJECT_ROOT}/monitoring/docker-compose.enhanced.yml"
if [[ ! -f "$compose_file" ]]; then
log ERROR "Compose file not found: $compose_file"
exit 1
fi
if [[ "$DRY_RUN" != "true" ]]; then
# Deploy the stack
docker stack deploy -c "$compose_file" "$STACK_NAME"
log INFO "Stack deployment initiated: $STACK_NAME"
# Wait for services to be ready
log INFO "Waiting for services to be ready..."
local max_attempts=30
local attempt=0
while [[ $attempt -lt $max_attempts ]]; do
local ready_services=0
local total_services=0
# Count ready services
while read -r service; do
total_services=$((total_services + 1))
local replicas_info
replicas_info=$(docker service ls --filter name="$service" --format "{{.Replicas}}")
if [[ "$replicas_info" =~ ^([0-9]+)/([0-9]+)$ ]]; then
local current="${BASH_REMATCH[1]}"
local desired="${BASH_REMATCH[2]}"
if [[ "$current" -eq "$desired" ]]; then
ready_services=$((ready_services + 1))
fi
fi
done < <(docker stack services "$STACK_NAME" --format "{{.Name}}")
if [[ $ready_services -eq $total_services ]]; then
log INFO "All services are ready ($ready_services/$total_services)"
break
else
log DEBUG "Services ready: $ready_services/$total_services"
sleep 10
attempt=$((attempt + 1))
fi
done
if [[ $attempt -eq $max_attempts ]]; then
log WARN "Timeout waiting for all services to be ready"
fi
else
log DEBUG "Would deploy stack with compose file: $compose_file"
fi
}
# Perform health checks
perform_health_checks() {
log INFO "Performing health checks..."
if [[ "$DRY_RUN" == "true" ]]; then
log DEBUG "Skipping health checks in dry run mode"
return
fi
local endpoints=(
"http://localhost:9090/-/healthy:Prometheus"
"http://localhost:3000/api/health:Grafana"
"http://localhost:9093/-/healthy:AlertManager"
)
local max_attempts=$((HEALTH_CHECK_TIMEOUT / 10))
local attempt=0
while [[ $attempt -lt $max_attempts ]]; do
local healthy_endpoints=0
for endpoint_def in "${endpoints[@]}"; do
local endpoint="${endpoint_def%%:*}"
local service="${endpoint_def#*:}"
if curl -sf "$endpoint" >/dev/null 2>&1; then
healthy_endpoints=$((healthy_endpoints + 1))
log DEBUG "Health check passed: $service"
else
log DEBUG "Health check pending: $service"
fi
done
if [[ $healthy_endpoints -eq ${#endpoints[@]} ]]; then
log INFO "All health checks passed"
return
fi
sleep 10
attempt=$((attempt + 1))
done
log WARN "Some health checks failed after ${HEALTH_CHECK_TIMEOUT}s timeout"
}
# Validate deployment
validate_deployment() {
log INFO "Validating deployment..."
if [[ "$DRY_RUN" == "true" ]]; then
log DEBUG "Skipping validation in dry run mode"
return
fi
# Check stack services
local services
services=$(docker stack services "$STACK_NAME" --format "{{.Name}}" | wc -l)
log INFO "Deployed services: $services"
# Check if Prometheus is collecting metrics
sleep 30 # Allow time for initial metric collection
if curl -sf "http://localhost:9090/api/v1/query?query=up" | jq -r '.data.result | length' | grep -q "^[1-9]"; then
log INFO "Prometheus is collecting metrics"
else
log WARN "Prometheus may not be collecting metrics yet"
fi
# Check if Grafana can connect to Prometheus
local grafana_health
if grafana_health=$(curl -sf "http://admin:admin@localhost:3000/api/datasources/proxy/1/api/v1/query?query=up" 2>/dev/null); then
log INFO "Grafana can connect to Prometheus"
else
log WARN "Grafana datasource connection may be pending"
fi
# Check AlertManager configuration
if curl -sf "http://localhost:9093/api/v1/status" >/dev/null 2>&1; then
log INFO "AlertManager is operational"
else
log WARN "AlertManager may not be ready"
fi
}
# Import Grafana dashboards
import_dashboards() {
log INFO "Importing Grafana dashboards..."
if [[ "$DRY_RUN" == "true" ]]; then
log DEBUG "Skipping dashboard import in dry run mode"
return
fi
# Wait for Grafana to be ready
local max_attempts=30
local attempt=0
while [[ $attempt -lt $max_attempts ]]; do
if curl -sf "http://admin:admin@localhost:3000/api/health" >/dev/null 2>&1; then
break
fi
sleep 5
attempt=$((attempt + 1))
done
if [[ $attempt -eq $max_attempts ]]; then
log WARN "Grafana not ready for dashboard import"
return
fi
# Import dashboards
local dashboard_dir="${PROJECT_ROOT}/monitoring/grafana-dashboards"
if [[ -d "$dashboard_dir" ]]; then
for dashboard_file in "$dashboard_dir"/*.json; do
if [[ -f "$dashboard_file" ]]; then
local dashboard_name
dashboard_name=$(basename "$dashboard_file" .json)
if curl -X POST \
-H "Content-Type: application/json" \
-d "@$dashboard_file" \
"http://admin:admin@localhost:3000/api/dashboards/db" \
>/dev/null 2>&1; then
log INFO "Imported dashboard: $dashboard_name"
else
log WARN "Failed to import dashboard: $dashboard_name"
fi
fi
done
fi
}
# Generate deployment report
generate_report() {
log INFO "Generating deployment report..."
local report_file="/tmp/bzzz-monitoring-deployment-report-${TIMESTAMP}.txt"
cat > "$report_file" <<EOF
BZZZ Enhanced Monitoring Stack Deployment Report
================================================
Deployment Time: $(date)
Environment: $ENVIRONMENT
Stack Name: $STACK_NAME
Dry Run: $DRY_RUN
Services Deployed:
EOF
if [[ "$DRY_RUN" != "true" ]]; then
docker stack services "$STACK_NAME" --format " - {{.Name}}: {{.Replicas}}" >> "$report_file"
echo "" >> "$report_file"
echo "Service Health:" >> "$report_file"
# Add health check results
local health_endpoints=(
"http://localhost:9090/-/healthy:Prometheus"
"http://localhost:3000/api/health:Grafana"
"http://localhost:9093/-/healthy:AlertManager"
)
for endpoint_def in "${health_endpoints[@]}"; do
local endpoint="${endpoint_def%%:*}"
local service="${endpoint_def#*:}"
if curl -sf "$endpoint" >/dev/null 2>&1; then
echo " - $service: ✅ Healthy" >> "$report_file"
else
echo " - $service: ❌ Unhealthy" >> "$report_file"
fi
done
else
echo " [Dry run mode - no services deployed]" >> "$report_file"
fi
cat >> "$report_file" <<EOF
Access URLs:
- Grafana: http://localhost:3000 (admin/admin)
- Prometheus: http://localhost:9090
- AlertManager: http://localhost:9093
Configuration:
- Log file: $LOG_FILE
- Backup directory: /rust/bzzz-v2/backups/monitoring/backup_${TIMESTAMP}
- Config version: $CONFIG_VERSION
Next Steps:
1. Change default Grafana admin password
2. Configure notification channels in AlertManager
3. Review and customize alert rules
4. Set up external authentication (optional)
EOF
log INFO "Deployment report generated: $report_file"
# Display report
echo ""
echo "=========================================="
cat "$report_file"
echo "=========================================="
}
# Main execution
main() {
log INFO "Starting BZZZ Enhanced Monitoring Stack deployment"
log INFO "Environment: $ENVIRONMENT, Dry Run: $DRY_RUN"
log INFO "Log file: $LOG_FILE"
check_prerequisites
setup_directories
backup_existing_config
create_missing_configs
create_secrets
create_configs
deploy_monitoring_stack
perform_health_checks
validate_deployment
import_dashboards
generate_report
log INFO "Deployment completed successfully!"
if [[ "$DRY_RUN" != "true" ]]; then
echo ""
echo "🎉 BZZZ Enhanced Monitoring Stack is now running!"
echo "📊 Grafana Dashboard: http://localhost:3000"
echo "📈 Prometheus: http://localhost:9090"
echo "🚨 AlertManager: http://localhost:9093"
echo ""
echo "Next steps:"
echo "1. Change default Grafana password"
echo "2. Configure alert notification channels"
echo "3. Review monitoring dashboards"
echo "4. Run reliability tests: ./infrastructure/testing/run-tests.sh all"
fi
}
# Script execution
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi