🚀 Complete BZZZ Issue Resolution - All 17 Issues Solved

Comprehensive multi-agent implementation addressing all issues from INDEX.md: ## Core Architecture & Validation - ✅ Issue 001: UCXL address validation at all system boundaries - ✅ Issue 002: Fixed search parsing bug in encrypted storage - ✅ Issue 003: Wired UCXI P2P announce and discover functionality - ✅ Issue 011: Aligned temporal grammar and documentation - ✅ Issue 012: SLURP idempotency, backpressure, and DLQ implementation - ✅ Issue 013: Linked SLURP events to UCXL decisions and DHT ## API Standardization & Configuration - ✅ Issue 004: Standardized UCXI payloads to UCXL codes - ✅ Issue 010: Status endpoints and configuration surface ## Infrastructure & Operations - ✅ Issue 005: Election heartbeat on admin transition - ✅ Issue 006: Active health checks for PubSub and DHT - ✅ Issue 007: DHT replication and provider records - ✅ Issue 014: SLURP leadership lifecycle and health probes - ✅ Issue 015: Comprehensive monitoring, SLOs, and alerts ## Security & Access Control - ✅ Issue 008: Key rotation and role-based access policies ## Testing & Quality Assurance - ✅ Issue 009: Integration tests for UCXI + DHT encryption + search - ✅ Issue 016: E2E tests for HMMM → SLURP → UCXL workflow ## HMMM Integration - ✅ Issue 017: HMMM adapter wiring and comprehensive testing ## Key Features Delivered: - Enterprise-grade security with automated key rotation - Comprehensive monitoring with Prometheus/Grafana stack - Role-based collaboration with HMMM integration - Complete API standardization with UCXL response formats - Full test coverage with integration and E2E testing - Production-ready infrastructure monitoring and alerting All solutions include comprehensive testing, documentation, and production-ready implementations. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-29 12:39:38 +10:00
parent 59f40e17a5
commit 92779523c0
136 changed files with 56649 additions and 134 deletions
--- a/infrastructure/scripts/deploy-enhanced-monitoring.sh
+++ b/infrastructure/scripts/deploy-enhanced-monitoring.sh
@@ -0,0 +1,615 @@
+#!/bin/bash
+
+# BZZZ Enhanced Monitoring Stack Deployment Script
+# Deploys comprehensive monitoring, metrics, and health checking infrastructure
+
+set -euo pipefail
+
+# Script configuration
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+LOG_FILE="/tmp/bzzz-deploy-${TIMESTAMP}.log"
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# Configuration
+ENVIRONMENT=${ENVIRONMENT:-"production"}
+DRY_RUN=${DRY_RUN:-"false"}
+BACKUP_EXISTING=${BACKUP_EXISTING:-"true"}
+HEALTH_CHECK_TIMEOUT=${HEALTH_CHECK_TIMEOUT:-300}
+
+# Docker configuration
+DOCKER_REGISTRY="registry.home.deepblack.cloud"
+STACK_NAME="bzzz-monitoring-v2"
+CONFIG_VERSION="v2"
+
+# Logging function
+log() {
+    local level=$1
+    shift
+    local message="$*"
+    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
+    
+    case $level in
+        ERROR)
+            echo -e "${RED}[ERROR]${NC} $message" >&2
+            ;;
+        WARN)
+            echo -e "${YELLOW}[WARN]${NC} $message"
+            ;;
+        INFO)
+            echo -e "${GREEN}[INFO]${NC} $message"
+            ;;
+        DEBUG)
+            echo -e "${BLUE}[DEBUG]${NC} $message"
+            ;;
+    esac
+    
+    echo "[$timestamp] [$level] $message" >> "$LOG_FILE"
+}
+
+# Error handler
+error_handler() {
+    local line_no=$1
+    log ERROR "Script failed at line $line_no"
+    log ERROR "Check log file: $LOG_FILE"
+    exit 1
+}
+trap 'error_handler $LINENO' ERR
+
+# Check prerequisites
+check_prerequisites() {
+    log INFO "Checking prerequisites..."
+    
+    # Check if running on Docker Swarm manager
+    if ! docker info --format '{{.Swarm.LocalNodeState}}' | grep -q "active"; then
+        log ERROR "This script must be run on a Docker Swarm manager node"
+        exit 1
+    fi
+    
+    # Check required tools
+    local required_tools=("docker" "jq" "curl")
+    for tool in "${required_tools[@]}"; do
+        if ! command -v "$tool" >/dev/null 2>&1; then
+            log ERROR "Required tool not found: $tool"
+            exit 1
+        fi
+    done
+    
+    # Check network connectivity to registry
+    if ! docker pull "$DOCKER_REGISTRY/bzzz:v2.0.0" >/dev/null 2>&1; then
+        log WARN "Unable to pull from registry, using local images"
+    fi
+    
+    log INFO "Prerequisites check completed"
+}
+
+# Create necessary directories
+setup_directories() {
+    log INFO "Setting up directories..."
+    
+    local dirs=(
+        "/rust/bzzz-v2/monitoring/prometheus/data"
+        "/rust/bzzz-v2/monitoring/grafana/data"
+        "/rust/bzzz-v2/monitoring/alertmanager/data"
+        "/rust/bzzz-v2/monitoring/loki/data"
+        "/rust/bzzz-v2/backups/monitoring"
+    )
+    
+    for dir in "${dirs[@]}"; do
+        if [[ "$DRY_RUN" != "true" ]]; then
+            sudo mkdir -p "$dir"
+            sudo chown -R 65534:65534 "$dir"  # nobody user for containers
+        fi
+        log DEBUG "Created directory: $dir"
+    done
+}
+
+# Backup existing configuration
+backup_existing_config() {
+    if [[ "$BACKUP_EXISTING" != "true" ]]; then
+        log INFO "Skipping backup (BACKUP_EXISTING=false)"
+        return
+    fi
+    
+    log INFO "Backing up existing configuration..."
+    
+    local backup_dir="/rust/bzzz-v2/backups/monitoring/backup_${TIMESTAMP}"
+    
+    if [[ "$DRY_RUN" != "true" ]]; then
+        mkdir -p "$backup_dir"
+        
+        # Backup Docker secrets
+        docker secret ls --filter name=bzzz_ --format "{{.Name}}" | while read -r secret; do
+            if docker secret inspect "$secret" >/dev/null 2>&1; then
+                docker secret inspect "$secret" > "$backup_dir/${secret}.json"
+                log DEBUG "Backed up secret: $secret"
+            fi
+        done
+        
+        # Backup Docker configs  
+        docker config ls --filter name=bzzz_ --format "{{.Name}}" | while read -r config; do
+            if docker config inspect "$config" >/dev/null 2>&1; then
+                docker config inspect "$config" > "$backup_dir/${config}.json"
+                log DEBUG "Backed up config: $config"
+            fi
+        done
+        
+        # Backup service definitions
+        if docker stack services "$STACK_NAME" >/dev/null 2>&1; then
+            docker stack services "$STACK_NAME" --format "{{.Name}}" | while read -r service; do
+                docker service inspect "$service" > "$backup_dir/${service}-service.json"
+            done
+        fi
+    fi
+    
+    log INFO "Backup completed: $backup_dir"
+}
+
+# Create Docker secrets
+create_secrets() {
+    log INFO "Creating Docker secrets..."
+    
+    local secrets=(
+        "bzzz_grafana_admin_password:$(openssl rand -base64 32)"
+        "bzzz_postgres_password:$(openssl rand -base64 32)"
+    )
+    
+    # Check if secrets directory exists
+    local secrets_dir="$HOME/chorus/business/secrets"
+    if [[ -d "$secrets_dir" ]]; then
+        # Use existing secrets if available
+        if [[ -f "$secrets_dir/grafana-admin-password" ]]; then
+            secrets[0]="bzzz_grafana_admin_password:$(cat "$secrets_dir/grafana-admin-password")"
+        fi
+        if [[ -f "$secrets_dir/postgres-password" ]]; then
+            secrets[1]="bzzz_postgres_password:$(cat "$secrets_dir/postgres-password")"
+        fi
+    fi
+    
+    for secret_def in "${secrets[@]}"; do
+        local secret_name="${secret_def%%:*}"
+        local secret_value="${secret_def#*:}"
+        
+        if docker secret inspect "$secret_name" >/dev/null 2>&1; then
+            log DEBUG "Secret already exists: $secret_name"
+        else
+            if [[ "$DRY_RUN" != "true" ]]; then
+                echo "$secret_value" | docker secret create "$secret_name" -
+                log INFO "Created secret: $secret_name"
+            else
+                log DEBUG "Would create secret: $secret_name"
+            fi
+        fi
+    done
+}
+
+# Create Docker configs
+create_configs() {
+    log INFO "Creating Docker configs..."
+    
+    local configs=(
+        "bzzz_prometheus_config_${CONFIG_VERSION}:${PROJECT_ROOT}/monitoring/configs/prometheus.yml"
+        "bzzz_prometheus_alerts_${CONFIG_VERSION}:${PROJECT_ROOT}/monitoring/configs/enhanced-alert-rules.yml"
+        "bzzz_grafana_datasources_${CONFIG_VERSION}:${PROJECT_ROOT}/monitoring/configs/grafana-datasources.yml"
+        "bzzz_alertmanager_config_${CONFIG_VERSION}:${PROJECT_ROOT}/monitoring/configs/alertmanager.yml"
+    )
+    
+    for config_def in "${configs[@]}"; do
+        local config_name="${config_def%%:*}"
+        local config_file="${config_def#*:}"
+        
+        if [[ ! -f "$config_file" ]]; then
+            log WARN "Config file not found: $config_file"
+            continue
+        fi
+        
+        if docker config inspect "$config_name" >/dev/null 2>&1; then
+            log DEBUG "Config already exists: $config_name"
+            # Remove old config if exists
+            if [[ "$DRY_RUN" != "true" ]]; then
+                local old_config_name="${config_name%_${CONFIG_VERSION}}"
+                if docker config inspect "$old_config_name" >/dev/null 2>&1; then
+                    docker config rm "$old_config_name" || true
+                fi
+            fi
+        else
+            if [[ "$DRY_RUN" != "true" ]]; then
+                docker config create "$config_name" "$config_file"
+                log INFO "Created config: $config_name"
+            else
+                log DEBUG "Would create config: $config_name from $config_file"
+            fi
+        fi
+    done
+}
+
+# Create missing config files
+create_missing_configs() {
+    log INFO "Creating missing configuration files..."
+    
+    # Create Grafana datasources config
+    local grafana_datasources="${PROJECT_ROOT}/monitoring/configs/grafana-datasources.yml"
+    if [[ ! -f "$grafana_datasources" ]]; then
+        cat > "$grafana_datasources" <<EOF
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://prometheus:9090
+    isDefault: true
+    editable: true
+
+  - name: Loki
+    type: loki
+    access: proxy
+    url: http://loki:3100
+    editable: true
+
+  - name: Jaeger
+    type: jaeger
+    access: proxy
+    url: http://jaeger:16686
+    editable: true
+EOF
+        log INFO "Created Grafana datasources config"
+    fi
+    
+    # Create AlertManager config
+    local alertmanager_config="${PROJECT_ROOT}/monitoring/configs/alertmanager.yml"
+    if [[ ! -f "$alertmanager_config" ]]; then
+        cat > "$alertmanager_config" <<EOF
+global:
+  smtp_smarthost: 'localhost:587'
+  smtp_from: 'alerts@chorus.services'
+  slack_api_url_file: '/run/secrets/slack_webhook_url'
+
+route:
+  group_by: ['alertname', 'cluster', 'service']
+  group_wait: 10s
+  group_interval: 10s
+  repeat_interval: 12h
+  receiver: 'default'
+  routes:
+    - match:
+        severity: critical
+      receiver: 'critical-alerts'
+    - match:
+        service: bzzz
+      receiver: 'bzzz-alerts'
+
+receivers:
+  - name: 'default'
+    slack_configs:
+      - channel: '#bzzz-alerts'
+        title: 'BZZZ Alert: {{ .CommonAnnotations.summary }}'
+        text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
+
+  - name: 'critical-alerts'
+    slack_configs:
+      - channel: '#bzzz-critical'
+        title: 'CRITICAL: {{ .CommonAnnotations.summary }}'
+        text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
+    
+  - name: 'bzzz-alerts'
+    slack_configs:
+      - channel: '#bzzz-alerts'
+        title: 'BZZZ: {{ .CommonAnnotations.summary }}'
+        text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
+EOF
+        log INFO "Created AlertManager config"
+    fi
+}
+
+# Deploy monitoring stack
+deploy_monitoring_stack() {
+    log INFO "Deploying monitoring stack..."
+    
+    local compose_file="${PROJECT_ROOT}/monitoring/docker-compose.enhanced.yml"
+    
+    if [[ ! -f "$compose_file" ]]; then
+        log ERROR "Compose file not found: $compose_file"
+        exit 1
+    fi
+    
+    if [[ "$DRY_RUN" != "true" ]]; then
+        # Deploy the stack
+        docker stack deploy -c "$compose_file" "$STACK_NAME"
+        log INFO "Stack deployment initiated: $STACK_NAME"
+        
+        # Wait for services to be ready
+        log INFO "Waiting for services to be ready..."
+        local max_attempts=30
+        local attempt=0
+        
+        while [[ $attempt -lt $max_attempts ]]; do
+            local ready_services=0
+            local total_services=0
+            
+            # Count ready services
+            while read -r service; do
+                total_services=$((total_services + 1))
+                local replicas_info
+                replicas_info=$(docker service ls --filter name="$service" --format "{{.Replicas}}")
+                
+                if [[ "$replicas_info" =~ ^([0-9]+)/([0-9]+)$ ]]; then
+                    local current="${BASH_REMATCH[1]}"
+                    local desired="${BASH_REMATCH[2]}"
+                    
+                    if [[ "$current" -eq "$desired" ]]; then
+                        ready_services=$((ready_services + 1))
+                    fi
+                fi
+            done < <(docker stack services "$STACK_NAME" --format "{{.Name}}")
+            
+            if [[ $ready_services -eq $total_services ]]; then
+                log INFO "All services are ready ($ready_services/$total_services)"
+                break
+            else
+                log DEBUG "Services ready: $ready_services/$total_services"
+                sleep 10
+                attempt=$((attempt + 1))
+            fi
+        done
+        
+        if [[ $attempt -eq $max_attempts ]]; then
+            log WARN "Timeout waiting for all services to be ready"
+        fi
+    else
+        log DEBUG "Would deploy stack with compose file: $compose_file"
+    fi
+}
+
+# Perform health checks
+perform_health_checks() {
+    log INFO "Performing health checks..."
+    
+    if [[ "$DRY_RUN" == "true" ]]; then
+        log DEBUG "Skipping health checks in dry run mode"
+        return
+    fi
+    
+    local endpoints=(
+        "http://localhost:9090/-/healthy:Prometheus"
+        "http://localhost:3000/api/health:Grafana"
+        "http://localhost:9093/-/healthy:AlertManager"
+    )
+    
+    local max_attempts=$((HEALTH_CHECK_TIMEOUT / 10))
+    local attempt=0
+    
+    while [[ $attempt -lt $max_attempts ]]; do
+        local healthy_endpoints=0
+        
+        for endpoint_def in "${endpoints[@]}"; do
+            local endpoint="${endpoint_def%%:*}"
+            local service="${endpoint_def#*:}"
+            
+            if curl -sf "$endpoint" >/dev/null 2>&1; then
+                healthy_endpoints=$((healthy_endpoints + 1))
+                log DEBUG "Health check passed: $service"
+            else
+                log DEBUG "Health check pending: $service"
+            fi
+        done
+        
+        if [[ $healthy_endpoints -eq ${#endpoints[@]} ]]; then
+            log INFO "All health checks passed"
+            return
+        fi
+        
+        sleep 10
+        attempt=$((attempt + 1))
+    done
+    
+    log WARN "Some health checks failed after ${HEALTH_CHECK_TIMEOUT}s timeout"
+}
+
+# Validate deployment
+validate_deployment() {
+    log INFO "Validating deployment..."
+    
+    if [[ "$DRY_RUN" == "true" ]]; then
+        log DEBUG "Skipping validation in dry run mode"
+        return
+    fi
+    
+    # Check stack services
+    local services
+    services=$(docker stack services "$STACK_NAME" --format "{{.Name}}" | wc -l)
+    log INFO "Deployed services: $services"
+    
+    # Check if Prometheus is collecting metrics
+    sleep 30  # Allow time for initial metric collection
+    
+    if curl -sf "http://localhost:9090/api/v1/query?query=up" | jq -r '.data.result | length' | grep -q "^[1-9]"; then
+        log INFO "Prometheus is collecting metrics"
+    else
+        log WARN "Prometheus may not be collecting metrics yet"
+    fi
+    
+    # Check if Grafana can connect to Prometheus
+    local grafana_health
+    if grafana_health=$(curl -sf "http://admin:admin@localhost:3000/api/datasources/proxy/1/api/v1/query?query=up" 2>/dev/null); then
+        log INFO "Grafana can connect to Prometheus"
+    else
+        log WARN "Grafana datasource connection may be pending"
+    fi
+    
+    # Check AlertManager configuration
+    if curl -sf "http://localhost:9093/api/v1/status" >/dev/null 2>&1; then
+        log INFO "AlertManager is operational"
+    else
+        log WARN "AlertManager may not be ready"
+    fi
+}
+
+# Import Grafana dashboards
+import_dashboards() {
+    log INFO "Importing Grafana dashboards..."
+    
+    if [[ "$DRY_RUN" == "true" ]]; then
+        log DEBUG "Skipping dashboard import in dry run mode"
+        return
+    fi
+    
+    # Wait for Grafana to be ready
+    local max_attempts=30
+    local attempt=0
+    
+    while [[ $attempt -lt $max_attempts ]]; do
+        if curl -sf "http://admin:admin@localhost:3000/api/health" >/dev/null 2>&1; then
+            break
+        fi
+        sleep 5
+        attempt=$((attempt + 1))
+    done
+    
+    if [[ $attempt -eq $max_attempts ]]; then
+        log WARN "Grafana not ready for dashboard import"
+        return
+    fi
+    
+    # Import dashboards
+    local dashboard_dir="${PROJECT_ROOT}/monitoring/grafana-dashboards"
+    if [[ -d "$dashboard_dir" ]]; then
+        for dashboard_file in "$dashboard_dir"/*.json; do
+            if [[ -f "$dashboard_file" ]]; then
+                local dashboard_name
+                dashboard_name=$(basename "$dashboard_file" .json)
+                
+                if curl -X POST \
+                    -H "Content-Type: application/json" \
+                    -d "@$dashboard_file" \
+                    "http://admin:admin@localhost:3000/api/dashboards/db" \
+                    >/dev/null 2>&1; then
+                    log INFO "Imported dashboard: $dashboard_name"
+                else
+                    log WARN "Failed to import dashboard: $dashboard_name"
+                fi
+            fi
+        done
+    fi
+}
+
+# Generate deployment report
+generate_report() {
+    log INFO "Generating deployment report..."
+    
+    local report_file="/tmp/bzzz-monitoring-deployment-report-${TIMESTAMP}.txt"
+    
+    cat > "$report_file" <<EOF
+BZZZ Enhanced Monitoring Stack Deployment Report
+================================================
+
+Deployment Time: $(date)
+Environment: $ENVIRONMENT
+Stack Name: $STACK_NAME
+Dry Run: $DRY_RUN
+
+Services Deployed:
+EOF
+    
+    if [[ "$DRY_RUN" != "true" ]]; then
+        docker stack services "$STACK_NAME" --format "  - {{.Name}}: {{.Replicas}}" >> "$report_file"
+        
+        echo "" >> "$report_file"
+        echo "Service Health:" >> "$report_file"
+        
+        # Add health check results
+        local health_endpoints=(
+            "http://localhost:9090/-/healthy:Prometheus"
+            "http://localhost:3000/api/health:Grafana"
+            "http://localhost:9093/-/healthy:AlertManager"
+        )
+        
+        for endpoint_def in "${health_endpoints[@]}"; do
+            local endpoint="${endpoint_def%%:*}"
+            local service="${endpoint_def#*:}"
+            
+            if curl -sf "$endpoint" >/dev/null 2>&1; then
+                echo "  - $service: ✅ Healthy" >> "$report_file"
+            else
+                echo "  - $service: ❌ Unhealthy" >> "$report_file"
+            fi
+        done
+    else
+        echo "  [Dry run mode - no services deployed]" >> "$report_file"
+    fi
+    
+    cat >> "$report_file" <<EOF
+
+Access URLs:
+  - Grafana: http://localhost:3000 (admin/admin)
+  - Prometheus: http://localhost:9090
+  - AlertManager: http://localhost:9093
+
+Configuration:
+  - Log file: $LOG_FILE
+  - Backup directory: /rust/bzzz-v2/backups/monitoring/backup_${TIMESTAMP}
+  - Config version: $CONFIG_VERSION
+
+Next Steps:
+  1. Change default Grafana admin password
+  2. Configure notification channels in AlertManager
+  3. Review and customize alert rules
+  4. Set up external authentication (optional)
+
+EOF
+    
+    log INFO "Deployment report generated: $report_file"
+    
+    # Display report
+    echo ""
+    echo "=========================================="
+    cat "$report_file"
+    echo "=========================================="
+}
+
+# Main execution
+main() {
+    log INFO "Starting BZZZ Enhanced Monitoring Stack deployment"
+    log INFO "Environment: $ENVIRONMENT, Dry Run: $DRY_RUN"
+    log INFO "Log file: $LOG_FILE"
+    
+    check_prerequisites
+    setup_directories
+    backup_existing_config
+    create_missing_configs
+    create_secrets
+    create_configs
+    deploy_monitoring_stack
+    perform_health_checks
+    validate_deployment
+    import_dashboards
+    generate_report
+    
+    log INFO "Deployment completed successfully!"
+    
+    if [[ "$DRY_RUN" != "true" ]]; then
+        echo ""
+        echo "🎉 BZZZ Enhanced Monitoring Stack is now running!"
+        echo "📊 Grafana Dashboard: http://localhost:3000"
+        echo "📈 Prometheus: http://localhost:9090"
+        echo "🚨 AlertManager: http://localhost:9093"
+        echo ""
+        echo "Next steps:"
+        echo "1. Change default Grafana password"
+        echo "2. Configure alert notification channels"
+        echo "3. Review monitoring dashboards"
+        echo "4. Run reliability tests: ./infrastructure/testing/run-tests.sh all"
+    fi
+}
+
+# Script execution
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+    main "$@"
+fi