bzzz/infrastructure/scripts/deploy-enhanced-monitoring.sh

#!/bin/bash

# BZZZ Enhanced Monitoring Stack Deployment Script
# Deploys comprehensive monitoring, metrics, and health checking infrastructure

set -euo pipefail

# Script configuration
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
LOG_FILE="/tmp/bzzz-deploy-${TIMESTAMP}.log"

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

# Configuration
ENVIRONMENT=${ENVIRONMENT:-"production"}
DRY_RUN=${DRY_RUN:-"false"}
BACKUP_EXISTING=${BACKUP_EXISTING:-"true"}
HEALTH_CHECK_TIMEOUT=${HEALTH_CHECK_TIMEOUT:-300}

# Docker configuration
DOCKER_REGISTRY="registry.home.deepblack.cloud"
STACK_NAME="bzzz-monitoring-v2"
CONFIG_VERSION="v2"

# Logging function
log() {
    local level=$1
    shift
    local message="$*"
    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')

    case $level in
        ERROR)
            echo -e "${RED}[ERROR]${NC} $message" >&2
            ;;
        WARN)
            echo -e "${YELLOW}[WARN]${NC} $message"
            ;;
        INFO)
            echo -e "${GREEN}[INFO]${NC} $message"
            ;;
        DEBUG)
            echo -e "${BLUE}[DEBUG]${NC} $message"
            ;;
    esac

    echo "[$timestamp] [$level] $message" >> "$LOG_FILE"
}

# Error handler
error_handler() {
    local line_no=$1
    log ERROR "Script failed at line $line_no"
    log ERROR "Check log file: $LOG_FILE"
    exit 1
}
trap 'error_handler $LINENO' ERR

# Check prerequisites
check_prerequisites() {
    log INFO "Checking prerequisites..."

    # Check if running on Docker Swarm manager
    if ! docker info --format '{{.Swarm.LocalNodeState}}' | grep -q "active"; then
        log ERROR "This script must be run on a Docker Swarm manager node"
        exit 1
    fi

    # Check required tools
    local required_tools=("docker" "jq" "curl")
    for tool in "${required_tools[@]}"; do
        if ! command -v "$tool" >/dev/null 2>&1; then
            log ERROR "Required tool not found: $tool"
            exit 1
        fi
    done

    # Check network connectivity to registry
    if ! docker pull "$DOCKER_REGISTRY/bzzz:v2.0.0" >/dev/null 2>&1; then
        log WARN "Unable to pull from registry, using local images"
    fi

    log INFO "Prerequisites check completed"
}

# Create necessary directories
setup_directories() {
    log INFO "Setting up directories..."

    local dirs=(
        "/rust/bzzz-v2/monitoring/prometheus/data"
        "/rust/bzzz-v2/monitoring/grafana/data"
        "/rust/bzzz-v2/monitoring/alertmanager/data"
        "/rust/bzzz-v2/monitoring/loki/data"
        "/rust/bzzz-v2/backups/monitoring"
    )

    for dir in "${dirs[@]}"; do
        if [[ "$DRY_RUN" != "true" ]]; then
            sudo mkdir -p "$dir"
            sudo chown -R 65534:65534 "$dir"  # nobody user for containers
        fi
        log DEBUG "Created directory: $dir"
    done
}

# Backup existing configuration
backup_existing_config() {
    if [[ "$BACKUP_EXISTING" != "true" ]]; then
        log INFO "Skipping backup (BACKUP_EXISTING=false)"
        return
    fi

    log INFO "Backing up existing configuration..."

    local backup_dir="/rust/bzzz-v2/backups/monitoring/backup_${TIMESTAMP}"

    if [[ "$DRY_RUN" != "true" ]]; then
        mkdir -p "$backup_dir"

        # Backup Docker secrets
        docker secret ls --filter name=bzzz_ --format "{{.Name}}" | while read -r secret; do
            if docker secret inspect "$secret" >/dev/null 2>&1; then
                docker secret inspect "$secret" > "$backup_dir/${secret}.json"
                log DEBUG "Backed up secret: $secret"
            fi
        done

        # Backup Docker configs
        docker config ls --filter name=bzzz_ --format "{{.Name}}" | while read -r config; do
            if docker config inspect "$config" >/dev/null 2>&1; then
                docker config inspect "$config" > "$backup_dir/${config}.json"
                log DEBUG "Backed up config: $config"
            fi
        done

        # Backup service definitions
        if docker stack services "$STACK_NAME" >/dev/null 2>&1; then
            docker stack services "$STACK_NAME" --format "{{.Name}}" | while read -r service; do
                docker service inspect "$service" > "$backup_dir/${service}-service.json"
            done
        fi
    fi

    log INFO "Backup completed: $backup_dir"
}

# Create Docker secrets
create_secrets() {
    log INFO "Creating Docker secrets..."

    local secrets=(
        "bzzz_grafana_admin_password:$(openssl rand -base64 32)"
        "bzzz_postgres_password:$(openssl rand -base64 32)"
    )

    # Check if secrets directory exists
    local secrets_dir="$HOME/chorus/business/secrets"
    if [[ -d "$secrets_dir" ]]; then
        # Use existing secrets if available
        if [[ -f "$secrets_dir/grafana-admin-password" ]]; then
            secrets[0]="bzzz_grafana_admin_password:$(cat "$secrets_dir/grafana-admin-password")"
        fi
        if [[ -f "$secrets_dir/postgres-password" ]]; then
            secrets[1]="bzzz_postgres_password:$(cat "$secrets_dir/postgres-password")"
        fi
    fi

    for secret_def in "${secrets[@]}"; do
        local secret_name="${secret_def%%:*}"
        local secret_value="${secret_def#*:}"

        if docker secret inspect "$secret_name" >/dev/null 2>&1; then
            log DEBUG "Secret already exists: $secret_name"
        else
            if [[ "$DRY_RUN" != "true" ]]; then
                echo "$secret_value" | docker secret create "$secret_name" -
                log INFO "Created secret: $secret_name"
            else
                log DEBUG "Would create secret: $secret_name"
            fi
        fi
    done
}

# Create Docker configs
create_configs() {
    log INFO "Creating Docker configs..."

    local configs=(
        "bzzz_prometheus_config_${CONFIG_VERSION}:${PROJECT_ROOT}/monitoring/configs/prometheus.yml"
        "bzzz_prometheus_alerts_${CONFIG_VERSION}:${PROJECT_ROOT}/monitoring/configs/enhanced-alert-rules.yml"
        "bzzz_grafana_datasources_${CONFIG_VERSION}:${PROJECT_ROOT}/monitoring/configs/grafana-datasources.yml"
        "bzzz_alertmanager_config_${CONFIG_VERSION}:${PROJECT_ROOT}/monitoring/configs/alertmanager.yml"
    )

    for config_def in "${configs[@]}"; do
        local config_name="${config_def%%:*}"
        local config_file="${config_def#*:}"

        if [[ ! -f "$config_file" ]]; then
            log WARN "Config file not found: $config_file"
            continue
        fi

        if docker config inspect "$config_name" >/dev/null 2>&1; then
            log DEBUG "Config already exists: $config_name"
            # Remove old config if exists
            if [[ "$DRY_RUN" != "true" ]]; then
                local old_config_name="${config_name%_${CONFIG_VERSION}}"
                if docker config inspect "$old_config_name" >/dev/null 2>&1; then
                    docker config rm "$old_config_name" || true
                fi
            fi
        else
            if [[ "$DRY_RUN" != "true" ]]; then
                docker config create "$config_name" "$config_file"
                log INFO "Created config: $config_name"
            else
                log DEBUG "Would create config: $config_name from $config_file"
            fi
        fi
    done
}

# Create missing config files
create_missing_configs() {
    log INFO "Creating missing configuration files..."

    # Create Grafana datasources config
    local grafana_datasources="${PROJECT_ROOT}/monitoring/configs/grafana-datasources.yml"
    if [[ ! -f "$grafana_datasources" ]]; then
        cat > "$grafana_datasources" <<EOF
apiVersion: 1

datasources:
  - name: Prometheus
    type: prometheus
    access: proxy
    url: http://prometheus:9090
    isDefault: true
    editable: true

  - name: Loki
    type: loki
    access: proxy
    url: http://loki:3100
    editable: true

  - name: Jaeger
    type: jaeger
    access: proxy
    url: http://jaeger:16686
    editable: true
EOF
        log INFO "Created Grafana datasources config"
    fi

    # Create AlertManager config
    local alertmanager_config="${PROJECT_ROOT}/monitoring/configs/alertmanager.yml"
    if [[ ! -f "$alertmanager_config" ]]; then
        cat > "$alertmanager_config" <<EOF
global:
  smtp_smarthost: 'localhost:587'
  smtp_from: 'alerts@chorus.services'
  slack_api_url_file: '/run/secrets/slack_webhook_url'

route:
  group_by: ['alertname', 'cluster', 'service']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 12h
  receiver: 'default'
  routes:
    - match:
        severity: critical
      receiver: 'critical-alerts'
    - match:
        service: bzzz
      receiver: 'bzzz-alerts'

receivers:
  - name: 'default'
    slack_configs:
      - channel: '#bzzz-alerts'
        title: 'BZZZ Alert: {{ .CommonAnnotations.summary }}'
        text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'

  - name: 'critical-alerts'
    slack_configs:
      - channel: '#bzzz-critical'
        title: 'CRITICAL: {{ .CommonAnnotations.summary }}'
        text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'

  - name: 'bzzz-alerts'
    slack_configs:
      - channel: '#bzzz-alerts'
        title: 'BZZZ: {{ .CommonAnnotations.summary }}'
        text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
EOF
        log INFO "Created AlertManager config"
    fi
}

# Deploy monitoring stack
deploy_monitoring_stack() {
    log INFO "Deploying monitoring stack..."

    local compose_file="${PROJECT_ROOT}/monitoring/docker-compose.enhanced.yml"

    if [[ ! -f "$compose_file" ]]; then
        log ERROR "Compose file not found: $compose_file"
        exit 1
    fi

    if [[ "$DRY_RUN" != "true" ]]; then
        # Deploy the stack
        docker stack deploy -c "$compose_file" "$STACK_NAME"
        log INFO "Stack deployment initiated: $STACK_NAME"

        # Wait for services to be ready
        log INFO "Waiting for services to be ready..."
        local max_attempts=30
        local attempt=0

        while [[ $attempt -lt $max_attempts ]]; do
            local ready_services=0
            local total_services=0

            # Count ready services
            while read -r service; do
                total_services=$((total_services + 1))
                local replicas_info
                replicas_info=$(docker service ls --filter name="$service" --format "{{.Replicas}}")

                if [[ "$replicas_info" =~ ^([0-9]+)/([0-9]+)$ ]]; then
                    local current="${BASH_REMATCH[1]}"
                    local desired="${BASH_REMATCH[2]}"

                    if [[ "$current" -eq "$desired" ]]; then
                        ready_services=$((ready_services + 1))
                    fi
                fi
            done < <(docker stack services "$STACK_NAME" --format "{{.Name}}")

            if [[ $ready_services -eq $total_services ]]; then
                log INFO "All services are ready ($ready_services/$total_services)"
                break
            else
                log DEBUG "Services ready: $ready_services/$total_services"
                sleep 10
                attempt=$((attempt + 1))
            fi
        done

        if [[ $attempt -eq $max_attempts ]]; then
            log WARN "Timeout waiting for all services to be ready"
        fi
    else
        log DEBUG "Would deploy stack with compose file: $compose_file"
    fi
}

# Perform health checks
perform_health_checks() {
    log INFO "Performing health checks..."

    if [[ "$DRY_RUN" == "true" ]]; then
        log DEBUG "Skipping health checks in dry run mode"
        return
    fi

    local endpoints=(
        "http://localhost:9090/-/healthy:Prometheus"
        "http://localhost:3000/api/health:Grafana"
        "http://localhost:9093/-/healthy:AlertManager"
    )

    local max_attempts=$((HEALTH_CHECK_TIMEOUT / 10))
    local attempt=0

    while [[ $attempt -lt $max_attempts ]]; do
        local healthy_endpoints=0

        for endpoint_def in "${endpoints[@]}"; do
            local endpoint="${endpoint_def%%:*}"
            local service="${endpoint_def#*:}"

            if curl -sf "$endpoint" >/dev/null 2>&1; then
                healthy_endpoints=$((healthy_endpoints + 1))
                log DEBUG "Health check passed: $service"
            else
                log DEBUG "Health check pending: $service"
            fi
        done

        if [[ $healthy_endpoints -eq ${#endpoints[@]} ]]; then
            log INFO "All health checks passed"
            return
        fi

        sleep 10
        attempt=$((attempt + 1))
    done

    log WARN "Some health checks failed after ${HEALTH_CHECK_TIMEOUT}s timeout"
}

# Validate deployment
validate_deployment() {
    log INFO "Validating deployment..."

    if [[ "$DRY_RUN" == "true" ]]; then
        log DEBUG "Skipping validation in dry run mode"
        return
    fi

    # Check stack services
    local services
    services=$(docker stack services "$STACK_NAME" --format "{{.Name}}" | wc -l)
    log INFO "Deployed services: $services"

    # Check if Prometheus is collecting metrics
    sleep 30  # Allow time for initial metric collection

    if curl -sf "http://localhost:9090/api/v1/query?query=up" | jq -r '.data.result | length' | grep -q "^[1-9]"; then
        log INFO "Prometheus is collecting metrics"
    else
        log WARN "Prometheus may not be collecting metrics yet"
    fi

    # Check if Grafana can connect to Prometheus
    local grafana_health
    if grafana_health=$(curl -sf "http://admin:admin@localhost:3000/api/datasources/proxy/1/api/v1/query?query=up" 2>/dev/null); then
        log INFO "Grafana can connect to Prometheus"
    else
        log WARN "Grafana datasource connection may be pending"
    fi

    # Check AlertManager configuration
    if curl -sf "http://localhost:9093/api/v1/status" >/dev/null 2>&1; then
        log INFO "AlertManager is operational"
    else
        log WARN "AlertManager may not be ready"
    fi
}

# Import Grafana dashboards
import_dashboards() {
    log INFO "Importing Grafana dashboards..."

    if [[ "$DRY_RUN" == "true" ]]; then
        log DEBUG "Skipping dashboard import in dry run mode"
        return
    fi

    # Wait for Grafana to be ready
    local max_attempts=30
    local attempt=0

    while [[ $attempt -lt $max_attempts ]]; do
        if curl -sf "http://admin:admin@localhost:3000/api/health" >/dev/null 2>&1; then
            break
        fi
        sleep 5
        attempt=$((attempt + 1))
    done

    if [[ $attempt -eq $max_attempts ]]; then
        log WARN "Grafana not ready for dashboard import"
        return
    fi

    # Import dashboards
    local dashboard_dir="${PROJECT_ROOT}/monitoring/grafana-dashboards"
    if [[ -d "$dashboard_dir" ]]; then
        for dashboard_file in "$dashboard_dir"/*.json; do
            if [[ -f "$dashboard_file" ]]; then
                local dashboard_name
                dashboard_name=$(basename "$dashboard_file" .json)

                if curl -X POST \
                    -H "Content-Type: application/json" \
                    -d "@$dashboard_file" \
                    "http://admin:admin@localhost:3000/api/dashboards/db" \
                    >/dev/null 2>&1; then
                    log INFO "Imported dashboard: $dashboard_name"
                else
                    log WARN "Failed to import dashboard: $dashboard_name"
                fi
            fi
        done
    fi
}

# Generate deployment report
generate_report() {
    log INFO "Generating deployment report..."

    local report_file="/tmp/bzzz-monitoring-deployment-report-${TIMESTAMP}.txt"

    cat > "$report_file" <<EOF
BZZZ Enhanced Monitoring Stack Deployment Report
================================================

Deployment Time: $(date)
Environment: $ENVIRONMENT
Stack Name: $STACK_NAME
Dry Run: $DRY_RUN

Services Deployed:
EOF

    if [[ "$DRY_RUN" != "true" ]]; then
        docker stack services "$STACK_NAME" --format "  - {{.Name}}: {{.Replicas}}" >> "$report_file"

        echo "" >> "$report_file"
        echo "Service Health:" >> "$report_file"

        # Add health check results
        local health_endpoints=(
            "http://localhost:9090/-/healthy:Prometheus"
            "http://localhost:3000/api/health:Grafana"
            "http://localhost:9093/-/healthy:AlertManager"
        )

        for endpoint_def in "${health_endpoints[@]}"; do
            local endpoint="${endpoint_def%%:*}"
            local service="${endpoint_def#*:}"

            if curl -sf "$endpoint" >/dev/null 2>&1; then
                echo "  - $service: ✅ Healthy" >> "$report_file"
            else
                echo "  - $service: ❌ Unhealthy" >> "$report_file"
            fi
        done
    else
        echo "  [Dry run mode - no services deployed]" >> "$report_file"
    fi

    cat >> "$report_file" <<EOF

Access URLs:
  - Grafana: http://localhost:3000 (admin/admin)
  - Prometheus: http://localhost:9090
  - AlertManager: http://localhost:9093

Configuration:
  - Log file: $LOG_FILE
  - Backup directory: /rust/bzzz-v2/backups/monitoring/backup_${TIMESTAMP}
  - Config version: $CONFIG_VERSION

Next Steps:
  1. Change default Grafana admin password
  2. Configure notification channels in AlertManager
  3. Review and customize alert rules
  4. Set up external authentication (optional)

EOF

    log INFO "Deployment report generated: $report_file"

    # Display report
    echo ""
    echo "=========================================="
    cat "$report_file"
    echo "=========================================="
}

# Main execution
main() {
    log INFO "Starting BZZZ Enhanced Monitoring Stack deployment"
    log INFO "Environment: $ENVIRONMENT, Dry Run: $DRY_RUN"
    log INFO "Log file: $LOG_FILE"

    check_prerequisites
    setup_directories
    backup_existing_config
    create_missing_configs
    create_secrets
    create_configs
    deploy_monitoring_stack
    perform_health_checks
    validate_deployment
    import_dashboards
    generate_report

    log INFO "Deployment completed successfully!"

    if [[ "$DRY_RUN" != "true" ]]; then
        echo ""
        echo "🎉 BZZZ Enhanced Monitoring Stack is now running!"
        echo "📊 Grafana Dashboard: http://localhost:3000"
        echo "📈 Prometheus: http://localhost:9090"
        echo "🚨 AlertManager: http://localhost:9093"
        echo ""
        echo "Next steps:"
        echo "1. Change default Grafana password"
        echo "2. Configure alert notification channels"
        echo "3. Review monitoring dashboards"
        echo "4. Run reliability tests: ./infrastructure/testing/run-tests.sh all"
    fi
}

# Script execution
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
    main "$@"
fi