#!/bin/bash # BZZZ Enhanced Monitoring Stack Deployment Script # Deploys comprehensive monitoring, metrics, and health checking infrastructure set -euo pipefail # Script configuration SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" TIMESTAMP=$(date +%Y%m%d_%H%M%S) LOG_FILE="/tmp/bzzz-deploy-${TIMESTAMP}.log" # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color # Configuration ENVIRONMENT=${ENVIRONMENT:-"production"} DRY_RUN=${DRY_RUN:-"false"} BACKUP_EXISTING=${BACKUP_EXISTING:-"true"} HEALTH_CHECK_TIMEOUT=${HEALTH_CHECK_TIMEOUT:-300} # Docker configuration DOCKER_REGISTRY="registry.home.deepblack.cloud" STACK_NAME="bzzz-monitoring-v2" CONFIG_VERSION="v2" # Logging function log() { local level=$1 shift local message="$*" local timestamp=$(date '+%Y-%m-%d %H:%M:%S') case $level in ERROR) echo -e "${RED}[ERROR]${NC} $message" >&2 ;; WARN) echo -e "${YELLOW}[WARN]${NC} $message" ;; INFO) echo -e "${GREEN}[INFO]${NC} $message" ;; DEBUG) echo -e "${BLUE}[DEBUG]${NC} $message" ;; esac echo "[$timestamp] [$level] $message" >> "$LOG_FILE" } # Error handler error_handler() { local line_no=$1 log ERROR "Script failed at line $line_no" log ERROR "Check log file: $LOG_FILE" exit 1 } trap 'error_handler $LINENO' ERR # Check prerequisites check_prerequisites() { log INFO "Checking prerequisites..." # Check if running on Docker Swarm manager if ! docker info --format '{{.Swarm.LocalNodeState}}' | grep -q "active"; then log ERROR "This script must be run on a Docker Swarm manager node" exit 1 fi # Check required tools local required_tools=("docker" "jq" "curl") for tool in "${required_tools[@]}"; do if ! command -v "$tool" >/dev/null 2>&1; then log ERROR "Required tool not found: $tool" exit 1 fi done # Check network connectivity to registry if ! docker pull "$DOCKER_REGISTRY/bzzz:v2.0.0" >/dev/null 2>&1; then log WARN "Unable to pull from registry, using local images" fi log INFO "Prerequisites check completed" } # Create necessary directories setup_directories() { log INFO "Setting up directories..." local dirs=( "/rust/bzzz-v2/monitoring/prometheus/data" "/rust/bzzz-v2/monitoring/grafana/data" "/rust/bzzz-v2/monitoring/alertmanager/data" "/rust/bzzz-v2/monitoring/loki/data" "/rust/bzzz-v2/backups/monitoring" ) for dir in "${dirs[@]}"; do if [[ "$DRY_RUN" != "true" ]]; then sudo mkdir -p "$dir" sudo chown -R 65534:65534 "$dir" # nobody user for containers fi log DEBUG "Created directory: $dir" done } # Backup existing configuration backup_existing_config() { if [[ "$BACKUP_EXISTING" != "true" ]]; then log INFO "Skipping backup (BACKUP_EXISTING=false)" return fi log INFO "Backing up existing configuration..." local backup_dir="/rust/bzzz-v2/backups/monitoring/backup_${TIMESTAMP}" if [[ "$DRY_RUN" != "true" ]]; then mkdir -p "$backup_dir" # Backup Docker secrets docker secret ls --filter name=bzzz_ --format "{{.Name}}" | while read -r secret; do if docker secret inspect "$secret" >/dev/null 2>&1; then docker secret inspect "$secret" > "$backup_dir/${secret}.json" log DEBUG "Backed up secret: $secret" fi done # Backup Docker configs docker config ls --filter name=bzzz_ --format "{{.Name}}" | while read -r config; do if docker config inspect "$config" >/dev/null 2>&1; then docker config inspect "$config" > "$backup_dir/${config}.json" log DEBUG "Backed up config: $config" fi done # Backup service definitions if docker stack services "$STACK_NAME" >/dev/null 2>&1; then docker stack services "$STACK_NAME" --format "{{.Name}}" | while read -r service; do docker service inspect "$service" > "$backup_dir/${service}-service.json" done fi fi log INFO "Backup completed: $backup_dir" } # Create Docker secrets create_secrets() { log INFO "Creating Docker secrets..." local secrets=( "bzzz_grafana_admin_password:$(openssl rand -base64 32)" "bzzz_postgres_password:$(openssl rand -base64 32)" ) # Check if secrets directory exists local secrets_dir="$HOME/chorus/business/secrets" if [[ -d "$secrets_dir" ]]; then # Use existing secrets if available if [[ -f "$secrets_dir/grafana-admin-password" ]]; then secrets[0]="bzzz_grafana_admin_password:$(cat "$secrets_dir/grafana-admin-password")" fi if [[ -f "$secrets_dir/postgres-password" ]]; then secrets[1]="bzzz_postgres_password:$(cat "$secrets_dir/postgres-password")" fi fi for secret_def in "${secrets[@]}"; do local secret_name="${secret_def%%:*}" local secret_value="${secret_def#*:}" if docker secret inspect "$secret_name" >/dev/null 2>&1; then log DEBUG "Secret already exists: $secret_name" else if [[ "$DRY_RUN" != "true" ]]; then echo "$secret_value" | docker secret create "$secret_name" - log INFO "Created secret: $secret_name" else log DEBUG "Would create secret: $secret_name" fi fi done } # Create Docker configs create_configs() { log INFO "Creating Docker configs..." local configs=( "bzzz_prometheus_config_${CONFIG_VERSION}:${PROJECT_ROOT}/monitoring/configs/prometheus.yml" "bzzz_prometheus_alerts_${CONFIG_VERSION}:${PROJECT_ROOT}/monitoring/configs/enhanced-alert-rules.yml" "bzzz_grafana_datasources_${CONFIG_VERSION}:${PROJECT_ROOT}/monitoring/configs/grafana-datasources.yml" "bzzz_alertmanager_config_${CONFIG_VERSION}:${PROJECT_ROOT}/monitoring/configs/alertmanager.yml" ) for config_def in "${configs[@]}"; do local config_name="${config_def%%:*}" local config_file="${config_def#*:}" if [[ ! -f "$config_file" ]]; then log WARN "Config file not found: $config_file" continue fi if docker config inspect "$config_name" >/dev/null 2>&1; then log DEBUG "Config already exists: $config_name" # Remove old config if exists if [[ "$DRY_RUN" != "true" ]]; then local old_config_name="${config_name%_${CONFIG_VERSION}}" if docker config inspect "$old_config_name" >/dev/null 2>&1; then docker config rm "$old_config_name" || true fi fi else if [[ "$DRY_RUN" != "true" ]]; then docker config create "$config_name" "$config_file" log INFO "Created config: $config_name" else log DEBUG "Would create config: $config_name from $config_file" fi fi done } # Create missing config files create_missing_configs() { log INFO "Creating missing configuration files..." # Create Grafana datasources config local grafana_datasources="${PROJECT_ROOT}/monitoring/configs/grafana-datasources.yml" if [[ ! -f "$grafana_datasources" ]]; then cat > "$grafana_datasources" < "$alertmanager_config" </dev/null 2>&1; then healthy_endpoints=$((healthy_endpoints + 1)) log DEBUG "Health check passed: $service" else log DEBUG "Health check pending: $service" fi done if [[ $healthy_endpoints -eq ${#endpoints[@]} ]]; then log INFO "All health checks passed" return fi sleep 10 attempt=$((attempt + 1)) done log WARN "Some health checks failed after ${HEALTH_CHECK_TIMEOUT}s timeout" } # Validate deployment validate_deployment() { log INFO "Validating deployment..." if [[ "$DRY_RUN" == "true" ]]; then log DEBUG "Skipping validation in dry run mode" return fi # Check stack services local services services=$(docker stack services "$STACK_NAME" --format "{{.Name}}" | wc -l) log INFO "Deployed services: $services" # Check if Prometheus is collecting metrics sleep 30 # Allow time for initial metric collection if curl -sf "http://localhost:9090/api/v1/query?query=up" | jq -r '.data.result | length' | grep -q "^[1-9]"; then log INFO "Prometheus is collecting metrics" else log WARN "Prometheus may not be collecting metrics yet" fi # Check if Grafana can connect to Prometheus local grafana_health if grafana_health=$(curl -sf "http://admin:admin@localhost:3000/api/datasources/proxy/1/api/v1/query?query=up" 2>/dev/null); then log INFO "Grafana can connect to Prometheus" else log WARN "Grafana datasource connection may be pending" fi # Check AlertManager configuration if curl -sf "http://localhost:9093/api/v1/status" >/dev/null 2>&1; then log INFO "AlertManager is operational" else log WARN "AlertManager may not be ready" fi } # Import Grafana dashboards import_dashboards() { log INFO "Importing Grafana dashboards..." if [[ "$DRY_RUN" == "true" ]]; then log DEBUG "Skipping dashboard import in dry run mode" return fi # Wait for Grafana to be ready local max_attempts=30 local attempt=0 while [[ $attempt -lt $max_attempts ]]; do if curl -sf "http://admin:admin@localhost:3000/api/health" >/dev/null 2>&1; then break fi sleep 5 attempt=$((attempt + 1)) done if [[ $attempt -eq $max_attempts ]]; then log WARN "Grafana not ready for dashboard import" return fi # Import dashboards local dashboard_dir="${PROJECT_ROOT}/monitoring/grafana-dashboards" if [[ -d "$dashboard_dir" ]]; then for dashboard_file in "$dashboard_dir"/*.json; do if [[ -f "$dashboard_file" ]]; then local dashboard_name dashboard_name=$(basename "$dashboard_file" .json) if curl -X POST \ -H "Content-Type: application/json" \ -d "@$dashboard_file" \ "http://admin:admin@localhost:3000/api/dashboards/db" \ >/dev/null 2>&1; then log INFO "Imported dashboard: $dashboard_name" else log WARN "Failed to import dashboard: $dashboard_name" fi fi done fi } # Generate deployment report generate_report() { log INFO "Generating deployment report..." local report_file="/tmp/bzzz-monitoring-deployment-report-${TIMESTAMP}.txt" cat > "$report_file" <> "$report_file" echo "" >> "$report_file" echo "Service Health:" >> "$report_file" # Add health check results local health_endpoints=( "http://localhost:9090/-/healthy:Prometheus" "http://localhost:3000/api/health:Grafana" "http://localhost:9093/-/healthy:AlertManager" ) for endpoint_def in "${health_endpoints[@]}"; do local endpoint="${endpoint_def%%:*}" local service="${endpoint_def#*:}" if curl -sf "$endpoint" >/dev/null 2>&1; then echo " - $service: ✅ Healthy" >> "$report_file" else echo " - $service: ❌ Unhealthy" >> "$report_file" fi done else echo " [Dry run mode - no services deployed]" >> "$report_file" fi cat >> "$report_file" <