groups: - name: whoosh_alerts rules: - alert: WhooshBackendDown expr: up{job="whoosh-backend"} == 0 for: 1m labels: severity: critical annotations: summary: "WHOOSH Backend is down" description: "WHOOSH Backend has been down for more than 1 minute." - alert: WhooshHighResponseTime expr: histogram_quantile(0.95, http_request_duration_seconds_bucket{job="whoosh-backend"}) > 2 for: 5m labels: severity: warning annotations: summary: "High response time on WHOOSH Backend" description: "95th percentile response time is {{ $value }}s" - alert: WhooshHighErrorRate expr: rate(http_requests_total{job="whoosh-backend",status=~"5.."}[5m]) > 0.1 for: 5m labels: severity: critical annotations: summary: "High error rate on WHOOSH Backend" description: "Error rate is {{ $value }} errors per second" - alert: PostgreSQLDown expr: up{job="whoosh-postgres"} == 0 for: 1m labels: severity: critical annotations: summary: "PostgreSQL is down" description: "PostgreSQL database has been down for more than 1 minute." - alert: PostgreSQLHighConnections expr: pg_stat_database_numbackends > 80 for: 5m labels: severity: warning annotations: summary: "High number of PostgreSQL connections" description: "Number of connections is {{ $value }}" - alert: RedisDown expr: up{job="whoosh-redis"} == 0 for: 1m labels: severity: warning annotations: summary: "Redis is down" description: "Redis cache has been down for more than 1 minute." - alert: HighMemoryUsage expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9 for: 5m labels: severity: warning annotations: summary: "High memory usage" description: "Memory usage is {{ $value | humanizePercentage }}" - alert: HighDiskUsage expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes > 0.9 for: 5m labels: severity: warning annotations: summary: "High disk usage" description: "Disk usage is {{ $value | humanizePercentage }}"