Prepare for v2 development: Add MCP integration and future development planning
- Add FUTURE_DEVELOPMENT.md with comprehensive v2 protocol specification - Add MCP integration design and implementation foundation - Add infrastructure and deployment configurations - Update system architecture for v2 evolution 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
669
infrastructure/BZZZ_V2_INFRASTRUCTURE_ARCHITECTURE.md
Normal file
669
infrastructure/BZZZ_V2_INFRASTRUCTURE_ARCHITECTURE.md
Normal file
@@ -0,0 +1,669 @@
|
||||
# BZZZ v2 Infrastructure Architecture & Deployment Strategy
|
||||
|
||||
## Executive Summary
|
||||
|
||||
This document outlines the comprehensive infrastructure architecture and deployment strategy for BZZZ v2 evolution. The design maintains the existing 3-node cluster reliability while enabling advanced protocol features including content-addressed storage, DHT networking, OpenAI integration, and MCP server capabilities.
|
||||
|
||||
## Current Infrastructure Analysis
|
||||
|
||||
### Existing v1 Deployment
|
||||
- **Cluster**: WALNUT (192.168.1.27), IRONWOOD (192.168.1.113), ACACIA (192.168.1.xxx)
|
||||
- **Deployment**: SystemD services with P2P mesh networking
|
||||
- **Protocol**: libp2p with mDNS discovery and pubsub messaging
|
||||
- **Storage**: File-based configuration and in-memory state
|
||||
- **Integration**: Basic Hive API connectivity and task coordination
|
||||
|
||||
### Infrastructure Dependencies
|
||||
- **Docker Swarm**: Existing cluster with `tengig` network
|
||||
- **Traefik**: Load balancing and SSL termination
|
||||
- **Private Registry**: registry.home.deepblack.cloud
|
||||
- **GitLab CI/CD**: gitlab.deepblack.cloud
|
||||
- **Secrets**: ~/chorus/business/secrets/ management
|
||||
- **Storage**: NFS mounts on /rust/ for shared data
|
||||
|
||||
## BZZZ v2 Architecture Design
|
||||
|
||||
### 1. Protocol Evolution Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────── BZZZ v2 Protocol Stack ───────────────────────┐
|
||||
│ │
|
||||
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────────┐ │
|
||||
│ │ MCP Server │ │ OpenAI Proxy │ │ bzzz:// Resolver │ │
|
||||
│ │ (Port 3001) │ │ (Port 3002) │ │ (Port 3003) │ │
|
||||
│ └─────────────────┘ └─────────────────┘ └─────────────────────┘ │
|
||||
│ │ │ │ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Content Layer │ │
|
||||
│ │ ┌─────────────┐ ┌──────────────┐ ┌─────────────────────────┐ │ │
|
||||
│ │ │ Conversation│ │ Content Store│ │ BLAKE3 Hasher │ │ │
|
||||
│ │ │ Threading │ │ (CAS Blobs) │ │ (Content Addressing) │ │ │
|
||||
│ │ └─────────────┘ └──────────────┘ └─────────────────────────┘ │ │
|
||||
│ └─────────────────────────────────────────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ P2P Layer │ │
|
||||
│ │ ┌─────────────┐ ┌──────────────┐ ┌─────────────────────────┐ │ │
|
||||
│ │ │ libp2p DHT │ │Content Route │ │ Stream Multiplexing │ │ │
|
||||
│ │ │ (Discovery)│ │ (Routing) │ │ (Yamux/mplex) │ │ │
|
||||
│ │ └─────────────┘ └──────────────┘ └─────────────────────────┘ │ │
|
||||
│ └─────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
└───────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### 2. Content-Addressed Storage (CAS) Architecture
|
||||
|
||||
```
|
||||
┌────────────────── Content-Addressed Storage System ──────────────────┐
|
||||
│ │
|
||||
│ ┌─────────────────────────── Node Distribution ────────────────────┐ │
|
||||
│ │ │ │
|
||||
│ │ WALNUT IRONWOOD ACACIA │ │
|
||||
│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │
|
||||
│ │ │ Primary │────▶│ Secondary │────▶│ Tertiary │ │ │
|
||||
│ │ │ Blob Store │ │ Replica │ │ Replica │ │ │
|
||||
│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │
|
||||
│ │ │ │ │ │ │
|
||||
│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │
|
||||
│ │ │BLAKE3 Index │ │BLAKE3 Index │ │BLAKE3 Index │ │ │
|
||||
│ │ │ (Primary) │ │ (Secondary) │ │ (Tertiary) │ │ │
|
||||
│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │
|
||||
│ └───────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────── Storage Layout ──────────────────────────────┐ │
|
||||
│ │ /rust/bzzz-v2/blobs/ │ │
|
||||
│ │ ├── data/ # Raw blob storage │ │
|
||||
│ │ │ ├── bl/ # BLAKE3 prefix sharding │ │
|
||||
│ │ │ │ └── 3k/ # Further sharding │ │
|
||||
│ │ │ └── conversations/ # Conversation threads │ │
|
||||
│ │ ├── index/ # BLAKE3 hash indices │ │
|
||||
│ │ │ ├── primary.db # Primary hash->location mapping │ │
|
||||
│ │ │ └── replication.db # Replication metadata │ │
|
||||
│ │ └── temp/ # Temporary staging area │ │
|
||||
│ └───────────────────────────────────────────────────────────────────┘ │
|
||||
└───────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### 3. DHT and Network Architecture
|
||||
|
||||
```
|
||||
┌────────────────────── DHT Network Topology ──────────────────────────┐
|
||||
│ │
|
||||
│ ┌─────────────────── Bootstrap & Discovery ────────────────────────┐ │
|
||||
│ │ │ │
|
||||
│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │
|
||||
│ │ │ WALNUT │────▶│ IRONWOOD │────▶│ ACACIA │ │ │
|
||||
│ │ │(Bootstrap 1)│◀────│(Bootstrap 2)│◀────│(Bootstrap 3)│ │ │
|
||||
│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │
|
||||
│ │ │ │
|
||||
│ │ ┌─────────────────── DHT Responsibilities ────────────────────┐ │ │
|
||||
│ │ │ WALNUT: Content Routing + Agent Discovery │ │ │
|
||||
│ │ │ IRONWOOD: Conversation Threading + OpenAI Coordination │ │ │
|
||||
│ │ │ ACACIA: MCP Services + External Integration │ │ │
|
||||
│ │ └─────────────────────────────────────────────────────────────┘ │ │
|
||||
│ └───────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────── Network Protocols ────────────────────────────┐ │
|
||||
│ │ │ │
|
||||
│ │ Protocol Support: │ │
|
||||
│ │ • bzzz:// semantic addressing (DHT resolution) │ │
|
||||
│ │ • Content routing via DHT (BLAKE3 hash lookup) │ │
|
||||
│ │ • Agent discovery and capability broadcasting │ │
|
||||
│ │ • Stream multiplexing for concurrent conversations │ │
|
||||
│ │ • NAT traversal and hole punching │ │
|
||||
│ │ │ │
|
||||
│ │ Port Allocation: │ │
|
||||
│ │ • P2P Listen: 9000-9100 (configurable range) │ │
|
||||
│ │ • DHT Bootstrap: 9101-9103 (per node) │ │
|
||||
│ │ • Content Routing: 9200-9300 (dynamic allocation) │ │
|
||||
│ │ • mDNS Discovery: 5353 (standard multicast DNS) │ │
|
||||
│ └───────────────────────────────────────────────────────────────────┘ │
|
||||
└───────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### 4. Service Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────── BZZZ v2 Service Stack ────────────────────────┐
|
||||
│ │
|
||||
│ ┌─────────────────── External Layer ───────────────────────────────┐ │
|
||||
│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │
|
||||
│ │ │ Traefik │────▶│ OpenAI │────▶│ MCP │ │ │
|
||||
│ │ │Load Balancer│ │ Gateway │ │ Clients │ │ │
|
||||
│ │ │ (SSL Term) │ │(Rate Limit) │ │(External) │ │ │
|
||||
│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │
|
||||
│ └───────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────── Application Layer ────────────────────────────┐ │
|
||||
│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │
|
||||
│ │ │ BZZZ Agent │────▶│ Conversation│────▶│ Content │ │ │
|
||||
│ │ │ Manager │ │ Threading │ │ Resolver │ │ │
|
||||
│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │
|
||||
│ │ │ │ │ │ │
|
||||
│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │
|
||||
│ │ │ MCP │ │ OpenAI │ │ DHT │ │ │
|
||||
│ │ │ Server │ │ Client │ │ Manager │ │ │
|
||||
│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │
|
||||
│ └───────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────── Storage Layer ─────────────────────────────────┐ │
|
||||
│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │
|
||||
│ │ │ CAS │────▶│ PostgreSQL │────▶│ Redis │ │ │
|
||||
│ │ │ Blob Store │ │(Metadata) │ │ (Cache) │ │ │
|
||||
│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │
|
||||
│ └───────────────────────────────────────────────────────────────────┘ │
|
||||
└───────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Migration Strategy
|
||||
|
||||
### Phase 1: Parallel Deployment (Weeks 1-2)
|
||||
|
||||
#### 1.1 Infrastructure Preparation
|
||||
```bash
|
||||
# Create v2 directory structure
|
||||
/rust/bzzz-v2/
|
||||
├── config/
|
||||
│ ├── swarm/
|
||||
│ ├── systemd/
|
||||
│ └── secrets/
|
||||
├── data/
|
||||
│ ├── blobs/
|
||||
│ ├── conversations/
|
||||
│ └── dht/
|
||||
└── logs/
|
||||
├── application/
|
||||
├── p2p/
|
||||
└── monitoring/
|
||||
```
|
||||
|
||||
#### 1.2 Service Deployment Strategy
|
||||
- Deploy v2 services on non-standard ports (9000+ range)
|
||||
- Maintain v1 SystemD services during transition
|
||||
- Use Docker Swarm stack for v2 components
|
||||
- Implement health checks and readiness probes
|
||||
|
||||
#### 1.3 Database Migration
|
||||
- Create new PostgreSQL schema for v2 metadata
|
||||
- Implement data migration scripts for conversation history
|
||||
- Set up Redis cluster for DHT caching
|
||||
- Configure backup and recovery procedures
|
||||
|
||||
### Phase 2: Feature Migration (Weeks 3-4)
|
||||
|
||||
#### 2.1 Content Store Migration
|
||||
```bash
|
||||
# Migration workflow
|
||||
1. Export v1 conversation logs from Hypercore
|
||||
2. Convert to BLAKE3-addressed blobs
|
||||
3. Populate content store with historical data
|
||||
4. Verify data integrity and accessibility
|
||||
5. Update references in conversation threads
|
||||
```
|
||||
|
||||
#### 2.2 P2P Protocol Upgrade
|
||||
- Implement dual-protocol support (v1 + v2)
|
||||
- Migrate peer discovery from mDNS to DHT
|
||||
- Update message formats and routing
|
||||
- Maintain backward compatibility during transition
|
||||
|
||||
### Phase 3: Service Cutover (Weeks 5-6)
|
||||
|
||||
#### 3.1 Traffic Migration
|
||||
- Implement feature flags for v2 protocol
|
||||
- Gradual migration of agents to v2 endpoints
|
||||
- Monitor performance and error rates
|
||||
- Implement automatic rollback triggers
|
||||
|
||||
#### 3.2 Monitoring and Validation
|
||||
- Deploy comprehensive monitoring stack
|
||||
- Validate all v2 protocol operations
|
||||
- Performance benchmarking vs v1
|
||||
- Load testing with conversation threading
|
||||
|
||||
### Phase 4: Production Deployment (Weeks 7-8)
|
||||
|
||||
#### 4.1 Full Cutover
|
||||
- Disable v1 protocol endpoints
|
||||
- Remove v1 SystemD services
|
||||
- Update all client configurations
|
||||
- Archive v1 data and configurations
|
||||
|
||||
#### 4.2 Optimization and Tuning
|
||||
- Performance optimization based on production load
|
||||
- Resource allocation tuning
|
||||
- Security hardening and audit
|
||||
- Documentation and training completion
|
||||
|
||||
## Container Orchestration
|
||||
|
||||
### Docker Swarm Stack Configuration
|
||||
|
||||
```yaml
|
||||
# docker-compose.swarm.yml
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
bzzz-agent:
|
||||
image: registry.home.deepblack.cloud/bzzz:v2.0.0
|
||||
networks:
|
||||
- tengig
|
||||
- bzzz-internal
|
||||
ports:
|
||||
- "9000-9100:9000-9100"
|
||||
volumes:
|
||||
- /rust/bzzz-v2/data:/app/data
|
||||
- /rust/bzzz-v2/config:/app/config
|
||||
environment:
|
||||
- BZZZ_VERSION=2.0.0
|
||||
- BZZZ_PROTOCOL=bzzz://
|
||||
- DHT_BOOTSTRAP_NODES=walnut:9101,ironwood:9102,acacia:9103
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == walnut
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.bzzz-agent.rule=Host(`bzzz.deepblack.cloud`)"
|
||||
- "traefik.http.services.bzzz-agent.loadbalancer.server.port=9000"
|
||||
|
||||
mcp-server:
|
||||
image: registry.home.deepblack.cloud/bzzz-mcp:v2.0.0
|
||||
networks:
|
||||
- tengig
|
||||
ports:
|
||||
- "3001:3001"
|
||||
environment:
|
||||
- MCP_VERSION=1.0.0
|
||||
- BZZZ_ENDPOINT=http://bzzz-agent:9000
|
||||
deploy:
|
||||
replicas: 3
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.mcp-server.rule=Host(`mcp.deepblack.cloud`)"
|
||||
|
||||
openai-proxy:
|
||||
image: registry.home.deepblack.cloud/bzzz-openai-proxy:v2.0.0
|
||||
networks:
|
||||
- tengig
|
||||
- bzzz-internal
|
||||
ports:
|
||||
- "3002:3002"
|
||||
environment:
|
||||
- OPENAI_API_KEY_FILE=/run/secrets/openai_api_key
|
||||
- RATE_LIMIT_RPM=1000
|
||||
- COST_TRACKING_ENABLED=true
|
||||
secrets:
|
||||
- openai_api_key
|
||||
deploy:
|
||||
replicas: 2
|
||||
|
||||
content-resolver:
|
||||
image: registry.home.deepblack.cloud/bzzz-resolver:v2.0.0
|
||||
networks:
|
||||
- bzzz-internal
|
||||
ports:
|
||||
- "3003:3003"
|
||||
volumes:
|
||||
- /rust/bzzz-v2/data/blobs:/app/blobs:ro
|
||||
deploy:
|
||||
replicas: 3
|
||||
|
||||
postgres:
|
||||
image: postgres:15-alpine
|
||||
networks:
|
||||
- bzzz-internal
|
||||
environment:
|
||||
- POSTGRES_DB=bzzz_v2
|
||||
- POSTGRES_USER_FILE=/run/secrets/postgres_user
|
||||
- POSTGRES_PASSWORD_FILE=/run/secrets/postgres_password
|
||||
volumes:
|
||||
- /rust/bzzz-v2/data/postgres:/var/lib/postgresql/data
|
||||
secrets:
|
||||
- postgres_user
|
||||
- postgres_password
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == walnut
|
||||
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
networks:
|
||||
- bzzz-internal
|
||||
volumes:
|
||||
- /rust/bzzz-v2/data/redis:/data
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == ironwood
|
||||
|
||||
networks:
|
||||
tengig:
|
||||
external: true
|
||||
bzzz-internal:
|
||||
driver: overlay
|
||||
internal: true
|
||||
|
||||
secrets:
|
||||
openai_api_key:
|
||||
external: true
|
||||
postgres_user:
|
||||
external: true
|
||||
postgres_password:
|
||||
external: true
|
||||
```
|
||||
|
||||
## CI/CD Pipeline Configuration
|
||||
|
||||
### GitLab CI Pipeline
|
||||
|
||||
```yaml
|
||||
# .gitlab-ci.yml
|
||||
stages:
|
||||
- build
|
||||
- test
|
||||
- deploy-staging
|
||||
- deploy-production
|
||||
|
||||
variables:
|
||||
REGISTRY: registry.home.deepblack.cloud
|
||||
IMAGE_TAG: ${CI_COMMIT_SHORT_SHA}
|
||||
|
||||
build:
|
||||
stage: build
|
||||
script:
|
||||
- docker build -t ${REGISTRY}/bzzz:${IMAGE_TAG} .
|
||||
- docker build -t ${REGISTRY}/bzzz-mcp:${IMAGE_TAG} -f Dockerfile.mcp .
|
||||
- docker build -t ${REGISTRY}/bzzz-openai-proxy:${IMAGE_TAG} -f Dockerfile.proxy .
|
||||
- docker build -t ${REGISTRY}/bzzz-resolver:${IMAGE_TAG} -f Dockerfile.resolver .
|
||||
- docker push ${REGISTRY}/bzzz:${IMAGE_TAG}
|
||||
- docker push ${REGISTRY}/bzzz-mcp:${IMAGE_TAG}
|
||||
- docker push ${REGISTRY}/bzzz-openai-proxy:${IMAGE_TAG}
|
||||
- docker push ${REGISTRY}/bzzz-resolver:${IMAGE_TAG}
|
||||
only:
|
||||
- main
|
||||
- develop
|
||||
|
||||
test-protocol:
|
||||
stage: test
|
||||
script:
|
||||
- go test ./...
|
||||
- docker run --rm ${REGISTRY}/bzzz:${IMAGE_TAG} /app/test-suite
|
||||
dependencies:
|
||||
- build
|
||||
|
||||
test-integration:
|
||||
stage: test
|
||||
script:
|
||||
- docker-compose -f docker-compose.test.yml up -d
|
||||
- ./scripts/integration-tests.sh
|
||||
- docker-compose -f docker-compose.test.yml down
|
||||
dependencies:
|
||||
- build
|
||||
|
||||
deploy-staging:
|
||||
stage: deploy-staging
|
||||
script:
|
||||
- docker stack deploy -c docker-compose.staging.yml bzzz-v2-staging
|
||||
environment:
|
||||
name: staging
|
||||
only:
|
||||
- develop
|
||||
|
||||
deploy-production:
|
||||
stage: deploy-production
|
||||
script:
|
||||
- docker stack deploy -c docker-compose.swarm.yml bzzz-v2
|
||||
environment:
|
||||
name: production
|
||||
only:
|
||||
- main
|
||||
when: manual
|
||||
```
|
||||
|
||||
## Monitoring and Operations
|
||||
|
||||
### Monitoring Stack
|
||||
|
||||
```yaml
|
||||
# docker-compose.monitoring.yml
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
networks:
|
||||
- monitoring
|
||||
volumes:
|
||||
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml
|
||||
- /rust/bzzz-v2/data/prometheus:/prometheus
|
||||
deploy:
|
||||
replicas: 1
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
networks:
|
||||
- monitoring
|
||||
- tengig
|
||||
volumes:
|
||||
- /rust/bzzz-v2/data/grafana:/var/lib/grafana
|
||||
deploy:
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.bzzz-grafana.rule=Host(`bzzz-monitor.deepblack.cloud`)"
|
||||
|
||||
alertmanager:
|
||||
image: prom/alertmanager:latest
|
||||
networks:
|
||||
- monitoring
|
||||
volumes:
|
||||
- ./monitoring/alertmanager.yml:/etc/alertmanager/alertmanager.yml
|
||||
deploy:
|
||||
replicas: 1
|
||||
|
||||
networks:
|
||||
monitoring:
|
||||
driver: overlay
|
||||
tengig:
|
||||
external: true
|
||||
```
|
||||
|
||||
### Key Metrics to Monitor
|
||||
|
||||
1. **Protocol Metrics**
|
||||
- DHT lookup latency and success rate
|
||||
- Content resolution time
|
||||
- Peer discovery and connection stability
|
||||
- bzzz:// address resolution performance
|
||||
|
||||
2. **Service Metrics**
|
||||
- MCP server response times
|
||||
- OpenAI API usage and costs
|
||||
- Conversation threading performance
|
||||
- Content store I/O operations
|
||||
|
||||
3. **Infrastructure Metrics**
|
||||
- Docker Swarm service health
|
||||
- Network connectivity between nodes
|
||||
- Storage utilization and performance
|
||||
- Resource utilization (CPU, memory, disk)
|
||||
|
||||
### Alerting Configuration
|
||||
|
||||
```yaml
|
||||
# monitoring/alertmanager.yml
|
||||
global:
|
||||
smtp_smarthost: 'localhost:587'
|
||||
smtp_from: 'alerts@deepblack.cloud'
|
||||
|
||||
route:
|
||||
group_by: ['alertname']
|
||||
group_wait: 10s
|
||||
group_interval: 10s
|
||||
repeat_interval: 1h
|
||||
receiver: 'web.hook'
|
||||
|
||||
receivers:
|
||||
- name: 'web.hook'
|
||||
slack_configs:
|
||||
- api_url: 'YOUR_SLACK_WEBHOOK_URL'
|
||||
channel: '#bzzz-alerts'
|
||||
title: 'BZZZ v2 Alert'
|
||||
text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
|
||||
|
||||
inhibit_rules:
|
||||
- source_match:
|
||||
severity: 'critical'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['alertname', 'dev', 'instance']
|
||||
```
|
||||
|
||||
## Security and Networking
|
||||
|
||||
### Security Architecture
|
||||
|
||||
1. **Network Isolation**
|
||||
- Internal overlay network for inter-service communication
|
||||
- External network exposure only through Traefik
|
||||
- Firewall rules restricting P2P ports to local network
|
||||
|
||||
2. **Secret Management**
|
||||
- Docker Swarm secrets for sensitive data
|
||||
- Encrypted storage of API keys and credentials
|
||||
- Regular secret rotation procedures
|
||||
|
||||
3. **Access Control**
|
||||
- mTLS for P2P communication
|
||||
- API authentication and authorization
|
||||
- Role-based access for MCP endpoints
|
||||
|
||||
### Networking Configuration
|
||||
|
||||
```bash
|
||||
# UFW firewall rules for BZZZ v2
|
||||
sudo ufw allow from 192.168.1.0/24 to any port 9000:9300 proto tcp
|
||||
sudo ufw allow from 192.168.1.0/24 to any port 5353 proto udp
|
||||
sudo ufw allow from 192.168.1.0/24 to any port 2377 proto tcp # Docker Swarm
|
||||
sudo ufw allow from 192.168.1.0/24 to any port 7946 proto tcp # Docker Swarm
|
||||
sudo ufw allow from 192.168.1.0/24 to any port 4789 proto udp # Docker Swarm
|
||||
```
|
||||
|
||||
## Rollback Procedures
|
||||
|
||||
### Automatic Rollback Triggers
|
||||
|
||||
1. **Health Check Failures**
|
||||
- Service health checks failing for > 5 minutes
|
||||
- DHT network partition detection
|
||||
- Content store corruption detection
|
||||
- Critical error rate > 5%
|
||||
|
||||
2. **Performance Degradation**
|
||||
- Response time increase > 200% from baseline
|
||||
- Memory usage > 90% for > 10 minutes
|
||||
- Storage I/O errors > 1% rate
|
||||
|
||||
### Manual Rollback Process
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# rollback-v2.sh - Emergency rollback to v1
|
||||
|
||||
echo "🚨 Initiating BZZZ v2 rollback procedure..."
|
||||
|
||||
# Step 1: Stop v2 services
|
||||
docker stack rm bzzz-v2
|
||||
sleep 30
|
||||
|
||||
# Step 2: Restart v1 SystemD services
|
||||
sudo systemctl start bzzz@walnut
|
||||
sudo systemctl start bzzz@ironwood
|
||||
sudo systemctl start bzzz@acacia
|
||||
|
||||
# Step 3: Verify v1 connectivity
|
||||
./scripts/verify-v1-mesh.sh
|
||||
|
||||
# Step 4: Update load balancer configuration
|
||||
./scripts/update-traefik-v1.sh
|
||||
|
||||
# Step 5: Notify operations team
|
||||
curl -X POST $SLACK_WEBHOOK -d '{"text":"🚨 BZZZ rollback to v1 completed"}'
|
||||
|
||||
echo "✅ Rollback completed successfully"
|
||||
```
|
||||
|
||||
## Resource Requirements
|
||||
|
||||
### Node Specifications
|
||||
|
||||
| Component | CPU | Memory | Storage | Network |
|
||||
|-----------|-----|---------|---------|---------|
|
||||
| BZZZ Agent | 2 cores | 4GB | 20GB | 1Gbps |
|
||||
| MCP Server | 1 core | 2GB | 5GB | 100Mbps |
|
||||
| OpenAI Proxy | 1 core | 2GB | 5GB | 100Mbps |
|
||||
| Content Store | 2 cores | 8GB | 500GB | 1Gbps |
|
||||
| DHT Manager | 1 core | 4GB | 50GB | 1Gbps |
|
||||
|
||||
### Scaling Considerations
|
||||
|
||||
1. **Horizontal Scaling**
|
||||
- Add nodes to DHT for increased capacity
|
||||
- Scale MCP servers based on external demand
|
||||
- Replicate content store across availability zones
|
||||
|
||||
2. **Vertical Scaling**
|
||||
- Increase memory for larger conversation contexts
|
||||
- Add storage for content addressing requirements
|
||||
- Enhance network capacity for P2P traffic
|
||||
|
||||
## Operational Procedures
|
||||
|
||||
### Daily Operations
|
||||
|
||||
1. **Health Monitoring**
|
||||
- Review Grafana dashboards for anomalies
|
||||
- Check DHT network connectivity
|
||||
- Verify content store replication status
|
||||
- Monitor OpenAI API usage and costs
|
||||
|
||||
2. **Maintenance Tasks**
|
||||
- Log rotation and archival
|
||||
- Content store garbage collection
|
||||
- DHT routing table optimization
|
||||
- Security patch deployment
|
||||
|
||||
### Weekly Operations
|
||||
|
||||
1. **Performance Review**
|
||||
- Analyze response time trends
|
||||
- Review resource utilization patterns
|
||||
- Assess scaling requirements
|
||||
- Update capacity planning
|
||||
|
||||
2. **Security Audit**
|
||||
- Review access logs
|
||||
- Validate secret rotation
|
||||
- Check for security updates
|
||||
- Test backup and recovery procedures
|
||||
|
||||
### Incident Response
|
||||
|
||||
1. **Incident Classification**
|
||||
- P0: Complete service outage
|
||||
- P1: Major feature degradation
|
||||
- P2: Performance issues
|
||||
- P3: Minor functionality problems
|
||||
|
||||
2. **Response Procedures**
|
||||
- Automated alerting and escalation
|
||||
- Incident commander assignment
|
||||
- Communication protocols
|
||||
- Post-incident review process
|
||||
|
||||
This comprehensive infrastructure architecture provides a robust foundation for BZZZ v2 deployment while maintaining operational excellence and enabling future growth. The design prioritizes reliability, security, and maintainability while introducing advanced protocol features required for the next generation of the BZZZ ecosystem.
|
||||
643
infrastructure/ci-cd/.gitlab-ci.yml
Normal file
643
infrastructure/ci-cd/.gitlab-ci.yml
Normal file
@@ -0,0 +1,643 @@
|
||||
# BZZZ v2 GitLab CI/CD Pipeline
|
||||
# Comprehensive build, test, and deployment pipeline for BZZZ v2
|
||||
|
||||
variables:
|
||||
REGISTRY: registry.home.deepblack.cloud
|
||||
REGISTRY_NAMESPACE: bzzz
|
||||
GO_VERSION: "1.21"
|
||||
DOCKER_BUILDKIT: "1"
|
||||
COMPOSE_DOCKER_CLI_BUILD: "1"
|
||||
POSTGRES_VERSION: "15"
|
||||
REDIS_VERSION: "7"
|
||||
|
||||
# Semantic versioning
|
||||
VERSION_PREFIX: "v2"
|
||||
|
||||
stages:
|
||||
- lint
|
||||
- test
|
||||
- build
|
||||
- security-scan
|
||||
- integration-test
|
||||
- deploy-staging
|
||||
- performance-test
|
||||
- deploy-production
|
||||
- post-deploy-validation
|
||||
|
||||
# Cache configuration
|
||||
cache:
|
||||
key: "${CI_COMMIT_REF_SLUG}"
|
||||
paths:
|
||||
- .cache/go-mod/
|
||||
- .cache/docker/
|
||||
- vendor/
|
||||
|
||||
before_script:
|
||||
- export GOPATH=$CI_PROJECT_DIR/.cache/go-mod
|
||||
- export GOCACHE=$CI_PROJECT_DIR/.cache/go-build
|
||||
- mkdir -p .cache/{go-mod,go-build,docker}
|
||||
|
||||
# ================================
|
||||
# LINT STAGE
|
||||
# ================================
|
||||
|
||||
golang-lint:
|
||||
stage: lint
|
||||
image: golangci/golangci-lint:v1.55-alpine
|
||||
script:
|
||||
- golangci-lint run ./... --timeout 10m
|
||||
- go mod tidy
|
||||
- git diff --exit-code go.mod go.sum
|
||||
rules:
|
||||
- if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
|
||||
- if: '$CI_COMMIT_BRANCH == "main"'
|
||||
- if: '$CI_COMMIT_BRANCH == "develop"'
|
||||
|
||||
dockerfile-lint:
|
||||
stage: lint
|
||||
image: hadolint/hadolint:latest-debian
|
||||
script:
|
||||
- hadolint infrastructure/dockerfiles/Dockerfile.*
|
||||
- hadolint Dockerfile
|
||||
rules:
|
||||
- changes:
|
||||
- "infrastructure/dockerfiles/*"
|
||||
- "Dockerfile*"
|
||||
|
||||
yaml-lint:
|
||||
stage: lint
|
||||
image: cytopia/yamllint:latest
|
||||
script:
|
||||
- yamllint infrastructure/
|
||||
- yamllint .gitlab-ci.yml
|
||||
rules:
|
||||
- changes:
|
||||
- "infrastructure/**/*.yml"
|
||||
- "infrastructure/**/*.yaml"
|
||||
- ".gitlab-ci.yml"
|
||||
|
||||
# ================================
|
||||
# TEST STAGE
|
||||
# ================================
|
||||
|
||||
unit-tests:
|
||||
stage: test
|
||||
image: golang:$GO_VERSION-alpine
|
||||
services:
|
||||
- name: postgres:$POSTGRES_VERSION-alpine
|
||||
alias: postgres
|
||||
- name: redis:$REDIS_VERSION-alpine
|
||||
alias: redis
|
||||
variables:
|
||||
POSTGRES_DB: bzzz_test
|
||||
POSTGRES_USER: test
|
||||
POSTGRES_PASSWORD: testpass
|
||||
POSTGRES_HOST: postgres
|
||||
REDIS_HOST: redis
|
||||
CGO_ENABLED: 0
|
||||
before_script:
|
||||
- apk add --no-cache git make gcc musl-dev
|
||||
- export GOPATH=$CI_PROJECT_DIR/.cache/go-mod
|
||||
- export GOCACHE=$CI_PROJECT_DIR/.cache/go-build
|
||||
script:
|
||||
- go mod download
|
||||
- go test -v -race -coverprofile=coverage.out ./...
|
||||
- go tool cover -html=coverage.out -o coverage.html
|
||||
- go tool cover -func=coverage.out | grep total | awk '{print "Coverage: " $3}'
|
||||
coverage: '/Coverage: \d+\.\d+/'
|
||||
artifacts:
|
||||
reports:
|
||||
coverage_report:
|
||||
coverage_format: cobertura
|
||||
path: coverage.xml
|
||||
paths:
|
||||
- coverage.html
|
||||
- coverage.out
|
||||
expire_in: 1 week
|
||||
|
||||
p2p-protocol-tests:
|
||||
stage: test
|
||||
image: golang:$GO_VERSION-alpine
|
||||
script:
|
||||
- apk add --no-cache git make gcc musl-dev
|
||||
- go test -v -tags=p2p ./p2p/... ./dht/...
|
||||
- go test -v -tags=integration ./test/p2p/...
|
||||
rules:
|
||||
- changes:
|
||||
- "p2p/**/*"
|
||||
- "dht/**/*"
|
||||
- "test/p2p/**/*"
|
||||
|
||||
content-store-tests:
|
||||
stage: test
|
||||
image: golang:$GO_VERSION-alpine
|
||||
script:
|
||||
- apk add --no-cache git make gcc musl-dev
|
||||
- go test -v -tags=storage ./storage/... ./blake3/...
|
||||
- go test -v -benchmem -bench=. ./storage/...
|
||||
artifacts:
|
||||
paths:
|
||||
- benchmark.out
|
||||
expire_in: 1 week
|
||||
rules:
|
||||
- changes:
|
||||
- "storage/**/*"
|
||||
- "blake3/**/*"
|
||||
|
||||
conversation-tests:
|
||||
stage: test
|
||||
image: golang:$GO_VERSION-alpine
|
||||
services:
|
||||
- name: postgres:$POSTGRES_VERSION-alpine
|
||||
alias: postgres
|
||||
variables:
|
||||
POSTGRES_DB: bzzz_conversation_test
|
||||
POSTGRES_USER: test
|
||||
POSTGRES_PASSWORD: testpass
|
||||
POSTGRES_HOST: postgres
|
||||
script:
|
||||
- apk add --no-cache git make gcc musl-dev postgresql-client
|
||||
- until pg_isready -h postgres -p 5432 -U test; do sleep 1; done
|
||||
- go test -v -tags=conversation ./conversation/... ./threading/...
|
||||
rules:
|
||||
- changes:
|
||||
- "conversation/**/*"
|
||||
- "threading/**/*"
|
||||
|
||||
# ================================
|
||||
# BUILD STAGE
|
||||
# ================================
|
||||
|
||||
build-binaries:
|
||||
stage: build
|
||||
image: golang:$GO_VERSION-alpine
|
||||
before_script:
|
||||
- apk add --no-cache git make gcc musl-dev upx
|
||||
- export GOPATH=$CI_PROJECT_DIR/.cache/go-mod
|
||||
- export GOCACHE=$CI_PROJECT_DIR/.cache/go-build
|
||||
script:
|
||||
- make build-all
|
||||
- upx --best --lzma dist/bzzz-*
|
||||
- ls -la dist/
|
||||
artifacts:
|
||||
paths:
|
||||
- dist/
|
||||
expire_in: 1 week
|
||||
|
||||
build-docker-images:
|
||||
stage: build
|
||||
image: docker:24-dind
|
||||
services:
|
||||
- docker:24-dind
|
||||
variables:
|
||||
IMAGE_TAG: ${CI_COMMIT_SHORT_SHA}
|
||||
DOCKER_DRIVER: overlay2
|
||||
before_script:
|
||||
- docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $REGISTRY
|
||||
- docker buildx create --use --driver docker-container
|
||||
script:
|
||||
# Build all images in parallel
|
||||
- |
|
||||
docker buildx build \
|
||||
--platform linux/amd64,linux/arm64 \
|
||||
--build-arg VERSION=${VERSION_PREFIX}.${CI_PIPELINE_ID} \
|
||||
--build-arg COMMIT=${CI_COMMIT_SHORT_SHA} \
|
||||
--tag $REGISTRY/$REGISTRY_NAMESPACE/bzzz-agent:$IMAGE_TAG \
|
||||
--tag $REGISTRY/$REGISTRY_NAMESPACE/bzzz-agent:latest \
|
||||
--file infrastructure/dockerfiles/Dockerfile.agent \
|
||||
--push .
|
||||
|
||||
- |
|
||||
docker buildx build \
|
||||
--platform linux/amd64,linux/arm64 \
|
||||
--tag $REGISTRY/$REGISTRY_NAMESPACE/bzzz-mcp:$IMAGE_TAG \
|
||||
--tag $REGISTRY/$REGISTRY_NAMESPACE/bzzz-mcp:latest \
|
||||
--file infrastructure/dockerfiles/Dockerfile.mcp \
|
||||
--push .
|
||||
|
||||
- |
|
||||
docker buildx build \
|
||||
--platform linux/amd64,linux/arm64 \
|
||||
--tag $REGISTRY/$REGISTRY_NAMESPACE/bzzz-openai-proxy:$IMAGE_TAG \
|
||||
--tag $REGISTRY/$REGISTRY_NAMESPACE/bzzz-openai-proxy:latest \
|
||||
--file infrastructure/dockerfiles/Dockerfile.proxy \
|
||||
--push .
|
||||
|
||||
- |
|
||||
docker buildx build \
|
||||
--platform linux/amd64,linux/arm64 \
|
||||
--tag $REGISTRY/$REGISTRY_NAMESPACE/bzzz-resolver:$IMAGE_TAG \
|
||||
--tag $REGISTRY/$REGISTRY_NAMESPACE/bzzz-resolver:latest \
|
||||
--file infrastructure/dockerfiles/Dockerfile.resolver \
|
||||
--push .
|
||||
|
||||
- |
|
||||
docker buildx build \
|
||||
--platform linux/amd64,linux/arm64 \
|
||||
--tag $REGISTRY/$REGISTRY_NAMESPACE/bzzz-dht:$IMAGE_TAG \
|
||||
--tag $REGISTRY/$REGISTRY_NAMESPACE/bzzz-dht:latest \
|
||||
--file infrastructure/dockerfiles/Dockerfile.dht \
|
||||
--push .
|
||||
|
||||
dependencies:
|
||||
- build-binaries
|
||||
|
||||
# ================================
|
||||
# SECURITY SCAN STAGE
|
||||
# ================================
|
||||
|
||||
container-security-scan:
|
||||
stage: security-scan
|
||||
image: aquasec/trivy:latest
|
||||
script:
|
||||
- |
|
||||
for component in agent mcp openai-proxy resolver dht; do
|
||||
echo "Scanning bzzz-${component}..."
|
||||
trivy image --exit-code 1 --severity HIGH,CRITICAL \
|
||||
--format json --output trivy-${component}.json \
|
||||
$REGISTRY/$REGISTRY_NAMESPACE/bzzz-${component}:${CI_COMMIT_SHORT_SHA}
|
||||
done
|
||||
artifacts:
|
||||
reports:
|
||||
container_scanning: trivy-*.json
|
||||
expire_in: 1 week
|
||||
dependencies:
|
||||
- build-docker-images
|
||||
allow_failure: true
|
||||
|
||||
dependency-security-scan:
|
||||
stage: security-scan
|
||||
image: golang:$GO_VERSION-alpine
|
||||
script:
|
||||
- go install golang.org/x/vuln/cmd/govulncheck@latest
|
||||
- govulncheck ./...
|
||||
allow_failure: true
|
||||
|
||||
secrets-scan:
|
||||
stage: security-scan
|
||||
image: trufflesecurity/trufflehog:latest
|
||||
script:
|
||||
- trufflehog filesystem --directory=. --fail --json
|
||||
allow_failure: true
|
||||
rules:
|
||||
- if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
|
||||
|
||||
# ================================
|
||||
# INTEGRATION TEST STAGE
|
||||
# ================================
|
||||
|
||||
p2p-integration-test:
|
||||
stage: integration-test
|
||||
image: docker:24-dind
|
||||
services:
|
||||
- docker:24-dind
|
||||
variables:
|
||||
COMPOSE_PROJECT_NAME: bzzz-integration-${CI_PIPELINE_ID}
|
||||
before_script:
|
||||
- docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $REGISTRY
|
||||
- apk add --no-cache docker-compose curl jq
|
||||
script:
|
||||
- cd infrastructure/testing
|
||||
- docker-compose -f docker-compose.integration.yml up -d
|
||||
- sleep 60 # Wait for services to start
|
||||
- ./scripts/test-p2p-mesh.sh
|
||||
- ./scripts/test-dht-discovery.sh
|
||||
- ./scripts/test-content-addressing.sh
|
||||
- docker-compose -f docker-compose.integration.yml logs
|
||||
after_script:
|
||||
- cd infrastructure/testing
|
||||
- docker-compose -f docker-compose.integration.yml down -v
|
||||
artifacts:
|
||||
paths:
|
||||
- infrastructure/testing/test-results/
|
||||
expire_in: 1 week
|
||||
when: always
|
||||
dependencies:
|
||||
- build-docker-images
|
||||
|
||||
mcp-integration-test:
|
||||
stage: integration-test
|
||||
image: node:18-alpine
|
||||
services:
|
||||
- name: $REGISTRY/$REGISTRY_NAMESPACE/bzzz-mcp:${CI_COMMIT_SHORT_SHA}
|
||||
alias: mcp-server
|
||||
- name: $REGISTRY/$REGISTRY_NAMESPACE/bzzz-agent:${CI_COMMIT_SHORT_SHA}
|
||||
alias: bzzz-agent
|
||||
script:
|
||||
- cd test/mcp
|
||||
- npm install
|
||||
- npm test
|
||||
artifacts:
|
||||
reports:
|
||||
junit: test/mcp/junit.xml
|
||||
dependencies:
|
||||
- build-docker-images
|
||||
|
||||
openai-proxy-test:
|
||||
stage: integration-test
|
||||
image: python:3.11-alpine
|
||||
services:
|
||||
- name: $REGISTRY/$REGISTRY_NAMESPACE/bzzz-openai-proxy:${CI_COMMIT_SHORT_SHA}
|
||||
alias: openai-proxy
|
||||
- name: redis:$REDIS_VERSION-alpine
|
||||
alias: redis
|
||||
variables:
|
||||
OPENAI_API_KEY: "test-key-mock"
|
||||
REDIS_HOST: redis
|
||||
script:
|
||||
- cd test/openai-proxy
|
||||
- pip install -r requirements.txt
|
||||
- python -m pytest -v --junitxml=junit.xml
|
||||
artifacts:
|
||||
reports:
|
||||
junit: test/openai-proxy/junit.xml
|
||||
dependencies:
|
||||
- build-docker-images
|
||||
|
||||
# ================================
|
||||
# STAGING DEPLOYMENT
|
||||
# ================================
|
||||
|
||||
deploy-staging:
|
||||
stage: deploy-staging
|
||||
image: docker:24-dind
|
||||
services:
|
||||
- docker:24-dind
|
||||
variables:
|
||||
DEPLOY_ENV: staging
|
||||
STACK_NAME: bzzz-v2-staging
|
||||
environment:
|
||||
name: staging
|
||||
url: https://bzzz-staging.deepblack.cloud
|
||||
before_script:
|
||||
- apk add --no-cache openssh-client
|
||||
- eval $(ssh-agent -s)
|
||||
- echo "$SSH_PRIVATE_KEY" | tr -d '\r' | ssh-add -
|
||||
- mkdir -p ~/.ssh
|
||||
- chmod 700 ~/.ssh
|
||||
- ssh-keyscan -H 192.168.1.27 >> ~/.ssh/known_hosts
|
||||
script:
|
||||
# Copy deployment files to staging environment
|
||||
- scp infrastructure/docker-compose.staging.yml tony@192.168.1.27:/rust/bzzz-v2/
|
||||
- scp infrastructure/configs/staging/* tony@192.168.1.27:/rust/bzzz-v2/config/
|
||||
|
||||
# Deploy to staging swarm
|
||||
- |
|
||||
ssh tony@192.168.1.27 << 'EOF'
|
||||
cd /rust/bzzz-v2
|
||||
export IMAGE_TAG=${CI_COMMIT_SHORT_SHA}
|
||||
docker stack deploy -c docker-compose.staging.yml ${STACK_NAME}
|
||||
|
||||
# Wait for deployment
|
||||
timeout 300 bash -c 'until docker service ls --filter label=com.docker.stack.namespace=${STACK_NAME} --format "{{.Replicas}}" | grep -v "0/"; do sleep 10; done'
|
||||
EOF
|
||||
|
||||
# Health check staging deployment
|
||||
- sleep 60
|
||||
- curl -f https://bzzz-staging.deepblack.cloud/health
|
||||
dependencies:
|
||||
- build-docker-images
|
||||
- p2p-integration-test
|
||||
rules:
|
||||
- if: '$CI_COMMIT_BRANCH == "develop"'
|
||||
- if: '$CI_COMMIT_BRANCH == "main"'
|
||||
|
||||
# ================================
|
||||
# PERFORMANCE TESTING
|
||||
# ================================
|
||||
|
||||
performance-test:
|
||||
stage: performance-test
|
||||
image: loadimpact/k6:latest
|
||||
script:
|
||||
- cd test/performance
|
||||
- k6 run --out json=performance-results.json performance-test.js
|
||||
- k6 run --out json=dht-performance.json dht-performance-test.js
|
||||
artifacts:
|
||||
paths:
|
||||
- test/performance/performance-results.json
|
||||
- test/performance/dht-performance.json
|
||||
reports:
|
||||
performance: test/performance/performance-results.json
|
||||
expire_in: 1 week
|
||||
environment:
|
||||
name: staging
|
||||
rules:
|
||||
- if: '$CI_COMMIT_BRANCH == "main"'
|
||||
- when: manual
|
||||
if: '$CI_COMMIT_BRANCH == "develop"'
|
||||
|
||||
load-test:
|
||||
stage: performance-test
|
||||
image: python:3.11-alpine
|
||||
script:
|
||||
- cd test/load
|
||||
- pip install locust requests
|
||||
- locust --headless --users 100 --spawn-rate 10 --run-time 5m --host https://bzzz-staging.deepblack.cloud
|
||||
artifacts:
|
||||
paths:
|
||||
- test/load/locust_stats.html
|
||||
expire_in: 1 week
|
||||
environment:
|
||||
name: staging
|
||||
rules:
|
||||
- when: manual
|
||||
|
||||
# ================================
|
||||
# PRODUCTION DEPLOYMENT
|
||||
# ================================
|
||||
|
||||
deploy-production:
|
||||
stage: deploy-production
|
||||
image: docker:24-dind
|
||||
services:
|
||||
- docker:24-dind
|
||||
variables:
|
||||
DEPLOY_ENV: production
|
||||
STACK_NAME: bzzz-v2
|
||||
environment:
|
||||
name: production
|
||||
url: https://bzzz.deepblack.cloud
|
||||
before_script:
|
||||
- apk add --no-cache openssh-client
|
||||
- eval $(ssh-agent -s)
|
||||
- echo "$SSH_PRIVATE_KEY" | tr -d '\r' | ssh-add -
|
||||
- mkdir -p ~/.ssh
|
||||
- chmod 700 ~/.ssh
|
||||
- ssh-keyscan -H 192.168.1.27 >> ~/.ssh/known_hosts
|
||||
script:
|
||||
# Backup current production state
|
||||
- |
|
||||
ssh tony@192.168.1.27 << 'EOF'
|
||||
mkdir -p /rust/bzzz-v2/backup/$(date +%Y%m%d-%H%M%S)
|
||||
docker service ls --filter label=com.docker.stack.namespace=bzzz-v2 --format "table {{.Name}}\t{{.Image}}" > /rust/bzzz-v2/backup/$(date +%Y%m%d-%H%M%S)/pre-deployment-services.txt
|
||||
EOF
|
||||
|
||||
# Copy production deployment files
|
||||
- scp infrastructure/docker-compose.swarm.yml tony@192.168.1.27:/rust/bzzz-v2/
|
||||
- scp infrastructure/configs/production/* tony@192.168.1.27:/rust/bzzz-v2/config/
|
||||
|
||||
# Deploy to production with blue-green strategy
|
||||
- |
|
||||
ssh tony@192.168.1.27 << 'EOF'
|
||||
cd /rust/bzzz-v2
|
||||
export IMAGE_TAG=${CI_COMMIT_SHORT_SHA}
|
||||
|
||||
# Deploy new version
|
||||
docker stack deploy -c docker-compose.swarm.yml ${STACK_NAME}
|
||||
|
||||
# Wait for healthy deployment
|
||||
timeout 600 bash -c 'until docker service ls --filter label=com.docker.stack.namespace=${STACK_NAME} --format "{{.Replicas}}" | grep -v "0/" | wc -l | grep -q 8; do sleep 15; done'
|
||||
|
||||
echo "Production deployment completed successfully"
|
||||
EOF
|
||||
|
||||
# Verify production health
|
||||
- sleep 120
|
||||
- curl -f https://bzzz.deepblack.cloud/health
|
||||
- curl -f https://mcp.deepblack.cloud/health
|
||||
dependencies:
|
||||
- deploy-staging
|
||||
- performance-test
|
||||
rules:
|
||||
- if: '$CI_COMMIT_BRANCH == "main"'
|
||||
when: manual
|
||||
|
||||
rollback-production:
|
||||
stage: deploy-production
|
||||
image: docker:24-dind
|
||||
variables:
|
||||
STACK_NAME: bzzz-v2
|
||||
environment:
|
||||
name: production
|
||||
action: rollback
|
||||
before_script:
|
||||
- apk add --no-cache openssh-client
|
||||
- eval $(ssh-agent -s)
|
||||
- echo "$SSH_PRIVATE_KEY" | tr -d '\r' | ssh-add -
|
||||
- mkdir -p ~/.ssh
|
||||
- chmod 700 ~/.ssh
|
||||
- ssh-keyscan -H 192.168.1.27 >> ~/.ssh/known_hosts
|
||||
script:
|
||||
- |
|
||||
ssh tony@192.168.1.27 << 'EOF'
|
||||
cd /rust/bzzz-v2
|
||||
|
||||
# Get previous stable image tags
|
||||
PREVIOUS_TAG=$(docker service inspect bzzz-v2_bzzz-agent --format '{{.Spec.TaskTemplate.ContainerSpec.Image}}' | cut -d: -f2)
|
||||
|
||||
# Rollback by redeploying previous version
|
||||
export IMAGE_TAG=$PREVIOUS_TAG
|
||||
docker stack deploy -c docker-compose.swarm.yml ${STACK_NAME}
|
||||
|
||||
echo "Production rollback completed"
|
||||
EOF
|
||||
rules:
|
||||
- when: manual
|
||||
if: '$CI_COMMIT_BRANCH == "main"'
|
||||
|
||||
# ================================
|
||||
# POST-DEPLOYMENT VALIDATION
|
||||
# ================================
|
||||
|
||||
post-deploy-validation:
|
||||
stage: post-deploy-validation
|
||||
image: curlimages/curl:latest
|
||||
script:
|
||||
- curl -f https://bzzz.deepblack.cloud/health
|
||||
- curl -f https://mcp.deepblack.cloud/health
|
||||
- curl -f https://resolve.deepblack.cloud/health
|
||||
- curl -f https://openai.deepblack.cloud/health
|
||||
|
||||
# Test basic functionality
|
||||
- |
|
||||
# Test bzzz:// address resolution
|
||||
CONTENT_HASH=$(curl -s https://bzzz.deepblack.cloud/api/v2/test-content | jq -r '.hash')
|
||||
curl -f "https://resolve.deepblack.cloud/bzzz://${CONTENT_HASH}"
|
||||
|
||||
# Test MCP endpoint
|
||||
curl -X POST https://mcp.deepblack.cloud/api/tools/list \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"method": "tools/list"}'
|
||||
environment:
|
||||
name: production
|
||||
rules:
|
||||
- if: '$CI_COMMIT_BRANCH == "main"'
|
||||
needs:
|
||||
- deploy-production
|
||||
|
||||
smoke-tests:
|
||||
stage: post-deploy-validation
|
||||
image: golang:$GO_VERSION-alpine
|
||||
script:
|
||||
- cd test/smoke
|
||||
- go test -v ./... -base-url=https://bzzz.deepblack.cloud
|
||||
environment:
|
||||
name: production
|
||||
rules:
|
||||
- if: '$CI_COMMIT_BRANCH == "main"'
|
||||
needs:
|
||||
- deploy-production
|
||||
|
||||
# ================================
|
||||
# NOTIFICATION STAGE (implicit)
|
||||
# ================================
|
||||
|
||||
notify-success:
|
||||
stage: .post
|
||||
image: curlimages/curl:latest
|
||||
script:
|
||||
- |
|
||||
curl -X POST $SLACK_WEBHOOK_URL \
|
||||
-H 'Content-type: application/json' \
|
||||
-d '{
|
||||
"text": "🚀 BZZZ v2 Pipeline Success",
|
||||
"attachments": [{
|
||||
"color": "good",
|
||||
"fields": [{
|
||||
"title": "Branch",
|
||||
"value": "'$CI_COMMIT_BRANCH'",
|
||||
"short": true
|
||||
}, {
|
||||
"title": "Commit",
|
||||
"value": "'$CI_COMMIT_SHORT_SHA'",
|
||||
"short": true
|
||||
}, {
|
||||
"title": "Pipeline",
|
||||
"value": "'$CI_PIPELINE_URL'",
|
||||
"short": false
|
||||
}]
|
||||
}]
|
||||
}'
|
||||
rules:
|
||||
- if: '$CI_PIPELINE_STATUS == "success" && $CI_COMMIT_BRANCH == "main"'
|
||||
when: on_success
|
||||
|
||||
notify-failure:
|
||||
stage: .post
|
||||
image: curlimages/curl:latest
|
||||
script:
|
||||
- |
|
||||
curl -X POST $SLACK_WEBHOOK_URL \
|
||||
-H 'Content-type: application/json' \
|
||||
-d '{
|
||||
"text": "❌ BZZZ v2 Pipeline Failed",
|
||||
"attachments": [{
|
||||
"color": "danger",
|
||||
"fields": [{
|
||||
"title": "Branch",
|
||||
"value": "'$CI_COMMIT_BRANCH'",
|
||||
"short": true
|
||||
}, {
|
||||
"title": "Commit",
|
||||
"value": "'$CI_COMMIT_SHORT_SHA'",
|
||||
"short": true
|
||||
}, {
|
||||
"title": "Pipeline",
|
||||
"value": "'$CI_PIPELINE_URL'",
|
||||
"short": false
|
||||
}]
|
||||
}]
|
||||
}'
|
||||
rules:
|
||||
- when: on_failure
|
||||
402
infrastructure/docker-compose.swarm.yml
Normal file
402
infrastructure/docker-compose.swarm.yml
Normal file
@@ -0,0 +1,402 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
# BZZZ v2 Main Agent
|
||||
bzzz-agent:
|
||||
image: registry.home.deepblack.cloud/bzzz:v2.0.0
|
||||
networks:
|
||||
- tengig
|
||||
- bzzz-internal
|
||||
ports:
|
||||
- "9000-9100:9000-9100"
|
||||
volumes:
|
||||
- /rust/bzzz-v2/data:/app/data
|
||||
- /rust/bzzz-v2/config:/app/config:ro
|
||||
environment:
|
||||
- BZZZ_VERSION=2.0.0
|
||||
- BZZZ_PROTOCOL=bzzz://
|
||||
- DHT_BOOTSTRAP_NODES=walnut:9101,ironwood:9102,acacia:9103
|
||||
- CONTENT_STORE_PATH=/app/data/blobs
|
||||
- POSTGRES_HOST=postgres
|
||||
- REDIS_HOST=redis
|
||||
- LOG_LEVEL=info
|
||||
secrets:
|
||||
- postgres_password
|
||||
- openai_api_key
|
||||
configs:
|
||||
- source: bzzz_config
|
||||
target: /app/config/config.yaml
|
||||
deploy:
|
||||
replicas: 3
|
||||
placement:
|
||||
max_replicas_per_node: 1
|
||||
constraints:
|
||||
- node.labels.bzzz.role == agent
|
||||
resources:
|
||||
limits:
|
||||
memory: 4G
|
||||
cpus: '2.0'
|
||||
reservations:
|
||||
memory: 2G
|
||||
cpus: '1.0'
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
delay: 10s
|
||||
max_attempts: 3
|
||||
update_config:
|
||||
parallelism: 1
|
||||
delay: 30s
|
||||
failure_action: rollback
|
||||
order: stop-first
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.bzzz-agent.rule=Host(`bzzz.deepblack.cloud`)"
|
||||
- "traefik.http.services.bzzz-agent.loadbalancer.server.port=9000"
|
||||
- "traefik.http.routers.bzzz-agent.tls=true"
|
||||
- "traefik.http.routers.bzzz-agent.tls.certresolver=letsencrypt"
|
||||
|
||||
# MCP Server for external tool integration
|
||||
mcp-server:
|
||||
image: registry.home.deepblack.cloud/bzzz-mcp:v2.0.0
|
||||
networks:
|
||||
- tengig
|
||||
- bzzz-internal
|
||||
ports:
|
||||
- "3001:3001"
|
||||
environment:
|
||||
- MCP_VERSION=1.0.0
|
||||
- BZZZ_ENDPOINT=http://bzzz-agent:9000
|
||||
- MAX_CONNECTIONS=1000
|
||||
- TIMEOUT_SECONDS=30
|
||||
configs:
|
||||
- source: mcp_config
|
||||
target: /app/config/mcp.yaml
|
||||
deploy:
|
||||
replicas: 3
|
||||
placement:
|
||||
max_replicas_per_node: 1
|
||||
resources:
|
||||
limits:
|
||||
memory: 2G
|
||||
cpus: '1.0'
|
||||
reservations:
|
||||
memory: 512M
|
||||
cpus: '0.5'
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
delay: 5s
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.mcp-server.rule=Host(`mcp.deepblack.cloud`)"
|
||||
- "traefik.http.services.mcp-server.loadbalancer.server.port=3001"
|
||||
- "traefik.http.routers.mcp-server.tls=true"
|
||||
|
||||
# OpenAI Proxy with rate limiting and cost tracking
|
||||
openai-proxy:
|
||||
image: registry.home.deepblack.cloud/bzzz-openai-proxy:v2.0.0
|
||||
networks:
|
||||
- tengig
|
||||
- bzzz-internal
|
||||
ports:
|
||||
- "3002:3002"
|
||||
environment:
|
||||
- RATE_LIMIT_RPM=1000
|
||||
- RATE_LIMIT_TPM=100000
|
||||
- COST_TRACKING_ENABLED=true
|
||||
- REDIS_HOST=redis
|
||||
- POSTGRES_HOST=postgres
|
||||
- LOG_REQUESTS=true
|
||||
secrets:
|
||||
- openai_api_key
|
||||
- postgres_password
|
||||
configs:
|
||||
- source: proxy_config
|
||||
target: /app/config/proxy.yaml
|
||||
deploy:
|
||||
replicas: 2
|
||||
placement:
|
||||
max_replicas_per_node: 1
|
||||
resources:
|
||||
limits:
|
||||
memory: 2G
|
||||
cpus: '1.0'
|
||||
reservations:
|
||||
memory: 1G
|
||||
cpus: '0.5'
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
delay: 10s
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.openai-proxy.rule=Host(`openai.deepblack.cloud`)"
|
||||
- "traefik.http.services.openai-proxy.loadbalancer.server.port=3002"
|
||||
- "traefik.http.routers.openai-proxy.tls=true"
|
||||
|
||||
# Content Resolver for bzzz:// address resolution
|
||||
content-resolver:
|
||||
image: registry.home.deepblack.cloud/bzzz-resolver:v2.0.0
|
||||
networks:
|
||||
- bzzz-internal
|
||||
- tengig
|
||||
ports:
|
||||
- "3003:3003"
|
||||
volumes:
|
||||
- /rust/bzzz-v2/data/blobs:/app/blobs:ro
|
||||
environment:
|
||||
- BLAKE3_INDEX_PATH=/app/blobs/index
|
||||
- DHT_BOOTSTRAP_NODES=walnut:9101,ironwood:9102,acacia:9103
|
||||
- CACHE_SIZE_MB=512
|
||||
deploy:
|
||||
replicas: 3
|
||||
placement:
|
||||
max_replicas_per_node: 1
|
||||
resources:
|
||||
limits:
|
||||
memory: 1G
|
||||
cpus: '1.0'
|
||||
reservations:
|
||||
memory: 512M
|
||||
cpus: '0.5'
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.content-resolver.rule=Host(`resolve.deepblack.cloud`)"
|
||||
|
||||
# DHT Bootstrap Nodes (one per physical node)
|
||||
dht-bootstrap-walnut:
|
||||
image: registry.home.deepblack.cloud/bzzz-dht:v2.0.0
|
||||
networks:
|
||||
- bzzz-internal
|
||||
ports:
|
||||
- "9101:9101"
|
||||
volumes:
|
||||
- /rust/bzzz-v2/data/dht/walnut:/app/data
|
||||
environment:
|
||||
- DHT_PORT=9101
|
||||
- NODE_NAME=walnut
|
||||
- PEER_STORE_PATH=/app/data/peers
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == walnut
|
||||
resources:
|
||||
limits:
|
||||
memory: 1G
|
||||
cpus: '1.0'
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
|
||||
dht-bootstrap-ironwood:
|
||||
image: registry.home.deepblack.cloud/bzzz-dht:v2.0.0
|
||||
networks:
|
||||
- bzzz-internal
|
||||
ports:
|
||||
- "9102:9102"
|
||||
volumes:
|
||||
- /rust/bzzz-v2/data/dht/ironwood:/app/data
|
||||
environment:
|
||||
- DHT_PORT=9102
|
||||
- NODE_NAME=ironwood
|
||||
- PEER_STORE_PATH=/app/data/peers
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == ironwood
|
||||
resources:
|
||||
limits:
|
||||
memory: 1G
|
||||
cpus: '1.0'
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
|
||||
dht-bootstrap-acacia:
|
||||
image: registry.home.deepblack.cloud/bzzz-dht:v2.0.0
|
||||
networks:
|
||||
- bzzz-internal
|
||||
ports:
|
||||
- "9103:9103"
|
||||
volumes:
|
||||
- /rust/bzzz-v2/data/dht/acacia:/app/data
|
||||
environment:
|
||||
- DHT_PORT=9103
|
||||
- NODE_NAME=acacia
|
||||
- PEER_STORE_PATH=/app/data/peers
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == acacia
|
||||
resources:
|
||||
limits:
|
||||
memory: 1G
|
||||
cpus: '1.0'
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
|
||||
# PostgreSQL for metadata and conversation threading
|
||||
postgres:
|
||||
image: postgres:15-alpine
|
||||
networks:
|
||||
- bzzz-internal
|
||||
environment:
|
||||
- POSTGRES_DB=bzzz_v2
|
||||
- POSTGRES_USER=bzzz
|
||||
- POSTGRES_PASSWORD_FILE=/run/secrets/postgres_password
|
||||
- POSTGRES_INITDB_ARGS=--auth-host=scram-sha-256
|
||||
volumes:
|
||||
- /rust/bzzz-v2/data/postgres:/var/lib/postgresql/data
|
||||
- /rust/bzzz-v2/config/postgres/init:/docker-entrypoint-initdb.d:ro
|
||||
secrets:
|
||||
- postgres_password
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == walnut
|
||||
resources:
|
||||
limits:
|
||||
memory: 4G
|
||||
cpus: '2.0'
|
||||
reservations:
|
||||
memory: 2G
|
||||
cpus: '1.0'
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
delay: 10s
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U bzzz -d bzzz_v2"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# Redis for caching and DHT coordination
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
networks:
|
||||
- bzzz-internal
|
||||
volumes:
|
||||
- /rust/bzzz-v2/data/redis:/data
|
||||
configs:
|
||||
- source: redis_config
|
||||
target: /usr/local/etc/redis/redis.conf
|
||||
command: redis-server /usr/local/etc/redis/redis.conf
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == ironwood
|
||||
resources:
|
||||
limits:
|
||||
memory: 2G
|
||||
cpus: '1.0'
|
||||
reservations:
|
||||
memory: 512M
|
||||
cpus: '0.5'
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "ping"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# Conversation Thread Manager
|
||||
conversation-manager:
|
||||
image: registry.home.deepblack.cloud/bzzz-conversation:v2.0.0
|
||||
networks:
|
||||
- bzzz-internal
|
||||
environment:
|
||||
- POSTGRES_HOST=postgres
|
||||
- REDIS_HOST=redis
|
||||
- LAMPORT_CLOCK_PRECISION=microsecond
|
||||
volumes:
|
||||
- /rust/bzzz-v2/data/conversations:/app/conversations
|
||||
secrets:
|
||||
- postgres_password
|
||||
deploy:
|
||||
replicas: 2
|
||||
placement:
|
||||
max_replicas_per_node: 1
|
||||
resources:
|
||||
limits:
|
||||
memory: 2G
|
||||
cpus: '1.0'
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
|
||||
# Content Store Manager
|
||||
content-store:
|
||||
image: registry.home.deepblack.cloud/bzzz-content-store:v2.0.0
|
||||
networks:
|
||||
- bzzz-internal
|
||||
volumes:
|
||||
- /rust/bzzz-v2/data/blobs:/app/blobs
|
||||
environment:
|
||||
- BLAKE3_SHARD_DEPTH=2
|
||||
- REPLICATION_FACTOR=3
|
||||
- GARBAGE_COLLECTION_INTERVAL=24h
|
||||
deploy:
|
||||
replicas: 3
|
||||
placement:
|
||||
max_replicas_per_node: 1
|
||||
resources:
|
||||
limits:
|
||||
memory: 8G
|
||||
cpus: '2.0'
|
||||
reservations:
|
||||
memory: 4G
|
||||
cpus: '1.0'
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
|
||||
networks:
|
||||
tengig:
|
||||
external: true
|
||||
bzzz-internal:
|
||||
driver: overlay
|
||||
internal: true
|
||||
attachable: false
|
||||
ipam:
|
||||
driver: default
|
||||
config:
|
||||
- subnet: 10.200.0.0/16
|
||||
|
||||
volumes:
|
||||
postgres_data:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: nfs
|
||||
o: addr=192.168.1.27,rw,sync
|
||||
device: ":/rust/bzzz-v2/data/postgres"
|
||||
|
||||
redis_data:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: nfs
|
||||
o: addr=192.168.1.27,rw,sync
|
||||
device: ":/rust/bzzz-v2/data/redis"
|
||||
|
||||
secrets:
|
||||
openai_api_key:
|
||||
external: true
|
||||
name: bzzz_openai_api_key
|
||||
postgres_password:
|
||||
external: true
|
||||
name: bzzz_postgres_password
|
||||
|
||||
configs:
|
||||
bzzz_config:
|
||||
external: true
|
||||
name: bzzz_v2_config
|
||||
mcp_config:
|
||||
external: true
|
||||
name: bzzz_mcp_config
|
||||
proxy_config:
|
||||
external: true
|
||||
name: bzzz_proxy_config
|
||||
redis_config:
|
||||
external: true
|
||||
name: bzzz_redis_config
|
||||
581
infrastructure/docs/DEPLOYMENT_RUNBOOK.md
Normal file
581
infrastructure/docs/DEPLOYMENT_RUNBOOK.md
Normal file
@@ -0,0 +1,581 @@
|
||||
# BZZZ v2 Deployment Runbook
|
||||
|
||||
## Overview
|
||||
|
||||
This runbook provides step-by-step procedures for deploying, operating, and maintaining BZZZ v2 infrastructure. It covers normal operations, emergency procedures, and troubleshooting guidelines.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
### System Requirements
|
||||
|
||||
- **Cluster**: 3 nodes (WALNUT, IRONWOOD, ACACIA)
|
||||
- **OS**: Ubuntu 22.04 LTS or newer
|
||||
- **Docker**: Version 24+ with Swarm mode enabled
|
||||
- **Storage**: NFS mount at `/rust/` with 500GB+ available
|
||||
- **Network**: Internal 192.168.1.0/24 with external internet access
|
||||
- **Secrets**: OpenAI API key and database credentials
|
||||
|
||||
### Access Requirements
|
||||
|
||||
- SSH access to all cluster nodes
|
||||
- Docker Swarm manager privileges
|
||||
- Sudo access for system configuration
|
||||
- GitLab access for CI/CD pipeline management
|
||||
|
||||
## Pre-Deployment Checklist
|
||||
|
||||
### Infrastructure Verification
|
||||
|
||||
```bash
|
||||
# Verify Docker Swarm status
|
||||
docker node ls
|
||||
docker network ls | grep tengig
|
||||
|
||||
# Check available storage
|
||||
df -h /rust/
|
||||
|
||||
# Verify network connectivity
|
||||
ping -c 3 192.168.1.27 # WALNUT
|
||||
ping -c 3 192.168.1.113 # IRONWOOD
|
||||
ping -c 3 192.168.1.xxx # ACACIA
|
||||
|
||||
# Test registry access
|
||||
docker pull registry.home.deepblack.cloud/hello-world || echo "Registry access test"
|
||||
```
|
||||
|
||||
### Security Hardening
|
||||
|
||||
```bash
|
||||
# Run security hardening script
|
||||
cd /home/tony/chorus/project-queues/active/BZZZ/infrastructure/security
|
||||
sudo ./security-hardening.sh
|
||||
|
||||
# Verify firewall status
|
||||
sudo ufw status verbose
|
||||
|
||||
# Check fail2ban status
|
||||
sudo fail2ban-client status
|
||||
```
|
||||
|
||||
## Deployment Procedures
|
||||
|
||||
### 1. Initial Deployment (Fresh Install)
|
||||
|
||||
#### Step 1: Prepare Infrastructure
|
||||
|
||||
```bash
|
||||
# Create directory structure
|
||||
mkdir -p /rust/bzzz-v2/{config,data,logs,backup}
|
||||
mkdir -p /rust/bzzz-v2/data/{blobs,conversations,dht,postgres,redis}
|
||||
mkdir -p /rust/bzzz-v2/config/{swarm,monitoring,security}
|
||||
|
||||
# Set permissions
|
||||
sudo chown -R tony:tony /rust/bzzz-v2
|
||||
chmod -R 755 /rust/bzzz-v2
|
||||
```
|
||||
|
||||
#### Step 2: Configure Secrets and Configs
|
||||
|
||||
```bash
|
||||
cd /home/tony/chorus/project-queues/active/BZZZ/infrastructure
|
||||
|
||||
# Create Docker secrets
|
||||
docker secret create bzzz_postgres_password config/secrets/postgres_password
|
||||
docker secret create bzzz_openai_api_key ~/chorus/business/secrets/openai-api-key
|
||||
docker secret create bzzz_grafana_admin_password config/secrets/grafana_admin_password
|
||||
|
||||
# Create Docker configs
|
||||
docker config create bzzz_v2_config config/bzzz-config.yaml
|
||||
docker config create bzzz_prometheus_config monitoring/configs/prometheus.yml
|
||||
docker config create bzzz_alertmanager_config monitoring/configs/alertmanager.yml
|
||||
```
|
||||
|
||||
#### Step 3: Deploy Core Services
|
||||
|
||||
```bash
|
||||
# Deploy main BZZZ v2 stack
|
||||
docker stack deploy -c docker-compose.swarm.yml bzzz-v2
|
||||
|
||||
# Wait for services to start (this may take 5-10 minutes)
|
||||
watch docker stack ps bzzz-v2
|
||||
```
|
||||
|
||||
#### Step 4: Deploy Monitoring Stack
|
||||
|
||||
```bash
|
||||
# Deploy monitoring services
|
||||
docker stack deploy -c monitoring/docker-compose.monitoring.yml bzzz-monitoring
|
||||
|
||||
# Verify monitoring services
|
||||
curl -f http://localhost:9090/-/healthy # Prometheus
|
||||
curl -f http://localhost:3000/api/health # Grafana
|
||||
```
|
||||
|
||||
#### Step 5: Verify Deployment
|
||||
|
||||
```bash
|
||||
# Check all services are running
|
||||
docker service ls --filter label=com.docker.stack.namespace=bzzz-v2
|
||||
|
||||
# Test external endpoints
|
||||
curl -f https://bzzz.deepblack.cloud/health
|
||||
curl -f https://mcp.deepblack.cloud/health
|
||||
curl -f https://resolve.deepblack.cloud/health
|
||||
|
||||
# Check P2P mesh connectivity
|
||||
docker exec $(docker ps -q -f label=com.docker.swarm.service.name=bzzz-v2_bzzz-agent | head -1) \
|
||||
curl -s http://localhost:9000/api/v2/peers | jq '.connected_peers | length'
|
||||
```
|
||||
|
||||
### 2. Update Deployment (Rolling Update)
|
||||
|
||||
#### Step 1: Pre-Update Checks
|
||||
|
||||
```bash
|
||||
# Check current deployment health
|
||||
docker stack ps bzzz-v2 | grep -v "Shutdown\|Failed"
|
||||
|
||||
# Backup current configuration
|
||||
mkdir -p /rust/bzzz-v2/backup/$(date +%Y%m%d-%H%M%S)
|
||||
docker config ls | grep bzzz_ > /rust/bzzz-v2/backup/$(date +%Y%m%d-%H%M%S)/configs.txt
|
||||
docker secret ls | grep bzzz_ > /rust/bzzz-v2/backup/$(date +%Y%m%d-%H%M%S)/secrets.txt
|
||||
```
|
||||
|
||||
#### Step 2: Update Images
|
||||
|
||||
```bash
|
||||
# Update to new image version
|
||||
export NEW_IMAGE_TAG="v2.1.0"
|
||||
|
||||
# Update Docker Compose file with new image tags
|
||||
sed -i "s/registry.home.deepblack.cloud\/bzzz:.*$/registry.home.deepblack.cloud\/bzzz:${NEW_IMAGE_TAG}/g" \
|
||||
docker-compose.swarm.yml
|
||||
|
||||
# Deploy updated stack (rolling update)
|
||||
docker stack deploy -c docker-compose.swarm.yml bzzz-v2
|
||||
```
|
||||
|
||||
#### Step 3: Monitor Update Progress
|
||||
|
||||
```bash
|
||||
# Watch rolling update progress
|
||||
watch "docker service ps bzzz-v2_bzzz-agent | head -20"
|
||||
|
||||
# Check for any failed updates
|
||||
docker service ps bzzz-v2_bzzz-agent --filter desired-state=running --filter current-state=failed
|
||||
```
|
||||
|
||||
### 3. Migration from v1 to v2
|
||||
|
||||
```bash
|
||||
# Use the automated migration script
|
||||
cd /home/tony/chorus/project-queues/active/BZZZ/infrastructure/migration-scripts
|
||||
|
||||
# Dry run first to preview changes
|
||||
./migrate-v1-to-v2.sh --dry-run
|
||||
|
||||
# Execute full migration
|
||||
./migrate-v1-to-v2.sh
|
||||
|
||||
# If rollback is needed
|
||||
./migrate-v1-to-v2.sh --rollback
|
||||
```
|
||||
|
||||
## Monitoring and Health Checks
|
||||
|
||||
### Health Check Commands
|
||||
|
||||
```bash
|
||||
# Service health checks
|
||||
docker service ls --filter label=com.docker.stack.namespace=bzzz-v2
|
||||
docker service ps bzzz-v2_bzzz-agent --filter desired-state=running
|
||||
|
||||
# Application health checks
|
||||
curl -f https://bzzz.deepblack.cloud/health
|
||||
curl -f https://mcp.deepblack.cloud/health
|
||||
curl -f https://resolve.deepblack.cloud/health
|
||||
curl -f https://openai.deepblack.cloud/health
|
||||
|
||||
# P2P network health
|
||||
docker exec $(docker ps -q -f label=com.docker.swarm.service.name=bzzz-v2_bzzz-agent | head -1) \
|
||||
curl -s http://localhost:9000/api/v2/dht/stats | jq '.'
|
||||
|
||||
# Database connectivity
|
||||
docker exec $(docker ps -q -f label=com.docker.swarm.service.name=bzzz-v2_postgres) \
|
||||
pg_isready -U bzzz -d bzzz_v2
|
||||
|
||||
# Cache connectivity
|
||||
docker exec $(docker ps -q -f label=com.docker.swarm.service.name=bzzz-v2_redis) \
|
||||
redis-cli ping
|
||||
```
|
||||
|
||||
### Performance Monitoring
|
||||
|
||||
```bash
|
||||
# Check resource usage
|
||||
docker stats --no-stream
|
||||
|
||||
# Monitor disk usage
|
||||
df -h /rust/bzzz-v2/data/
|
||||
|
||||
# Check network connections
|
||||
netstat -tuln | grep -E ":(9000|3001|3002|3003|9101|9102|9103)"
|
||||
|
||||
# Monitor OpenAI API usage
|
||||
curl -s http://localhost:9203/metrics | grep openai_cost
|
||||
```
|
||||
|
||||
## Troubleshooting Guide
|
||||
|
||||
### Common Issues and Solutions
|
||||
|
||||
#### 1. Service Won't Start
|
||||
|
||||
**Symptoms:** Service stuck in `preparing` or constantly restarting
|
||||
|
||||
**Diagnosis:**
|
||||
```bash
|
||||
# Check service logs
|
||||
docker service logs bzzz-v2_bzzz-agent --tail 50
|
||||
|
||||
# Check node resources
|
||||
docker node ls
|
||||
docker system df
|
||||
|
||||
# Verify secrets and configs
|
||||
docker secret ls | grep bzzz_
|
||||
docker config ls | grep bzzz_
|
||||
```
|
||||
|
||||
**Solutions:**
|
||||
- Check resource constraints and availability
|
||||
- Verify secrets and configs are accessible
|
||||
- Ensure image is available and correct
|
||||
- Check node labels and placement constraints
|
||||
|
||||
#### 2. P2P Network Issues
|
||||
|
||||
**Symptoms:** Agents not discovering each other, DHT lookups failing
|
||||
|
||||
**Diagnosis:**
|
||||
```bash
|
||||
# Check peer connections
|
||||
docker exec $(docker ps -q -f label=com.docker.swarm.service.name=bzzz-v2_bzzz-agent | head -1) \
|
||||
curl -s http://localhost:9000/api/v2/peers
|
||||
|
||||
# Check DHT bootstrap nodes
|
||||
curl http://localhost:9101/health
|
||||
curl http://localhost:9102/health
|
||||
curl http://localhost:9103/health
|
||||
|
||||
# Check network connectivity
|
||||
docker network inspect bzzz-internal
|
||||
```
|
||||
|
||||
**Solutions:**
|
||||
- Restart DHT bootstrap services
|
||||
- Check firewall rules for P2P ports
|
||||
- Verify Docker Swarm overlay network
|
||||
- Check for port conflicts
|
||||
|
||||
#### 3. High OpenAI Costs
|
||||
|
||||
**Symptoms:** Cost alerts triggering, rate limits being hit
|
||||
|
||||
**Diagnosis:**
|
||||
```bash
|
||||
# Check current usage
|
||||
curl -s http://localhost:9203/metrics | grep -E "openai_(cost|requests|tokens)"
|
||||
|
||||
# Check rate limiting
|
||||
docker service logs bzzz-v2_openai-proxy --tail 100 | grep "rate limit"
|
||||
```
|
||||
|
||||
**Solutions:**
|
||||
- Adjust rate limiting parameters
|
||||
- Review conversation patterns for excessive API calls
|
||||
- Implement request caching
|
||||
- Consider model selection optimization
|
||||
|
||||
#### 4. Database Connection Issues
|
||||
|
||||
**Symptoms:** Service errors related to database connectivity
|
||||
|
||||
**Diagnosis:**
|
||||
```bash
|
||||
# Check PostgreSQL status
|
||||
docker service logs bzzz-v2_postgres --tail 50
|
||||
|
||||
# Test connection from agent
|
||||
docker exec $(docker ps -q -f label=com.docker.swarm.service.name=bzzz-v2_bzzz-agent | head -1) \
|
||||
pg_isready -h postgres -U bzzz
|
||||
|
||||
# Check connection limits
|
||||
docker exec $(docker ps -q -f label=com.docker.swarm.service.name=bzzz-v2_postgres) \
|
||||
psql -U bzzz -d bzzz_v2 -c "SELECT count(*) FROM pg_stat_activity;"
|
||||
```
|
||||
|
||||
**Solutions:**
|
||||
- Restart PostgreSQL service
|
||||
- Check connection pool settings
|
||||
- Increase max_connections if needed
|
||||
- Review long-running queries
|
||||
|
||||
#### 5. Storage Issues
|
||||
|
||||
**Symptoms:** Disk full alerts, content store errors
|
||||
|
||||
**Diagnosis:**
|
||||
```bash
|
||||
# Check disk usage
|
||||
df -h /rust/bzzz-v2/data/
|
||||
du -sh /rust/bzzz-v2/data/blobs/
|
||||
|
||||
# Check content store health
|
||||
curl -s http://localhost:9202/metrics | grep content_store
|
||||
```
|
||||
|
||||
**Solutions:**
|
||||
- Run garbage collection on old blobs
|
||||
- Clean up old conversation threads
|
||||
- Increase storage capacity
|
||||
- Adjust retention policies
|
||||
|
||||
## Emergency Procedures
|
||||
|
||||
### Service Outage Response
|
||||
|
||||
#### Priority 1: Complete Service Outage
|
||||
|
||||
```bash
|
||||
# 1. Check cluster status
|
||||
docker node ls
|
||||
docker service ls --filter label=com.docker.stack.namespace=bzzz-v2
|
||||
|
||||
# 2. Emergency restart of critical services
|
||||
docker service update --force bzzz-v2_bzzz-agent
|
||||
docker service update --force bzzz-v2_postgres
|
||||
docker service update --force bzzz-v2_redis
|
||||
|
||||
# 3. If stack is corrupted, redeploy
|
||||
docker stack rm bzzz-v2
|
||||
sleep 60
|
||||
docker stack deploy -c docker-compose.swarm.yml bzzz-v2
|
||||
|
||||
# 4. Monitor recovery
|
||||
watch docker stack ps bzzz-v2
|
||||
```
|
||||
|
||||
#### Priority 2: Partial Service Degradation
|
||||
|
||||
```bash
|
||||
# 1. Identify problematic services
|
||||
docker service ps bzzz-v2_bzzz-agent --filter desired-state=running --filter current-state=failed
|
||||
|
||||
# 2. Scale up healthy replicas
|
||||
docker service update --replicas 3 bzzz-v2_bzzz-agent
|
||||
|
||||
# 3. Remove unhealthy tasks
|
||||
docker service update --force bzzz-v2_bzzz-agent
|
||||
```
|
||||
|
||||
### Security Incident Response
|
||||
|
||||
#### Step 1: Immediate Containment
|
||||
|
||||
```bash
|
||||
# 1. Block suspicious IPs
|
||||
sudo ufw insert 1 deny from SUSPICIOUS_IP
|
||||
|
||||
# 2. Check for compromise indicators
|
||||
sudo fail2ban-client status
|
||||
sudo tail -100 /var/log/audit/audit.log | grep -i "denied\|failed\|error"
|
||||
|
||||
# 3. Isolate affected services
|
||||
docker service update --replicas 0 AFFECTED_SERVICE
|
||||
```
|
||||
|
||||
#### Step 2: Investigation
|
||||
|
||||
```bash
|
||||
# 1. Check access logs
|
||||
docker service logs bzzz-v2_bzzz-agent --since 1h | grep -i "error\|failed\|unauthorized"
|
||||
|
||||
# 2. Review monitoring alerts
|
||||
curl -s http://localhost:9093/api/v1/alerts | jq '.data[] | select(.state=="firing")'
|
||||
|
||||
# 3. Examine network connections
|
||||
netstat -tuln
|
||||
ss -tulpn | grep -E ":(9000|3001|3002|3003)"
|
||||
```
|
||||
|
||||
#### Step 3: Recovery
|
||||
|
||||
```bash
|
||||
# 1. Update security rules
|
||||
./infrastructure/security/security-hardening.sh
|
||||
|
||||
# 2. Rotate secrets if compromised
|
||||
docker secret rm bzzz_postgres_password
|
||||
openssl rand -base64 32 | docker secret create bzzz_postgres_password -
|
||||
|
||||
# 3. Restart services with new secrets
|
||||
docker stack deploy -c docker-compose.swarm.yml bzzz-v2
|
||||
```
|
||||
|
||||
### Data Recovery Procedures
|
||||
|
||||
#### Backup Restoration
|
||||
|
||||
```bash
|
||||
# 1. Stop services
|
||||
docker stack rm bzzz-v2
|
||||
|
||||
# 2. Restore from backup
|
||||
BACKUP_DATE="20241201-120000"
|
||||
rsync -av /rust/bzzz-v2/backup/$BACKUP_DATE/ /rust/bzzz-v2/data/
|
||||
|
||||
# 3. Restart services
|
||||
docker stack deploy -c docker-compose.swarm.yml bzzz-v2
|
||||
```
|
||||
|
||||
#### Database Recovery
|
||||
|
||||
```bash
|
||||
# 1. Stop application services
|
||||
docker service scale bzzz-v2_bzzz-agent=0
|
||||
|
||||
# 2. Create database backup
|
||||
docker exec $(docker ps -q -f label=com.docker.swarm.service.name=bzzz-v2_postgres) \
|
||||
pg_dump -U bzzz bzzz_v2 > /rust/bzzz-v2/backup/database-$(date +%Y%m%d-%H%M%S).sql
|
||||
|
||||
# 3. Restore database
|
||||
docker exec -i $(docker ps -q -f label=com.docker.swarm.service.name=bzzz-v2_postgres) \
|
||||
psql -U bzzz -d bzzz_v2 < /rust/bzzz-v2/backup/database-backup.sql
|
||||
|
||||
# 4. Restart application services
|
||||
docker service scale bzzz-v2_bzzz-agent=3
|
||||
```
|
||||
|
||||
## Maintenance Procedures
|
||||
|
||||
### Routine Maintenance (Weekly)
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# Weekly maintenance script
|
||||
|
||||
# 1. Check service health
|
||||
docker service ls --filter label=com.docker.stack.namespace=bzzz-v2
|
||||
docker system df
|
||||
|
||||
# 2. Clean up unused resources
|
||||
docker system prune -f
|
||||
docker volume prune -f
|
||||
|
||||
# 3. Backup critical data
|
||||
pg_dump -h localhost -U bzzz bzzz_v2 | gzip > \
|
||||
/rust/bzzz-v2/backup/weekly-db-$(date +%Y%m%d).sql.gz
|
||||
|
||||
# 4. Rotate logs
|
||||
find /rust/bzzz-v2/logs -name "*.log" -mtime +7 -delete
|
||||
|
||||
# 5. Check certificate expiration
|
||||
openssl x509 -in /rust/bzzz-v2/config/tls/server/walnut.pem -noout -dates
|
||||
|
||||
# 6. Update security rules
|
||||
fail2ban-client reload
|
||||
|
||||
# 7. Generate maintenance report
|
||||
echo "Maintenance completed on $(date)" >> /rust/bzzz-v2/logs/maintenance.log
|
||||
```
|
||||
|
||||
### Scaling Procedures
|
||||
|
||||
#### Scale Up
|
||||
|
||||
```bash
|
||||
# Increase replica count
|
||||
docker service scale bzzz-v2_bzzz-agent=5
|
||||
docker service scale bzzz-v2_mcp-server=5
|
||||
|
||||
# Add new node to cluster (run on new node)
|
||||
docker swarm join --token $WORKER_TOKEN $MANAGER_IP:2377
|
||||
|
||||
# Label new node
|
||||
docker node update --label-add bzzz.role=agent NEW_NODE_HOSTNAME
|
||||
```
|
||||
|
||||
#### Scale Down
|
||||
|
||||
```bash
|
||||
# Gracefully reduce replicas
|
||||
docker service scale bzzz-v2_bzzz-agent=2
|
||||
docker service scale bzzz-v2_mcp-server=2
|
||||
|
||||
# Remove node from cluster
|
||||
docker node update --availability drain NODE_HOSTNAME
|
||||
docker node rm NODE_HOSTNAME
|
||||
```
|
||||
|
||||
## Performance Tuning
|
||||
|
||||
### Database Optimization
|
||||
|
||||
```bash
|
||||
# PostgreSQL tuning
|
||||
docker exec $(docker ps -q -f label=com.docker.swarm.service.name=bzzz-v2_postgres) \
|
||||
psql -U bzzz -d bzzz_v2 -c "
|
||||
ALTER SYSTEM SET shared_buffers = '1GB';
|
||||
ALTER SYSTEM SET max_connections = 200;
|
||||
ALTER SYSTEM SET checkpoint_timeout = '15min';
|
||||
SELECT pg_reload_conf();
|
||||
"
|
||||
```
|
||||
|
||||
### Storage Optimization
|
||||
|
||||
```bash
|
||||
# Content store optimization
|
||||
find /rust/bzzz-v2/data/blobs -name "*.tmp" -mtime +1 -delete
|
||||
find /rust/bzzz-v2/data/blobs -type f -size 0 -delete
|
||||
|
||||
# Compress old logs
|
||||
find /rust/bzzz-v2/logs -name "*.log" -mtime +3 -exec gzip {} \;
|
||||
```
|
||||
|
||||
### Network Optimization
|
||||
|
||||
```bash
|
||||
# Optimize network buffer sizes
|
||||
echo 'net.core.rmem_max = 134217728' | sudo tee -a /etc/sysctl.conf
|
||||
echo 'net.core.wmem_max = 134217728' | sudo tee -a /etc/sysctl.conf
|
||||
echo 'net.ipv4.tcp_rmem = 4096 87380 134217728' | sudo tee -a /etc/sysctl.conf
|
||||
echo 'net.ipv4.tcp_wmem = 4096 65536 134217728' | sudo tee -a /etc/sysctl.conf
|
||||
sudo sysctl -p
|
||||
```
|
||||
|
||||
## Contact Information
|
||||
|
||||
### On-Call Procedures
|
||||
|
||||
- **Primary Contact**: DevOps Team Lead
|
||||
- **Secondary Contact**: Senior Site Reliability Engineer
|
||||
- **Escalation**: Platform Engineering Manager
|
||||
|
||||
### Communication Channels
|
||||
|
||||
- **Slack**: #bzzz-incidents
|
||||
- **Email**: devops@deepblack.cloud
|
||||
- **Phone**: Emergency On-Call Rotation
|
||||
|
||||
### Documentation
|
||||
|
||||
- **Runbooks**: This document
|
||||
- **Architecture**: `/docs/BZZZ_V2_INFRASTRUCTURE_ARCHITECTURE.md`
|
||||
- **API Documentation**: https://bzzz.deepblack.cloud/docs
|
||||
- **Monitoring Dashboards**: https://grafana.deepblack.cloud
|
||||
|
||||
---
|
||||
|
||||
*This runbook should be reviewed and updated monthly. Last updated: $(date)*
|
||||
514
infrastructure/migration-scripts/migrate-v1-to-v2.sh
Executable file
514
infrastructure/migration-scripts/migrate-v1-to-v2.sh
Executable file
@@ -0,0 +1,514 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
# BZZZ v1 to v2 Migration Script
|
||||
# This script handles the complete migration from BZZZ v1 (SystemD) to v2 (Docker Swarm)
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
LOG_FILE="/var/log/bzzz-migration-$(date +%Y%m%d-%H%M%S).log"
|
||||
BACKUP_DIR="/rust/bzzz-v2/backup/$(date +%Y%m%d-%H%M%S)"
|
||||
DRY_RUN=${DRY_RUN:-false}
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
log() {
|
||||
echo -e "${BLUE}[$(date +'%Y-%m-%d %H:%M:%S')]${NC} $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1" | tee -a "$LOG_FILE"
|
||||
exit 1
|
||||
}
|
||||
|
||||
warn() {
|
||||
echo -e "${YELLOW}[WARN]${NC} $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
check_prerequisites() {
|
||||
log "Checking prerequisites..."
|
||||
|
||||
# Check if running as root for some operations
|
||||
if [[ $EUID -eq 0 ]]; then
|
||||
error "This script should not be run as root. Run as tony user with sudo access."
|
||||
fi
|
||||
|
||||
# Check required commands
|
||||
local commands=("docker" "systemctl" "pg_dump" "rsync" "curl")
|
||||
for cmd in "${commands[@]}"; do
|
||||
if ! command -v "$cmd" &> /dev/null; then
|
||||
error "Required command '$cmd' not found"
|
||||
fi
|
||||
done
|
||||
|
||||
# Check Docker Swarm status
|
||||
if ! docker info | grep -q "Swarm: active"; then
|
||||
error "Docker Swarm is not active. Please initialize swarm first."
|
||||
fi
|
||||
|
||||
# Check available disk space
|
||||
local available=$(df /rust | awk 'NR==2 {print $4}')
|
||||
local required=10485760 # 10GB in KB
|
||||
if [[ $available -lt $required ]]; then
|
||||
error "Insufficient disk space. Need at least 10GB available in /rust"
|
||||
fi
|
||||
|
||||
success "Prerequisites check passed"
|
||||
}
|
||||
|
||||
backup_v1_data() {
|
||||
log "Creating backup of v1 data..."
|
||||
|
||||
if [[ "$DRY_RUN" == "true" ]]; then
|
||||
log "[DRY RUN] Would create backup at: $BACKUP_DIR"
|
||||
return 0
|
||||
fi
|
||||
|
||||
mkdir -p "$BACKUP_DIR"
|
||||
|
||||
# Backup v1 configuration
|
||||
if [[ -d "/home/tony/chorus/project-queues/active/BZZZ" ]]; then
|
||||
rsync -av "/home/tony/chorus/project-queues/active/BZZZ/" "$BACKUP_DIR/v1-source/"
|
||||
fi
|
||||
|
||||
# Backup systemd service files
|
||||
sudo cp /etc/systemd/system/bzzz.service "$BACKUP_DIR/" 2>/dev/null || true
|
||||
|
||||
# Backup hypercore logs (if any)
|
||||
if [[ -d "/home/tony/.config/bzzz" ]]; then
|
||||
rsync -av "/home/tony/.config/bzzz/" "$BACKUP_DIR/config/"
|
||||
fi
|
||||
|
||||
# Backup any existing data directories
|
||||
for node in walnut ironwood acacia; do
|
||||
if [[ -d "/rust/bzzz/$node" ]]; then
|
||||
rsync -av "/rust/bzzz/$node/" "$BACKUP_DIR/data/$node/"
|
||||
fi
|
||||
done
|
||||
|
||||
success "Backup completed at: $BACKUP_DIR"
|
||||
}
|
||||
|
||||
stop_v1_services() {
|
||||
log "Stopping BZZZ v1 services..."
|
||||
|
||||
if [[ "$DRY_RUN" == "true" ]]; then
|
||||
log "[DRY RUN] Would stop v1 systemd services"
|
||||
return 0
|
||||
fi
|
||||
|
||||
local nodes=("walnut" "ironwood" "acacia")
|
||||
for node in "${nodes[@]}"; do
|
||||
if sudo systemctl is-active --quiet "bzzz@$node" 2>/dev/null || sudo systemctl is-active --quiet bzzz 2>/dev/null; then
|
||||
log "Stopping BZZZ service on $node..."
|
||||
sudo systemctl stop "bzzz@$node" 2>/dev/null || sudo systemctl stop bzzz 2>/dev/null || true
|
||||
sudo systemctl disable "bzzz@$node" 2>/dev/null || sudo systemctl disable bzzz 2>/dev/null || true
|
||||
fi
|
||||
done
|
||||
|
||||
# Wait for services to fully stop
|
||||
sleep 10
|
||||
|
||||
success "v1 services stopped"
|
||||
}
|
||||
|
||||
setup_v2_infrastructure() {
|
||||
log "Setting up v2 infrastructure..."
|
||||
|
||||
if [[ "$DRY_RUN" == "true" ]]; then
|
||||
log "[DRY RUN] Would create v2 directory structure"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Create directory structure
|
||||
mkdir -p /rust/bzzz-v2/{config,data,logs}
|
||||
mkdir -p /rust/bzzz-v2/data/{blobs,conversations,dht,postgres,redis}
|
||||
mkdir -p /rust/bzzz-v2/data/blobs/{data,index,temp}
|
||||
mkdir -p /rust/bzzz-v2/data/dht/{walnut,ironwood,acacia}
|
||||
mkdir -p /rust/bzzz-v2/config/{swarm,systemd,secrets}
|
||||
mkdir -p /rust/bzzz-v2/logs/{application,p2p,monitoring}
|
||||
|
||||
# Set permissions
|
||||
sudo chown -R tony:tony /rust/bzzz-v2
|
||||
chmod -R 755 /rust/bzzz-v2
|
||||
|
||||
# Create placeholder configuration files
|
||||
cat > /rust/bzzz-v2/config/bzzz-config.yaml << 'EOF'
|
||||
agent:
|
||||
id: ""
|
||||
specialization: "advanced_reasoning"
|
||||
capabilities: ["code_generation", "debugging", "analysis"]
|
||||
models: ["llama3.2:70b", "qwen2.5:72b"]
|
||||
max_tasks: 3
|
||||
|
||||
hive_api:
|
||||
base_url: "http://hive.deepblack.cloud"
|
||||
api_key: ""
|
||||
|
||||
dht:
|
||||
bootstrap_nodes:
|
||||
- "walnut:9101"
|
||||
- "ironwood:9102"
|
||||
- "acacia:9103"
|
||||
|
||||
content_store:
|
||||
path: "/app/data/blobs"
|
||||
replication_factor: 3
|
||||
shard_depth: 2
|
||||
|
||||
openai:
|
||||
rate_limit_rpm: 1000
|
||||
rate_limit_tpm: 100000
|
||||
cost_tracking: true
|
||||
EOF
|
||||
|
||||
success "v2 infrastructure setup completed"
|
||||
}
|
||||
|
||||
migrate_conversation_data() {
|
||||
log "Migrating conversation data..."
|
||||
|
||||
if [[ "$DRY_RUN" == "true" ]]; then
|
||||
log "[DRY RUN] Would migrate hypercore logs to content-addressed storage"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Check if there are any hypercore logs to migrate
|
||||
local log_files=()
|
||||
for node in walnut ironwood acacia; do
|
||||
if [[ -f "/home/tony/.config/bzzz/hypercore-$node.log" ]]; then
|
||||
log_files+=("/home/tony/.config/bzzz/hypercore-$node.log")
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ ${#log_files[@]} -eq 0 ]]; then
|
||||
warn "No hypercore logs found for migration"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Process each log file and create content-addressed blobs
|
||||
local migration_script="$SCRIPT_DIR/convert-hypercore-to-cas.py"
|
||||
if [[ -f "$migration_script" ]]; then
|
||||
python3 "$migration_script" "${log_files[@]}" --output-dir "/rust/bzzz-v2/data/blobs/data"
|
||||
success "Conversation data migrated to content-addressed storage"
|
||||
else
|
||||
warn "Migration script not found, skipping conversation data migration"
|
||||
fi
|
||||
}
|
||||
|
||||
setup_docker_secrets() {
|
||||
log "Setting up Docker secrets..."
|
||||
|
||||
if [[ "$DRY_RUN" == "true" ]]; then
|
||||
log "[DRY RUN] Would create Docker secrets"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Create PostgreSQL password secret
|
||||
if [[ -f "/home/tony/chorus/business/secrets/postgres-bzzz-password" ]]; then
|
||||
docker secret create bzzz_postgres_password /home/tony/chorus/business/secrets/postgres-bzzz-password 2>/dev/null || true
|
||||
else
|
||||
# Generate random password
|
||||
openssl rand -base64 32 | docker secret create bzzz_postgres_password - 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Create OpenAI API key secret
|
||||
if [[ -f "/home/tony/chorus/business/secrets/openai-api-key" ]]; then
|
||||
docker secret create bzzz_openai_api_key /home/tony/chorus/business/secrets/openai-api-key 2>/dev/null || true
|
||||
else
|
||||
warn "OpenAI API key not found in secrets directory"
|
||||
fi
|
||||
|
||||
success "Docker secrets configured"
|
||||
}
|
||||
|
||||
setup_docker_configs() {
|
||||
log "Setting up Docker configs..."
|
||||
|
||||
if [[ "$DRY_RUN" == "true" ]]; then
|
||||
log "[DRY RUN] Would create Docker configs"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Create main BZZZ config
|
||||
docker config create bzzz_v2_config /rust/bzzz-v2/config/bzzz-config.yaml 2>/dev/null || true
|
||||
|
||||
# Create MCP server config
|
||||
cat > /tmp/mcp-config.yaml << 'EOF'
|
||||
server:
|
||||
port: 3001
|
||||
max_connections: 1000
|
||||
timeout_seconds: 30
|
||||
|
||||
tools:
|
||||
enabled: true
|
||||
max_execution_time: 300
|
||||
|
||||
logging:
|
||||
level: info
|
||||
format: json
|
||||
EOF
|
||||
docker config create bzzz_mcp_config /tmp/mcp-config.yaml 2>/dev/null || true
|
||||
rm /tmp/mcp-config.yaml
|
||||
|
||||
# Create proxy config
|
||||
cat > /tmp/proxy-config.yaml << 'EOF'
|
||||
openai:
|
||||
rate_limit:
|
||||
requests_per_minute: 1000
|
||||
tokens_per_minute: 100000
|
||||
cost_tracking:
|
||||
enabled: true
|
||||
log_requests: true
|
||||
models:
|
||||
- "gpt-4"
|
||||
- "gpt-4-turbo"
|
||||
- "gpt-3.5-turbo"
|
||||
|
||||
server:
|
||||
port: 3002
|
||||
timeout: 30s
|
||||
EOF
|
||||
docker config create bzzz_proxy_config /tmp/proxy-config.yaml 2>/dev/null || true
|
||||
rm /tmp/proxy-config.yaml
|
||||
|
||||
# Create Redis config
|
||||
cat > /tmp/redis.conf << 'EOF'
|
||||
bind 0.0.0.0
|
||||
port 6379
|
||||
timeout 0
|
||||
keepalive 300
|
||||
maxclients 10000
|
||||
maxmemory 1gb
|
||||
maxmemory-policy allkeys-lru
|
||||
save 900 1
|
||||
save 300 10
|
||||
save 60 10000
|
||||
EOF
|
||||
docker config create bzzz_redis_config /tmp/redis.conf 2>/dev/null || true
|
||||
rm /tmp/redis.conf
|
||||
|
||||
success "Docker configs created"
|
||||
}
|
||||
|
||||
deploy_v2_stack() {
|
||||
log "Deploying BZZZ v2 Docker stack..."
|
||||
|
||||
if [[ "$DRY_RUN" == "true" ]]; then
|
||||
log "[DRY RUN] Would deploy Docker stack with: docker stack deploy -c docker-compose.swarm.yml bzzz-v2"
|
||||
return 0
|
||||
fi
|
||||
|
||||
cd "$SCRIPT_DIR/.."
|
||||
|
||||
# Verify compose file
|
||||
if ! docker-compose -f infrastructure/docker-compose.swarm.yml config > /dev/null; then
|
||||
error "Docker compose file validation failed"
|
||||
fi
|
||||
|
||||
# Deploy the stack
|
||||
docker stack deploy -c infrastructure/docker-compose.swarm.yml bzzz-v2
|
||||
|
||||
# Wait for services to start
|
||||
log "Waiting for services to become ready..."
|
||||
local max_wait=300 # 5 minutes
|
||||
local wait_time=0
|
||||
|
||||
while [[ $wait_time -lt $max_wait ]]; do
|
||||
local ready_services=$(docker service ls --filter label=com.docker.stack.namespace=bzzz-v2 --format "table {{.Name}}\t{{.Replicas}}" | grep -v "0/" | wc -l)
|
||||
local total_services=$(docker service ls --filter label=com.docker.stack.namespace=bzzz-v2 --format "table {{.Name}}" | wc -l)
|
||||
|
||||
if [[ $ready_services -eq $total_services ]]; then
|
||||
success "All services are ready"
|
||||
break
|
||||
fi
|
||||
|
||||
log "Waiting for services... ($ready_services/$total_services ready)"
|
||||
sleep 10
|
||||
wait_time=$((wait_time + 10))
|
||||
done
|
||||
|
||||
if [[ $wait_time -ge $max_wait ]]; then
|
||||
error "Timeout waiting for services to become ready"
|
||||
fi
|
||||
}
|
||||
|
||||
verify_v2_deployment() {
|
||||
log "Verifying v2 deployment..."
|
||||
|
||||
# Check service health
|
||||
local services=("bzzz-v2_bzzz-agent" "bzzz-v2_postgres" "bzzz-v2_redis" "bzzz-v2_mcp-server")
|
||||
for service in "${services[@]}"; do
|
||||
if ! docker service ps "$service" | grep -q "Running"; then
|
||||
error "Service $service is not running properly"
|
||||
fi
|
||||
done
|
||||
|
||||
# Test DHT connectivity
|
||||
log "Testing DHT connectivity..."
|
||||
if ! timeout 30 docker exec "$(docker ps -q -f label=com.docker.swarm.service.name=bzzz-v2_dht-bootstrap-walnut)" \
|
||||
curl -f http://localhost:9101/health > /dev/null 2>&1; then
|
||||
warn "DHT bootstrap node (walnut) health check failed"
|
||||
fi
|
||||
|
||||
# Test MCP server
|
||||
log "Testing MCP server..."
|
||||
if ! timeout 10 curl -f http://localhost:3001/health > /dev/null 2>&1; then
|
||||
warn "MCP server health check failed"
|
||||
fi
|
||||
|
||||
# Test content resolver
|
||||
log "Testing content resolver..."
|
||||
if ! timeout 10 curl -f http://localhost:3003/health > /dev/null 2>&1; then
|
||||
warn "Content resolver health check failed"
|
||||
fi
|
||||
|
||||
success "v2 deployment verification completed"
|
||||
}
|
||||
|
||||
update_node_labels() {
|
||||
log "Updating Docker node labels for service placement..."
|
||||
|
||||
if [[ "$DRY_RUN" == "true" ]]; then
|
||||
log "[DRY RUN] Would update node labels"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Set node labels for service placement
|
||||
docker node update --label-add bzzz.role=agent walnut 2>/dev/null || true
|
||||
docker node update --label-add bzzz.role=agent ironwood 2>/dev/null || true
|
||||
docker node update --label-add bzzz.role=agent acacia 2>/dev/null || true
|
||||
|
||||
success "Node labels updated"
|
||||
}
|
||||
|
||||
cleanup_v1_artifacts() {
|
||||
log "Cleaning up v1 artifacts..."
|
||||
|
||||
if [[ "$DRY_RUN" == "true" ]]; then
|
||||
log "[DRY RUN] Would clean up v1 systemd files and binaries"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Remove systemd service files (but keep backup)
|
||||
sudo rm -f /etc/systemd/system/bzzz.service
|
||||
sudo rm -f /etc/systemd/system/bzzz@.service
|
||||
sudo systemctl daemon-reload
|
||||
|
||||
# Move v1 binaries to backup location
|
||||
if [[ -f "/home/tony/chorus/project-queues/active/BZZZ/bzzz" ]]; then
|
||||
mv "/home/tony/chorus/project-queues/active/BZZZ/bzzz" "$BACKUP_DIR/bzzz-v1-binary"
|
||||
fi
|
||||
|
||||
success "v1 cleanup completed"
|
||||
}
|
||||
|
||||
print_migration_summary() {
|
||||
log "Migration Summary:"
|
||||
log "=================="
|
||||
log "✅ v1 services stopped and disabled"
|
||||
log "✅ v2 infrastructure deployed to Docker Swarm"
|
||||
log "✅ Data migrated to content-addressed storage"
|
||||
log "✅ DHT network established across 3 nodes"
|
||||
log "✅ MCP server and OpenAI proxy deployed"
|
||||
log "✅ Monitoring and health checks configured"
|
||||
log ""
|
||||
log "Access Points:"
|
||||
log "- BZZZ Agent API: https://bzzz.deepblack.cloud"
|
||||
log "- MCP Server: https://mcp.deepblack.cloud"
|
||||
log "- Content Resolver: https://resolve.deepblack.cloud"
|
||||
log "- OpenAI Proxy: https://openai.deepblack.cloud"
|
||||
log ""
|
||||
log "Monitoring:"
|
||||
log "- docker service ls --filter label=com.docker.stack.namespace=bzzz-v2"
|
||||
log "- docker stack ps bzzz-v2"
|
||||
log "- docker service logs bzzz-v2_bzzz-agent"
|
||||
log ""
|
||||
log "Backup Location: $BACKUP_DIR"
|
||||
log "Migration Log: $LOG_FILE"
|
||||
}
|
||||
|
||||
rollback_to_v1() {
|
||||
log "Rolling back to v1..."
|
||||
|
||||
# Stop v2 services
|
||||
docker stack rm bzzz-v2 2>/dev/null || true
|
||||
sleep 30
|
||||
|
||||
# Restore v1 systemd service
|
||||
if [[ -f "$BACKUP_DIR/bzzz.service" ]]; then
|
||||
sudo cp "$BACKUP_DIR/bzzz.service" /etc/systemd/system/
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable bzzz
|
||||
sudo systemctl start bzzz
|
||||
fi
|
||||
|
||||
# Restore v1 binary
|
||||
if [[ -f "$BACKUP_DIR/bzzz-v1-binary" ]]; then
|
||||
cp "$BACKUP_DIR/bzzz-v1-binary" "/home/tony/chorus/project-queues/active/BZZZ/bzzz"
|
||||
chmod +x "/home/tony/chorus/project-queues/active/BZZZ/bzzz"
|
||||
fi
|
||||
|
||||
success "Rollback to v1 completed"
|
||||
}
|
||||
|
||||
main() {
|
||||
log "Starting BZZZ v1 to v2 migration..."
|
||||
log "DRY_RUN mode: $DRY_RUN"
|
||||
|
||||
# Handle rollback if requested
|
||||
if [[ "${1:-}" == "--rollback" ]]; then
|
||||
rollback_to_v1
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Trap to handle errors
|
||||
trap 'error "Migration failed at line $LINENO"' ERR
|
||||
|
||||
check_prerequisites
|
||||
backup_v1_data
|
||||
stop_v1_services
|
||||
setup_v2_infrastructure
|
||||
migrate_conversation_data
|
||||
setup_docker_secrets
|
||||
setup_docker_configs
|
||||
update_node_labels
|
||||
deploy_v2_stack
|
||||
verify_v2_deployment
|
||||
cleanup_v1_artifacts
|
||||
print_migration_summary
|
||||
|
||||
success "BZZZ v2 migration completed successfully!"
|
||||
log "Run with --rollback to revert to v1 if needed"
|
||||
}
|
||||
|
||||
# Handle script arguments
|
||||
case "${1:-}" in
|
||||
--dry-run)
|
||||
DRY_RUN=true
|
||||
main
|
||||
;;
|
||||
--rollback)
|
||||
main --rollback
|
||||
;;
|
||||
--help|-h)
|
||||
echo "Usage: $0 [--dry-run|--rollback|--help]"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --dry-run Preview migration steps without making changes"
|
||||
echo " --rollback Rollback to v1 (emergency use only)"
|
||||
echo " --help Show this help message"
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
main
|
||||
;;
|
||||
esac
|
||||
339
infrastructure/monitoring/configs/alert-rules.yml
Normal file
339
infrastructure/monitoring/configs/alert-rules.yml
Normal file
@@ -0,0 +1,339 @@
|
||||
# BZZZ v2 Prometheus Alert Rules
|
||||
|
||||
groups:
|
||||
# P2P Network Health Rules
|
||||
- name: p2p-network
|
||||
rules:
|
||||
- alert: P2PNetworkPartition
|
||||
expr: bzzz_p2p_connected_peers < 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
component: p2p
|
||||
annotations:
|
||||
summary: "P2P network partition detected"
|
||||
description: "Node {{ $labels.instance }} has less than 2 peers connected for more than 5 minutes"
|
||||
|
||||
- alert: P2PHighLatency
|
||||
expr: histogram_quantile(0.95, bzzz_p2p_message_duration_seconds) > 5
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
component: p2p
|
||||
annotations:
|
||||
summary: "High P2P message latency"
|
||||
description: "95th percentile P2P message latency is {{ $value }}s on {{ $labels.instance }}"
|
||||
|
||||
- alert: P2PMessageDropRate
|
||||
expr: rate(bzzz_p2p_messages_dropped_total[5m]) > 0.1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
component: p2p
|
||||
annotations:
|
||||
summary: "High P2P message drop rate"
|
||||
description: "P2P message drop rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
|
||||
|
||||
# DHT Network Rules
|
||||
- name: dht-network
|
||||
rules:
|
||||
- alert: DHTBootstrapNodeDown
|
||||
expr: up{job="dht-bootstrap"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
component: dht
|
||||
annotations:
|
||||
summary: "DHT bootstrap node is down"
|
||||
description: "DHT bootstrap node {{ $labels.instance }} has been down for more than 1 minute"
|
||||
|
||||
- alert: DHTRoutingTableSize
|
||||
expr: bzzz_dht_routing_table_size < 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: dht
|
||||
annotations:
|
||||
summary: "DHT routing table is small"
|
||||
description: "DHT routing table size is {{ $value }} on {{ $labels.instance }}, indicating poor network connectivity"
|
||||
|
||||
- alert: DHTLookupFailureRate
|
||||
expr: rate(bzzz_dht_lookup_failures_total[5m]) / rate(bzzz_dht_lookups_total[5m]) > 0.2
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
component: dht
|
||||
annotations:
|
||||
summary: "High DHT lookup failure rate"
|
||||
description: "DHT lookup failure rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
|
||||
|
||||
# Content Store Rules
|
||||
- name: content-store
|
||||
rules:
|
||||
- alert: ContentStoreDiskUsage
|
||||
expr: (bzzz_content_store_disk_used_bytes / bzzz_content_store_disk_total_bytes) * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: content-store
|
||||
disk_usage: "{{ $value | humanize }}"
|
||||
annotations:
|
||||
summary: "Content store disk usage is high"
|
||||
description: "Content store disk usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
|
||||
|
||||
- alert: ContentStoreDiskFull
|
||||
expr: (bzzz_content_store_disk_used_bytes / bzzz_content_store_disk_total_bytes) * 100 > 95
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
component: content-store
|
||||
disk_usage: "{{ $value | humanize }}"
|
||||
annotations:
|
||||
summary: "Content store disk is nearly full"
|
||||
description: "Content store disk usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
|
||||
|
||||
- alert: ContentReplicationFailed
|
||||
expr: increase(bzzz_content_replication_failures_total[10m]) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: content-store
|
||||
annotations:
|
||||
summary: "Content replication failures detected"
|
||||
description: "{{ $value }} content replication failures in the last 10 minutes on {{ $labels.instance }}"
|
||||
|
||||
- alert: BLAKE3HashCollision
|
||||
expr: increase(bzzz_blake3_hash_collisions_total[1h]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
component: content-store
|
||||
annotations:
|
||||
summary: "BLAKE3 hash collision detected"
|
||||
description: "BLAKE3 hash collision detected on {{ $labels.instance }} - immediate investigation required"
|
||||
|
||||
# OpenAI Integration Rules
|
||||
- name: openai-integration
|
||||
rules:
|
||||
- alert: OpenAIHighCost
|
||||
expr: bzzz_openai_cost_daily_usd > 100
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
component: openai-cost
|
||||
current_cost: "{{ $value }}"
|
||||
cost_threshold: "100"
|
||||
cost_period: "daily"
|
||||
annotations:
|
||||
summary: "OpenAI daily cost exceeds threshold"
|
||||
description: "Daily OpenAI cost is ${{ $value }}, exceeding the $100 threshold"
|
||||
|
||||
- alert: OpenAICriticalCost
|
||||
expr: bzzz_openai_cost_daily_usd > 500
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
component: openai-cost
|
||||
current_cost: "{{ $value }}"
|
||||
cost_threshold: "500"
|
||||
cost_period: "daily"
|
||||
annotations:
|
||||
summary: "OpenAI daily cost critically high"
|
||||
description: "Daily OpenAI cost is ${{ $value }}, which is critically high - consider rate limiting"
|
||||
|
||||
- alert: OpenAIRateLimitHit
|
||||
expr: increase(bzzz_openai_rate_limit_hits_total[5m]) > 10
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
component: openai-cost
|
||||
annotations:
|
||||
summary: "OpenAI rate limit frequently hit"
|
||||
description: "OpenAI rate limit hit {{ $value }} times in the last 5 minutes"
|
||||
|
||||
- alert: OpenAIProxyDown
|
||||
expr: up{job="openai-proxy"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
component: service-health
|
||||
annotations:
|
||||
summary: "OpenAI proxy is down"
|
||||
description: "OpenAI proxy service is down on {{ $labels.instance }}"
|
||||
|
||||
# MCP Server Rules
|
||||
- name: mcp-server
|
||||
rules:
|
||||
- alert: MCPServerDown
|
||||
expr: up{job="mcp-server"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
component: service-health
|
||||
annotations:
|
||||
summary: "MCP server is down"
|
||||
description: "MCP server is down on {{ $labels.instance }}"
|
||||
|
||||
- alert: MCPHighResponseTime
|
||||
expr: histogram_quantile(0.95, bzzz_mcp_request_duration_seconds) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: service-health
|
||||
annotations:
|
||||
summary: "MCP server high response time"
|
||||
description: "95th percentile MCP response time is {{ $value }}s on {{ $labels.instance }}"
|
||||
|
||||
- alert: MCPConnectionLimit
|
||||
expr: bzzz_mcp_active_connections / bzzz_mcp_max_connections > 0.8
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
component: service-health
|
||||
annotations:
|
||||
summary: "MCP server connection limit approaching"
|
||||
description: "MCP server connection usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
|
||||
|
||||
# Conversation Threading Rules
|
||||
- name: conversation-threading
|
||||
rules:
|
||||
- alert: ConversationThreadLag
|
||||
expr: bzzz_conversation_lamport_clock_lag_seconds > 30
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
component: conversation
|
||||
annotations:
|
||||
summary: "Conversation thread lag detected"
|
||||
description: "Lamport clock lag is {{ $value }}s on {{ $labels.instance }}, indicating thread synchronization issues"
|
||||
|
||||
- alert: ConversationStorageFailure
|
||||
expr: increase(bzzz_conversation_storage_failures_total[5m]) > 3
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
component: conversation
|
||||
annotations:
|
||||
summary: "Conversation storage failures"
|
||||
description: "{{ $value }} conversation storage failures in the last 5 minutes on {{ $labels.instance }}"
|
||||
|
||||
# System Resource Rules
|
||||
- name: system-resources
|
||||
rules:
|
||||
- alert: NodeDown
|
||||
expr: up{job="node-exporter"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
component: system
|
||||
annotations:
|
||||
summary: "Node is down"
|
||||
description: "Node {{ $labels.instance }} has been down for more than 1 minute"
|
||||
|
||||
- alert: HighCPUUsage
|
||||
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: resources
|
||||
resource_type: "cpu"
|
||||
usage_percent: "{{ $value | humanize }}"
|
||||
threshold: "80"
|
||||
annotations:
|
||||
summary: "High CPU usage"
|
||||
description: "CPU usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: resources
|
||||
resource_type: "memory"
|
||||
usage_percent: "{{ $value | humanize }}"
|
||||
threshold: "85"
|
||||
annotations:
|
||||
summary: "High memory usage"
|
||||
description: "Memory usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
|
||||
|
||||
- alert: DiskSpaceLow
|
||||
expr: (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"}) * 100 < 15
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: resources
|
||||
resource_type: "disk"
|
||||
usage_percent: "{{ 100 - $value | humanize }}"
|
||||
threshold: "85"
|
||||
annotations:
|
||||
summary: "Low disk space"
|
||||
description: "Disk space is {{ 100 - $value | humanizePercentage }} full on {{ $labels.instance }} ({{ $labels.mountpoint }})"
|
||||
|
||||
# Database Rules
|
||||
- name: database
|
||||
rules:
|
||||
- alert: PostgreSQLDown
|
||||
expr: up{job="postgres"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
component: service-health
|
||||
annotations:
|
||||
summary: "PostgreSQL is down"
|
||||
description: "PostgreSQL database is down on {{ $labels.instance }}"
|
||||
|
||||
- alert: PostgreSQLHighConnections
|
||||
expr: pg_stat_database_numbackends / pg_settings_max_connections > 0.8
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
component: service-health
|
||||
annotations:
|
||||
summary: "PostgreSQL connection limit approaching"
|
||||
description: "PostgreSQL connection usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
|
||||
|
||||
- alert: RedisDown
|
||||
expr: up{job="redis"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
component: service-health
|
||||
annotations:
|
||||
summary: "Redis is down"
|
||||
description: "Redis cache is down on {{ $labels.instance }}"
|
||||
|
||||
# Security Rules
|
||||
- name: security
|
||||
rules:
|
||||
- alert: UnauthorizedP2PConnection
|
||||
expr: increase(bzzz_p2p_unauthorized_connections_total[5m]) > 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
component: security
|
||||
security_type: "unauthorized_connection"
|
||||
annotations:
|
||||
summary: "Unauthorized P2P connection attempts"
|
||||
description: "{{ $value }} unauthorized P2P connection attempts in the last 5 minutes on {{ $labels.instance }}"
|
||||
|
||||
- alert: SuspiciousContentRequest
|
||||
expr: increase(bzzz_content_suspicious_requests_total[5m]) > 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
component: security
|
||||
security_type: "suspicious_content"
|
||||
annotations:
|
||||
summary: "Suspicious content requests detected"
|
||||
description: "{{ $value }} suspicious content requests in the last 5 minutes on {{ $labels.instance }}"
|
||||
|
||||
- alert: FailedAuthentication
|
||||
expr: increase(bzzz_auth_failures_total[5m]) > 20
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
component: security
|
||||
security_type: "authentication_failure"
|
||||
annotations:
|
||||
summary: "High authentication failure rate"
|
||||
description: "{{ $value }} authentication failures in the last 5 minutes on {{ $labels.instance }}"
|
||||
255
infrastructure/monitoring/configs/alertmanager.yml
Normal file
255
infrastructure/monitoring/configs/alertmanager.yml
Normal file
@@ -0,0 +1,255 @@
|
||||
# AlertManager Configuration for BZZZ v2
|
||||
|
||||
global:
|
||||
smtp_smarthost: 'localhost:587'
|
||||
smtp_from: 'alerts@deepblack.cloud'
|
||||
smtp_require_tls: true
|
||||
resolve_timeout: 5m
|
||||
|
||||
# Template files
|
||||
templates:
|
||||
- '/etc/alertmanager/templates/*.tmpl'
|
||||
|
||||
# Route configuration
|
||||
route:
|
||||
group_by: ['cluster', 'alertname', 'service']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 12h
|
||||
receiver: 'default'
|
||||
routes:
|
||||
# Critical P2P network issues
|
||||
- match:
|
||||
severity: critical
|
||||
component: p2p
|
||||
receiver: 'p2p-critical'
|
||||
group_wait: 10s
|
||||
repeat_interval: 5m
|
||||
|
||||
# DHT network issues
|
||||
- match:
|
||||
component: dht
|
||||
receiver: 'dht-alerts'
|
||||
group_wait: 1m
|
||||
repeat_interval: 30m
|
||||
|
||||
# Content store issues
|
||||
- match:
|
||||
component: content-store
|
||||
receiver: 'storage-alerts'
|
||||
group_wait: 2m
|
||||
repeat_interval: 1h
|
||||
|
||||
# OpenAI cost alerts
|
||||
- match:
|
||||
component: openai-cost
|
||||
receiver: 'cost-alerts'
|
||||
group_wait: 5m
|
||||
repeat_interval: 6h
|
||||
|
||||
# Service health alerts
|
||||
- match:
|
||||
component: service-health
|
||||
receiver: 'service-alerts'
|
||||
group_wait: 1m
|
||||
repeat_interval: 15m
|
||||
|
||||
# Resource exhaustion
|
||||
- match:
|
||||
severity: warning
|
||||
component: resources
|
||||
receiver: 'resource-alerts'
|
||||
group_wait: 5m
|
||||
repeat_interval: 2h
|
||||
|
||||
# Security alerts
|
||||
- match:
|
||||
component: security
|
||||
receiver: 'security-alerts'
|
||||
group_wait: 30s
|
||||
repeat_interval: 1h
|
||||
|
||||
# Inhibition rules
|
||||
inhibit_rules:
|
||||
# Silence warning if critical alert is firing
|
||||
- source_match:
|
||||
severity: 'critical'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['cluster', 'service', 'instance']
|
||||
|
||||
# Silence service alerts if node is down
|
||||
- source_match:
|
||||
alertname: 'NodeDown'
|
||||
target_match:
|
||||
component: 'service-health'
|
||||
equal: ['instance']
|
||||
|
||||
# Receiver configurations
|
||||
receivers:
|
||||
# Default receiver
|
||||
- name: 'default'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#bzzz-monitoring'
|
||||
title: 'BZZZ v2 Alert'
|
||||
text: |
|
||||
{{ range .Alerts }}
|
||||
*Alert:* {{ .Annotations.summary }}
|
||||
*Description:* {{ .Annotations.description }}
|
||||
*Severity:* {{ .Labels.severity }}
|
||||
*Instance:* {{ .Labels.instance }}
|
||||
*Service:* {{ .Labels.service }}
|
||||
{{ end }}
|
||||
send_resolved: true
|
||||
|
||||
# Critical P2P network alerts
|
||||
- name: 'p2p-critical'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#bzzz-critical'
|
||||
title: '🚨 CRITICAL P2P Network Issue'
|
||||
text: |
|
||||
{{ range .Alerts }}
|
||||
*CRITICAL P2P ALERT*
|
||||
|
||||
*Summary:* {{ .Annotations.summary }}
|
||||
*Description:* {{ .Annotations.description }}
|
||||
*Node:* {{ .Labels.instance }}
|
||||
*Time:* {{ .StartsAt.Format "2006-01-02 15:04:05" }}
|
||||
|
||||
*Immediate Action Required*
|
||||
{{ end }}
|
||||
send_resolved: true
|
||||
pagerduty_configs:
|
||||
- service_key: 'YOUR_PAGERDUTY_SERVICE_KEY'
|
||||
description: '{{ .GroupLabels.alertname }} - {{ .Annotations.summary }}'
|
||||
|
||||
# DHT network alerts
|
||||
- name: 'dht-alerts'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#bzzz-dht'
|
||||
title: '🔗 DHT Network Alert'
|
||||
text: |
|
||||
{{ range .Alerts }}
|
||||
*DHT Network Issue*
|
||||
|
||||
*Alert:* {{ .Annotations.summary }}
|
||||
*Description:* {{ .Annotations.description }}
|
||||
*Bootstrap Node:* {{ .Labels.instance }}
|
||||
*Peers Connected:* {{ .Labels.peer_count | default "unknown" }}
|
||||
{{ end }}
|
||||
send_resolved: true
|
||||
|
||||
# Storage alerts
|
||||
- name: 'storage-alerts'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#bzzz-storage'
|
||||
title: '💾 Content Store Alert'
|
||||
text: |
|
||||
{{ range .Alerts }}
|
||||
*Storage Alert*
|
||||
|
||||
*Issue:* {{ .Annotations.summary }}
|
||||
*Details:* {{ .Annotations.description }}
|
||||
*Node:* {{ .Labels.instance }}
|
||||
*Usage:* {{ .Labels.disk_usage | default "unknown" }}%
|
||||
{{ end }}
|
||||
send_resolved: true
|
||||
|
||||
# OpenAI cost alerts
|
||||
- name: 'cost-alerts'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#bzzz-costs'
|
||||
title: '💰 OpenAI Cost Alert'
|
||||
text: |
|
||||
{{ range .Alerts }}
|
||||
*Cost Alert*
|
||||
|
||||
*Alert:* {{ .Annotations.summary }}
|
||||
*Current Cost:* ${{ .Labels.current_cost | default "unknown" }}
|
||||
*Threshold:* ${{ .Labels.cost_threshold | default "unknown" }}
|
||||
*Period:* {{ .Labels.cost_period | default "daily" }}
|
||||
*Action:* {{ .Annotations.description }}
|
||||
{{ end }}
|
||||
send_resolved: true
|
||||
email_configs:
|
||||
- to: 'finance@deepblack.cloud'
|
||||
subject: 'BZZZ v2 OpenAI Cost Alert'
|
||||
body: |
|
||||
OpenAI usage has exceeded cost thresholds.
|
||||
|
||||
{{ range .Alerts }}
|
||||
Alert: {{ .Annotations.summary }}
|
||||
Current Cost: ${{ .Labels.current_cost }}
|
||||
Threshold: ${{ .Labels.cost_threshold }}
|
||||
{{ end }}
|
||||
|
||||
# Service health alerts
|
||||
- name: 'service-alerts'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#bzzz-services'
|
||||
title: '🔧 Service Health Alert'
|
||||
text: |
|
||||
{{ range .Alerts }}
|
||||
*Service Health Issue*
|
||||
|
||||
*Service:* {{ .Labels.service }}
|
||||
*Alert:* {{ .Annotations.summary }}
|
||||
*Node:* {{ .Labels.instance }}
|
||||
*Status:* {{ .Labels.status | default "unknown" }}
|
||||
*Description:* {{ .Annotations.description }}
|
||||
{{ end }}
|
||||
send_resolved: true
|
||||
|
||||
# Resource alerts
|
||||
- name: 'resource-alerts'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#bzzz-resources'
|
||||
title: '⚡ Resource Alert'
|
||||
text: |
|
||||
{{ range .Alerts }}
|
||||
*Resource Warning*
|
||||
|
||||
*Resource:* {{ .Labels.resource_type | default "unknown" }}
|
||||
*Node:* {{ .Labels.instance }}
|
||||
*Alert:* {{ .Annotations.summary }}
|
||||
*Current Usage:* {{ .Labels.usage_percent | default "unknown" }}%
|
||||
*Threshold:* {{ .Labels.threshold | default "unknown" }}%
|
||||
{{ end }}
|
||||
send_resolved: true
|
||||
|
||||
# Security alerts
|
||||
- name: 'security-alerts'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#bzzz-security'
|
||||
title: '🔒 Security Alert'
|
||||
text: |
|
||||
{{ range .Alerts }}
|
||||
*SECURITY ALERT*
|
||||
|
||||
*Type:* {{ .Labels.security_type | default "unknown" }}
|
||||
*Alert:* {{ .Annotations.summary }}
|
||||
*Source:* {{ .Labels.instance }}
|
||||
*Details:* {{ .Annotations.description }}
|
||||
*Severity:* {{ .Labels.severity }}
|
||||
{{ end }}
|
||||
send_resolved: true
|
||||
email_configs:
|
||||
- to: 'security@deepblack.cloud'
|
||||
subject: 'BZZZ v2 Security Alert'
|
||||
body: |
|
||||
Security alert triggered in BZZZ v2 cluster.
|
||||
|
||||
{{ range .Alerts }}
|
||||
Alert: {{ .Annotations.summary }}
|
||||
Severity: {{ .Labels.severity }}
|
||||
Source: {{ .Labels.instance }}
|
||||
Details: {{ .Annotations.description }}
|
||||
{{ end }}
|
||||
216
infrastructure/monitoring/configs/prometheus.yml
Normal file
216
infrastructure/monitoring/configs/prometheus.yml
Normal file
@@ -0,0 +1,216 @@
|
||||
# Prometheus Configuration for BZZZ v2 Monitoring
|
||||
|
||||
global:
|
||||
scrape_interval: 30s
|
||||
scrape_timeout: 10s
|
||||
evaluation_interval: 30s
|
||||
external_labels:
|
||||
cluster: 'deepblack-cloud'
|
||||
environment: 'production'
|
||||
|
||||
rule_files:
|
||||
- "/etc/prometheus/rules.yml"
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
|
||||
scrape_configs:
|
||||
# Prometheus self-monitoring
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 15s
|
||||
|
||||
# System metrics from node exporters
|
||||
- job_name: 'node-exporter'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'walnut:9100'
|
||||
- 'ironwood:9100'
|
||||
- 'acacia:9100'
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 15s
|
||||
|
||||
# Container metrics from cAdvisor
|
||||
- job_name: 'cadvisor'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'walnut:8080'
|
||||
- 'ironwood:8080'
|
||||
- 'acacia:8080'
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 30s
|
||||
|
||||
# BZZZ v2 Application Services
|
||||
- job_name: 'bzzz-agent'
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
port: 9000
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
target_label: __tmp_service_name
|
||||
- source_labels: [__tmp_service_name]
|
||||
regex: bzzz-v2_bzzz-agent
|
||||
action: keep
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_node_id]
|
||||
target_label: node_id
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
target_label: service
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 15s
|
||||
|
||||
# MCP Server Metrics
|
||||
- job_name: 'mcp-server'
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
port: 3001
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
regex: bzzz-v2_mcp-server
|
||||
action: keep
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
target_label: service
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 30s
|
||||
|
||||
# OpenAI Proxy Metrics
|
||||
- job_name: 'openai-proxy'
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
port: 3002
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
regex: bzzz-v2_openai-proxy
|
||||
action: keep
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
target_label: service
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 30s
|
||||
|
||||
# Content Resolver Metrics
|
||||
- job_name: 'content-resolver'
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
port: 3003
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
regex: bzzz-v2_content-resolver
|
||||
action: keep
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
target_label: service
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 30s
|
||||
|
||||
# DHT Bootstrap Nodes
|
||||
- job_name: 'dht-bootstrap'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'walnut:9101'
|
||||
- 'ironwood:9102'
|
||||
- 'acacia:9103'
|
||||
labels:
|
||||
service: 'dht-bootstrap'
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 15s
|
||||
|
||||
# P2P Network Metrics
|
||||
- job_name: 'bzzz-p2p-exporter'
|
||||
static_configs:
|
||||
- targets: ['bzzz-p2p-exporter:9200']
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 30s
|
||||
|
||||
# DHT Network Monitoring
|
||||
- job_name: 'dht-monitor'
|
||||
static_configs:
|
||||
- targets: ['dht-monitor:9201']
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 60s
|
||||
|
||||
# Content Store Monitoring
|
||||
- job_name: 'content-monitor'
|
||||
static_configs:
|
||||
- targets: ['content-monitor:9202']
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 300s # 5 minutes for storage checks
|
||||
|
||||
# OpenAI Cost Monitoring
|
||||
- job_name: 'openai-cost-monitor'
|
||||
static_configs:
|
||||
- targets: ['openai-cost-monitor:9203']
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 60s
|
||||
|
||||
# Database Metrics (PostgreSQL)
|
||||
- job_name: 'postgres'
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
port: 5432
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
regex: bzzz-v2_postgres
|
||||
action: keep
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
target_label: service
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 30s
|
||||
params:
|
||||
dbname: [bzzz_v2]
|
||||
|
||||
# Cache Metrics (Redis)
|
||||
- job_name: 'redis'
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
port: 6379
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
regex: bzzz-v2_redis
|
||||
action: keep
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
target_label: service
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 30s
|
||||
|
||||
# Traefik Load Balancer Metrics
|
||||
- job_name: 'traefik'
|
||||
static_configs:
|
||||
- targets: ['traefik:8080']
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 30s
|
||||
|
||||
# Conversation Management Metrics
|
||||
- job_name: 'conversation-manager'
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
port: 8090
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
regex: bzzz-v2_conversation-manager
|
||||
action: keep
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
target_label: service
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 30s
|
||||
|
||||
# External Service Monitoring (Webhook endpoints)
|
||||
- job_name: 'external-health'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'bzzz.deepblack.cloud'
|
||||
- 'mcp.deepblack.cloud'
|
||||
- 'resolve.deepblack.cloud'
|
||||
- 'openai.deepblack.cloud'
|
||||
metrics_path: /health
|
||||
scrape_interval: 60s
|
||||
scrape_timeout: 10s
|
||||
|
||||
# Remote write configuration for long-term storage (optional)
|
||||
# remote_write:
|
||||
# - url: "https://prometheus-remote-write.example.com/api/v1/write"
|
||||
# basic_auth:
|
||||
# username: "bzzz-cluster"
|
||||
# password_file: "/etc/prometheus/remote-write-password"
|
||||
372
infrastructure/monitoring/docker-compose.monitoring.yml
Normal file
372
infrastructure/monitoring/docker-compose.monitoring.yml
Normal file
@@ -0,0 +1,372 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
# Prometheus for metrics collection
|
||||
prometheus:
|
||||
image: prom/prometheus:v2.48.0
|
||||
networks:
|
||||
- tengig
|
||||
- monitoring
|
||||
ports:
|
||||
- "9090:9090"
|
||||
volumes:
|
||||
- /rust/bzzz-v2/config/prometheus:/etc/prometheus:ro
|
||||
- /rust/bzzz-v2/data/prometheus:/prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--storage.tsdb.retention.time=30d'
|
||||
- '--storage.tsdb.retention.size=50GB'
|
||||
- '--web.console.libraries=/etc/prometheus/console_libraries'
|
||||
- '--web.console.templates=/etc/prometheus/consoles'
|
||||
- '--web.enable-lifecycle'
|
||||
- '--web.external-url=https://prometheus.deepblack.cloud'
|
||||
configs:
|
||||
- source: prometheus_config
|
||||
target: /etc/prometheus/prometheus.yml
|
||||
- source: prometheus_rules
|
||||
target: /etc/prometheus/rules.yml
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == walnut
|
||||
resources:
|
||||
limits:
|
||||
memory: 4G
|
||||
cpus: '2.0'
|
||||
reservations:
|
||||
memory: 2G
|
||||
cpus: '1.0'
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.prometheus.rule=Host(`prometheus.deepblack.cloud`)"
|
||||
- "traefik.http.services.prometheus.loadbalancer.server.port=9090"
|
||||
- "traefik.http.routers.prometheus.tls=true"
|
||||
|
||||
# Grafana for visualization
|
||||
grafana:
|
||||
image: grafana/grafana:10.2.0
|
||||
networks:
|
||||
- tengig
|
||||
- monitoring
|
||||
ports:
|
||||
- "3000:3000"
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_PASSWORD__FILE=/run/secrets/grafana_admin_password
|
||||
- GF_SERVER_ROOT_URL=https://grafana.deepblack.cloud
|
||||
- GF_SERVER_DOMAIN=grafana.deepblack.cloud
|
||||
- GF_ANALYTICS_REPORTING_ENABLED=false
|
||||
- GF_ANALYTICS_CHECK_FOR_UPDATES=false
|
||||
- GF_USERS_ALLOW_SIGN_UP=false
|
||||
- GF_INSTALL_PLUGINS=grafana-piechart-panel,grafana-worldmap-panel
|
||||
volumes:
|
||||
- /rust/bzzz-v2/data/grafana:/var/lib/grafana
|
||||
- /rust/bzzz-v2/config/grafana/provisioning:/etc/grafana/provisioning:ro
|
||||
secrets:
|
||||
- grafana_admin_password
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == walnut
|
||||
resources:
|
||||
limits:
|
||||
memory: 2G
|
||||
cpus: '1.0'
|
||||
reservations:
|
||||
memory: 1G
|
||||
cpus: '0.5'
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.grafana.rule=Host(`grafana.deepblack.cloud`)"
|
||||
- "traefik.http.services.grafana.loadbalancer.server.port=3000"
|
||||
- "traefik.http.routers.grafana.tls=true"
|
||||
|
||||
# AlertManager for alerting
|
||||
alertmanager:
|
||||
image: prom/alertmanager:v0.26.0
|
||||
networks:
|
||||
- tengig
|
||||
- monitoring
|
||||
ports:
|
||||
- "9093:9093"
|
||||
volumes:
|
||||
- /rust/bzzz-v2/data/alertmanager:/alertmanager
|
||||
command:
|
||||
- '--config.file=/etc/alertmanager/config.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
- '--web.external-url=https://alerts.deepblack.cloud'
|
||||
configs:
|
||||
- source: alertmanager_config
|
||||
target: /etc/alertmanager/config.yml
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == ironwood
|
||||
resources:
|
||||
limits:
|
||||
memory: 1G
|
||||
cpus: '0.5'
|
||||
reservations:
|
||||
memory: 512M
|
||||
cpus: '0.25'
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.alertmanager.rule=Host(`alerts.deepblack.cloud`)"
|
||||
- "traefik.http.services.alertmanager.loadbalancer.server.port=9093"
|
||||
- "traefik.http.routers.alertmanager.tls=true"
|
||||
|
||||
# Node Exporter for system metrics
|
||||
node-exporter:
|
||||
image: prom/node-exporter:v1.6.1
|
||||
networks:
|
||||
- monitoring
|
||||
ports:
|
||||
- "9100:9100"
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
- /etc/hostname:/etc/nodename:ro
|
||||
command:
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--path.rootfs=/rootfs'
|
||||
- '--path.sysfs=/host/sys'
|
||||
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
|
||||
- '--collector.textfile.directory=/var/lib/node_exporter/textfile_collector'
|
||||
deploy:
|
||||
mode: global
|
||||
resources:
|
||||
limits:
|
||||
memory: 256M
|
||||
cpus: '0.5'
|
||||
reservations:
|
||||
memory: 128M
|
||||
cpus: '0.25'
|
||||
|
||||
# cAdvisor for container metrics
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:v0.47.0
|
||||
networks:
|
||||
- monitoring
|
||||
ports:
|
||||
- "8080:8080"
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:rw
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker:/var/lib/docker:ro
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
command:
|
||||
- '--housekeeping_interval=10s'
|
||||
- '--docker_only=true'
|
||||
- '--disable_metrics=percpu,process,sched,tcp,udp,disk,diskIO,accelerator,hugetlb,referenced_memory,cpu_topology,resctrl'
|
||||
deploy:
|
||||
mode: global
|
||||
resources:
|
||||
limits:
|
||||
memory: 512M
|
||||
cpus: '0.5'
|
||||
reservations:
|
||||
memory: 256M
|
||||
cpus: '0.25'
|
||||
|
||||
# BZZZ P2P Metrics Exporter
|
||||
bzzz-p2p-exporter:
|
||||
image: registry.home.deepblack.cloud/bzzz/p2p-exporter:v2.0.0
|
||||
networks:
|
||||
- monitoring
|
||||
- bzzz-internal
|
||||
ports:
|
||||
- "9200:9200"
|
||||
environment:
|
||||
- BZZZ_AGENT_ENDPOINTS=http://bzzz-v2_bzzz-agent:9000
|
||||
- DHT_BOOTSTRAP_NODES=walnut:9101,ironwood:9102,acacia:9103
|
||||
- METRICS_PORT=9200
|
||||
- SCRAPE_INTERVAL=30s
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == acacia
|
||||
resources:
|
||||
limits:
|
||||
memory: 512M
|
||||
cpus: '0.5'
|
||||
|
||||
# DHT Network Monitor
|
||||
dht-monitor:
|
||||
image: registry.home.deepblack.cloud/bzzz/dht-monitor:v2.0.0
|
||||
networks:
|
||||
- monitoring
|
||||
- bzzz-internal
|
||||
ports:
|
||||
- "9201:9201"
|
||||
environment:
|
||||
- DHT_BOOTSTRAP_NODES=walnut:9101,ironwood:9102,acacia:9103
|
||||
- MONITOR_PORT=9201
|
||||
- PEER_CHECK_INTERVAL=60s
|
||||
deploy:
|
||||
replicas: 1
|
||||
resources:
|
||||
limits:
|
||||
memory: 256M
|
||||
cpus: '0.25'
|
||||
|
||||
# Content Store Monitor
|
||||
content-monitor:
|
||||
image: registry.home.deepblack.cloud/bzzz/content-monitor:v2.0.0
|
||||
networks:
|
||||
- monitoring
|
||||
- bzzz-internal
|
||||
ports:
|
||||
- "9202:9202"
|
||||
environment:
|
||||
- CONTENT_STORE_PATH=/rust/bzzz-v2/data/blobs
|
||||
- MONITOR_PORT=9202
|
||||
- CHECK_INTERVAL=300s
|
||||
volumes:
|
||||
- /rust/bzzz-v2/data/blobs:/data/blobs:ro
|
||||
deploy:
|
||||
replicas: 1
|
||||
resources:
|
||||
limits:
|
||||
memory: 256M
|
||||
cpus: '0.25'
|
||||
|
||||
# OpenAI Cost Monitor
|
||||
openai-cost-monitor:
|
||||
image: registry.home.deepblack.cloud/bzzz/openai-cost-monitor:v2.0.0
|
||||
networks:
|
||||
- monitoring
|
||||
- bzzz-internal
|
||||
ports:
|
||||
- "9203:9203"
|
||||
environment:
|
||||
- POSTGRES_HOST=bzzz-v2_postgres
|
||||
- POSTGRES_DB=bzzz_v2
|
||||
- POSTGRES_USER=bzzz
|
||||
- MONITOR_PORT=9203
|
||||
- COST_ALERT_THRESHOLD=100.00
|
||||
secrets:
|
||||
- postgres_password
|
||||
deploy:
|
||||
replicas: 1
|
||||
resources:
|
||||
limits:
|
||||
memory: 256M
|
||||
cpus: '0.25'
|
||||
|
||||
# Log aggregation with Loki
|
||||
loki:
|
||||
image: grafana/loki:2.9.0
|
||||
networks:
|
||||
- monitoring
|
||||
ports:
|
||||
- "3100:3100"
|
||||
volumes:
|
||||
- /rust/bzzz-v2/data/loki:/loki
|
||||
command: -config.file=/etc/loki/local-config.yaml
|
||||
configs:
|
||||
- source: loki_config
|
||||
target: /etc/loki/local-config.yaml
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == acacia
|
||||
resources:
|
||||
limits:
|
||||
memory: 2G
|
||||
cpus: '1.0'
|
||||
reservations:
|
||||
memory: 1G
|
||||
cpus: '0.5'
|
||||
|
||||
# Promtail for log shipping
|
||||
promtail:
|
||||
image: grafana/promtail:2.9.0
|
||||
networks:
|
||||
- monitoring
|
||||
volumes:
|
||||
- /var/log:/var/log:ro
|
||||
- /var/lib/docker/containers:/var/lib/docker/containers:ro
|
||||
- /rust/bzzz-v2/logs:/app/logs:ro
|
||||
command: -config.file=/etc/promtail/config.yml
|
||||
configs:
|
||||
- source: promtail_config
|
||||
target: /etc/promtail/config.yml
|
||||
deploy:
|
||||
mode: global
|
||||
resources:
|
||||
limits:
|
||||
memory: 256M
|
||||
cpus: '0.25'
|
||||
|
||||
# Jaeger for distributed tracing
|
||||
jaeger:
|
||||
image: jaegertracing/all-in-one:1.49
|
||||
networks:
|
||||
- tengig
|
||||
- monitoring
|
||||
ports:
|
||||
- "16686:16686"
|
||||
- "14268:14268"
|
||||
environment:
|
||||
- COLLECTOR_OTLP_ENABLED=true
|
||||
- SPAN_STORAGE_TYPE=badger
|
||||
- BADGER_EPHEMERAL=false
|
||||
- BADGER_DIRECTORY_VALUE=/badger/data
|
||||
- BADGER_DIRECTORY_KEY=/badger/key
|
||||
volumes:
|
||||
- /rust/bzzz-v2/data/jaeger:/badger
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == ironwood
|
||||
resources:
|
||||
limits:
|
||||
memory: 1G
|
||||
cpus: '0.5'
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.jaeger.rule=Host(`tracing.deepblack.cloud`)"
|
||||
- "traefik.http.services.jaeger.loadbalancer.server.port=16686"
|
||||
- "traefik.http.routers.jaeger.tls=true"
|
||||
|
||||
networks:
|
||||
tengig:
|
||||
external: true
|
||||
monitoring:
|
||||
driver: overlay
|
||||
attachable: true
|
||||
bzzz-internal:
|
||||
external: true
|
||||
|
||||
secrets:
|
||||
grafana_admin_password:
|
||||
external: true
|
||||
name: bzzz_grafana_admin_password
|
||||
postgres_password:
|
||||
external: true
|
||||
name: bzzz_postgres_password
|
||||
|
||||
configs:
|
||||
prometheus_config:
|
||||
external: true
|
||||
name: bzzz_prometheus_config
|
||||
prometheus_rules:
|
||||
external: true
|
||||
name: bzzz_prometheus_rules
|
||||
alertmanager_config:
|
||||
external: true
|
||||
name: bzzz_alertmanager_config
|
||||
loki_config:
|
||||
external: true
|
||||
name: bzzz_loki_config
|
||||
promtail_config:
|
||||
external: true
|
||||
name: bzzz_promtail_config
|
||||
335
infrastructure/security/network-policy.yaml
Normal file
335
infrastructure/security/network-policy.yaml
Normal file
@@ -0,0 +1,335 @@
|
||||
# Kubernetes Network Policy for BZZZ v2 (if migrating to K8s later)
|
||||
# Currently using Docker Swarm, but this provides a template for K8s migration
|
||||
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: bzzz-v2-network-policy
|
||||
namespace: bzzz-v2
|
||||
spec:
|
||||
podSelector: {}
|
||||
policyTypes:
|
||||
- Ingress
|
||||
- Egress
|
||||
|
||||
# Default deny all ingress and egress
|
||||
ingress: []
|
||||
egress: []
|
||||
|
||||
---
|
||||
# Allow internal cluster communication
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: bzzz-internal-communication
|
||||
namespace: bzzz-v2
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app: bzzz-agent
|
||||
policyTypes:
|
||||
- Ingress
|
||||
- Egress
|
||||
ingress:
|
||||
- from:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
name: bzzz-v2
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 9000
|
||||
- protocol: UDP
|
||||
port: 9000
|
||||
egress:
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
name: bzzz-v2
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 9000
|
||||
- protocol: UDP
|
||||
port: 9000
|
||||
|
||||
---
|
||||
# DHT Bootstrap Network Policy
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: dht-bootstrap-policy
|
||||
namespace: bzzz-v2
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app: dht-bootstrap
|
||||
policyTypes:
|
||||
- Ingress
|
||||
- Egress
|
||||
ingress:
|
||||
- from:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
name: bzzz-v2
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 9101
|
||||
- protocol: TCP
|
||||
port: 9102
|
||||
- protocol: TCP
|
||||
port: 9103
|
||||
egress:
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
name: bzzz-v2
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 9101
|
||||
- protocol: TCP
|
||||
port: 9102
|
||||
- protocol: TCP
|
||||
port: 9103
|
||||
|
||||
---
|
||||
# MCP Server Network Policy
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: mcp-server-policy
|
||||
namespace: bzzz-v2
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app: mcp-server
|
||||
policyTypes:
|
||||
- Ingress
|
||||
- Egress
|
||||
ingress:
|
||||
- from:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
name: traefik
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 3001
|
||||
- from:
|
||||
- podSelector:
|
||||
matchLabels:
|
||||
app: bzzz-agent
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 3001
|
||||
egress:
|
||||
- to:
|
||||
- podSelector:
|
||||
matchLabels:
|
||||
app: bzzz-agent
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 9000
|
||||
|
||||
---
|
||||
# OpenAI Proxy Network Policy
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: openai-proxy-policy
|
||||
namespace: bzzz-v2
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app: openai-proxy
|
||||
policyTypes:
|
||||
- Ingress
|
||||
- Egress
|
||||
ingress:
|
||||
- from:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
name: traefik
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 3002
|
||||
- from:
|
||||
- podSelector:
|
||||
matchLabels:
|
||||
app: bzzz-agent
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 3002
|
||||
egress:
|
||||
# Allow outbound to OpenAI API
|
||||
- to: []
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 443
|
||||
# Allow access to Redis and PostgreSQL
|
||||
- to:
|
||||
- podSelector:
|
||||
matchLabels:
|
||||
app: redis
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 6379
|
||||
- to:
|
||||
- podSelector:
|
||||
matchLabels:
|
||||
app: postgres
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 5432
|
||||
|
||||
---
|
||||
# Content Resolver Network Policy
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: content-resolver-policy
|
||||
namespace: bzzz-v2
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app: content-resolver
|
||||
policyTypes:
|
||||
- Ingress
|
||||
- Egress
|
||||
ingress:
|
||||
- from:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
name: traefik
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 3003
|
||||
- from:
|
||||
- podSelector:
|
||||
matchLabels:
|
||||
app: bzzz-agent
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 3003
|
||||
egress:
|
||||
- to:
|
||||
- podSelector:
|
||||
matchLabels:
|
||||
app: dht-bootstrap
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 9101
|
||||
- protocol: TCP
|
||||
port: 9102
|
||||
- protocol: TCP
|
||||
port: 9103
|
||||
|
||||
---
|
||||
# Database Network Policy
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: postgres-policy
|
||||
namespace: bzzz-v2
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app: postgres
|
||||
policyTypes:
|
||||
- Ingress
|
||||
ingress:
|
||||
- from:
|
||||
- podSelector:
|
||||
matchLabels:
|
||||
app: bzzz-agent
|
||||
- podSelector:
|
||||
matchLabels:
|
||||
app: openai-proxy
|
||||
- podSelector:
|
||||
matchLabels:
|
||||
app: conversation-manager
|
||||
- podSelector:
|
||||
matchLabels:
|
||||
app: openai-cost-monitor
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 5432
|
||||
|
||||
---
|
||||
# Redis Network Policy
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: redis-policy
|
||||
namespace: bzzz-v2
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app: redis
|
||||
policyTypes:
|
||||
- Ingress
|
||||
ingress:
|
||||
- from:
|
||||
- podSelector:
|
||||
matchLabels:
|
||||
app: bzzz-agent
|
||||
- podSelector:
|
||||
matchLabels:
|
||||
app: openai-proxy
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 6379
|
||||
|
||||
---
|
||||
# Monitoring Network Policy
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: monitoring-policy
|
||||
namespace: bzzz-v2
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
monitoring: "true"
|
||||
policyTypes:
|
||||
- Ingress
|
||||
- Egress
|
||||
ingress:
|
||||
- from:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
name: monitoring
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
name: traefik
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 9090
|
||||
- protocol: TCP
|
||||
port: 3000
|
||||
- protocol: TCP
|
||||
port: 9093
|
||||
egress:
|
||||
# Allow monitoring to scrape all services
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
name: bzzz-v2
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 9000
|
||||
- protocol: TCP
|
||||
port: 3001
|
||||
- protocol: TCP
|
||||
port: 3002
|
||||
- protocol: TCP
|
||||
port: 3003
|
||||
- protocol: TCP
|
||||
port: 9100
|
||||
- protocol: TCP
|
||||
port: 8080
|
||||
- protocol: TCP
|
||||
port: 9200
|
||||
- protocol: TCP
|
||||
port: 9201
|
||||
- protocol: TCP
|
||||
port: 9202
|
||||
- protocol: TCP
|
||||
port: 9203
|
||||
675
infrastructure/security/security-hardening.sh
Executable file
675
infrastructure/security/security-hardening.sh
Executable file
@@ -0,0 +1,675 @@
|
||||
#!/bin/bash
|
||||
# BZZZ v2 Security Hardening Script
|
||||
# Applies comprehensive security configurations for the cluster
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
LOG_FILE="/var/log/bzzz-security-hardening-$(date +%Y%m%d-%H%M%S).log"
|
||||
|
||||
# Colors
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m'
|
||||
|
||||
log() {
|
||||
echo -e "${BLUE}[$(date +'%Y-%m-%d %H:%M:%S')]${NC} $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1" | tee -a "$LOG_FILE"
|
||||
exit 1
|
||||
}
|
||||
|
||||
warn() {
|
||||
echo -e "${YELLOW}[WARN]${NC} $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
check_root() {
|
||||
if [[ $EUID -eq 0 ]]; then
|
||||
error "This script should not be run as root. Run as tony user with sudo access."
|
||||
fi
|
||||
}
|
||||
|
||||
configure_firewall() {
|
||||
log "Configuring UFW firewall for BZZZ v2..."
|
||||
|
||||
# Enable UFW if not enabled
|
||||
sudo ufw --force enable
|
||||
|
||||
# Default policies
|
||||
sudo ufw default deny incoming
|
||||
sudo ufw default allow outgoing
|
||||
|
||||
# SSH access
|
||||
sudo ufw allow ssh
|
||||
|
||||
# Docker Swarm ports (internal cluster only)
|
||||
sudo ufw allow from 192.168.1.0/24 to any port 2376 proto tcp comment "Docker daemon TLS"
|
||||
sudo ufw allow from 192.168.1.0/24 to any port 2377 proto tcp comment "Docker Swarm management"
|
||||
sudo ufw allow from 192.168.1.0/24 to any port 7946 proto tcp comment "Docker Swarm node communication"
|
||||
sudo ufw allow from 192.168.1.0/24 to any port 7946 proto udp comment "Docker Swarm node communication"
|
||||
sudo ufw allow from 192.168.1.0/24 to any port 4789 proto udp comment "Docker Swarm overlay networks"
|
||||
|
||||
# BZZZ v2 P2P ports (internal cluster only)
|
||||
sudo ufw allow from 192.168.1.0/24 to any port 9000:9300 proto tcp comment "BZZZ v2 P2P"
|
||||
sudo ufw allow from 192.168.1.0/24 to any port 9000:9300 proto udp comment "BZZZ v2 P2P"
|
||||
|
||||
# DHT bootstrap ports
|
||||
sudo ufw allow from 192.168.1.0/24 to any port 9101:9103 proto tcp comment "BZZZ DHT Bootstrap"
|
||||
|
||||
# mDNS discovery (local network only)
|
||||
sudo ufw allow from 192.168.1.0/24 to any port 5353 proto udp comment "mDNS discovery"
|
||||
|
||||
# HTTP/HTTPS through Traefik (external access)
|
||||
sudo ufw allow 80/tcp comment "HTTP"
|
||||
sudo ufw allow 443/tcp comment "HTTPS"
|
||||
|
||||
# Internal service ports (cluster only)
|
||||
sudo ufw allow from 192.168.1.0/24 to any port 3000:3100 proto tcp comment "BZZZ v2 services"
|
||||
sudo ufw allow from 192.168.1.0/24 to any port 5432 proto tcp comment "PostgreSQL"
|
||||
sudo ufw allow from 192.168.1.0/24 to any port 6379 proto tcp comment "Redis"
|
||||
|
||||
# Monitoring ports (cluster only)
|
||||
sudo ufw allow from 192.168.1.0/24 to any port 9090:9203 proto tcp comment "Monitoring"
|
||||
|
||||
# Rate limiting rules
|
||||
sudo ufw limit ssh comment "Rate limit SSH"
|
||||
|
||||
# Log denied connections
|
||||
sudo ufw logging on
|
||||
|
||||
success "Firewall configured successfully"
|
||||
}
|
||||
|
||||
configure_docker_security() {
|
||||
log "Configuring Docker security..."
|
||||
|
||||
# Create Docker daemon configuration
|
||||
sudo mkdir -p /etc/docker
|
||||
|
||||
cat << 'EOF' | sudo tee /etc/docker/daemon.json > /dev/null
|
||||
{
|
||||
"log-driver": "json-file",
|
||||
"log-opts": {
|
||||
"max-size": "100m",
|
||||
"max-file": "3"
|
||||
},
|
||||
"live-restore": true,
|
||||
"userland-proxy": false,
|
||||
"icc": false,
|
||||
"userns-remap": "default",
|
||||
"no-new-privileges": true,
|
||||
"seccomp-profile": "/etc/docker/seccomp-default.json",
|
||||
"apparmor-profile": "docker-default",
|
||||
"storage-driver": "overlay2",
|
||||
"storage-opts": [
|
||||
"overlay2.override_kernel_check=true"
|
||||
],
|
||||
"default-ulimits": {
|
||||
"nofile": {
|
||||
"Name": "nofile",
|
||||
"Hard": 65536,
|
||||
"Soft": 65536
|
||||
}
|
||||
},
|
||||
"registry-mirrors": ["https://registry.home.deepblack.cloud"],
|
||||
"insecure-registries": ["registry.home.deepblack.cloud:5000"],
|
||||
"features": {
|
||||
"buildkit": true
|
||||
}
|
||||
}
|
||||
EOF
|
||||
|
||||
# Create custom seccomp profile
|
||||
cat << 'EOF' | sudo tee /etc/docker/seccomp-default.json > /dev/null
|
||||
{
|
||||
"defaultAction": "SCMP_ACT_ERRNO",
|
||||
"architectures": [
|
||||
"SCMP_ARCH_X86_64",
|
||||
"SCMP_ARCH_X86",
|
||||
"SCMP_ARCH_X32"
|
||||
],
|
||||
"syscalls": [
|
||||
{
|
||||
"names": [
|
||||
"accept",
|
||||
"access",
|
||||
"arch_prctl",
|
||||
"bind",
|
||||
"brk",
|
||||
"chdir",
|
||||
"chmod",
|
||||
"chown",
|
||||
"clone",
|
||||
"close",
|
||||
"connect",
|
||||
"dup",
|
||||
"dup2",
|
||||
"epoll_create",
|
||||
"epoll_ctl",
|
||||
"epoll_wait",
|
||||
"execve",
|
||||
"exit",
|
||||
"exit_group",
|
||||
"fcntl",
|
||||
"fstat",
|
||||
"futex",
|
||||
"getcwd",
|
||||
"getdents",
|
||||
"getgid",
|
||||
"getpid",
|
||||
"getppid",
|
||||
"gettid",
|
||||
"getuid",
|
||||
"listen",
|
||||
"lstat",
|
||||
"mmap",
|
||||
"mprotect",
|
||||
"munmap",
|
||||
"nanosleep",
|
||||
"open",
|
||||
"openat",
|
||||
"pipe",
|
||||
"poll",
|
||||
"prctl",
|
||||
"read",
|
||||
"readlink",
|
||||
"recv",
|
||||
"recvfrom",
|
||||
"rt_sigaction",
|
||||
"rt_sigprocmask",
|
||||
"rt_sigreturn",
|
||||
"sched_yield",
|
||||
"send",
|
||||
"sendto",
|
||||
"set_robust_list",
|
||||
"setsockopt",
|
||||
"socket",
|
||||
"stat",
|
||||
"write"
|
||||
],
|
||||
"action": "SCMP_ACT_ALLOW"
|
||||
}
|
||||
]
|
||||
}
|
||||
EOF
|
||||
|
||||
# Restart Docker to apply changes
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl restart docker
|
||||
|
||||
success "Docker security configuration applied"
|
||||
}
|
||||
|
||||
setup_tls_certificates() {
|
||||
log "Setting up TLS certificates..."
|
||||
|
||||
# Create certificates directory
|
||||
mkdir -p /rust/bzzz-v2/config/tls/{ca,server,client}
|
||||
|
||||
# Generate CA key and certificate
|
||||
if [[ ! -f /rust/bzzz-v2/config/tls/ca/ca-key.pem ]]; then
|
||||
openssl genrsa -out /rust/bzzz-v2/config/tls/ca/ca-key.pem 4096
|
||||
openssl req -new -x509 -days 3650 -key /rust/bzzz-v2/config/tls/ca/ca-key.pem \
|
||||
-out /rust/bzzz-v2/config/tls/ca/ca.pem \
|
||||
-subj "/C=US/ST=Cloud/L=DeepBlack/O=BZZZ/CN=bzzz-ca"
|
||||
|
||||
log "Generated new CA certificate"
|
||||
fi
|
||||
|
||||
# Generate server certificates for each node
|
||||
local nodes=("walnut" "ironwood" "acacia")
|
||||
for node in "${nodes[@]}"; do
|
||||
if [[ ! -f "/rust/bzzz-v2/config/tls/server/${node}-key.pem" ]]; then
|
||||
# Generate server key
|
||||
openssl genrsa -out "/rust/bzzz-v2/config/tls/server/${node}-key.pem" 4096
|
||||
|
||||
# Generate server certificate request
|
||||
openssl req -new -key "/rust/bzzz-v2/config/tls/server/${node}-key.pem" \
|
||||
-out "/rust/bzzz-v2/config/tls/server/${node}.csr" \
|
||||
-subj "/C=US/ST=Cloud/L=DeepBlack/O=BZZZ/CN=${node}.deepblack.cloud"
|
||||
|
||||
# Create extensions file
|
||||
cat > "/rust/bzzz-v2/config/tls/server/${node}-ext.cnf" << EOF
|
||||
subjectAltName = DNS:${node}.deepblack.cloud,DNS:${node},DNS:localhost,IP:127.0.0.1,IP:192.168.1.27
|
||||
extendedKeyUsage = serverAuth,clientAuth
|
||||
EOF
|
||||
|
||||
# Generate server certificate
|
||||
openssl x509 -req -days 365 -in "/rust/bzzz-v2/config/tls/server/${node}.csr" \
|
||||
-CA /rust/bzzz-v2/config/tls/ca/ca.pem \
|
||||
-CAkey /rust/bzzz-v2/config/tls/ca/ca-key.pem \
|
||||
-out "/rust/bzzz-v2/config/tls/server/${node}.pem" \
|
||||
-extensions v3_req -extfile "/rust/bzzz-v2/config/tls/server/${node}-ext.cnf" \
|
||||
-CAcreateserial
|
||||
|
||||
# Clean up CSR and extensions file
|
||||
rm "/rust/bzzz-v2/config/tls/server/${node}.csr" "/rust/bzzz-v2/config/tls/server/${node}-ext.cnf"
|
||||
|
||||
log "Generated TLS certificate for $node"
|
||||
fi
|
||||
done
|
||||
|
||||
# Generate client certificates for inter-service communication
|
||||
if [[ ! -f /rust/bzzz-v2/config/tls/client/client-key.pem ]]; then
|
||||
openssl genrsa -out /rust/bzzz-v2/config/tls/client/client-key.pem 4096
|
||||
openssl req -new -key /rust/bzzz-v2/config/tls/client/client-key.pem \
|
||||
-out /rust/bzzz-v2/config/tls/client/client.csr \
|
||||
-subj "/C=US/ST=Cloud/L=DeepBlack/O=BZZZ/CN=bzzz-client"
|
||||
|
||||
openssl x509 -req -days 365 -in /rust/bzzz-v2/config/tls/client/client.csr \
|
||||
-CA /rust/bzzz-v2/config/tls/ca/ca.pem \
|
||||
-CAkey /rust/bzzz-v2/config/tls/ca/ca-key.pem \
|
||||
-out /rust/bzzz-v2/config/tls/client/client.pem \
|
||||
-CAcreateserial
|
||||
|
||||
rm /rust/bzzz-v2/config/tls/client/client.csr
|
||||
|
||||
log "Generated client certificate"
|
||||
fi
|
||||
|
||||
# Set appropriate permissions
|
||||
chmod -R 600 /rust/bzzz-v2/config/tls
|
||||
chmod 755 /rust/bzzz-v2/config/tls /rust/bzzz-v2/config/tls/{ca,server,client}
|
||||
|
||||
success "TLS certificates configured"
|
||||
}
|
||||
|
||||
configure_secrets_management() {
|
||||
log "Configuring secrets management..."
|
||||
|
||||
# Create secrets directory with restricted permissions
|
||||
mkdir -p /rust/bzzz-v2/config/secrets
|
||||
chmod 700 /rust/bzzz-v2/config/secrets
|
||||
|
||||
# Generate random secrets if they don't exist
|
||||
local secrets=(
|
||||
"postgres_password"
|
||||
"redis_password"
|
||||
"grafana_admin_password"
|
||||
"prometheus_web_password"
|
||||
"alertmanager_web_password"
|
||||
)
|
||||
|
||||
for secret in "${secrets[@]}"; do
|
||||
local secret_file="/rust/bzzz-v2/config/secrets/${secret}"
|
||||
if [[ ! -f "$secret_file" ]]; then
|
||||
openssl rand -base64 32 > "$secret_file"
|
||||
chmod 600 "$secret_file"
|
||||
log "Generated secret: $secret"
|
||||
fi
|
||||
done
|
||||
|
||||
# Create Docker secrets
|
||||
for secret in "${secrets[@]}"; do
|
||||
local secret_file="/rust/bzzz-v2/config/secrets/${secret}"
|
||||
if docker secret inspect "bzzz_${secret}" >/dev/null 2>&1; then
|
||||
log "Docker secret bzzz_${secret} already exists"
|
||||
else
|
||||
docker secret create "bzzz_${secret}" "$secret_file"
|
||||
log "Created Docker secret: bzzz_${secret}"
|
||||
fi
|
||||
done
|
||||
|
||||
# Handle OpenAI API key if it exists
|
||||
local openai_key_file="/home/tony/chorus/business/secrets/openai-api-key"
|
||||
if [[ -f "$openai_key_file" ]]; then
|
||||
if ! docker secret inspect bzzz_openai_api_key >/dev/null 2>&1; then
|
||||
docker secret create bzzz_openai_api_key "$openai_key_file"
|
||||
log "Created OpenAI API key secret"
|
||||
fi
|
||||
else
|
||||
warn "OpenAI API key not found at $openai_key_file"
|
||||
fi
|
||||
|
||||
success "Secrets management configured"
|
||||
}
|
||||
|
||||
setup_network_security() {
|
||||
log "Setting up network security..."
|
||||
|
||||
# Configure iptables rules for container isolation
|
||||
cat << 'EOF' | sudo tee /etc/iptables/rules.v4 > /dev/null
|
||||
*filter
|
||||
:INPUT ACCEPT [0:0]
|
||||
:FORWARD DROP [0:0]
|
||||
:OUTPUT ACCEPT [0:0]
|
||||
:DOCKER-USER - [0:0]
|
||||
|
||||
# Allow established connections
|
||||
-A INPUT -m state --state ESTABLISHED,RELATED -j ACCEPT
|
||||
|
||||
# Allow loopback
|
||||
-A INPUT -i lo -j ACCEPT
|
||||
|
||||
# Allow SSH (with rate limiting)
|
||||
-A INPUT -p tcp --dport 22 -m state --state NEW -m recent --set
|
||||
-A INPUT -p tcp --dport 22 -m state --state NEW -m recent --update --seconds 60 --hitcount 4 -j DROP
|
||||
-A INPUT -p tcp --dport 22 -j ACCEPT
|
||||
|
||||
# Allow HTTP/HTTPS
|
||||
-A INPUT -p tcp --dport 80 -j ACCEPT
|
||||
-A INPUT -p tcp --dport 443 -j ACCEPT
|
||||
|
||||
# Allow Docker Swarm (internal network only)
|
||||
-A INPUT -s 192.168.1.0/24 -p tcp --dport 2376 -j ACCEPT
|
||||
-A INPUT -s 192.168.1.0/24 -p tcp --dport 2377 -j ACCEPT
|
||||
-A INPUT -s 192.168.1.0/24 -p tcp --dport 7946 -j ACCEPT
|
||||
-A INPUT -s 192.168.1.0/24 -p udp --dport 7946 -j ACCEPT
|
||||
-A INPUT -s 192.168.1.0/24 -p udp --dport 4789 -j ACCEPT
|
||||
|
||||
# Allow BZZZ P2P (internal network only)
|
||||
-A INPUT -s 192.168.1.0/24 -p tcp --dport 9000:9300 -j ACCEPT
|
||||
-A INPUT -s 192.168.1.0/24 -p udp --dport 9000:9300 -j ACCEPT
|
||||
|
||||
# Block container-to-host access except for specific services
|
||||
-A DOCKER-USER -i docker_gwbridge -j ACCEPT
|
||||
-A DOCKER-USER -i docker0 -j ACCEPT
|
||||
-A DOCKER-USER -j DROP
|
||||
|
||||
# Drop everything else
|
||||
-A INPUT -j DROP
|
||||
|
||||
COMMIT
|
||||
EOF
|
||||
|
||||
# Apply iptables rules
|
||||
sudo iptables-restore < /etc/iptables/rules.v4
|
||||
|
||||
# Enable IP forwarding for Docker
|
||||
echo 'net.ipv4.ip_forward=1' | sudo tee -a /etc/sysctl.conf
|
||||
echo 'net.ipv6.conf.all.forwarding=1' | sudo tee -a /etc/sysctl.conf
|
||||
|
||||
# Kernel security parameters
|
||||
cat << 'EOF' | sudo tee -a /etc/sysctl.conf > /dev/null
|
||||
|
||||
# BZZZ v2 Security Parameters
|
||||
net.ipv4.conf.all.rp_filter=1
|
||||
net.ipv4.conf.default.rp_filter=1
|
||||
net.ipv4.icmp_echo_ignore_broadcasts=1
|
||||
net.ipv4.icmp_ignore_bogus_error_responses=1
|
||||
net.ipv4.tcp_syncookies=1
|
||||
net.ipv4.conf.all.log_martians=1
|
||||
net.ipv4.conf.default.log_martians=1
|
||||
net.ipv4.conf.all.accept_source_route=0
|
||||
net.ipv4.conf.default.accept_source_route=0
|
||||
net.ipv6.conf.all.accept_source_route=0
|
||||
net.ipv6.conf.default.accept_source_route=0
|
||||
net.ipv4.conf.all.accept_redirects=0
|
||||
net.ipv4.conf.default.accept_redirects=0
|
||||
net.ipv6.conf.all.accept_redirects=0
|
||||
net.ipv6.conf.default.accept_redirects=0
|
||||
net.ipv4.conf.all.secure_redirects=0
|
||||
net.ipv4.conf.default.secure_redirects=0
|
||||
net.ipv4.conf.all.send_redirects=0
|
||||
net.ipv4.conf.default.send_redirects=0
|
||||
|
||||
# Kernel hardening
|
||||
kernel.dmesg_restrict=1
|
||||
kernel.kptr_restrict=2
|
||||
kernel.yama.ptrace_scope=1
|
||||
fs.suid_dumpable=0
|
||||
kernel.core_uses_pid=1
|
||||
EOF
|
||||
|
||||
# Apply sysctl settings
|
||||
sudo sysctl -p
|
||||
|
||||
success "Network security configured"
|
||||
}
|
||||
|
||||
configure_audit_logging() {
|
||||
log "Configuring audit logging..."
|
||||
|
||||
# Install auditd if not present
|
||||
if ! command -v auditctl &> /dev/null; then
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y auditd audispd-plugins
|
||||
fi
|
||||
|
||||
# Configure audit rules
|
||||
cat << 'EOF' | sudo tee /etc/audit/rules.d/bzzz-v2.rules > /dev/null
|
||||
# BZZZ v2 Audit Rules
|
||||
|
||||
# Monitor file changes in sensitive directories
|
||||
-w /etc/docker/ -p wa -k docker-config
|
||||
-w /rust/bzzz-v2/config/secrets/ -p wa -k bzzz-secrets
|
||||
-w /rust/bzzz-v2/config/tls/ -p wa -k bzzz-tls
|
||||
-w /etc/ssl/ -p wa -k ssl-config
|
||||
|
||||
# Monitor process execution
|
||||
-a always,exit -F arch=b64 -S execve -k process-execution
|
||||
-a always,exit -F arch=b32 -S execve -k process-execution
|
||||
|
||||
# Monitor network connections
|
||||
-a always,exit -F arch=b64 -S socket -k network-socket
|
||||
-a always,exit -F arch=b32 -S socket -k network-socket
|
||||
|
||||
# Monitor file permission changes
|
||||
-a always,exit -F arch=b64 -S chmod,fchmod,fchmodat -k file-permissions
|
||||
-a always,exit -F arch=b32 -S chmod,fchmod,fchmodat -k file-permissions
|
||||
|
||||
# Monitor privilege escalation
|
||||
-w /usr/bin/sudo -p x -k privilege-escalation
|
||||
-w /bin/su -p x -k privilege-escalation
|
||||
|
||||
# Monitor Docker daemon
|
||||
-w /var/lib/docker/ -p wa -k docker-data
|
||||
-w /usr/bin/docker -p x -k docker-exec
|
||||
-w /usr/bin/dockerd -p x -k docker-daemon
|
||||
|
||||
# Make rules immutable
|
||||
-e 2
|
||||
EOF
|
||||
|
||||
# Restart auditd to apply rules
|
||||
sudo systemctl restart auditd
|
||||
|
||||
# Configure log rotation for audit logs
|
||||
cat << 'EOF' | sudo tee /etc/logrotate.d/bzzz-audit > /dev/null
|
||||
/var/log/audit/*.log {
|
||||
daily
|
||||
rotate 30
|
||||
compress
|
||||
delaycompress
|
||||
missingok
|
||||
notifempty
|
||||
create 640 root adm
|
||||
postrotate
|
||||
/sbin/service auditd restart > /dev/null 2>&1 || true
|
||||
endscript
|
||||
}
|
||||
EOF
|
||||
|
||||
success "Audit logging configured"
|
||||
}
|
||||
|
||||
setup_intrusion_detection() {
|
||||
log "Setting up intrusion detection..."
|
||||
|
||||
# Install fail2ban if not present
|
||||
if ! command -v fail2ban-server &> /dev/null; then
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y fail2ban
|
||||
fi
|
||||
|
||||
# Configure fail2ban for BZZZ v2
|
||||
cat << 'EOF' | sudo tee /etc/fail2ban/jail.d/bzzz-v2.conf > /dev/null
|
||||
[DEFAULT]
|
||||
bantime = 3600
|
||||
findtime = 600
|
||||
maxretry = 5
|
||||
backend = systemd
|
||||
|
||||
[sshd]
|
||||
enabled = true
|
||||
port = ssh
|
||||
filter = sshd
|
||||
logpath = /var/log/auth.log
|
||||
maxretry = 3
|
||||
bantime = 7200
|
||||
|
||||
[docker-auth]
|
||||
enabled = true
|
||||
port = 2376
|
||||
filter = docker-auth
|
||||
logpath = /var/log/audit/audit.log
|
||||
maxretry = 3
|
||||
bantime = 3600
|
||||
|
||||
[bzzz-p2p]
|
||||
enabled = true
|
||||
port = 9000:9300
|
||||
filter = bzzz-p2p
|
||||
logpath = /rust/bzzz-v2/logs/application/bzzz-agent.log
|
||||
maxretry = 10
|
||||
bantime = 1800
|
||||
|
||||
[traefik-auth]
|
||||
enabled = true
|
||||
port = http,https
|
||||
filter = traefik-auth
|
||||
logpath = /var/log/traefik/access.log
|
||||
maxretry = 5
|
||||
bantime = 3600
|
||||
EOF
|
||||
|
||||
# Create custom fail2ban filters
|
||||
cat << 'EOF' | sudo tee /etc/fail2ban/filter.d/docker-auth.conf > /dev/null
|
||||
[Definition]
|
||||
failregex = ^.*type=SYSCALL.*comm="dockerd".*res=failed.*$
|
||||
ignoreregex =
|
||||
EOF
|
||||
|
||||
cat << 'EOF' | sudo tee /etc/fail2ban/filter.d/bzzz-p2p.conf > /dev/null
|
||||
[Definition]
|
||||
failregex = ^.*level=error.*msg="unauthorized connection attempt".*peer=<HOST>.*$
|
||||
^.*level=warn.*msg="rate limit exceeded".*source=<HOST>.*$
|
||||
ignoreregex =
|
||||
EOF
|
||||
|
||||
cat << 'EOF' | sudo tee /etc/fail2ban/filter.d/traefik-auth.conf > /dev/null
|
||||
[Definition]
|
||||
failregex = ^<HOST>.*"(GET|POST|PUT|DELETE).*" (401|403) .*$
|
||||
ignoreregex =
|
||||
EOF
|
||||
|
||||
# Start and enable fail2ban
|
||||
sudo systemctl enable fail2ban
|
||||
sudo systemctl start fail2ban
|
||||
|
||||
success "Intrusion detection configured"
|
||||
}
|
||||
|
||||
configure_container_security() {
|
||||
log "Configuring container security policies..."
|
||||
|
||||
# Create AppArmor profile for BZZZ containers
|
||||
cat << 'EOF' | sudo tee /etc/apparmor.d/bzzz-container > /dev/null
|
||||
#include <tunables/global>
|
||||
|
||||
profile bzzz-container flags=(attach_disconnected,mediate_deleted) {
|
||||
#include <abstractions/base>
|
||||
|
||||
capability,
|
||||
file,
|
||||
network,
|
||||
|
||||
deny @{PROC}/* w,
|
||||
deny @{PROC}/sys/fs/** w,
|
||||
deny @{PROC}/sysrq-trigger rwklx,
|
||||
deny @{PROC}/mem rwklx,
|
||||
deny @{PROC}/kmem rwklx,
|
||||
deny @{PROC}/sys/kernel/[^s][^h][^m]* w,
|
||||
deny mount,
|
||||
deny /sys/[^f]** wklx,
|
||||
deny /sys/f[^s]** wklx,
|
||||
deny /sys/fs/[^c]** wklx,
|
||||
deny /sys/fs/c[^g]** wklx,
|
||||
deny /sys/fs/cg[^r]** wklx,
|
||||
deny /sys/firmware/** rwklx,
|
||||
deny /sys/kernel/security/** rwklx,
|
||||
|
||||
# Allow access to application directories
|
||||
/app/** r,
|
||||
/app/bzzz rix,
|
||||
/data/** rw,
|
||||
/config/** r,
|
||||
|
||||
# Allow temporary files
|
||||
/tmp/** rw,
|
||||
|
||||
# Network access
|
||||
network inet,
|
||||
network inet6,
|
||||
network unix,
|
||||
}
|
||||
EOF
|
||||
|
||||
# Load AppArmor profile
|
||||
sudo apparmor_parser -r /etc/apparmor.d/bzzz-container
|
||||
|
||||
# Create seccomp profile for BZZZ containers
|
||||
mkdir -p /rust/bzzz-v2/config/security
|
||||
cat << 'EOF' > /rust/bzzz-v2/config/security/bzzz-seccomp.json
|
||||
{
|
||||
"defaultAction": "SCMP_ACT_ERRNO",
|
||||
"architectures": [
|
||||
"SCMP_ARCH_X86_64",
|
||||
"SCMP_ARCH_X86",
|
||||
"SCMP_ARCH_X32"
|
||||
],
|
||||
"syscalls": [
|
||||
{
|
||||
"names": [
|
||||
"accept", "access", "arch_prctl", "bind", "brk",
|
||||
"chdir", "chmod", "chown", "clone", "close",
|
||||
"connect", "dup", "dup2", "epoll_create", "epoll_ctl",
|
||||
"epoll_wait", "execve", "exit", "exit_group", "fcntl",
|
||||
"fstat", "futex", "getcwd", "getdents", "getgid",
|
||||
"getpid", "getppid", "gettid", "getuid", "listen",
|
||||
"lstat", "mmap", "mprotect", "munmap", "nanosleep",
|
||||
"open", "openat", "pipe", "poll", "prctl",
|
||||
"read", "readlink", "recv", "recvfrom", "rt_sigaction",
|
||||
"rt_sigprocmask", "rt_sigreturn", "sched_yield", "send",
|
||||
"sendto", "set_robust_list", "setsockopt", "socket",
|
||||
"stat", "write"
|
||||
],
|
||||
"action": "SCMP_ACT_ALLOW"
|
||||
}
|
||||
]
|
||||
}
|
||||
EOF
|
||||
|
||||
success "Container security policies configured"
|
||||
}
|
||||
|
||||
main() {
|
||||
log "Starting BZZZ v2 security hardening..."
|
||||
|
||||
check_root
|
||||
configure_firewall
|
||||
configure_docker_security
|
||||
setup_tls_certificates
|
||||
configure_secrets_management
|
||||
setup_network_security
|
||||
configure_audit_logging
|
||||
setup_intrusion_detection
|
||||
configure_container_security
|
||||
|
||||
success "BZZZ v2 security hardening completed successfully!"
|
||||
log "Security configuration saved to: $LOG_FILE"
|
||||
log "Review firewall rules: sudo ufw status verbose"
|
||||
log "Check fail2ban status: sudo fail2ban-client status"
|
||||
log "Verify audit rules: sudo auditctl -l"
|
||||
}
|
||||
|
||||
# Execute main function
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user