#!/usr/bin/env python3 """ Cluster Setup Service for WHOOSH Handles initial cluster setup, infrastructure discovery, and BZZZ agent deployment """ import asyncio import json import logging import aiohttp import asyncssh from typing import Dict, List, Optional, Any from datetime import datetime from dataclasses import dataclass, asdict from pathlib import Path import subprocess import tempfile logger = logging.getLogger(__name__) @dataclass class ClusterNode: """Cluster node configuration""" hostname: str ip_address: str ssh_user: str ssh_port: int = 22 ssh_key_path: Optional[str] = None ssh_password: Optional[str] = None role: str = "worker" # coordinator, worker, storage status: str = "pending" # pending, connecting, ready, error capabilities: List[str] = None ollama_models: List[str] = None def __post_init__(self): if self.capabilities is None: self.capabilities = [] if self.ollama_models is None: self.ollama_models = [] @dataclass class ClusterSetupState: """Overall cluster setup state""" infrastructure_configured: bool = False age_keys_generated: bool = False models_selected: bool = False first_agent_deployed: bool = False cluster_initialized: bool = False nodes: List[ClusterNode] = None selected_models: List[str] = None age_keys: Dict[str, str] = None def __post_init__(self): if self.nodes is None: self.nodes = [] if self.selected_models is None: self.selected_models = [] if self.age_keys is None: self.age_keys = {} class ClusterSetupService: """ Service for setting up the WHOOSH distributed cluster infrastructure. Handles infrastructure discovery, age key generation, model selection, and BZZZ deployment. """ def __init__(self): self.setup_state = ClusterSetupState() self.session: Optional[aiohttp.ClientSession] = None async def initialize(self) -> bool: """Initialize the cluster setup service""" try: logger.info("๐Ÿš€ Initializing Cluster Setup Service") self.session = aiohttp.ClientSession( timeout=aiohttp.ClientTimeout(total=30) ) # Check if cluster is already set up await self._detect_existing_cluster() logger.info("โœ… Cluster Setup Service initialized") return True except Exception as e: logger.error(f"โŒ Failed to initialize cluster setup service: {e}") return False async def _detect_existing_cluster(self) -> None: """Detect if cluster infrastructure already exists""" try: # Check for existing BZZZ agents on known endpoints known_endpoints = [ # Direct BZZZ connections disabled - WHOOSH should use BZZZ API instead # "http://192.168.1.27:8080", # walnut # "http://192.168.1.72:8080", # acacia # "http://192.168.1.113:8080", # ironwood # "http://192.168.1.106:8080", # oak ] active_nodes = [] for endpoint in known_endpoints: try: async with self.session.get(f"{endpoint}/api/agent/status", timeout=aiohttp.ClientTimeout(total=5)) as response: if response.status == 200: data = await response.json() node_info = ClusterNode( hostname=data.get("hostname", endpoint.split("//")[1].split(":")[0]), ip_address=endpoint.split("//")[1].split(":")[0], ssh_user="auto-detected", status="ready", capabilities=data.get("capabilities", []), ollama_models=data.get("models", []) ) active_nodes.append(node_info) logger.info(f"๐Ÿ” Detected active BZZZ agent: {endpoint}") except Exception as e: logger.debug(f"No BZZZ agent at {endpoint}: {e}") if active_nodes: self.setup_state.nodes = active_nodes self.setup_state.infrastructure_configured = True self.setup_state.first_agent_deployed = True self.setup_state.cluster_initialized = True logger.info(f"๐ŸŽฏ Detected existing cluster with {len(active_nodes)} nodes") else: logger.info("๐Ÿ†• No existing cluster detected - fresh setup required") except Exception as e: logger.error(f"โŒ Error detecting existing cluster: {e}") async def get_setup_status(self) -> Dict[str, Any]: """Get current cluster setup status""" return { "cluster_exists": self.setup_state.cluster_initialized, "infrastructure_configured": self.setup_state.infrastructure_configured, "age_keys_generated": self.setup_state.age_keys_generated, "models_selected": self.setup_state.models_selected, "first_agent_deployed": self.setup_state.first_agent_deployed, "cluster_initialized": self.setup_state.cluster_initialized, "nodes": [asdict(node) for node in self.setup_state.nodes], "selected_models": self.setup_state.selected_models, "next_step": self._get_next_setup_step() } def _get_next_setup_step(self) -> str: """Determine the next step in cluster setup""" if not self.setup_state.infrastructure_configured: return "configure_infrastructure" elif not self.setup_state.age_keys_generated: return "generate_age_keys" elif not self.setup_state.models_selected: return "select_models" elif not self.setup_state.first_agent_deployed: return "deploy_first_agent" elif not self.setup_state.cluster_initialized: return "initialize_cluster" else: return "complete" async def fetch_ollama_models(self) -> List[Dict[str, Any]]: """Fetch available models from ollama.com registry""" try: # Real models from Ollama registry based on your cluster data models = [ # Popular General Purpose Models { "name": "llama3.1:8b", "description": "Llama 3.1 8B - State-of-the-art model from Meta available in 8B parameters", "size": "4.7GB", "category": "general", "capabilities": ["tools", "chat", "reasoning", "code"] }, { "name": "llama3.1:70b", "description": "Llama 3.1 70B - Large high-performance model for demanding tasks", "size": "40GB", "category": "advanced", "capabilities": ["tools", "chat", "reasoning", "code", "complex"] }, { "name": "llama3.2:3b", "description": "Meta's Llama 3.2 3B - Compact model that runs efficiently", "size": "2.0GB", "category": "general", "capabilities": ["tools", "chat", "lightweight"] }, { "name": "llama3.2:1b", "description": "Meta's Llama 3.2 1B - Ultra lightweight for edge devices", "size": "1.3GB", "category": "lightweight", "capabilities": ["tools", "chat", "edge", "fast"] }, # Coding Models { "name": "qwen2.5-coder:7b", "description": "Latest Code-Specific Qwen model with significant improvements in code generation", "size": "4.1GB", "category": "code", "capabilities": ["tools", "code", "reasoning", "programming"] }, { "name": "codellama:7b", "description": "Code Llama 7B - Large language model for code generation and discussion", "size": "3.8GB", "category": "code", "capabilities": ["code", "programming", "debugging"] }, { "name": "deepseek-coder:6.7b", "description": "DeepSeek Coder 6.7B - Trained on code and natural language tokens", "size": "3.8GB", "category": "code", "capabilities": ["code", "programming", "generation"] }, # Reasoning Models { "name": "deepseek-r1:7b", "description": "DeepSeek-R1 7B - Open reasoning model with advanced thinking capabilities", "size": "4.2GB", "category": "reasoning", "capabilities": ["tools", "thinking", "reasoning", "analysis"] }, { "name": "qwen3:8b", "description": "Qwen3 8B - Latest generation with dense and mixture-of-experts models", "size": "4.6GB", "category": "general", "capabilities": ["tools", "thinking", "reasoning", "multilingual"] }, # Efficient Models { "name": "mistral:7b", "description": "Mistral 7B - Fast general purpose model updated to version 0.3", "size": "4.1GB", "category": "general", "capabilities": ["tools", "chat", "reasoning", "fast"] }, { "name": "gemma2:9b", "description": "Google Gemma 2 9B - High-performing efficient model with multilingual support", "size": "5.4GB", "category": "general", "capabilities": ["chat", "reasoning", "math", "analysis"] }, { "name": "qwen2.5:7b", "description": "Qwen2.5 7B - Multilingual model with 128K context length", "size": "4.4GB", "category": "general", "capabilities": ["tools", "chat", "multilingual", "reasoning"] }, # Embedding Models { "name": "nomic-embed-text", "description": "High-performing open embedding model with large token context window", "size": "274MB", "category": "embedding", "capabilities": ["embedding", "search", "similarity"] }, { "name": "mxbai-embed-large", "description": "State-of-the-art large embedding model from mixedbread.ai", "size": "670MB", "category": "embedding", "capabilities": ["embedding", "search", "retrieval"] } ] logger.info(f"๐Ÿ“‹ Fetched {len(models)} available models from registry") return models except Exception as e: logger.error(f"โŒ Error fetching ollama models: {e}") return [] async def configure_infrastructure(self, nodes: List[Dict[str, Any]]) -> Dict[str, Any]: """Configure cluster infrastructure with provided node information""" try: logger.info(f"๐Ÿ—๏ธ Configuring infrastructure with {len(nodes)} nodes") # Convert dict nodes to ClusterNode objects cluster_nodes = [] for node_data in nodes: node = ClusterNode( hostname=node_data["hostname"], ip_address=node_data["ip_address"], ssh_user=node_data["ssh_user"], ssh_port=node_data.get("ssh_port", 22), ssh_key_path=node_data.get("ssh_key_path"), ssh_password=node_data.get("ssh_password"), role=node_data.get("role", "worker") ) cluster_nodes.append(node) # Test SSH connectivity to all nodes connectivity_results = await self._test_node_connectivity(cluster_nodes) # Update node statuses based on connectivity for i, result in enumerate(connectivity_results): cluster_nodes[i].status = "ready" if result["success"] else "error" self.setup_state.nodes = cluster_nodes self.setup_state.infrastructure_configured = True successful_nodes = sum(1 for result in connectivity_results if result["success"]) return { "success": True, "nodes_configured": len(nodes), "nodes_accessible": successful_nodes, "connectivity_results": connectivity_results } except Exception as e: logger.error(f"โŒ Error configuring infrastructure: {e}") return {"success": False, "error": str(e)} async def _test_node_connectivity(self, nodes: List[ClusterNode]) -> List[Dict[str, Any]]: """Test SSH connectivity to all cluster nodes""" async def test_node(node: ClusterNode) -> Dict[str, Any]: try: # Test SSH connection if node.ssh_key_path: # Use SSH key authentication async with asyncssh.connect( node.ip_address, port=node.ssh_port, username=node.ssh_user, client_keys=[node.ssh_key_path], known_hosts=None # Skip host key verification for now ) as conn: result = await conn.run('echo "SSH test successful"') return { "hostname": node.hostname, "success": True, "message": "SSH connection successful", "output": result.stdout.strip() } else: # Use password authentication async with asyncssh.connect( node.ip_address, port=node.ssh_port, username=node.ssh_user, password=node.ssh_password, known_hosts=None ) as conn: result = await conn.run('echo "SSH test successful"') return { "hostname": node.hostname, "success": True, "message": "SSH connection successful", "output": result.stdout.strip() } except Exception as e: return { "hostname": node.hostname, "success": False, "message": f"SSH connection failed: {str(e)}" } # Test all nodes concurrently connectivity_tasks = [test_node(node) for node in nodes] results = await asyncio.gather(*connectivity_tasks, return_exceptions=True) # Handle any exceptions in the results formatted_results = [] for i, result in enumerate(results): if isinstance(result, Exception): formatted_results.append({ "hostname": nodes[i].hostname, "success": False, "message": f"Connection test failed: {str(result)}" }) else: formatted_results.append(result) return formatted_results async def generate_age_keys(self) -> Dict[str, Any]: """Generate Age encryption keys for secure P2P communication""" try: logger.info("๐Ÿ” Generating Age encryption keys") # Generate age key pair using subprocess result = subprocess.run( ["age-keygen"], capture_output=True, text=True ) if result.returncode == 0: # Parse the key output output_lines = result.stdout.strip().split('\n') private_key = "" public_key = "" for line in output_lines: if line.startswith("AGE-SECRET-KEY-"): private_key = line elif line.startswith("age"): public_key = line self.setup_state.age_keys = { "private_key": private_key, "public_key": public_key, "generated_at": datetime.utcnow().isoformat() } self.setup_state.age_keys_generated = True logger.info("โœ… Age keys generated successfully") return { "success": True, "public_key": public_key, "message": "Age encryption keys generated successfully" } else: raise Exception(f"age-keygen failed: {result.stderr}") except FileNotFoundError: logger.error("โŒ age-keygen command not found - please install age") return { "success": False, "error": "age-keygen command not found - please install age encryption tool" } except Exception as e: logger.error(f"โŒ Error generating age keys: {e}") return { "success": False, "error": str(e) } async def select_models(self, model_names: List[str]) -> Dict[str, Any]: """Select models for the cluster""" try: logger.info(f"๐Ÿ“ฆ Selecting {len(model_names)} models for cluster") self.setup_state.selected_models = model_names self.setup_state.models_selected = True return { "success": True, "selected_models": model_names, "message": f"Selected {len(model_names)} models for deployment" } except Exception as e: logger.error(f"โŒ Error selecting models: {e}") return {"success": False, "error": str(e)} async def deploy_first_agent(self, coordinator_node_hostname: str) -> Dict[str, Any]: """Deploy the first BZZZ agent and pull selected models""" try: logger.info(f"๐Ÿš€ Deploying first BZZZ agent to {coordinator_node_hostname}") # Find the coordinator node coordinator_node = None for node in self.setup_state.nodes: if node.hostname == coordinator_node_hostname: coordinator_node = node break if not coordinator_node: raise Exception(f"Coordinator node {coordinator_node_hostname} not found") # Deploy BZZZ agent via SSH deployment_result = await self._deploy_bzzz_agent(coordinator_node, is_coordinator=True) if deployment_result["success"]: # Pull selected models on the coordinator model_results = await self._pull_models_on_node(coordinator_node, self.setup_state.selected_models) self.setup_state.first_agent_deployed = True coordinator_node.status = "ready" coordinator_node.ollama_models = self.setup_state.selected_models return { "success": True, "coordinator": coordinator_node_hostname, "models_pulled": len(self.setup_state.selected_models), "deployment_details": deployment_result, "model_results": model_results } else: return deployment_result except Exception as e: logger.error(f"โŒ Error deploying first agent: {e}") return {"success": False, "error": str(e)} async def _deploy_bzzz_agent(self, node: ClusterNode, is_coordinator: bool = False) -> Dict[str, Any]: """Deploy BZZZ agent as native systemd service to a specific node""" try: # SSH to node and deploy BZZZ if node.ssh_key_path: conn_kwargs = {"client_keys": [node.ssh_key_path]} else: conn_kwargs = {"password": node.ssh_password} async with asyncssh.connect( node.ip_address, port=node.ssh_port, username=node.ssh_user, known_hosts=None, **conn_kwargs ) as conn: # Install Go and Git if not present await conn.run("sudo apt-get update && sudo apt-get install -y golang-go git build-essential") # Clone BZZZ repository await conn.run("rm -rf ~/chorus && mkdir -p ~/chorus/project-queues/active") clone_cmd = "cd ~/chorus/project-queues/active && git clone https://gitea.deepblack.cloud/tony/BZZZ.git" await conn.run(clone_cmd) # Build BZZZ binary build_cmd = "cd ~/chorus/project-queues/active/BZZZ && go build -o bzzz" build_result = await conn.run(build_cmd) # Create BZZZ configuration (if needed - check if BZZZ uses config files) config = { "node": {"id": node.hostname}, "agent": {"id": f"bzzz-{node.hostname}", "role": node.role}, "api": {"host": "0.0.0.0", "port": 8080}, "p2p": {"port": 4001}, "coordinator": is_coordinator } # Write config file (adjust path as needed) config_json = json.dumps(config, indent=2) await conn.run(f'mkdir -p ~/chorus/project-queues/active/BZZZ/config && echo \'{config_json}\' > ~/chorus/project-queues/active/BZZZ/config/bzzz.json') # Install BZZZ as systemd service install_cmd = "cd ~/chorus/project-queues/active/BZZZ && sudo ./install-service.sh" install_result = await conn.run(install_cmd) return { "success": True, "message": f"BZZZ agent deployed as systemd service to {node.hostname}", "build_output": build_result.stdout, "install_output": install_result.stdout } except Exception as e: return { "success": False, "error": f"Failed to deploy BZZZ agent to {node.hostname}: {str(e)}" } async def _pull_models_on_node(self, node: ClusterNode, models: List[str]) -> List[Dict[str, Any]]: """Pull Ollama models on a specific node""" try: if node.ssh_key_path: conn_kwargs = {"client_keys": [node.ssh_key_path]} else: conn_kwargs = {"password": node.ssh_password} async with asyncssh.connect( node.ip_address, port=node.ssh_port, username=node.ssh_user, known_hosts=None, **conn_kwargs ) as conn: # Install Ollama if not present await conn.run("curl -fsSL https://ollama.com/install.sh | sh") # Start Ollama service await conn.run("sudo systemctl enable ollama && sudo systemctl start ollama") # Pull each model results = [] for model in models: try: result = await conn.run(f"ollama pull {model}") results.append({ "model": model, "success": True, "output": result.stdout }) logger.info(f"โœ… Pulled model {model} on {node.hostname}") except Exception as e: results.append({ "model": model, "success": False, "error": str(e) }) logger.error(f"โŒ Failed to pull model {model} on {node.hostname}: {e}") return results except Exception as e: logger.error(f"โŒ Error pulling models on {node.hostname}: {e}") return [{"error": str(e), "success": False}] async def initialize_cluster(self) -> Dict[str, Any]: """Initialize the complete cluster with P2P model distribution""" try: logger.info("๐ŸŒ Initializing complete cluster") # Deploy BZZZ agents to remaining nodes remaining_nodes = [node for node in self.setup_state.nodes if node.status != "ready"] deployment_results = [] for node in remaining_nodes: result = await self._deploy_bzzz_agent(node, is_coordinator=False) deployment_results.append(result) if result["success"]: node.status = "ready" # TODO: Implement P2P model distribution via BZZZ network # For now, we'll note that models should be distributed via P2P self.setup_state.cluster_initialized = True successful_deployments = sum(1 for r in deployment_results if r["success"]) return { "success": True, "cluster_nodes": len(self.setup_state.nodes), "successful_deployments": successful_deployments, "deployment_results": deployment_results, "message": "Cluster initialization completed" } except Exception as e: logger.error(f"โŒ Error initializing cluster: {e}") return {"success": False, "error": str(e)} async def cleanup(self) -> None: """Cleanup cluster setup service resources""" try: if self.session: await self.session.close() logger.info("๐Ÿงน Cluster Setup Service cleanup completed") except Exception as e: logger.error(f"โŒ Error during cleanup: {e}") # Global service instance cluster_setup_service = ClusterSetupService()