hive/backend/app/services/cluster_setup_service.py

#!/usr/bin/env python3
"""
Cluster Setup Service for WHOOSH
Handles initial cluster setup, infrastructure discovery, and BZZZ agent deployment
"""

import asyncio
import json
import logging
import aiohttp
import asyncssh
from typing import Dict, List, Optional, Any
from datetime import datetime
from dataclasses import dataclass, asdict
from pathlib import Path
import subprocess
import tempfile

logger = logging.getLogger(__name__)

@dataclass
class ClusterNode:
    """Cluster node configuration"""
    hostname: str
    ip_address: str
    ssh_user: str
    ssh_port: int = 22
    ssh_key_path: Optional[str] = None
    ssh_password: Optional[str] = None
    role: str = "worker"  # coordinator, worker, storage
    status: str = "pending"  # pending, connecting, ready, error
    capabilities: List[str] = None
    ollama_models: List[str] = None

    def __post_init__(self):
        if self.capabilities is None:
            self.capabilities = []
        if self.ollama_models is None:
            self.ollama_models = []

@dataclass
class ClusterSetupState:
    """Overall cluster setup state"""
    infrastructure_configured: bool = False
    age_keys_generated: bool = False
    models_selected: bool = False
    first_agent_deployed: bool = False
    cluster_initialized: bool = False
    nodes: List[ClusterNode] = None
    selected_models: List[str] = None
    age_keys: Dict[str, str] = None

    def __post_init__(self):
        if self.nodes is None:
            self.nodes = []
        if self.selected_models is None:
            self.selected_models = []
        if self.age_keys is None:
            self.age_keys = {}

class ClusterSetupService:
    """
    Service for setting up the WHOOSH distributed cluster infrastructure.
    Handles infrastructure discovery, age key generation, model selection, and BZZZ deployment.
    """

    def __init__(self):
        self.setup_state = ClusterSetupState()
        self.session: Optional[aiohttp.ClientSession] = None

    async def initialize(self) -> bool:
        """Initialize the cluster setup service"""
        try:
            logger.info("🚀 Initializing Cluster Setup Service")

            self.session = aiohttp.ClientSession(
                timeout=aiohttp.ClientTimeout(total=30)
            )

            # Check if cluster is already set up
            await self._detect_existing_cluster()

            logger.info("✅ Cluster Setup Service initialized")
            return True

        except Exception as e:
            logger.error(f"❌ Failed to initialize cluster setup service: {e}")
            return False

    async def _detect_existing_cluster(self) -> None:
        """Detect if cluster infrastructure already exists"""
        try:
            # Check for existing BZZZ agents on known endpoints
            known_endpoints = [
                # Direct BZZZ connections disabled - WHOOSH should use BZZZ API instead
                # "http://192.168.1.27:8080",  # walnut
                # "http://192.168.1.72:8080",  # acacia
                # "http://192.168.1.113:8080", # ironwood
                # "http://192.168.1.106:8080", # oak
            ]

            active_nodes = []
            for endpoint in known_endpoints:
                try:
                    async with self.session.get(f"{endpoint}/api/agent/status", timeout=aiohttp.ClientTimeout(total=5)) as response:
                        if response.status == 200:
                            data = await response.json()
                            node_info = ClusterNode(
                                hostname=data.get("hostname", endpoint.split("//")[1].split(":")[0]),
                                ip_address=endpoint.split("//")[1].split(":")[0],
                                ssh_user="auto-detected",
                                status="ready",
                                capabilities=data.get("capabilities", []),
                                ollama_models=data.get("models", [])
                            )
                            active_nodes.append(node_info)
                            logger.info(f"🔍 Detected active BZZZ agent: {endpoint}")

                except Exception as e:
                    logger.debug(f"No BZZZ agent at {endpoint}: {e}")

            if active_nodes:
                self.setup_state.nodes = active_nodes
                self.setup_state.infrastructure_configured = True
                self.setup_state.first_agent_deployed = True
                self.setup_state.cluster_initialized = True
                logger.info(f"🎯 Detected existing cluster with {len(active_nodes)} nodes")
            else:
                logger.info("🆕 No existing cluster detected - fresh setup required")

        except Exception as e:
            logger.error(f"❌ Error detecting existing cluster: {e}")

    async def get_setup_status(self) -> Dict[str, Any]:
        """Get current cluster setup status"""
        return {
            "cluster_exists": self.setup_state.cluster_initialized,
            "infrastructure_configured": self.setup_state.infrastructure_configured,
            "age_keys_generated": self.setup_state.age_keys_generated,
            "models_selected": self.setup_state.models_selected,
            "first_agent_deployed": self.setup_state.first_agent_deployed,
            "cluster_initialized": self.setup_state.cluster_initialized,
            "nodes": [asdict(node) for node in self.setup_state.nodes],
            "selected_models": self.setup_state.selected_models,
            "next_step": self._get_next_setup_step()
        }

    def _get_next_setup_step(self) -> str:
        """Determine the next step in cluster setup"""
        if not self.setup_state.infrastructure_configured:
            return "configure_infrastructure"
        elif not self.setup_state.age_keys_generated:
            return "generate_age_keys"
        elif not self.setup_state.models_selected:
            return "select_models"
        elif not self.setup_state.first_agent_deployed:
            return "deploy_first_agent"
        elif not self.setup_state.cluster_initialized:
            return "initialize_cluster"
        else:
            return "complete"

    async def fetch_ollama_models(self) -> List[Dict[str, Any]]:
        """Fetch available models from ollama.com registry"""
        try:
            # Real models from Ollama registry based on your cluster data
            models = [
                # Popular General Purpose Models
                {
                    "name": "llama3.1:8b",
                    "description": "Llama 3.1 8B - State-of-the-art model from Meta available in 8B parameters",
                    "size": "4.7GB",
                    "category": "general",
                    "capabilities": ["tools", "chat", "reasoning", "code"]
                },
                {
                    "name": "llama3.1:70b",
                    "description": "Llama 3.1 70B - Large high-performance model for demanding tasks",
                    "size": "40GB",
                    "category": "advanced",
                    "capabilities": ["tools", "chat", "reasoning", "code", "complex"]
                },
                {
                    "name": "llama3.2:3b",
                    "description": "Meta's Llama 3.2 3B - Compact model that runs efficiently",
                    "size": "2.0GB",
                    "category": "general",
                    "capabilities": ["tools", "chat", "lightweight"]
                },
                {
                    "name": "llama3.2:1b",
                    "description": "Meta's Llama 3.2 1B - Ultra lightweight for edge devices",
                    "size": "1.3GB",
                    "category": "lightweight",
                    "capabilities": ["tools", "chat", "edge", "fast"]
                },

                # Coding Models
                {
                    "name": "qwen2.5-coder:7b",
                    "description": "Latest Code-Specific Qwen model with significant improvements in code generation",
                    "size": "4.1GB",
                    "category": "code",
                    "capabilities": ["tools", "code", "reasoning", "programming"]
                },
                {
                    "name": "codellama:7b",
                    "description": "Code Llama 7B - Large language model for code generation and discussion",
                    "size": "3.8GB",
                    "category": "code",
                    "capabilities": ["code", "programming", "debugging"]
                },
                {
                    "name": "deepseek-coder:6.7b",
                    "description": "DeepSeek Coder 6.7B - Trained on code and natural language tokens",
                    "size": "3.8GB",
                    "category": "code",
                    "capabilities": ["code", "programming", "generation"]
                },

                # Reasoning Models
                {
                    "name": "deepseek-r1:7b",
                    "description": "DeepSeek-R1 7B - Open reasoning model with advanced thinking capabilities",
                    "size": "4.2GB",
                    "category": "reasoning",
                    "capabilities": ["tools", "thinking", "reasoning", "analysis"]
                },
                {
                    "name": "qwen3:8b",
                    "description": "Qwen3 8B - Latest generation with dense and mixture-of-experts models",
                    "size": "4.6GB",
                    "category": "general",
                    "capabilities": ["tools", "thinking", "reasoning", "multilingual"]
                },

                # Efficient Models
                {
                    "name": "mistral:7b",
                    "description": "Mistral 7B - Fast general purpose model updated to version 0.3",
                    "size": "4.1GB",
                    "category": "general",
                    "capabilities": ["tools", "chat", "reasoning", "fast"]
                },
                {
                    "name": "gemma2:9b",
                    "description": "Google Gemma 2 9B - High-performing efficient model with multilingual support",
                    "size": "5.4GB",
                    "category": "general",
                    "capabilities": ["chat", "reasoning", "math", "analysis"]
                },
                {
                    "name": "qwen2.5:7b",
                    "description": "Qwen2.5 7B - Multilingual model with 128K context length",
                    "size": "4.4GB",
                    "category": "general",
                    "capabilities": ["tools", "chat", "multilingual", "reasoning"]
                },

                # Embedding Models
                {
                    "name": "nomic-embed-text",
                    "description": "High-performing open embedding model with large token context window",
                    "size": "274MB",
                    "category": "embedding",
                    "capabilities": ["embedding", "search", "similarity"]
                },
                {
                    "name": "mxbai-embed-large",
                    "description": "State-of-the-art large embedding model from mixedbread.ai",
                    "size": "670MB",
                    "category": "embedding",
                    "capabilities": ["embedding", "search", "retrieval"]
                }
            ]

            logger.info(f"📋 Fetched {len(models)} available models from registry")
            return models

        except Exception as e:
            logger.error(f"❌ Error fetching ollama models: {e}")
            return []

    async def configure_infrastructure(self, nodes: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Configure cluster infrastructure with provided node information"""
        try:
            logger.info(f"🏗️ Configuring infrastructure with {len(nodes)} nodes")

            # Convert dict nodes to ClusterNode objects
            cluster_nodes = []
            for node_data in nodes:
                node = ClusterNode(
                    hostname=node_data["hostname"],
                    ip_address=node_data["ip_address"],
                    ssh_user=node_data["ssh_user"],
                    ssh_port=node_data.get("ssh_port", 22),
                    ssh_key_path=node_data.get("ssh_key_path"),
                    ssh_password=node_data.get("ssh_password"),
                    role=node_data.get("role", "worker")
                )
                cluster_nodes.append(node)

            # Test SSH connectivity to all nodes
            connectivity_results = await self._test_node_connectivity(cluster_nodes)

            # Update node statuses based on connectivity
            for i, result in enumerate(connectivity_results):
                cluster_nodes[i].status = "ready" if result["success"] else "error"

            self.setup_state.nodes = cluster_nodes
            self.setup_state.infrastructure_configured = True

            successful_nodes = sum(1 for result in connectivity_results if result["success"])

            return {
                "success": True,
                "nodes_configured": len(nodes),
                "nodes_accessible": successful_nodes,
                "connectivity_results": connectivity_results
            }

        except Exception as e:
            logger.error(f"❌ Error configuring infrastructure: {e}")
            return {"success": False, "error": str(e)}

    async def _test_node_connectivity(self, nodes: List[ClusterNode]) -> List[Dict[str, Any]]:
        """Test SSH connectivity to all cluster nodes"""
        async def test_node(node: ClusterNode) -> Dict[str, Any]:
            try:
                # Test SSH connection
                if node.ssh_key_path:
                    # Use SSH key authentication
                    async with asyncssh.connect(
                        node.ip_address,
                        port=node.ssh_port,
                        username=node.ssh_user,
                        client_keys=[node.ssh_key_path],
                        known_hosts=None  # Skip host key verification for now
                    ) as conn:
                        result = await conn.run('echo "SSH test successful"')
                        return {
                            "hostname": node.hostname,
                            "success": True,
                            "message": "SSH connection successful",
                            "output": result.stdout.strip()
                        }
                else:
                    # Use password authentication
                    async with asyncssh.connect(
                        node.ip_address,
                        port=node.ssh_port,
                        username=node.ssh_user,
                        password=node.ssh_password,
                        known_hosts=None
                    ) as conn:
                        result = await conn.run('echo "SSH test successful"')
                        return {
                            "hostname": node.hostname,
                            "success": True,
                            "message": "SSH connection successful",
                            "output": result.stdout.strip()
                        }

            except Exception as e:
                return {
                    "hostname": node.hostname,
                    "success": False,
                    "message": f"SSH connection failed: {str(e)}"
                }

        # Test all nodes concurrently
        connectivity_tasks = [test_node(node) for node in nodes]
        results = await asyncio.gather(*connectivity_tasks, return_exceptions=True)

        # Handle any exceptions in the results
        formatted_results = []
        for i, result in enumerate(results):
            if isinstance(result, Exception):
                formatted_results.append({
                    "hostname": nodes[i].hostname,
                    "success": False,
                    "message": f"Connection test failed: {str(result)}"
                })
            else:
                formatted_results.append(result)

        return formatted_results

    async def generate_age_keys(self) -> Dict[str, Any]:
        """Generate Age encryption keys for secure P2P communication"""
        try:
            logger.info("🔐 Generating Age encryption keys")

            # Generate age key pair using subprocess
            result = subprocess.run(
                ["age-keygen"],
                capture_output=True,
                text=True
            )

            if result.returncode == 0:
                # Parse the key output
                output_lines = result.stdout.strip().split('\n')
                private_key = ""
                public_key = ""

                for line in output_lines:
                    if line.startswith("AGE-SECRET-KEY-"):
                        private_key = line
                    elif line.startswith("age"):
                        public_key = line

                self.setup_state.age_keys = {
                    "private_key": private_key,
                    "public_key": public_key,
                    "generated_at": datetime.utcnow().isoformat()
                }
                self.setup_state.age_keys_generated = True

                logger.info("✅ Age keys generated successfully")
                return {
                    "success": True,
                    "public_key": public_key,
                    "message": "Age encryption keys generated successfully"
                }
            else:
                raise Exception(f"age-keygen failed: {result.stderr}")

        except FileNotFoundError:
            logger.error("❌ age-keygen command not found - please install age")
            return {
                "success": False,
                "error": "age-keygen command not found - please install age encryption tool"
            }
        except Exception as e:
            logger.error(f"❌ Error generating age keys: {e}")
            return {
                "success": False,
                "error": str(e)
            }

    async def select_models(self, model_names: List[str]) -> Dict[str, Any]:
        """Select models for the cluster"""
        try:
            logger.info(f"📦 Selecting {len(model_names)} models for cluster")

            self.setup_state.selected_models = model_names
            self.setup_state.models_selected = True

            return {
                "success": True,
                "selected_models": model_names,
                "message": f"Selected {len(model_names)} models for deployment"
            }

        except Exception as e:
            logger.error(f"❌ Error selecting models: {e}")
            return {"success": False, "error": str(e)}

    async def deploy_first_agent(self, coordinator_node_hostname: str) -> Dict[str, Any]:
        """Deploy the first BZZZ agent and pull selected models"""
        try:
            logger.info(f"🚀 Deploying first BZZZ agent to {coordinator_node_hostname}")

            # Find the coordinator node
            coordinator_node = None
            for node in self.setup_state.nodes:
                if node.hostname == coordinator_node_hostname:
                    coordinator_node = node
                    break

            if not coordinator_node:
                raise Exception(f"Coordinator node {coordinator_node_hostname} not found")

            # Deploy BZZZ agent via SSH
            deployment_result = await self._deploy_bzzz_agent(coordinator_node, is_coordinator=True)

            if deployment_result["success"]:
                # Pull selected models on the coordinator
                model_results = await self._pull_models_on_node(coordinator_node, self.setup_state.selected_models)

                self.setup_state.first_agent_deployed = True
                coordinator_node.status = "ready"
                coordinator_node.ollama_models = self.setup_state.selected_models

                return {
                    "success": True,
                    "coordinator": coordinator_node_hostname,
                    "models_pulled": len(self.setup_state.selected_models),
                    "deployment_details": deployment_result,
                    "model_results": model_results
                }
            else:
                return deployment_result

        except Exception as e:
            logger.error(f"❌ Error deploying first agent: {e}")
            return {"success": False, "error": str(e)}

    async def _deploy_bzzz_agent(self, node: ClusterNode, is_coordinator: bool = False) -> Dict[str, Any]:
        """Deploy BZZZ agent as native systemd service to a specific node"""
        try:
            # SSH to node and deploy BZZZ
            if node.ssh_key_path:
                conn_kwargs = {"client_keys": [node.ssh_key_path]}
            else:
                conn_kwargs = {"password": node.ssh_password}

            async with asyncssh.connect(
                node.ip_address,
                port=node.ssh_port,
                username=node.ssh_user,
                known_hosts=None,
                **conn_kwargs
            ) as conn:

                # Install Go and Git if not present
                await conn.run("sudo apt-get update && sudo apt-get install -y golang-go git build-essential")

                # Clone BZZZ repository
                await conn.run("rm -rf ~/chorus && mkdir -p ~/chorus/project-queues/active")
                clone_cmd = "cd ~/chorus/project-queues/active && git clone https://gitea.deepblack.cloud/tony/BZZZ.git"
                await conn.run(clone_cmd)

                # Build BZZZ binary
                build_cmd = "cd ~/chorus/project-queues/active/BZZZ && go build -o bzzz"
                build_result = await conn.run(build_cmd)

                # Create BZZZ configuration (if needed - check if BZZZ uses config files)
                config = {
                    "node": {"id": node.hostname},
                    "agent": {"id": f"bzzz-{node.hostname}", "role": node.role},
                    "api": {"host": "0.0.0.0", "port": 8080},
                    "p2p": {"port": 4001},
                    "coordinator": is_coordinator
                }

                # Write config file (adjust path as needed)
                config_json = json.dumps(config, indent=2)
                await conn.run(f'mkdir -p ~/chorus/project-queues/active/BZZZ/config && echo \'{config_json}\' > ~/chorus/project-queues/active/BZZZ/config/bzzz.json')

                # Install BZZZ as systemd service
                install_cmd = "cd ~/chorus/project-queues/active/BZZZ && sudo ./install-service.sh"
                install_result = await conn.run(install_cmd)

                return {
                    "success": True,
                    "message": f"BZZZ agent deployed as systemd service to {node.hostname}",
                    "build_output": build_result.stdout,
                    "install_output": install_result.stdout
                }

        except Exception as e:
            return {
                "success": False,
                "error": f"Failed to deploy BZZZ agent to {node.hostname}: {str(e)}"
            }

    async def _pull_models_on_node(self, node: ClusterNode, models: List[str]) -> List[Dict[str, Any]]:
        """Pull Ollama models on a specific node"""
        try:
            if node.ssh_key_path:
                conn_kwargs = {"client_keys": [node.ssh_key_path]}
            else:
                conn_kwargs = {"password": node.ssh_password}

            async with asyncssh.connect(
                node.ip_address,
                port=node.ssh_port,
                username=node.ssh_user,
                known_hosts=None,
                **conn_kwargs
            ) as conn:

                # Install Ollama if not present
                await conn.run("curl -fsSL https://ollama.com/install.sh | sh")

                # Start Ollama service
                await conn.run("sudo systemctl enable ollama && sudo systemctl start ollama")

                # Pull each model
                results = []
                for model in models:
                    try:
                        result = await conn.run(f"ollama pull {model}")
                        results.append({
                            "model": model,
                            "success": True,
                            "output": result.stdout
                        })
                        logger.info(f"✅ Pulled model {model} on {node.hostname}")
                    except Exception as e:
                        results.append({
                            "model": model,
                            "success": False,
                            "error": str(e)
                        })
                        logger.error(f"❌ Failed to pull model {model} on {node.hostname}: {e}")

                return results

        except Exception as e:
            logger.error(f"❌ Error pulling models on {node.hostname}: {e}")
            return [{"error": str(e), "success": False}]

    async def initialize_cluster(self) -> Dict[str, Any]:
        """Initialize the complete cluster with P2P model distribution"""
        try:
            logger.info("🌐 Initializing complete cluster")

            # Deploy BZZZ agents to remaining nodes
            remaining_nodes = [node for node in self.setup_state.nodes if node.status != "ready"]

            deployment_results = []
            for node in remaining_nodes:
                result = await self._deploy_bzzz_agent(node, is_coordinator=False)
                deployment_results.append(result)

                if result["success"]:
                    node.status = "ready"

            # TODO: Implement P2P model distribution via BZZZ network
            # For now, we'll note that models should be distributed via P2P

            self.setup_state.cluster_initialized = True

            successful_deployments = sum(1 for r in deployment_results if r["success"])

            return {
                "success": True,
                "cluster_nodes": len(self.setup_state.nodes),
                "successful_deployments": successful_deployments,
                "deployment_results": deployment_results,
                "message": "Cluster initialization completed"
            }

        except Exception as e:
            logger.error(f"❌ Error initializing cluster: {e}")
            return {"success": False, "error": str(e)}

    async def cleanup(self) -> None:
        """Cleanup cluster setup service resources"""
        try:
            if self.session:
                await self.session.close()
            logger.info("🧹 Cluster Setup Service cleanup completed")
        except Exception as e:
            logger.error(f"❌ Error during cleanup: {e}")

# Global service instance
cluster_setup_service = ClusterSetupService()