hive/backend/app/services/ai_model_service.py

"""
WHOOSH AI Model Service - Phase 6.1
Advanced AI model integration with distributed Ollama cluster
"""

import asyncio
import aiohttp
import json
import time
from typing import Dict, List, Optional, Any
from datetime import datetime, timedelta
import logging
from dataclasses import dataclass
from enum import Enum

logger = logging.getLogger(__name__)

class ModelCapability(Enum):
    """AI Model capabilities"""
    CODE_GENERATION = "code_generation"
    CODE_REVIEW = "code_review"
    DOCUMENTATION = "documentation"
    TESTING = "testing"
    ARCHITECTURE = "architecture"
    DEBUGGING = "debugging"
    REFACTORING = "refactoring"
    GENERAL_CHAT = "general_chat"
    SPECIALIZED_DOMAIN = "specialized_domain"

@dataclass
class AIModel:
    """AI Model information"""
    name: str
    node_url: str
    capabilities: List[ModelCapability]
    context_length: int
    parameter_count: str
    specialization: Optional[str] = None
    performance_score: float = 0.0
    availability: bool = True
    last_used: Optional[datetime] = None
    usage_count: int = 0
    avg_response_time: float = 0.0

@dataclass
class ClusterNode:
    """Ollama cluster node information"""
    host: str
    port: int
    status: str = "unknown"
    models: List[str] = None
    load: float = 0.0
    last_ping: Optional[datetime] = None

class AIModelService:
    """Advanced AI Model Service for WHOOSH"""

    def __init__(self):
        # Distributed Ollama cluster nodes from CLAUDE.md
        self.cluster_nodes = [
            ClusterNode("192.168.1.27", 11434),  # Node 1
            ClusterNode("192.168.1.72", 11434),  # Node 2
            ClusterNode("192.168.1.113", 11434), # Node 3
            ClusterNode("192.168.1.106", 11434), # Node 4
        ]

        self.models: Dict[str, AIModel] = {}
        self.model_cache = {}
        self.load_balancer_state = {}
        self.session: Optional[aiohttp.ClientSession] = None

    async def initialize(self):
        """Initialize the AI model service"""
        logger.info("Initializing AI Model Service...")

        # Create aiohttp session
        self.session = aiohttp.ClientSession(
            timeout=aiohttp.ClientTimeout(total=30)
        )

        # Discover all available models across the cluster
        await self.discover_cluster_models()

        # Set up load balancing
        await self.initialize_load_balancer()

        logger.info(f"AI Model Service initialized with {len(self.models)} models across {len(self.cluster_nodes)} nodes")

    async def discover_cluster_models(self):
        """Discover all available models across the Ollama cluster"""
        logger.info("Discovering models across Ollama cluster...")

        discovered_models = {}

        for node in self.cluster_nodes:
            try:
                node_url = f"http://{node.host}:{node.port}"

                # Check node health
                async with self.session.get(f"{node_url}/api/tags", timeout=5) as response:
                    if response.status == 200:
                        data = await response.json()
                        node.status = "healthy"
                        node.models = [model["name"] for model in data.get("models", [])]
                        node.last_ping = datetime.now()

                        # Process each model
                        for model_info in data.get("models", []):
                            model_name = model_info["name"]

                            # Determine model capabilities based on name patterns
                            capabilities = self._determine_model_capabilities(model_name)

                            # Create or update model entry
                            if model_name not in discovered_models:
                                discovered_models[model_name] = AIModel(
                                    name=model_name,
                                    node_url=node_url,
                                    capabilities=capabilities,
                                    context_length=self._estimate_context_length(model_name),
                                    parameter_count=self._estimate_parameters(model_name),
                                    specialization=self._determine_specialization(model_name)
                                )

                        logger.info(f"Node {node.host}: {len(node.models)} models available")

            except Exception as e:
                logger.warning(f"Failed to connect to node {node.host}:{node.port}: {e}")
                node.status = "unavailable"
                node.models = []

        self.models = discovered_models
        logger.info(f"Discovered {len(self.models)} total models across cluster")

    def _determine_model_capabilities(self, model_name: str) -> List[ModelCapability]:
        """Determine model capabilities based on name patterns"""
        capabilities = []
        name_lower = model_name.lower()

        # Code-focused models
        if any(keyword in name_lower for keyword in ["code", "codellama", "deepseek", "starcoder", "wizard"]):
            capabilities.extend([
                ModelCapability.CODE_GENERATION,
                ModelCapability.CODE_REVIEW,
                ModelCapability.DEBUGGING,
                ModelCapability.REFACTORING
            ])

        # Documentation models
        if any(keyword in name_lower for keyword in ["llama", "mistral", "gemma"]):
            capabilities.append(ModelCapability.DOCUMENTATION)

        # Testing models
        if "test" in name_lower or "wizard" in name_lower:
            capabilities.append(ModelCapability.TESTING)

        # Architecture models (larger models)
        if any(keyword in name_lower for keyword in ["70b", "34b", "33b"]):
            capabilities.append(ModelCapability.ARCHITECTURE)

        # General chat (most models)
        capabilities.append(ModelCapability.GENERAL_CHAT)

        # Default if no specific capabilities found
        if len(capabilities) == 1:  # Only GENERAL_CHAT
            capabilities.append(ModelCapability.CODE_GENERATION)

        return capabilities

    def _estimate_context_length(self, model_name: str) -> int:
        """Estimate context length based on model name"""
        name_lower = model_name.lower()

        if "32k" in name_lower:
            return 32768
        elif "16k" in name_lower:
            return 16384
        elif "8k" in name_lower:
            return 8192
        elif any(size in name_lower for size in ["70b", "65b"]):
            return 4096
        elif any(size in name_lower for size in ["34b", "33b"]):
            return 4096
        else:
            return 2048  # Default

    def _estimate_parameters(self, model_name: str) -> str:
        """Estimate parameter count based on model name"""
        name_lower = model_name.lower()

        if "70b" in name_lower:
            return "70B"
        elif "34b" in name_lower or "33b" in name_lower:
            return "34B"
        elif "13b" in name_lower:
            return "13B"
        elif "7b" in name_lower:
            return "7B"
        elif "3b" in name_lower:
            return "3B"
        elif "1b" in name_lower:
            return "1B"
        else:
            return "Unknown"

    def _determine_specialization(self, model_name: str) -> Optional[str]:
        """Determine model specialization"""
        name_lower = model_name.lower()

        if "code" in name_lower:
            return "Programming"
        elif "math" in name_lower:
            return "Mathematics"
        elif "sql" in name_lower:
            return "Database"
        elif "medical" in name_lower:
            return "Healthcare"
        else:
            return None

    async def get_best_model_for_task(self,
                                    task_type: ModelCapability,
                                    context_requirements: int = 2048,
                                    prefer_specialized: bool = True) -> Optional[AIModel]:
        """Select the best model for a specific task"""

        # Filter models by capability
        suitable_models = [
            model for model in self.models.values()
            if task_type in model.capabilities and
               model.availability and
               model.context_length >= context_requirements
        ]

        if not suitable_models:
            logger.warning(f"No suitable models found for task {task_type}")
            return None

        # Scoring algorithm
        def score_model(model: AIModel) -> float:
            score = 0.0

            # Base score from performance
            score += model.performance_score * 0.3

            # Capability match bonus
            if task_type in model.capabilities:
                score += 0.2

            # Specialization bonus
            if prefer_specialized and model.specialization:
                score += 0.2

            # Context length bonus (more is better up to a point)
            context_ratio = min(model.context_length / context_requirements, 2.0)
            score += context_ratio * 0.1

            # Load balancing - prefer less used models
            if model.usage_count > 0:
                usage_penalty = min(model.usage_count / 100.0, 0.1)
                score -= usage_penalty

            # Response time bonus (faster is better)
            if model.avg_response_time > 0:
                time_bonus = max(0.1 - (model.avg_response_time / 10.0), 0)
                score += time_bonus

            return score

        # Sort by score and return best
        best_model = max(suitable_models, key=score_model)

        logger.info(f"Selected model {best_model.name} for task {task_type}")
        return best_model

    async def generate_completion(self,
                                model_name: str,
                                prompt: str,
                                system_prompt: Optional[str] = None,
                                max_tokens: int = 1000,
                                temperature: float = 0.7) -> Dict[str, Any]:
        """Generate completion using specified model"""

        if model_name not in self.models:
            raise ValueError(f"Model {model_name} not available")

        model = self.models[model_name]
        start_time = time.time()

        try:
            # Prepare request
            request_data = {
                "model": model_name,
                "prompt": prompt,
                "stream": False,
                "options": {
                    "num_predict": max_tokens,
                    "temperature": temperature
                }
            }

            if system_prompt:
                request_data["system"] = system_prompt

            # Make request to Ollama
            async with self.session.post(
                f"{model.node_url}/api/generate",
                json=request_data
            ) as response:

                if response.status == 200:
                    result = await response.json()

                    # Update model statistics
                    end_time = time.time()
                    response_time = end_time - start_time

                    model.usage_count += 1
                    model.last_used = datetime.now()

                    # Update average response time
                    if model.avg_response_time == 0:
                        model.avg_response_time = response_time
                    else:
                        model.avg_response_time = (model.avg_response_time * 0.8) + (response_time * 0.2)

                    return {
                        "success": True,
                        "content": result.get("response", ""),
                        "model": model_name,
                        "response_time": response_time,
                        "usage_stats": {
                            "total_duration": result.get("total_duration", 0),
                            "load_duration": result.get("load_duration", 0),
                            "prompt_eval_count": result.get("prompt_eval_count", 0),
                            "eval_count": result.get("eval_count", 0)
                        }
                    }
                else:
                    error_text = await response.text()
                    raise Exception(f"API error {response.status}: {error_text}")

        except Exception as e:
            logger.error(f"Error generating completion with {model_name}: {e}")
            model.availability = False
            return {
                "success": False,
                "error": str(e),
                "model": model_name
            }

    async def initialize_load_balancer(self):
        """Initialize load balancing for the cluster"""
        logger.info("Initializing load balancer...")

        for node in self.cluster_nodes:
            if node.status == "healthy":
                self.load_balancer_state[f"{node.host}:{node.port}"] = {
                    "active_requests": 0,
                    "total_requests": 0,
                    "last_request": None,
                    "average_response_time": 0.0
                }

    async def get_cluster_status(self) -> Dict[str, Any]:
        """Get comprehensive cluster status"""
        return {
            "total_nodes": len(self.cluster_nodes),
            "healthy_nodes": len([n for n in self.cluster_nodes if n.status == "healthy"]),
            "total_models": len(self.models),
            "models_by_capability": {
                capability.value: len([
                    m for m in self.models.values()
                    if capability in m.capabilities
                ])
                for capability in ModelCapability
            },
            "cluster_load": self._calculate_cluster_load(),
            "model_usage_stats": {
                name: {
                    "usage_count": model.usage_count,
                    "avg_response_time": model.avg_response_time,
                    "last_used": model.last_used.isoformat() if model.last_used else None
                }
                for name, model in self.models.items()
            }
        }

    def _calculate_cluster_load(self) -> float:
        """Calculate overall cluster load"""
        if not self.load_balancer_state:
            return 0.0

        total_load = sum(
            state["active_requests"]
            for state in self.load_balancer_state.values()
        )

        healthy_nodes = len([n for n in self.cluster_nodes if n.status == "healthy"])
        if healthy_nodes == 0:
            return 0.0

        return total_load / healthy_nodes

    async def cleanup(self):
        """Cleanup resources"""
        if self.session:
            await self.session.close()

# Global instance
ai_model_service = AIModelService()