""" WHOOSH AI Model Service - Phase 6.1 Advanced AI model integration with distributed Ollama cluster """ import asyncio import aiohttp import json import time from typing import Dict, List, Optional, Any from datetime import datetime, timedelta import logging from dataclasses import dataclass from enum import Enum logger = logging.getLogger(__name__) class ModelCapability(Enum): """AI Model capabilities""" CODE_GENERATION = "code_generation" CODE_REVIEW = "code_review" DOCUMENTATION = "documentation" TESTING = "testing" ARCHITECTURE = "architecture" DEBUGGING = "debugging" REFACTORING = "refactoring" GENERAL_CHAT = "general_chat" SPECIALIZED_DOMAIN = "specialized_domain" @dataclass class AIModel: """AI Model information""" name: str node_url: str capabilities: List[ModelCapability] context_length: int parameter_count: str specialization: Optional[str] = None performance_score: float = 0.0 availability: bool = True last_used: Optional[datetime] = None usage_count: int = 0 avg_response_time: float = 0.0 @dataclass class ClusterNode: """Ollama cluster node information""" host: str port: int status: str = "unknown" models: List[str] = None load: float = 0.0 last_ping: Optional[datetime] = None class AIModelService: """Advanced AI Model Service for WHOOSH""" def __init__(self): # Distributed Ollama cluster nodes from CLAUDE.md self.cluster_nodes = [ ClusterNode("192.168.1.27", 11434), # Node 1 ClusterNode("192.168.1.72", 11434), # Node 2 ClusterNode("192.168.1.113", 11434), # Node 3 ClusterNode("192.168.1.106", 11434), # Node 4 ] self.models: Dict[str, AIModel] = {} self.model_cache = {} self.load_balancer_state = {} self.session: Optional[aiohttp.ClientSession] = None async def initialize(self): """Initialize the AI model service""" logger.info("Initializing AI Model Service...") # Create aiohttp session self.session = aiohttp.ClientSession( timeout=aiohttp.ClientTimeout(total=30) ) # Discover all available models across the cluster await self.discover_cluster_models() # Set up load balancing await self.initialize_load_balancer() logger.info(f"AI Model Service initialized with {len(self.models)} models across {len(self.cluster_nodes)} nodes") async def discover_cluster_models(self): """Discover all available models across the Ollama cluster""" logger.info("Discovering models across Ollama cluster...") discovered_models = {} for node in self.cluster_nodes: try: node_url = f"http://{node.host}:{node.port}" # Check node health async with self.session.get(f"{node_url}/api/tags", timeout=5) as response: if response.status == 200: data = await response.json() node.status = "healthy" node.models = [model["name"] for model in data.get("models", [])] node.last_ping = datetime.now() # Process each model for model_info in data.get("models", []): model_name = model_info["name"] # Determine model capabilities based on name patterns capabilities = self._determine_model_capabilities(model_name) # Create or update model entry if model_name not in discovered_models: discovered_models[model_name] = AIModel( name=model_name, node_url=node_url, capabilities=capabilities, context_length=self._estimate_context_length(model_name), parameter_count=self._estimate_parameters(model_name), specialization=self._determine_specialization(model_name) ) logger.info(f"Node {node.host}: {len(node.models)} models available") except Exception as e: logger.warning(f"Failed to connect to node {node.host}:{node.port}: {e}") node.status = "unavailable" node.models = [] self.models = discovered_models logger.info(f"Discovered {len(self.models)} total models across cluster") def _determine_model_capabilities(self, model_name: str) -> List[ModelCapability]: """Determine model capabilities based on name patterns""" capabilities = [] name_lower = model_name.lower() # Code-focused models if any(keyword in name_lower for keyword in ["code", "codellama", "deepseek", "starcoder", "wizard"]): capabilities.extend([ ModelCapability.CODE_GENERATION, ModelCapability.CODE_REVIEW, ModelCapability.DEBUGGING, ModelCapability.REFACTORING ]) # Documentation models if any(keyword in name_lower for keyword in ["llama", "mistral", "gemma"]): capabilities.append(ModelCapability.DOCUMENTATION) # Testing models if "test" in name_lower or "wizard" in name_lower: capabilities.append(ModelCapability.TESTING) # Architecture models (larger models) if any(keyword in name_lower for keyword in ["70b", "34b", "33b"]): capabilities.append(ModelCapability.ARCHITECTURE) # General chat (most models) capabilities.append(ModelCapability.GENERAL_CHAT) # Default if no specific capabilities found if len(capabilities) == 1: # Only GENERAL_CHAT capabilities.append(ModelCapability.CODE_GENERATION) return capabilities def _estimate_context_length(self, model_name: str) -> int: """Estimate context length based on model name""" name_lower = model_name.lower() if "32k" in name_lower: return 32768 elif "16k" in name_lower: return 16384 elif "8k" in name_lower: return 8192 elif any(size in name_lower for size in ["70b", "65b"]): return 4096 elif any(size in name_lower for size in ["34b", "33b"]): return 4096 else: return 2048 # Default def _estimate_parameters(self, model_name: str) -> str: """Estimate parameter count based on model name""" name_lower = model_name.lower() if "70b" in name_lower: return "70B" elif "34b" in name_lower or "33b" in name_lower: return "34B" elif "13b" in name_lower: return "13B" elif "7b" in name_lower: return "7B" elif "3b" in name_lower: return "3B" elif "1b" in name_lower: return "1B" else: return "Unknown" def _determine_specialization(self, model_name: str) -> Optional[str]: """Determine model specialization""" name_lower = model_name.lower() if "code" in name_lower: return "Programming" elif "math" in name_lower: return "Mathematics" elif "sql" in name_lower: return "Database" elif "medical" in name_lower: return "Healthcare" else: return None async def get_best_model_for_task(self, task_type: ModelCapability, context_requirements: int = 2048, prefer_specialized: bool = True) -> Optional[AIModel]: """Select the best model for a specific task""" # Filter models by capability suitable_models = [ model for model in self.models.values() if task_type in model.capabilities and model.availability and model.context_length >= context_requirements ] if not suitable_models: logger.warning(f"No suitable models found for task {task_type}") return None # Scoring algorithm def score_model(model: AIModel) -> float: score = 0.0 # Base score from performance score += model.performance_score * 0.3 # Capability match bonus if task_type in model.capabilities: score += 0.2 # Specialization bonus if prefer_specialized and model.specialization: score += 0.2 # Context length bonus (more is better up to a point) context_ratio = min(model.context_length / context_requirements, 2.0) score += context_ratio * 0.1 # Load balancing - prefer less used models if model.usage_count > 0: usage_penalty = min(model.usage_count / 100.0, 0.1) score -= usage_penalty # Response time bonus (faster is better) if model.avg_response_time > 0: time_bonus = max(0.1 - (model.avg_response_time / 10.0), 0) score += time_bonus return score # Sort by score and return best best_model = max(suitable_models, key=score_model) logger.info(f"Selected model {best_model.name} for task {task_type}") return best_model async def generate_completion(self, model_name: str, prompt: str, system_prompt: Optional[str] = None, max_tokens: int = 1000, temperature: float = 0.7) -> Dict[str, Any]: """Generate completion using specified model""" if model_name not in self.models: raise ValueError(f"Model {model_name} not available") model = self.models[model_name] start_time = time.time() try: # Prepare request request_data = { "model": model_name, "prompt": prompt, "stream": False, "options": { "num_predict": max_tokens, "temperature": temperature } } if system_prompt: request_data["system"] = system_prompt # Make request to Ollama async with self.session.post( f"{model.node_url}/api/generate", json=request_data ) as response: if response.status == 200: result = await response.json() # Update model statistics end_time = time.time() response_time = end_time - start_time model.usage_count += 1 model.last_used = datetime.now() # Update average response time if model.avg_response_time == 0: model.avg_response_time = response_time else: model.avg_response_time = (model.avg_response_time * 0.8) + (response_time * 0.2) return { "success": True, "content": result.get("response", ""), "model": model_name, "response_time": response_time, "usage_stats": { "total_duration": result.get("total_duration", 0), "load_duration": result.get("load_duration", 0), "prompt_eval_count": result.get("prompt_eval_count", 0), "eval_count": result.get("eval_count", 0) } } else: error_text = await response.text() raise Exception(f"API error {response.status}: {error_text}") except Exception as e: logger.error(f"Error generating completion with {model_name}: {e}") model.availability = False return { "success": False, "error": str(e), "model": model_name } async def initialize_load_balancer(self): """Initialize load balancing for the cluster""" logger.info("Initializing load balancer...") for node in self.cluster_nodes: if node.status == "healthy": self.load_balancer_state[f"{node.host}:{node.port}"] = { "active_requests": 0, "total_requests": 0, "last_request": None, "average_response_time": 0.0 } async def get_cluster_status(self) -> Dict[str, Any]: """Get comprehensive cluster status""" return { "total_nodes": len(self.cluster_nodes), "healthy_nodes": len([n for n in self.cluster_nodes if n.status == "healthy"]), "total_models": len(self.models), "models_by_capability": { capability.value: len([ m for m in self.models.values() if capability in m.capabilities ]) for capability in ModelCapability }, "cluster_load": self._calculate_cluster_load(), "model_usage_stats": { name: { "usage_count": model.usage_count, "avg_response_time": model.avg_response_time, "last_used": model.last_used.isoformat() if model.last_used else None } for name, model in self.models.items() } } def _calculate_cluster_load(self) -> float: """Calculate overall cluster load""" if not self.load_balancer_state: return 0.0 total_load = sum( state["active_requests"] for state in self.load_balancer_state.values() ) healthy_nodes = len([n for n in self.cluster_nodes if n.status == "healthy"]) if healthy_nodes == 0: return 0.0 return total_load / healthy_nodes async def cleanup(self): """Cleanup resources""" if self.session: await self.session.close() # Global instance ai_model_service = AIModelService()