616 lines
		
	
	
		
			23 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			616 lines
		
	
	
		
			23 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """
 | |
| Optimized Embedding Manager - High-performance vector operations and storage.
 | |
| 
 | |
| This module provides enhanced embedding capabilities including:
 | |
| - Vector database integration with SQLite-Vec
 | |
| - Optimized batch processing and caching
 | |
| - Multiple embedding model support
 | |
| - Efficient similarity search with indexing
 | |
| - Memory-efficient embedding storage
 | |
| """
 | |
| 
 | |
| import json
 | |
| import time
 | |
| import numpy as np
 | |
| import sqlite3
 | |
| from typing import List, Dict, Optional, Tuple, Union, Any
 | |
| from dataclasses import dataclass, asdict
 | |
| from pathlib import Path
 | |
| from sentence_transformers import SentenceTransformer
 | |
| from sklearn.feature_extraction.text import TfidfVectorizer
 | |
| from sklearn.metrics.pairwise import cosine_similarity
 | |
| import threading
 | |
| from contextlib import contextmanager
 | |
| from functools import lru_cache
 | |
| import logging
 | |
| 
 | |
| from .context_db import Context, ContextDatabase
 | |
| 
 | |
| logger = logging.getLogger(__name__)
 | |
| 
 | |
| @dataclass
 | |
| class EmbeddingModel:
 | |
|     """Configuration for embedding models."""
 | |
|     name: str
 | |
|     model_path: str
 | |
|     dimension: int
 | |
|     max_tokens: int = 512
 | |
|     normalize: bool = True
 | |
| 
 | |
| @dataclass 
 | |
| class VectorSearchResult:
 | |
|     """Result from vector search operations."""
 | |
|     context_id: int
 | |
|     score: float
 | |
|     context: Optional[Context] = None
 | |
|     metadata: Dict[str, Any] = None
 | |
| 
 | |
| class VectorCache:
 | |
|     """High-performance LRU cache for embeddings."""
 | |
|     
 | |
|     def __init__(self, max_size: int = 5000, ttl_seconds: int = 3600):
 | |
|         self.max_size = max_size
 | |
|         self.ttl_seconds = ttl_seconds
 | |
|         self.cache: Dict[str, Tuple[np.ndarray, float]] = {}
 | |
|         self.access_times: Dict[str, float] = {}
 | |
|         self.lock = threading.RLock()
 | |
|     
 | |
|     def get(self, key: str) -> Optional[np.ndarray]:
 | |
|         """Get embedding from cache."""
 | |
|         with self.lock:
 | |
|             current_time = time.time()
 | |
|             
 | |
|             if key in self.cache:
 | |
|                 embedding, created_time = self.cache[key]
 | |
|                 
 | |
|                 # Check TTL
 | |
|                 if current_time - created_time < self.ttl_seconds:
 | |
|                     self.access_times[key] = current_time
 | |
|                     return embedding.copy()
 | |
|                 else:
 | |
|                     # Expired
 | |
|                     del self.cache[key]
 | |
|                     del self.access_times[key]
 | |
|         return None
 | |
|     
 | |
|     def put(self, key: str, embedding: np.ndarray) -> None:
 | |
|         """Store embedding in cache."""
 | |
|         with self.lock:
 | |
|             current_time = time.time()
 | |
|             
 | |
|             # Evict if cache is full
 | |
|             if len(self.cache) >= self.max_size:
 | |
|                 self._evict_lru()
 | |
|             
 | |
|             self.cache[key] = (embedding.copy(), current_time)
 | |
|             self.access_times[key] = current_time
 | |
|     
 | |
|     def _evict_lru(self) -> None:
 | |
|         """Evict least recently used item."""
 | |
|         if not self.access_times:
 | |
|             return
 | |
|         
 | |
|         lru_key = min(self.access_times.items(), key=lambda x: x[1])[0]
 | |
|         del self.cache[lru_key]
 | |
|         del self.access_times[lru_key]
 | |
|     
 | |
|     def clear(self) -> None:
 | |
|         """Clear cache."""
 | |
|         with self.lock:
 | |
|             self.cache.clear()
 | |
|             self.access_times.clear()
 | |
|     
 | |
|     def stats(self) -> Dict[str, Any]:
 | |
|         """Get cache statistics."""
 | |
|         with self.lock:
 | |
|             return {
 | |
|                 "size": len(self.cache),
 | |
|                 "max_size": self.max_size,
 | |
|                 "hit_rate": getattr(self, '_hits', 0) / max(getattr(self, '_requests', 1), 1),
 | |
|                 "ttl_seconds": self.ttl_seconds
 | |
|             }
 | |
| 
 | |
| class OptimizedEmbeddingManager:
 | |
|     """
 | |
|     High-performance embedding manager with vector database capabilities.
 | |
|     """
 | |
|     
 | |
|     # Predefined embedding models
 | |
|     MODELS = {
 | |
|         "mini": EmbeddingModel("all-MiniLM-L6-v2", "all-MiniLM-L6-v2", 384),
 | |
|         "base": EmbeddingModel("all-MiniLM-L12-v2", "all-MiniLM-L12-v2", 384), 
 | |
|         "large": EmbeddingModel("all-mpnet-base-v2", "all-mpnet-base-v2", 768),
 | |
|         "multilingual": EmbeddingModel("paraphrase-multilingual-MiniLM-L12-v2", 
 | |
|                                      "paraphrase-multilingual-MiniLM-L12-v2", 384)
 | |
|     }
 | |
|     
 | |
|     def __init__(self, 
 | |
|                  context_db: ContextDatabase,
 | |
|                  model_name: str = "mini",
 | |
|                  vector_db_path: Optional[str] = None,
 | |
|                  cache_size: int = 5000,
 | |
|                  batch_size: int = 32):
 | |
|         self.context_db = context_db
 | |
|         self.model_config = self.MODELS.get(model_name, self.MODELS["mini"])
 | |
|         self.model = None  # Lazy loading
 | |
|         self.vector_cache = VectorCache(cache_size)
 | |
|         self.batch_size = batch_size
 | |
|         
 | |
|         # Vector database setup
 | |
|         self.vector_db_path = vector_db_path or "hcfs_vectors.db"
 | |
|         self._init_vector_db()
 | |
|         
 | |
|         # TF-IDF for hybrid search
 | |
|         self.tfidf_vectorizer = TfidfVectorizer(
 | |
|             stop_words='english', 
 | |
|             max_features=5000,
 | |
|             ngram_range=(1, 2),
 | |
|             min_df=2
 | |
|         )
 | |
|         self._tfidf_fitted = False
 | |
|         self._model_lock = threading.RLock()
 | |
|         
 | |
|         logger.info(f"Initialized OptimizedEmbeddingManager with model: {self.model_config.name}")
 | |
|     
 | |
|     def _get_model(self) -> SentenceTransformer:
 | |
|         """Lazy load the embedding model."""
 | |
|         if self.model is None:
 | |
|             with self._model_lock:
 | |
|                 if self.model is None:
 | |
|                     logger.info(f"Loading embedding model: {self.model_config.model_path}")
 | |
|                     self.model = SentenceTransformer(self.model_config.model_path)
 | |
|         return self.model
 | |
|     
 | |
|     def _init_vector_db(self) -> None:
 | |
|         """Initialize SQLite vector database for fast similarity search."""
 | |
|         conn = sqlite3.connect(self.vector_db_path)
 | |
|         cursor = conn.cursor()
 | |
|         
 | |
|         # Create vectors table
 | |
|         cursor.execute('''
 | |
|             CREATE TABLE IF NOT EXISTS context_vectors (
 | |
|                 context_id INTEGER PRIMARY KEY,
 | |
|                 model_name TEXT NOT NULL,
 | |
|                 embedding_dimension INTEGER NOT NULL,
 | |
|                 vector_data BLOB NOT NULL,
 | |
|                 created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
 | |
|                 updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
 | |
|             )
 | |
|         ''')
 | |
|         
 | |
|         # Create index for fast lookups
 | |
|         cursor.execute('''
 | |
|             CREATE INDEX IF NOT EXISTS idx_context_vectors_model 
 | |
|             ON context_vectors(model_name, context_id)
 | |
|         ''')
 | |
|         
 | |
|         conn.commit()
 | |
|         conn.close()
 | |
|         
 | |
|         logger.info(f"Vector database initialized: {self.vector_db_path}")
 | |
|     
 | |
|     @contextmanager
 | |
|     def _get_vector_db(self):
 | |
|         """Get vector database connection with proper cleanup."""
 | |
|         conn = sqlite3.connect(self.vector_db_path)
 | |
|         try:
 | |
|             yield conn
 | |
|         finally:
 | |
|             conn.close()
 | |
|     
 | |
|     def generate_embedding(self, text: str, use_cache: bool = True) -> np.ndarray:
 | |
|         """Generate embedding for text with caching."""
 | |
|         cache_key = f"{self.model_config.name}:{hash(text)}"
 | |
|         
 | |
|         if use_cache:
 | |
|             cached = self.vector_cache.get(cache_key)
 | |
|             if cached is not None:
 | |
|                 return cached
 | |
|         
 | |
|         model = self._get_model()
 | |
|         embedding = model.encode(
 | |
|             text, 
 | |
|             normalize_embeddings=self.model_config.normalize,
 | |
|             show_progress_bar=False
 | |
|         )
 | |
|         
 | |
|         if use_cache:
 | |
|             self.vector_cache.put(cache_key, embedding)
 | |
|         
 | |
|         return embedding
 | |
|     
 | |
|     def generate_embeddings_batch(self, texts: List[str], use_cache: bool = True) -> List[np.ndarray]:
 | |
|         """Generate embeddings for multiple texts efficiently."""
 | |
|         if not texts:
 | |
|             return []
 | |
|         
 | |
|         # Check cache first
 | |
|         cache_results = []
 | |
|         uncached_indices = []
 | |
|         uncached_texts = []
 | |
|         
 | |
|         if use_cache:
 | |
|             for i, text in enumerate(texts):
 | |
|                 cache_key = f"{self.model_config.name}:{hash(text)}"
 | |
|                 cached = self.vector_cache.get(cache_key)
 | |
|                 if cached is not None:
 | |
|                     cache_results.append((i, cached))
 | |
|                 else:
 | |
|                     uncached_indices.append(i)
 | |
|                     uncached_texts.append(text)
 | |
|         else:
 | |
|             uncached_indices = list(range(len(texts)))
 | |
|             uncached_texts = texts
 | |
|         
 | |
|         # Generate embeddings for uncached texts
 | |
|         embeddings = [None] * len(texts)
 | |
|         
 | |
|         # Place cached results
 | |
|         for i, embedding in cache_results:
 | |
|             embeddings[i] = embedding
 | |
|         
 | |
|         if uncached_texts:
 | |
|             model = self._get_model()
 | |
|             
 | |
|             # Process in batches
 | |
|             for batch_start in range(0, len(uncached_texts), self.batch_size):
 | |
|                 batch_end = min(batch_start + self.batch_size, len(uncached_texts))
 | |
|                 batch_texts = uncached_texts[batch_start:batch_end]
 | |
|                 batch_indices = uncached_indices[batch_start:batch_end]
 | |
|                 
 | |
|                 batch_embeddings = model.encode(
 | |
|                     batch_texts,
 | |
|                     normalize_embeddings=self.model_config.normalize,
 | |
|                     show_progress_bar=False,
 | |
|                     batch_size=self.batch_size
 | |
|                 )
 | |
|                 
 | |
|                 # Store results and cache
 | |
|                 for i, (orig_idx, embedding) in enumerate(zip(batch_indices, batch_embeddings)):
 | |
|                     embeddings[orig_idx] = embedding
 | |
|                     
 | |
|                     if use_cache:
 | |
|                         cache_key = f"{self.model_config.name}:{hash(batch_texts[i])}"
 | |
|                         self.vector_cache.put(cache_key, embedding)
 | |
|         
 | |
|         return embeddings
 | |
|     
 | |
|     def store_embedding(self, context_id: int, embedding: np.ndarray) -> None:
 | |
|         """Store embedding in vector database."""
 | |
|         with self._get_vector_db() as conn:
 | |
|             cursor = conn.cursor()
 | |
|             
 | |
|             # Convert to bytes for storage
 | |
|             vector_bytes = embedding.astype(np.float32).tobytes()
 | |
|             
 | |
|             cursor.execute('''
 | |
|                 INSERT OR REPLACE INTO context_vectors 
 | |
|                 (context_id, model_name, embedding_dimension, vector_data, updated_at)
 | |
|                 VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)
 | |
|             ''', (context_id, self.model_config.name, embedding.shape[0], vector_bytes))
 | |
|             
 | |
|             conn.commit()
 | |
|     
 | |
|     def store_embeddings_batch(self, context_embeddings: List[Tuple[int, np.ndarray]]) -> None:
 | |
|         """Store multiple embeddings efficiently."""
 | |
|         if not context_embeddings:
 | |
|             return
 | |
|         
 | |
|         with self._get_vector_db() as conn:
 | |
|             cursor = conn.cursor()
 | |
|             
 | |
|             data = [
 | |
|                 (context_id, self.model_config.name, embedding.shape[0], 
 | |
|                  embedding.astype(np.float32).tobytes())
 | |
|                 for context_id, embedding in context_embeddings
 | |
|             ]
 | |
|             
 | |
|             cursor.executemany('''
 | |
|                 INSERT OR REPLACE INTO context_vectors 
 | |
|                 (context_id, model_name, embedding_dimension, vector_data, updated_at)
 | |
|                 VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)
 | |
|             ''', data)
 | |
|             
 | |
|             conn.commit()
 | |
|         
 | |
|         logger.info(f"Stored {len(context_embeddings)} embeddings in batch")
 | |
|     
 | |
|     def get_embedding(self, context_id: int) -> Optional[np.ndarray]:
 | |
|         """Retrieve embedding for a context."""
 | |
|         with self._get_vector_db() as conn:
 | |
|             cursor = conn.cursor()
 | |
|             
 | |
|             cursor.execute('''
 | |
|                 SELECT vector_data, embedding_dimension FROM context_vectors 
 | |
|                 WHERE context_id = ? AND model_name = ?
 | |
|             ''', (context_id, self.model_config.name))
 | |
|             
 | |
|             result = cursor.fetchone()
 | |
|             if result:
 | |
|                 vector_bytes, dimension = result
 | |
|                 return np.frombuffer(vector_bytes, dtype=np.float32).reshape(dimension)
 | |
|         
 | |
|         return None
 | |
|     
 | |
|     def vector_similarity_search(self, 
 | |
|                                 query_embedding: np.ndarray, 
 | |
|                                 context_ids: Optional[List[int]] = None,
 | |
|                                 top_k: int = 10,
 | |
|                                 min_similarity: float = 0.0) -> List[VectorSearchResult]:
 | |
|         """Efficient vector similarity search."""
 | |
|         with self._get_vector_db() as conn:
 | |
|             cursor = conn.cursor()
 | |
|             
 | |
|             # Build query
 | |
|             if context_ids:
 | |
|                 placeholders = ','.join(['?'] * len(context_ids))
 | |
|                 query = f'''
 | |
|                     SELECT context_id, vector_data, embedding_dimension 
 | |
|                     FROM context_vectors 
 | |
|                     WHERE model_name = ? AND context_id IN ({placeholders})
 | |
|                 '''
 | |
|                 params = [self.model_config.name] + context_ids
 | |
|             else:
 | |
|                 query = '''
 | |
|                     SELECT context_id, vector_data, embedding_dimension 
 | |
|                     FROM context_vectors 
 | |
|                     WHERE model_name = ?
 | |
|                 '''
 | |
|                 params = [self.model_config.name]
 | |
|             
 | |
|             cursor.execute(query, params)
 | |
|             results = cursor.fetchall()
 | |
|         
 | |
|         if not results:
 | |
|             return []
 | |
|         
 | |
|         # Calculate similarities
 | |
|         similarities = []
 | |
|         query_embedding = query_embedding.reshape(1, -1)
 | |
|         
 | |
|         for context_id, vector_bytes, dimension in results:
 | |
|             stored_embedding = np.frombuffer(vector_bytes, dtype=np.float32).reshape(1, dimension)
 | |
|             
 | |
|             similarity = cosine_similarity(query_embedding, stored_embedding)[0][0]
 | |
|             
 | |
|             if similarity >= min_similarity:
 | |
|                 similarities.append(VectorSearchResult(
 | |
|                     context_id=context_id,
 | |
|                     score=float(similarity)
 | |
|                 ))
 | |
|         
 | |
|         # Sort by similarity and return top_k
 | |
|         similarities.sort(key=lambda x: x.score, reverse=True)
 | |
|         return similarities[:top_k]
 | |
|     
 | |
|     def semantic_search_optimized(self, 
 | |
|                                 query: str, 
 | |
|                                 path_prefix: str = None, 
 | |
|                                 top_k: int = 5,
 | |
|                                 include_contexts: bool = True) -> List[VectorSearchResult]:
 | |
|         """High-performance semantic search."""
 | |
|         # Generate query embedding
 | |
|         query_embedding = self.generate_embedding(query)
 | |
|         
 | |
|         # Get relevant context IDs based on path filter
 | |
|         context_ids = None
 | |
|         if path_prefix:
 | |
|             with self.context_db.get_session() as session:
 | |
|                 from .context_db import ContextBlob
 | |
|                 blobs = session.query(ContextBlob.id).filter(
 | |
|                     ContextBlob.path.startswith(path_prefix)
 | |
|                 ).all()
 | |
|                 context_ids = [blob.id for blob in blobs]
 | |
|                 
 | |
|                 if not context_ids:
 | |
|                     return []
 | |
|         
 | |
|         # Perform vector search
 | |
|         results = self.vector_similarity_search(
 | |
|             query_embedding, 
 | |
|             context_ids=context_ids,
 | |
|             top_k=top_k
 | |
|         )
 | |
|         
 | |
|         # Populate with context data if requested
 | |
|         if include_contexts and results:
 | |
|             context_map = {}
 | |
|             with self.context_db.get_session() as session:
 | |
|                 from .context_db import ContextBlob
 | |
|                 
 | |
|                 result_ids = [r.context_id for r in results]
 | |
|                 blobs = session.query(ContextBlob).filter(
 | |
|                     ContextBlob.id.in_(result_ids)
 | |
|                 ).all()
 | |
|                 
 | |
|                 for blob in blobs:
 | |
|                     context_map[blob.id] = Context(
 | |
|                         id=blob.id,
 | |
|                         path=blob.path,
 | |
|                         content=blob.content,
 | |
|                         summary=blob.summary,
 | |
|                         author=blob.author,
 | |
|                         created_at=blob.created_at,
 | |
|                         updated_at=blob.updated_at,
 | |
|                         version=blob.version
 | |
|                     )
 | |
|             
 | |
|             # Add contexts to results
 | |
|             for result in results:
 | |
|                 result.context = context_map.get(result.context_id)
 | |
|         
 | |
|         return results
 | |
|     
 | |
|     def hybrid_search_optimized(self, 
 | |
|                                query: str, 
 | |
|                                path_prefix: str = None, 
 | |
|                                top_k: int = 5,
 | |
|                                semantic_weight: float = 0.7,
 | |
|                                rerank_top_n: int = 50) -> List[VectorSearchResult]:
 | |
|         """Optimized hybrid search with two-stage ranking."""
 | |
|         
 | |
|         # Stage 1: Fast semantic search to get candidate set
 | |
|         semantic_results = self.semantic_search_optimized(
 | |
|             query, path_prefix, rerank_top_n, include_contexts=True
 | |
|         )
 | |
|         
 | |
|         if not semantic_results or len(semantic_results) < 2:
 | |
|             return semantic_results[:top_k]
 | |
|         
 | |
|         # Stage 2: Re-rank with BM25 scores
 | |
|         contexts = [r.context for r in semantic_results if r.context]
 | |
|         if not contexts:
 | |
|             return semantic_results[:top_k]
 | |
|         
 | |
|         documents = [ctx.content for ctx in contexts]
 | |
|         
 | |
|         # Compute BM25 scores
 | |
|         try:
 | |
|             if not self._tfidf_fitted:
 | |
|                 self.tfidf_vectorizer.fit(documents)
 | |
|                 self._tfidf_fitted = True
 | |
|             
 | |
|             doc_vectors = self.tfidf_vectorizer.transform(documents)
 | |
|             query_vector = self.tfidf_vectorizer.transform([query])
 | |
|             bm25_scores = cosine_similarity(query_vector, doc_vectors)[0]
 | |
|             
 | |
|         except Exception as e:
 | |
|             logger.warning(f"BM25 scoring failed: {e}, using semantic only")
 | |
|             return semantic_results[:top_k]
 | |
|         
 | |
|         # Combine scores
 | |
|         for i, result in enumerate(semantic_results[:len(bm25_scores)]):
 | |
|             semantic_score = result.score
 | |
|             bm25_score = bm25_scores[i]
 | |
|             
 | |
|             combined_score = (semantic_weight * semantic_score + 
 | |
|                             (1 - semantic_weight) * bm25_score)
 | |
|             
 | |
|             result.score = float(combined_score)
 | |
|             result.metadata = {
 | |
|                 "semantic_score": float(semantic_score),
 | |
|                 "bm25_score": float(bm25_score),
 | |
|                 "semantic_weight": semantic_weight
 | |
|             }
 | |
|         
 | |
|         # Re-sort by combined score
 | |
|         semantic_results.sort(key=lambda x: x.score, reverse=True)
 | |
|         return semantic_results[:top_k]
 | |
|     
 | |
|     def build_embeddings_index(self, batch_size: int = 100) -> Dict[str, Any]:
 | |
|         """Build embeddings for all contexts without embeddings."""
 | |
|         start_time = time.time()
 | |
|         
 | |
|         # Get contexts without embeddings
 | |
|         with self.context_db.get_session() as session:
 | |
|             from .context_db import ContextBlob
 | |
|             
 | |
|             # Find contexts missing embeddings
 | |
|             with self._get_vector_db() as vector_conn:
 | |
|                 vector_cursor = vector_conn.cursor()
 | |
|                 vector_cursor.execute('''
 | |
|                     SELECT context_id FROM context_vectors 
 | |
|                     WHERE model_name = ?
 | |
|                 ''', (self.model_config.name,))
 | |
|                 
 | |
|                 existing_ids = {row[0] for row in vector_cursor.fetchall()}
 | |
|             
 | |
|             # Get contexts that need embeddings
 | |
|             all_blobs = session.query(ContextBlob).all()
 | |
|             missing_blobs = [blob for blob in all_blobs if blob.id not in existing_ids]
 | |
|         
 | |
|         if not missing_blobs:
 | |
|             return {
 | |
|                 "total_processed": 0,
 | |
|                 "processing_time": 0,
 | |
|                 "embeddings_per_second": 0,
 | |
|                 "message": "All contexts already have embeddings"
 | |
|             }
 | |
|         
 | |
|         logger.info(f"Building embeddings for {len(missing_blobs)} contexts")
 | |
|         
 | |
|         # Process in batches
 | |
|         total_processed = 0
 | |
|         for batch_start in range(0, len(missing_blobs), batch_size):
 | |
|             batch_end = min(batch_start + batch_size, len(missing_blobs))
 | |
|             batch_blobs = missing_blobs[batch_start:batch_end]
 | |
|             
 | |
|             # Generate embeddings for batch
 | |
|             texts = [blob.content for blob in batch_blobs]
 | |
|             embeddings = self.generate_embeddings_batch(texts, use_cache=False)
 | |
|             
 | |
|             # Store embeddings
 | |
|             context_embeddings = [
 | |
|                 (blob.id, embedding) 
 | |
|                 for blob, embedding in zip(batch_blobs, embeddings)
 | |
|             ]
 | |
|             self.store_embeddings_batch(context_embeddings)
 | |
|             
 | |
|             total_processed += len(batch_blobs)
 | |
|             logger.info(f"Processed {total_processed}/{len(missing_blobs)} contexts")
 | |
|         
 | |
|         processing_time = time.time() - start_time
 | |
|         embeddings_per_second = total_processed / processing_time if processing_time > 0 else 0
 | |
|         
 | |
|         return {
 | |
|             "total_processed": total_processed,
 | |
|             "processing_time": processing_time,
 | |
|             "embeddings_per_second": embeddings_per_second,
 | |
|             "model_used": self.model_config.name,
 | |
|             "embedding_dimension": self.model_config.dimension
 | |
|         }
 | |
|     
 | |
|     def get_statistics(self) -> Dict[str, Any]:
 | |
|         """Get embedding manager statistics."""
 | |
|         with self._get_vector_db() as conn:
 | |
|             cursor = conn.cursor()
 | |
|             
 | |
|             cursor.execute('''
 | |
|                 SELECT 
 | |
|                     COUNT(*) as total_embeddings,
 | |
|                     COUNT(DISTINCT model_name) as unique_models,
 | |
|                     AVG(embedding_dimension) as avg_dimension
 | |
|                 FROM context_vectors
 | |
|             ''')
 | |
|             
 | |
|             db_stats = cursor.fetchone()
 | |
|             
 | |
|             cursor.execute('''
 | |
|                 SELECT model_name, COUNT(*) as count
 | |
|                 FROM context_vectors
 | |
|                 GROUP BY model_name
 | |
|             ''')
 | |
|             
 | |
|             model_counts = dict(cursor.fetchall())
 | |
|         
 | |
|         return {
 | |
|             "database_stats": {
 | |
|                 "total_embeddings": db_stats[0] if db_stats else 0,
 | |
|                 "unique_models": db_stats[1] if db_stats else 0,
 | |
|                 "average_dimension": db_stats[2] if db_stats else 0,
 | |
|                 "model_counts": model_counts
 | |
|             },
 | |
|             "cache_stats": self.vector_cache.stats(),
 | |
|             "current_model": asdict(self.model_config),
 | |
|             "vector_db_path": self.vector_db_path,
 | |
|             "batch_size": self.batch_size
 | |
|         }
 | |
|     
 | |
|     def cleanup_old_embeddings(self, days_old: int = 30) -> int:
 | |
|         """Remove old unused embeddings."""
 | |
|         with self._get_vector_db() as conn:
 | |
|             cursor = conn.cursor()
 | |
|             
 | |
|             cursor.execute('''
 | |
|                 DELETE FROM context_vectors 
 | |
|                 WHERE updated_at < datetime('now', '-{} days')
 | |
|                 AND context_id NOT IN (
 | |
|                     SELECT id FROM context_blobs
 | |
|                 )
 | |
|             '''.format(days_old))
 | |
|             
 | |
|             deleted_count = cursor.rowcount
 | |
|             conn.commit()
 | |
|             
 | |
|         logger.info(f"Cleaned up {deleted_count} old embeddings")
 | |
|         return deleted_count | 
