Phase 2 build initial

This commit is contained in:
Claude Code
2025-07-30 09:34:16 +10:00
parent 8f19eaab25
commit a6ee31f237
68 changed files with 18055 additions and 3 deletions

View File

@@ -0,0 +1 @@
"""Core HCFS components."""

View File

@@ -0,0 +1,148 @@
"""
Context Database - Storage and retrieval of context blobs.
"""
from datetime import datetime
from typing import List, Optional, Dict, Any
from dataclasses import dataclass
from pathlib import Path
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text, Float
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker, Session
Base = declarative_base()
class ContextBlob(Base):
"""Database model for context blobs."""
__tablename__ = "context_blobs"
id = Column(Integer, primary_key=True)
path = Column(String(512), nullable=False, index=True)
content = Column(Text, nullable=False)
summary = Column(Text)
embedding_model = Column(String(100))
embedding_vector = Column(Text) # JSON serialized vector
author = Column(String(100))
created_at = Column(DateTime, default=datetime.utcnow)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
version = Column(Integer, default=1)
@dataclass
class Context:
"""Context data structure."""
id: Optional[int]
path: str
content: str
summary: Optional[str] = None
author: Optional[str] = None
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None
version: int = 1
class ContextDatabase:
"""Main interface for context storage and retrieval."""
def __init__(self, db_path: str = "hcfs_context.db"):
self.db_path = db_path
self.engine = create_engine(f"sqlite:///{db_path}")
Base.metadata.create_all(self.engine)
self.SessionLocal = sessionmaker(bind=self.engine)
def get_session(self) -> Session:
"""Get database session."""
return self.SessionLocal()
def store_context(self, context: Context) -> int:
"""Store a context blob and return its ID."""
with self.get_session() as session:
blob = ContextBlob(
path=context.path,
content=context.content,
summary=context.summary,
author=context.author,
version=context.version
)
session.add(blob)
session.commit()
session.refresh(blob)
return blob.id
def get_context_by_path(self, path: str, depth: int = 1) -> List[Context]:
"""Retrieve contexts for a path and optionally parent paths."""
contexts = []
current_path = Path(path)
with self.get_session() as session:
# Get contexts for current path and parents up to depth
for i in range(depth + 1):
search_path = str(current_path) if current_path != Path(".") else "/"
blobs = session.query(ContextBlob).filter(
ContextBlob.path == search_path
).order_by(ContextBlob.created_at.desc()).all()
for blob in blobs:
contexts.append(Context(
id=blob.id,
path=blob.path,
content=blob.content,
summary=blob.summary,
author=blob.author,
created_at=blob.created_at,
updated_at=blob.updated_at,
version=blob.version
))
if current_path.parent == current_path: # Root reached
break
current_path = current_path.parent
return contexts
def list_contexts_at_path(self, path: str) -> List[Context]:
"""List all contexts at a specific path."""
with self.get_session() as session:
blobs = session.query(ContextBlob).filter(
ContextBlob.path == path
).order_by(ContextBlob.created_at.desc()).all()
return [Context(
id=blob.id,
path=blob.path,
content=blob.content,
summary=blob.summary,
author=blob.author,
created_at=blob.created_at,
updated_at=blob.updated_at,
version=blob.version
) for blob in blobs]
def update_context(self, context_id: int, content: str, summary: str = None) -> bool:
"""Update an existing context."""
with self.get_session() as session:
blob = session.query(ContextBlob).filter(ContextBlob.id == context_id).first()
if blob:
blob.content = content
if summary:
blob.summary = summary
blob.version += 1
blob.updated_at = datetime.utcnow()
session.commit()
return True
return False
def delete_context(self, context_id: int) -> bool:
"""Delete a context by ID."""
with self.get_session() as session:
blob = session.query(ContextBlob).filter(ContextBlob.id == context_id).first()
if blob:
session.delete(blob)
session.commit()
return True
return False

View File

@@ -0,0 +1,188 @@
"""
Embedding Manager - Generate and manage context embeddings.
"""
import json
import numpy as np
from typing import List, Dict, Optional, Tuple
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from .context_db import Context, ContextDatabase
class EmbeddingManager:
"""
Manages embeddings for context blobs and semantic similarity search.
"""
def __init__(self, context_db: ContextDatabase, model_name: str = "all-MiniLM-L6-v2"):
self.context_db = context_db
self.model_name = model_name
self.model = SentenceTransformer(model_name)
self.tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
self._tfidf_fitted = False
def generate_embedding(self, text: str) -> np.ndarray:
"""Generate embedding for a text."""
return self.model.encode(text, normalize_embeddings=True)
def store_context_with_embedding(self, context: Context) -> int:
"""Store context and generate its embedding."""
# Generate embedding
embedding = self.generate_embedding(context.content)
# Store in database
context_id = self.context_db.store_context(context)
# Update with embedding (you'd extend ContextBlob model for this)
self._store_embedding(context_id, embedding)
return context_id
def _store_embedding(self, context_id: int, embedding: np.ndarray) -> None:
"""Store embedding vector in database."""
embedding_json = json.dumps(embedding.tolist())
with self.context_db.get_session() as session:
from .context_db import ContextBlob
blob = session.query(ContextBlob).filter(ContextBlob.id == context_id).first()
if blob:
blob.embedding_model = self.model_name
blob.embedding_vector = embedding_json
session.commit()
def semantic_search(self, query: str, path_prefix: str = None, top_k: int = 5) -> List[Tuple[Context, float]]:
"""
Perform semantic search for contexts similar to query.
Args:
query: Search query text
path_prefix: Optional path prefix to limit search scope
top_k: Number of results to return
Returns:
List of (Context, similarity_score) tuples
"""
query_embedding = self.generate_embedding(query)
with self.context_db.get_session() as session:
from .context_db import ContextBlob
query_filter = session.query(ContextBlob).filter(
ContextBlob.embedding_vector.isnot(None)
)
if path_prefix:
query_filter = query_filter.filter(ContextBlob.path.startswith(path_prefix))
blobs = query_filter.all()
if not blobs:
return []
# Calculate similarities
similarities = []
for blob in blobs:
if blob.embedding_vector:
stored_embedding = np.array(json.loads(blob.embedding_vector))
similarity = cosine_similarity(
query_embedding.reshape(1, -1),
stored_embedding.reshape(1, -1)
)[0][0]
context = Context(
id=blob.id,
path=blob.path,
content=blob.content,
summary=blob.summary,
author=blob.author,
created_at=blob.created_at,
updated_at=blob.updated_at,
version=blob.version
)
similarities.append((context, float(similarity)))
# Sort by similarity and return top_k
similarities.sort(key=lambda x: x[1], reverse=True)
return similarities[:top_k]
def hybrid_search(self, query: str, path_prefix: str = None, top_k: int = 5,
semantic_weight: float = 0.7) -> List[Tuple[Context, float]]:
"""
Hybrid search combining semantic similarity and BM25.
Args:
query: Search query
path_prefix: Optional path filter
top_k: Number of results
semantic_weight: Weight for semantic vs BM25 (0.0-1.0)
"""
# Get contexts for BM25
with self.context_db.get_session() as session:
from .context_db import ContextBlob
query_filter = session.query(ContextBlob)
if path_prefix:
query_filter = query_filter.filter(ContextBlob.path.startswith(path_prefix))
blobs = query_filter.all()
if not blobs:
return []
# Prepare documents for BM25
documents = [blob.content for blob in blobs]
# Fit TF-IDF if not already fitted or refitting needed
if not self._tfidf_fitted or len(documents) > 100: # Refit periodically
self.tfidf_vectorizer.fit(documents)
self._tfidf_fitted = True
# BM25 scoring (using TF-IDF as approximation)
doc_vectors = self.tfidf_vectorizer.transform(documents)
query_vector = self.tfidf_vectorizer.transform([query])
bm25_scores = cosine_similarity(query_vector, doc_vectors)[0]
# Semantic scoring
semantic_results = self.semantic_search(query, path_prefix, len(blobs))
semantic_scores = {ctx.id: score for ctx, score in semantic_results}
# Combine scores
combined_results = []
for i, blob in enumerate(blobs):
bm25_score = bm25_scores[i]
semantic_score = semantic_scores.get(blob.id, 0.0)
combined_score = (semantic_weight * semantic_score +
(1 - semantic_weight) * bm25_score)
context = Context(
id=blob.id,
path=blob.path,
content=blob.content,
summary=blob.summary,
author=blob.author,
created_at=blob.created_at,
updated_at=blob.updated_at,
version=blob.version
)
combined_results.append((context, combined_score))
# Sort and return top results
combined_results.sort(key=lambda x: x[1], reverse=True)
return combined_results[:top_k]
def get_similar_contexts(self, context_id: int, top_k: int = 5) -> List[Tuple[Context, float]]:
"""Find contexts similar to a given context."""
with self.context_db.get_session() as session:
from .context_db import ContextBlob
reference_blob = session.query(ContextBlob).filter(ContextBlob.id == context_id).first()
if not reference_blob or not reference_blob.content:
return []
return self.semantic_search(reference_blob.content, top_k=top_k)

View File

@@ -0,0 +1,616 @@
"""
Optimized Embedding Manager - High-performance vector operations and storage.
This module provides enhanced embedding capabilities including:
- Vector database integration with SQLite-Vec
- Optimized batch processing and caching
- Multiple embedding model support
- Efficient similarity search with indexing
- Memory-efficient embedding storage
"""
import json
import time
import numpy as np
import sqlite3
from typing import List, Dict, Optional, Tuple, Union, Any
from dataclasses import dataclass, asdict
from pathlib import Path
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import threading
from contextlib import contextmanager
from functools import lru_cache
import logging
from .context_db import Context, ContextDatabase
logger = logging.getLogger(__name__)
@dataclass
class EmbeddingModel:
"""Configuration for embedding models."""
name: str
model_path: str
dimension: int
max_tokens: int = 512
normalize: bool = True
@dataclass
class VectorSearchResult:
"""Result from vector search operations."""
context_id: int
score: float
context: Optional[Context] = None
metadata: Dict[str, Any] = None
class VectorCache:
"""High-performance LRU cache for embeddings."""
def __init__(self, max_size: int = 5000, ttl_seconds: int = 3600):
self.max_size = max_size
self.ttl_seconds = ttl_seconds
self.cache: Dict[str, Tuple[np.ndarray, float]] = {}
self.access_times: Dict[str, float] = {}
self.lock = threading.RLock()
def get(self, key: str) -> Optional[np.ndarray]:
"""Get embedding from cache."""
with self.lock:
current_time = time.time()
if key in self.cache:
embedding, created_time = self.cache[key]
# Check TTL
if current_time - created_time < self.ttl_seconds:
self.access_times[key] = current_time
return embedding.copy()
else:
# Expired
del self.cache[key]
del self.access_times[key]
return None
def put(self, key: str, embedding: np.ndarray) -> None:
"""Store embedding in cache."""
with self.lock:
current_time = time.time()
# Evict if cache is full
if len(self.cache) >= self.max_size:
self._evict_lru()
self.cache[key] = (embedding.copy(), current_time)
self.access_times[key] = current_time
def _evict_lru(self) -> None:
"""Evict least recently used item."""
if not self.access_times:
return
lru_key = min(self.access_times.items(), key=lambda x: x[1])[0]
del self.cache[lru_key]
del self.access_times[lru_key]
def clear(self) -> None:
"""Clear cache."""
with self.lock:
self.cache.clear()
self.access_times.clear()
def stats(self) -> Dict[str, Any]:
"""Get cache statistics."""
with self.lock:
return {
"size": len(self.cache),
"max_size": self.max_size,
"hit_rate": getattr(self, '_hits', 0) / max(getattr(self, '_requests', 1), 1),
"ttl_seconds": self.ttl_seconds
}
class OptimizedEmbeddingManager:
"""
High-performance embedding manager with vector database capabilities.
"""
# Predefined embedding models
MODELS = {
"mini": EmbeddingModel("all-MiniLM-L6-v2", "all-MiniLM-L6-v2", 384),
"base": EmbeddingModel("all-MiniLM-L12-v2", "all-MiniLM-L12-v2", 384),
"large": EmbeddingModel("all-mpnet-base-v2", "all-mpnet-base-v2", 768),
"multilingual": EmbeddingModel("paraphrase-multilingual-MiniLM-L12-v2",
"paraphrase-multilingual-MiniLM-L12-v2", 384)
}
def __init__(self,
context_db: ContextDatabase,
model_name: str = "mini",
vector_db_path: Optional[str] = None,
cache_size: int = 5000,
batch_size: int = 32):
self.context_db = context_db
self.model_config = self.MODELS.get(model_name, self.MODELS["mini"])
self.model = None # Lazy loading
self.vector_cache = VectorCache(cache_size)
self.batch_size = batch_size
# Vector database setup
self.vector_db_path = vector_db_path or "hcfs_vectors.db"
self._init_vector_db()
# TF-IDF for hybrid search
self.tfidf_vectorizer = TfidfVectorizer(
stop_words='english',
max_features=5000,
ngram_range=(1, 2),
min_df=2
)
self._tfidf_fitted = False
self._model_lock = threading.RLock()
logger.info(f"Initialized OptimizedEmbeddingManager with model: {self.model_config.name}")
def _get_model(self) -> SentenceTransformer:
"""Lazy load the embedding model."""
if self.model is None:
with self._model_lock:
if self.model is None:
logger.info(f"Loading embedding model: {self.model_config.model_path}")
self.model = SentenceTransformer(self.model_config.model_path)
return self.model
def _init_vector_db(self) -> None:
"""Initialize SQLite vector database for fast similarity search."""
conn = sqlite3.connect(self.vector_db_path)
cursor = conn.cursor()
# Create vectors table
cursor.execute('''
CREATE TABLE IF NOT EXISTS context_vectors (
context_id INTEGER PRIMARY KEY,
model_name TEXT NOT NULL,
embedding_dimension INTEGER NOT NULL,
vector_data BLOB NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
# Create index for fast lookups
cursor.execute('''
CREATE INDEX IF NOT EXISTS idx_context_vectors_model
ON context_vectors(model_name, context_id)
''')
conn.commit()
conn.close()
logger.info(f"Vector database initialized: {self.vector_db_path}")
@contextmanager
def _get_vector_db(self):
"""Get vector database connection with proper cleanup."""
conn = sqlite3.connect(self.vector_db_path)
try:
yield conn
finally:
conn.close()
def generate_embedding(self, text: str, use_cache: bool = True) -> np.ndarray:
"""Generate embedding for text with caching."""
cache_key = f"{self.model_config.name}:{hash(text)}"
if use_cache:
cached = self.vector_cache.get(cache_key)
if cached is not None:
return cached
model = self._get_model()
embedding = model.encode(
text,
normalize_embeddings=self.model_config.normalize,
show_progress_bar=False
)
if use_cache:
self.vector_cache.put(cache_key, embedding)
return embedding
def generate_embeddings_batch(self, texts: List[str], use_cache: bool = True) -> List[np.ndarray]:
"""Generate embeddings for multiple texts efficiently."""
if not texts:
return []
# Check cache first
cache_results = []
uncached_indices = []
uncached_texts = []
if use_cache:
for i, text in enumerate(texts):
cache_key = f"{self.model_config.name}:{hash(text)}"
cached = self.vector_cache.get(cache_key)
if cached is not None:
cache_results.append((i, cached))
else:
uncached_indices.append(i)
uncached_texts.append(text)
else:
uncached_indices = list(range(len(texts)))
uncached_texts = texts
# Generate embeddings for uncached texts
embeddings = [None] * len(texts)
# Place cached results
for i, embedding in cache_results:
embeddings[i] = embedding
if uncached_texts:
model = self._get_model()
# Process in batches
for batch_start in range(0, len(uncached_texts), self.batch_size):
batch_end = min(batch_start + self.batch_size, len(uncached_texts))
batch_texts = uncached_texts[batch_start:batch_end]
batch_indices = uncached_indices[batch_start:batch_end]
batch_embeddings = model.encode(
batch_texts,
normalize_embeddings=self.model_config.normalize,
show_progress_bar=False,
batch_size=self.batch_size
)
# Store results and cache
for i, (orig_idx, embedding) in enumerate(zip(batch_indices, batch_embeddings)):
embeddings[orig_idx] = embedding
if use_cache:
cache_key = f"{self.model_config.name}:{hash(batch_texts[i])}"
self.vector_cache.put(cache_key, embedding)
return embeddings
def store_embedding(self, context_id: int, embedding: np.ndarray) -> None:
"""Store embedding in vector database."""
with self._get_vector_db() as conn:
cursor = conn.cursor()
# Convert to bytes for storage
vector_bytes = embedding.astype(np.float32).tobytes()
cursor.execute('''
INSERT OR REPLACE INTO context_vectors
(context_id, model_name, embedding_dimension, vector_data, updated_at)
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)
''', (context_id, self.model_config.name, embedding.shape[0], vector_bytes))
conn.commit()
def store_embeddings_batch(self, context_embeddings: List[Tuple[int, np.ndarray]]) -> None:
"""Store multiple embeddings efficiently."""
if not context_embeddings:
return
with self._get_vector_db() as conn:
cursor = conn.cursor()
data = [
(context_id, self.model_config.name, embedding.shape[0],
embedding.astype(np.float32).tobytes())
for context_id, embedding in context_embeddings
]
cursor.executemany('''
INSERT OR REPLACE INTO context_vectors
(context_id, model_name, embedding_dimension, vector_data, updated_at)
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)
''', data)
conn.commit()
logger.info(f"Stored {len(context_embeddings)} embeddings in batch")
def get_embedding(self, context_id: int) -> Optional[np.ndarray]:
"""Retrieve embedding for a context."""
with self._get_vector_db() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT vector_data, embedding_dimension FROM context_vectors
WHERE context_id = ? AND model_name = ?
''', (context_id, self.model_config.name))
result = cursor.fetchone()
if result:
vector_bytes, dimension = result
return np.frombuffer(vector_bytes, dtype=np.float32).reshape(dimension)
return None
def vector_similarity_search(self,
query_embedding: np.ndarray,
context_ids: Optional[List[int]] = None,
top_k: int = 10,
min_similarity: float = 0.0) -> List[VectorSearchResult]:
"""Efficient vector similarity search."""
with self._get_vector_db() as conn:
cursor = conn.cursor()
# Build query
if context_ids:
placeholders = ','.join(['?'] * len(context_ids))
query = f'''
SELECT context_id, vector_data, embedding_dimension
FROM context_vectors
WHERE model_name = ? AND context_id IN ({placeholders})
'''
params = [self.model_config.name] + context_ids
else:
query = '''
SELECT context_id, vector_data, embedding_dimension
FROM context_vectors
WHERE model_name = ?
'''
params = [self.model_config.name]
cursor.execute(query, params)
results = cursor.fetchall()
if not results:
return []
# Calculate similarities
similarities = []
query_embedding = query_embedding.reshape(1, -1)
for context_id, vector_bytes, dimension in results:
stored_embedding = np.frombuffer(vector_bytes, dtype=np.float32).reshape(1, dimension)
similarity = cosine_similarity(query_embedding, stored_embedding)[0][0]
if similarity >= min_similarity:
similarities.append(VectorSearchResult(
context_id=context_id,
score=float(similarity)
))
# Sort by similarity and return top_k
similarities.sort(key=lambda x: x.score, reverse=True)
return similarities[:top_k]
def semantic_search_optimized(self,
query: str,
path_prefix: str = None,
top_k: int = 5,
include_contexts: bool = True) -> List[VectorSearchResult]:
"""High-performance semantic search."""
# Generate query embedding
query_embedding = self.generate_embedding(query)
# Get relevant context IDs based on path filter
context_ids = None
if path_prefix:
with self.context_db.get_session() as session:
from .context_db import ContextBlob
blobs = session.query(ContextBlob.id).filter(
ContextBlob.path.startswith(path_prefix)
).all()
context_ids = [blob.id for blob in blobs]
if not context_ids:
return []
# Perform vector search
results = self.vector_similarity_search(
query_embedding,
context_ids=context_ids,
top_k=top_k
)
# Populate with context data if requested
if include_contexts and results:
context_map = {}
with self.context_db.get_session() as session:
from .context_db import ContextBlob
result_ids = [r.context_id for r in results]
blobs = session.query(ContextBlob).filter(
ContextBlob.id.in_(result_ids)
).all()
for blob in blobs:
context_map[blob.id] = Context(
id=blob.id,
path=blob.path,
content=blob.content,
summary=blob.summary,
author=blob.author,
created_at=blob.created_at,
updated_at=blob.updated_at,
version=blob.version
)
# Add contexts to results
for result in results:
result.context = context_map.get(result.context_id)
return results
def hybrid_search_optimized(self,
query: str,
path_prefix: str = None,
top_k: int = 5,
semantic_weight: float = 0.7,
rerank_top_n: int = 50) -> List[VectorSearchResult]:
"""Optimized hybrid search with two-stage ranking."""
# Stage 1: Fast semantic search to get candidate set
semantic_results = self.semantic_search_optimized(
query, path_prefix, rerank_top_n, include_contexts=True
)
if not semantic_results or len(semantic_results) < 2:
return semantic_results[:top_k]
# Stage 2: Re-rank with BM25 scores
contexts = [r.context for r in semantic_results if r.context]
if not contexts:
return semantic_results[:top_k]
documents = [ctx.content for ctx in contexts]
# Compute BM25 scores
try:
if not self._tfidf_fitted:
self.tfidf_vectorizer.fit(documents)
self._tfidf_fitted = True
doc_vectors = self.tfidf_vectorizer.transform(documents)
query_vector = self.tfidf_vectorizer.transform([query])
bm25_scores = cosine_similarity(query_vector, doc_vectors)[0]
except Exception as e:
logger.warning(f"BM25 scoring failed: {e}, using semantic only")
return semantic_results[:top_k]
# Combine scores
for i, result in enumerate(semantic_results[:len(bm25_scores)]):
semantic_score = result.score
bm25_score = bm25_scores[i]
combined_score = (semantic_weight * semantic_score +
(1 - semantic_weight) * bm25_score)
result.score = float(combined_score)
result.metadata = {
"semantic_score": float(semantic_score),
"bm25_score": float(bm25_score),
"semantic_weight": semantic_weight
}
# Re-sort by combined score
semantic_results.sort(key=lambda x: x.score, reverse=True)
return semantic_results[:top_k]
def build_embeddings_index(self, batch_size: int = 100) -> Dict[str, Any]:
"""Build embeddings for all contexts without embeddings."""
start_time = time.time()
# Get contexts without embeddings
with self.context_db.get_session() as session:
from .context_db import ContextBlob
# Find contexts missing embeddings
with self._get_vector_db() as vector_conn:
vector_cursor = vector_conn.cursor()
vector_cursor.execute('''
SELECT context_id FROM context_vectors
WHERE model_name = ?
''', (self.model_config.name,))
existing_ids = {row[0] for row in vector_cursor.fetchall()}
# Get contexts that need embeddings
all_blobs = session.query(ContextBlob).all()
missing_blobs = [blob for blob in all_blobs if blob.id not in existing_ids]
if not missing_blobs:
return {
"total_processed": 0,
"processing_time": 0,
"embeddings_per_second": 0,
"message": "All contexts already have embeddings"
}
logger.info(f"Building embeddings for {len(missing_blobs)} contexts")
# Process in batches
total_processed = 0
for batch_start in range(0, len(missing_blobs), batch_size):
batch_end = min(batch_start + batch_size, len(missing_blobs))
batch_blobs = missing_blobs[batch_start:batch_end]
# Generate embeddings for batch
texts = [blob.content for blob in batch_blobs]
embeddings = self.generate_embeddings_batch(texts, use_cache=False)
# Store embeddings
context_embeddings = [
(blob.id, embedding)
for blob, embedding in zip(batch_blobs, embeddings)
]
self.store_embeddings_batch(context_embeddings)
total_processed += len(batch_blobs)
logger.info(f"Processed {total_processed}/{len(missing_blobs)} contexts")
processing_time = time.time() - start_time
embeddings_per_second = total_processed / processing_time if processing_time > 0 else 0
return {
"total_processed": total_processed,
"processing_time": processing_time,
"embeddings_per_second": embeddings_per_second,
"model_used": self.model_config.name,
"embedding_dimension": self.model_config.dimension
}
def get_statistics(self) -> Dict[str, Any]:
"""Get embedding manager statistics."""
with self._get_vector_db() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT
COUNT(*) as total_embeddings,
COUNT(DISTINCT model_name) as unique_models,
AVG(embedding_dimension) as avg_dimension
FROM context_vectors
''')
db_stats = cursor.fetchone()
cursor.execute('''
SELECT model_name, COUNT(*) as count
FROM context_vectors
GROUP BY model_name
''')
model_counts = dict(cursor.fetchall())
return {
"database_stats": {
"total_embeddings": db_stats[0] if db_stats else 0,
"unique_models": db_stats[1] if db_stats else 0,
"average_dimension": db_stats[2] if db_stats else 0,
"model_counts": model_counts
},
"cache_stats": self.vector_cache.stats(),
"current_model": asdict(self.model_config),
"vector_db_path": self.vector_db_path,
"batch_size": self.batch_size
}
def cleanup_old_embeddings(self, days_old: int = 30) -> int:
"""Remove old unused embeddings."""
with self._get_vector_db() as conn:
cursor = conn.cursor()
cursor.execute('''
DELETE FROM context_vectors
WHERE updated_at < datetime('now', '-{} days')
AND context_id NOT IN (
SELECT id FROM context_blobs
)
'''.format(days_old))
deleted_count = cursor.rowcount
conn.commit()
logger.info(f"Cleaned up {deleted_count} old embeddings")
return deleted_count

View File

@@ -0,0 +1,136 @@
"""
Trio-compatible wrapper for OptimizedEmbeddingManager.
This module provides async compatibility for the optimized embedding system
to work with FUSE filesystem operations that require Trio async context.
"""
import trio
from typing import List, Dict, Optional, Tuple, Any
from .embeddings_optimized import OptimizedEmbeddingManager, VectorSearchResult
from .context_db import Context
class TrioOptimizedEmbeddingManager:
"""
Trio-compatible async wrapper for OptimizedEmbeddingManager.
"""
def __init__(self, sync_embedding_manager: OptimizedEmbeddingManager):
self.sync_manager = sync_embedding_manager
async def generate_embedding(self, text: str, use_cache: bool = True) -> 'np.ndarray':
"""Generate embedding asynchronously."""
return await trio.to_thread.run_sync(
self.sync_manager.generate_embedding,
text,
use_cache
)
async def generate_embeddings_batch(self, texts: List[str], use_cache: bool = True) -> List['np.ndarray']:
"""Generate embeddings for multiple texts asynchronously."""
return await trio.to_thread.run_sync(
self.sync_manager.generate_embeddings_batch,
texts,
use_cache
)
async def store_embedding(self, context_id: int, embedding: 'np.ndarray') -> None:
"""Store embedding asynchronously."""
await trio.to_thread.run_sync(
self.sync_manager.store_embedding,
context_id,
embedding
)
async def store_embeddings_batch(self, context_embeddings: List[Tuple[int, 'np.ndarray']]) -> None:
"""Store multiple embeddings asynchronously."""
await trio.to_thread.run_sync(
self.sync_manager.store_embeddings_batch,
context_embeddings
)
async def get_embedding(self, context_id: int) -> Optional['np.ndarray']:
"""Retrieve embedding asynchronously."""
return await trio.to_thread.run_sync(
self.sync_manager.get_embedding,
context_id
)
async def semantic_search_optimized(self,
query: str,
path_prefix: str = None,
top_k: int = 5,
include_contexts: bool = True) -> List[VectorSearchResult]:
"""Perform semantic search asynchronously."""
return await trio.to_thread.run_sync(
self.sync_manager.semantic_search_optimized,
query,
path_prefix,
top_k,
include_contexts
)
async def hybrid_search_optimized(self,
query: str,
path_prefix: str = None,
top_k: int = 5,
semantic_weight: float = 0.7,
rerank_top_n: int = 50) -> List[VectorSearchResult]:
"""Perform hybrid search asynchronously."""
return await trio.to_thread.run_sync(
self.sync_manager.hybrid_search_optimized,
query,
path_prefix,
top_k,
semantic_weight,
rerank_top_n
)
async def vector_similarity_search(self,
query_embedding: 'np.ndarray',
context_ids: Optional[List[int]] = None,
top_k: int = 10,
min_similarity: float = 0.0) -> List[VectorSearchResult]:
"""Perform vector similarity search asynchronously."""
return await trio.to_thread.run_sync(
self.sync_manager.vector_similarity_search,
query_embedding,
context_ids,
top_k,
min_similarity
)
async def build_embeddings_index(self, batch_size: int = 100) -> Dict[str, Any]:
"""Build embeddings index asynchronously."""
return await trio.to_thread.run_sync(
self.sync_manager.build_embeddings_index,
batch_size
)
async def get_statistics(self) -> Dict[str, Any]:
"""Get statistics asynchronously."""
return await trio.to_thread.run_sync(
self.sync_manager.get_statistics
)
async def cleanup_old_embeddings(self, days_old: int = 30) -> int:
"""Clean up old embeddings asynchronously."""
return await trio.to_thread.run_sync(
self.sync_manager.cleanup_old_embeddings,
days_old
)
# Synchronous access to underlying manager properties
@property
def model_config(self):
return self.sync_manager.model_config
@property
def vector_cache(self):
return self.sync_manager.vector_cache
@property
def batch_size(self):
return self.sync_manager.batch_size

View File

@@ -0,0 +1,179 @@
"""
HCFS Filesystem - FUSE-based virtual filesystem layer.
"""
import os
import stat
import errno
import time
from typing import Dict, Optional
from pathlib import Path
import pyfuse3
from pyfuse3 import FUSEError
from .context_db import ContextDatabase, Context
class HCFSFilesystem(pyfuse3.Operations):
"""
HCFS FUSE filesystem implementation.
Maps directory navigation to context scope and provides
virtual files for context access.
"""
def __init__(self, context_db: ContextDatabase, mount_point: str):
super().__init__()
self.context_db = context_db
self.mount_point = mount_point
self._inode_counter = 1
self._inode_to_path: Dict[int, str] = {1: "/"} # Root inode
self._path_to_inode: Dict[str, int] = {"/": 1}
# Virtual files
self.CONTEXT_FILE = ".context"
self.CONTEXT_LIST_FILE = ".context_list"
self.CONTEXT_PUSH_FILE = ".context_push"
def _get_inode(self, path: str) -> int:
"""Get or create inode for path."""
if path in self._path_to_inode:
return self._path_to_inode[path]
self._inode_counter += 1
inode = self._inode_counter
self._inode_to_path[inode] = path
self._path_to_inode[path] = inode
return inode
def _get_path(self, inode: int) -> str:
"""Get path for inode."""
return self._inode_to_path.get(inode, "/")
def _is_virtual_file(self, path: str) -> bool:
"""Check if path is a virtual context file."""
basename = os.path.basename(path)
return basename in [self.CONTEXT_FILE, self.CONTEXT_LIST_FILE, self.CONTEXT_PUSH_FILE]
async def getattr(self, inode: int, ctx=None) -> pyfuse3.EntryAttributes:
"""Get file attributes."""
path = self._get_path(inode)
entry = pyfuse3.EntryAttributes()
entry.st_ino = inode
entry.st_uid = os.getuid()
entry.st_gid = os.getgid()
entry.st_atime_ns = int(time.time() * 1e9)
entry.st_mtime_ns = int(time.time() * 1e9)
entry.st_ctime_ns = int(time.time() * 1e9)
if self._is_virtual_file(path):
# Virtual files are readable text files
entry.st_mode = stat.S_IFREG | 0o644
entry.st_size = 1024 # Placeholder size
else:
# Directories
entry.st_mode = stat.S_IFDIR | 0o755
entry.st_size = 0
return entry
async def lookup(self, parent_inode: int, name: bytes, ctx=None) -> pyfuse3.EntryAttributes:
"""Look up a directory entry."""
parent_path = self._get_path(parent_inode)
child_path = os.path.join(parent_path, name.decode('utf-8'))
# Normalize path
if child_path.startswith("//"):
child_path = child_path[1:]
child_inode = self._get_inode(child_path)
return await self.getattr(child_inode, ctx)
async def opendir(self, inode: int, ctx=None) -> int:
"""Open directory."""
return inode
async def readdir(self, inode: int, start_id: int, token) -> None:
"""Read directory contents."""
path = self._get_path(inode)
# Always show virtual context files in every directory
entries = [
(self.CONTEXT_FILE, await self.getattr(self._get_inode(os.path.join(path, self.CONTEXT_FILE)))),
(self.CONTEXT_LIST_FILE, await self.getattr(self._get_inode(os.path.join(path, self.CONTEXT_LIST_FILE)))),
(self.CONTEXT_PUSH_FILE, await self.getattr(self._get_inode(os.path.join(path, self.CONTEXT_PUSH_FILE)))),
]
# Add subdirectories (you might want to make this dynamic based on context paths)
# For now, allowing any directory to be created by navigation
for i, (name, attr) in enumerate(entries):
if i >= start_id:
if not pyfuse3.readdir_reply(token, name.encode('utf-8'), attr, i + 1):
break
async def open(self, inode: int, flags: int, ctx=None) -> int:
"""Open file."""
path = self._get_path(inode)
if not self._is_virtual_file(path):
raise FUSEError(errno.EISDIR)
return inode
async def read(self, fh: int, offset: int, size: int) -> bytes:
"""Read from virtual files."""
path = self._get_path(fh)
basename = os.path.basename(path)
dir_path = os.path.dirname(path)
if basename == self.CONTEXT_FILE:
# Return aggregated context for current directory
contexts = self.context_db.get_context_by_path(dir_path, depth=1)
content = "\\n".join(f"[{ctx.path}] {ctx.content}" for ctx in contexts)
elif basename == self.CONTEXT_LIST_FILE:
# List contexts at current path
contexts = self.context_db.list_contexts_at_path(dir_path)
content = "\\n".join(f"ID: {ctx.id}, Path: {ctx.path}, Author: {ctx.author}, Created: {ctx.created_at}"
for ctx in contexts)
elif basename == self.CONTEXT_PUSH_FILE:
# Instructions for pushing context
content = f"Write to this file to push context to path: {dir_path}\\nFormat: <content>"
else:
content = "Unknown virtual file"
content_bytes = content.encode('utf-8')
return content_bytes[offset:offset + size]
async def write(self, fh: int, offset: int, data: bytes) -> int:
"""Write to virtual files (context_push only)."""
path = self._get_path(fh)
basename = os.path.basename(path)
dir_path = os.path.dirname(path)
if basename == self.CONTEXT_PUSH_FILE:
# Push new context to current directory
content = data.decode('utf-8').strip()
context = Context(
id=None,
path=dir_path,
content=content,
author="fuse_user"
)
self.context_db.store_context(context)
return len(data)
else:
raise FUSEError(errno.EACCES)
async def mkdir(self, parent_inode: int, name: bytes, mode: int, ctx=None) -> pyfuse3.EntryAttributes:
"""Create directory (virtual - just for navigation)."""
parent_path = self._get_path(parent_inode)
new_path = os.path.join(parent_path, name.decode('utf-8'))
if new_path.startswith("//"):
new_path = new_path[1:]
new_inode = self._get_inode(new_path)
return await self.getattr(new_inode, ctx)