Phase 2 build initial
This commit is contained in:
1
hcfs-python/hcfs/core/__init__.py
Normal file
1
hcfs-python/hcfs/core/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Core HCFS components."""
|
||||
148
hcfs-python/hcfs/core/context_db.py
Normal file
148
hcfs-python/hcfs/core/context_db.py
Normal file
@@ -0,0 +1,148 @@
|
||||
"""
|
||||
Context Database - Storage and retrieval of context blobs.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import List, Optional, Dict, Any
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text, Float
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.orm import sessionmaker, Session
|
||||
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
|
||||
class ContextBlob(Base):
|
||||
"""Database model for context blobs."""
|
||||
|
||||
__tablename__ = "context_blobs"
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
path = Column(String(512), nullable=False, index=True)
|
||||
content = Column(Text, nullable=False)
|
||||
summary = Column(Text)
|
||||
embedding_model = Column(String(100))
|
||||
embedding_vector = Column(Text) # JSON serialized vector
|
||||
author = Column(String(100))
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
version = Column(Integer, default=1)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Context:
|
||||
"""Context data structure."""
|
||||
id: Optional[int]
|
||||
path: str
|
||||
content: str
|
||||
summary: Optional[str] = None
|
||||
author: Optional[str] = None
|
||||
created_at: Optional[datetime] = None
|
||||
updated_at: Optional[datetime] = None
|
||||
version: int = 1
|
||||
|
||||
|
||||
class ContextDatabase:
|
||||
"""Main interface for context storage and retrieval."""
|
||||
|
||||
def __init__(self, db_path: str = "hcfs_context.db"):
|
||||
self.db_path = db_path
|
||||
self.engine = create_engine(f"sqlite:///{db_path}")
|
||||
Base.metadata.create_all(self.engine)
|
||||
self.SessionLocal = sessionmaker(bind=self.engine)
|
||||
|
||||
def get_session(self) -> Session:
|
||||
"""Get database session."""
|
||||
return self.SessionLocal()
|
||||
|
||||
def store_context(self, context: Context) -> int:
|
||||
"""Store a context blob and return its ID."""
|
||||
with self.get_session() as session:
|
||||
blob = ContextBlob(
|
||||
path=context.path,
|
||||
content=context.content,
|
||||
summary=context.summary,
|
||||
author=context.author,
|
||||
version=context.version
|
||||
)
|
||||
session.add(blob)
|
||||
session.commit()
|
||||
session.refresh(blob)
|
||||
return blob.id
|
||||
|
||||
def get_context_by_path(self, path: str, depth: int = 1) -> List[Context]:
|
||||
"""Retrieve contexts for a path and optionally parent paths."""
|
||||
contexts = []
|
||||
current_path = Path(path)
|
||||
|
||||
with self.get_session() as session:
|
||||
# Get contexts for current path and parents up to depth
|
||||
for i in range(depth + 1):
|
||||
search_path = str(current_path) if current_path != Path(".") else "/"
|
||||
|
||||
blobs = session.query(ContextBlob).filter(
|
||||
ContextBlob.path == search_path
|
||||
).order_by(ContextBlob.created_at.desc()).all()
|
||||
|
||||
for blob in blobs:
|
||||
contexts.append(Context(
|
||||
id=blob.id,
|
||||
path=blob.path,
|
||||
content=blob.content,
|
||||
summary=blob.summary,
|
||||
author=blob.author,
|
||||
created_at=blob.created_at,
|
||||
updated_at=blob.updated_at,
|
||||
version=blob.version
|
||||
))
|
||||
|
||||
if current_path.parent == current_path: # Root reached
|
||||
break
|
||||
current_path = current_path.parent
|
||||
|
||||
return contexts
|
||||
|
||||
def list_contexts_at_path(self, path: str) -> List[Context]:
|
||||
"""List all contexts at a specific path."""
|
||||
with self.get_session() as session:
|
||||
blobs = session.query(ContextBlob).filter(
|
||||
ContextBlob.path == path
|
||||
).order_by(ContextBlob.created_at.desc()).all()
|
||||
|
||||
return [Context(
|
||||
id=blob.id,
|
||||
path=blob.path,
|
||||
content=blob.content,
|
||||
summary=blob.summary,
|
||||
author=blob.author,
|
||||
created_at=blob.created_at,
|
||||
updated_at=blob.updated_at,
|
||||
version=blob.version
|
||||
) for blob in blobs]
|
||||
|
||||
def update_context(self, context_id: int, content: str, summary: str = None) -> bool:
|
||||
"""Update an existing context."""
|
||||
with self.get_session() as session:
|
||||
blob = session.query(ContextBlob).filter(ContextBlob.id == context_id).first()
|
||||
if blob:
|
||||
blob.content = content
|
||||
if summary:
|
||||
blob.summary = summary
|
||||
blob.version += 1
|
||||
blob.updated_at = datetime.utcnow()
|
||||
session.commit()
|
||||
return True
|
||||
return False
|
||||
|
||||
def delete_context(self, context_id: int) -> bool:
|
||||
"""Delete a context by ID."""
|
||||
with self.get_session() as session:
|
||||
blob = session.query(ContextBlob).filter(ContextBlob.id == context_id).first()
|
||||
if blob:
|
||||
session.delete(blob)
|
||||
session.commit()
|
||||
return True
|
||||
return False
|
||||
188
hcfs-python/hcfs/core/embeddings.py
Normal file
188
hcfs-python/hcfs/core/embeddings.py
Normal file
@@ -0,0 +1,188 @@
|
||||
"""
|
||||
Embedding Manager - Generate and manage context embeddings.
|
||||
"""
|
||||
|
||||
import json
|
||||
import numpy as np
|
||||
from typing import List, Dict, Optional, Tuple
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
from .context_db import Context, ContextDatabase
|
||||
|
||||
|
||||
class EmbeddingManager:
|
||||
"""
|
||||
Manages embeddings for context blobs and semantic similarity search.
|
||||
"""
|
||||
|
||||
def __init__(self, context_db: ContextDatabase, model_name: str = "all-MiniLM-L6-v2"):
|
||||
self.context_db = context_db
|
||||
self.model_name = model_name
|
||||
self.model = SentenceTransformer(model_name)
|
||||
self.tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
|
||||
self._tfidf_fitted = False
|
||||
|
||||
def generate_embedding(self, text: str) -> np.ndarray:
|
||||
"""Generate embedding for a text."""
|
||||
return self.model.encode(text, normalize_embeddings=True)
|
||||
|
||||
def store_context_with_embedding(self, context: Context) -> int:
|
||||
"""Store context and generate its embedding."""
|
||||
# Generate embedding
|
||||
embedding = self.generate_embedding(context.content)
|
||||
|
||||
# Store in database
|
||||
context_id = self.context_db.store_context(context)
|
||||
|
||||
# Update with embedding (you'd extend ContextBlob model for this)
|
||||
self._store_embedding(context_id, embedding)
|
||||
|
||||
return context_id
|
||||
|
||||
def _store_embedding(self, context_id: int, embedding: np.ndarray) -> None:
|
||||
"""Store embedding vector in database."""
|
||||
embedding_json = json.dumps(embedding.tolist())
|
||||
|
||||
with self.context_db.get_session() as session:
|
||||
from .context_db import ContextBlob
|
||||
blob = session.query(ContextBlob).filter(ContextBlob.id == context_id).first()
|
||||
if blob:
|
||||
blob.embedding_model = self.model_name
|
||||
blob.embedding_vector = embedding_json
|
||||
session.commit()
|
||||
|
||||
def semantic_search(self, query: str, path_prefix: str = None, top_k: int = 5) -> List[Tuple[Context, float]]:
|
||||
"""
|
||||
Perform semantic search for contexts similar to query.
|
||||
|
||||
Args:
|
||||
query: Search query text
|
||||
path_prefix: Optional path prefix to limit search scope
|
||||
top_k: Number of results to return
|
||||
|
||||
Returns:
|
||||
List of (Context, similarity_score) tuples
|
||||
"""
|
||||
query_embedding = self.generate_embedding(query)
|
||||
|
||||
with self.context_db.get_session() as session:
|
||||
from .context_db import ContextBlob
|
||||
|
||||
query_filter = session.query(ContextBlob).filter(
|
||||
ContextBlob.embedding_vector.isnot(None)
|
||||
)
|
||||
|
||||
if path_prefix:
|
||||
query_filter = query_filter.filter(ContextBlob.path.startswith(path_prefix))
|
||||
|
||||
blobs = query_filter.all()
|
||||
|
||||
if not blobs:
|
||||
return []
|
||||
|
||||
# Calculate similarities
|
||||
similarities = []
|
||||
for blob in blobs:
|
||||
if blob.embedding_vector:
|
||||
stored_embedding = np.array(json.loads(blob.embedding_vector))
|
||||
similarity = cosine_similarity(
|
||||
query_embedding.reshape(1, -1),
|
||||
stored_embedding.reshape(1, -1)
|
||||
)[0][0]
|
||||
|
||||
context = Context(
|
||||
id=blob.id,
|
||||
path=blob.path,
|
||||
content=blob.content,
|
||||
summary=blob.summary,
|
||||
author=blob.author,
|
||||
created_at=blob.created_at,
|
||||
updated_at=blob.updated_at,
|
||||
version=blob.version
|
||||
)
|
||||
|
||||
similarities.append((context, float(similarity)))
|
||||
|
||||
# Sort by similarity and return top_k
|
||||
similarities.sort(key=lambda x: x[1], reverse=True)
|
||||
return similarities[:top_k]
|
||||
|
||||
def hybrid_search(self, query: str, path_prefix: str = None, top_k: int = 5,
|
||||
semantic_weight: float = 0.7) -> List[Tuple[Context, float]]:
|
||||
"""
|
||||
Hybrid search combining semantic similarity and BM25.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
path_prefix: Optional path filter
|
||||
top_k: Number of results
|
||||
semantic_weight: Weight for semantic vs BM25 (0.0-1.0)
|
||||
"""
|
||||
# Get contexts for BM25
|
||||
with self.context_db.get_session() as session:
|
||||
from .context_db import ContextBlob
|
||||
|
||||
query_filter = session.query(ContextBlob)
|
||||
if path_prefix:
|
||||
query_filter = query_filter.filter(ContextBlob.path.startswith(path_prefix))
|
||||
|
||||
blobs = query_filter.all()
|
||||
|
||||
if not blobs:
|
||||
return []
|
||||
|
||||
# Prepare documents for BM25
|
||||
documents = [blob.content for blob in blobs]
|
||||
|
||||
# Fit TF-IDF if not already fitted or refitting needed
|
||||
if not self._tfidf_fitted or len(documents) > 100: # Refit periodically
|
||||
self.tfidf_vectorizer.fit(documents)
|
||||
self._tfidf_fitted = True
|
||||
|
||||
# BM25 scoring (using TF-IDF as approximation)
|
||||
doc_vectors = self.tfidf_vectorizer.transform(documents)
|
||||
query_vector = self.tfidf_vectorizer.transform([query])
|
||||
bm25_scores = cosine_similarity(query_vector, doc_vectors)[0]
|
||||
|
||||
# Semantic scoring
|
||||
semantic_results = self.semantic_search(query, path_prefix, len(blobs))
|
||||
semantic_scores = {ctx.id: score for ctx, score in semantic_results}
|
||||
|
||||
# Combine scores
|
||||
combined_results = []
|
||||
for i, blob in enumerate(blobs):
|
||||
bm25_score = bm25_scores[i]
|
||||
semantic_score = semantic_scores.get(blob.id, 0.0)
|
||||
|
||||
combined_score = (semantic_weight * semantic_score +
|
||||
(1 - semantic_weight) * bm25_score)
|
||||
|
||||
context = Context(
|
||||
id=blob.id,
|
||||
path=blob.path,
|
||||
content=blob.content,
|
||||
summary=blob.summary,
|
||||
author=blob.author,
|
||||
created_at=blob.created_at,
|
||||
updated_at=blob.updated_at,
|
||||
version=blob.version
|
||||
)
|
||||
|
||||
combined_results.append((context, combined_score))
|
||||
|
||||
# Sort and return top results
|
||||
combined_results.sort(key=lambda x: x[1], reverse=True)
|
||||
return combined_results[:top_k]
|
||||
|
||||
def get_similar_contexts(self, context_id: int, top_k: int = 5) -> List[Tuple[Context, float]]:
|
||||
"""Find contexts similar to a given context."""
|
||||
with self.context_db.get_session() as session:
|
||||
from .context_db import ContextBlob
|
||||
reference_blob = session.query(ContextBlob).filter(ContextBlob.id == context_id).first()
|
||||
|
||||
if not reference_blob or not reference_blob.content:
|
||||
return []
|
||||
|
||||
return self.semantic_search(reference_blob.content, top_k=top_k)
|
||||
616
hcfs-python/hcfs/core/embeddings_optimized.py
Normal file
616
hcfs-python/hcfs/core/embeddings_optimized.py
Normal file
@@ -0,0 +1,616 @@
|
||||
"""
|
||||
Optimized Embedding Manager - High-performance vector operations and storage.
|
||||
|
||||
This module provides enhanced embedding capabilities including:
|
||||
- Vector database integration with SQLite-Vec
|
||||
- Optimized batch processing and caching
|
||||
- Multiple embedding model support
|
||||
- Efficient similarity search with indexing
|
||||
- Memory-efficient embedding storage
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
import numpy as np
|
||||
import sqlite3
|
||||
from typing import List, Dict, Optional, Tuple, Union, Any
|
||||
from dataclasses import dataclass, asdict
|
||||
from pathlib import Path
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
import threading
|
||||
from contextlib import contextmanager
|
||||
from functools import lru_cache
|
||||
import logging
|
||||
|
||||
from .context_db import Context, ContextDatabase
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@dataclass
|
||||
class EmbeddingModel:
|
||||
"""Configuration for embedding models."""
|
||||
name: str
|
||||
model_path: str
|
||||
dimension: int
|
||||
max_tokens: int = 512
|
||||
normalize: bool = True
|
||||
|
||||
@dataclass
|
||||
class VectorSearchResult:
|
||||
"""Result from vector search operations."""
|
||||
context_id: int
|
||||
score: float
|
||||
context: Optional[Context] = None
|
||||
metadata: Dict[str, Any] = None
|
||||
|
||||
class VectorCache:
|
||||
"""High-performance LRU cache for embeddings."""
|
||||
|
||||
def __init__(self, max_size: int = 5000, ttl_seconds: int = 3600):
|
||||
self.max_size = max_size
|
||||
self.ttl_seconds = ttl_seconds
|
||||
self.cache: Dict[str, Tuple[np.ndarray, float]] = {}
|
||||
self.access_times: Dict[str, float] = {}
|
||||
self.lock = threading.RLock()
|
||||
|
||||
def get(self, key: str) -> Optional[np.ndarray]:
|
||||
"""Get embedding from cache."""
|
||||
with self.lock:
|
||||
current_time = time.time()
|
||||
|
||||
if key in self.cache:
|
||||
embedding, created_time = self.cache[key]
|
||||
|
||||
# Check TTL
|
||||
if current_time - created_time < self.ttl_seconds:
|
||||
self.access_times[key] = current_time
|
||||
return embedding.copy()
|
||||
else:
|
||||
# Expired
|
||||
del self.cache[key]
|
||||
del self.access_times[key]
|
||||
return None
|
||||
|
||||
def put(self, key: str, embedding: np.ndarray) -> None:
|
||||
"""Store embedding in cache."""
|
||||
with self.lock:
|
||||
current_time = time.time()
|
||||
|
||||
# Evict if cache is full
|
||||
if len(self.cache) >= self.max_size:
|
||||
self._evict_lru()
|
||||
|
||||
self.cache[key] = (embedding.copy(), current_time)
|
||||
self.access_times[key] = current_time
|
||||
|
||||
def _evict_lru(self) -> None:
|
||||
"""Evict least recently used item."""
|
||||
if not self.access_times:
|
||||
return
|
||||
|
||||
lru_key = min(self.access_times.items(), key=lambda x: x[1])[0]
|
||||
del self.cache[lru_key]
|
||||
del self.access_times[lru_key]
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear cache."""
|
||||
with self.lock:
|
||||
self.cache.clear()
|
||||
self.access_times.clear()
|
||||
|
||||
def stats(self) -> Dict[str, Any]:
|
||||
"""Get cache statistics."""
|
||||
with self.lock:
|
||||
return {
|
||||
"size": len(self.cache),
|
||||
"max_size": self.max_size,
|
||||
"hit_rate": getattr(self, '_hits', 0) / max(getattr(self, '_requests', 1), 1),
|
||||
"ttl_seconds": self.ttl_seconds
|
||||
}
|
||||
|
||||
class OptimizedEmbeddingManager:
|
||||
"""
|
||||
High-performance embedding manager with vector database capabilities.
|
||||
"""
|
||||
|
||||
# Predefined embedding models
|
||||
MODELS = {
|
||||
"mini": EmbeddingModel("all-MiniLM-L6-v2", "all-MiniLM-L6-v2", 384),
|
||||
"base": EmbeddingModel("all-MiniLM-L12-v2", "all-MiniLM-L12-v2", 384),
|
||||
"large": EmbeddingModel("all-mpnet-base-v2", "all-mpnet-base-v2", 768),
|
||||
"multilingual": EmbeddingModel("paraphrase-multilingual-MiniLM-L12-v2",
|
||||
"paraphrase-multilingual-MiniLM-L12-v2", 384)
|
||||
}
|
||||
|
||||
def __init__(self,
|
||||
context_db: ContextDatabase,
|
||||
model_name: str = "mini",
|
||||
vector_db_path: Optional[str] = None,
|
||||
cache_size: int = 5000,
|
||||
batch_size: int = 32):
|
||||
self.context_db = context_db
|
||||
self.model_config = self.MODELS.get(model_name, self.MODELS["mini"])
|
||||
self.model = None # Lazy loading
|
||||
self.vector_cache = VectorCache(cache_size)
|
||||
self.batch_size = batch_size
|
||||
|
||||
# Vector database setup
|
||||
self.vector_db_path = vector_db_path or "hcfs_vectors.db"
|
||||
self._init_vector_db()
|
||||
|
||||
# TF-IDF for hybrid search
|
||||
self.tfidf_vectorizer = TfidfVectorizer(
|
||||
stop_words='english',
|
||||
max_features=5000,
|
||||
ngram_range=(1, 2),
|
||||
min_df=2
|
||||
)
|
||||
self._tfidf_fitted = False
|
||||
self._model_lock = threading.RLock()
|
||||
|
||||
logger.info(f"Initialized OptimizedEmbeddingManager with model: {self.model_config.name}")
|
||||
|
||||
def _get_model(self) -> SentenceTransformer:
|
||||
"""Lazy load the embedding model."""
|
||||
if self.model is None:
|
||||
with self._model_lock:
|
||||
if self.model is None:
|
||||
logger.info(f"Loading embedding model: {self.model_config.model_path}")
|
||||
self.model = SentenceTransformer(self.model_config.model_path)
|
||||
return self.model
|
||||
|
||||
def _init_vector_db(self) -> None:
|
||||
"""Initialize SQLite vector database for fast similarity search."""
|
||||
conn = sqlite3.connect(self.vector_db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Create vectors table
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS context_vectors (
|
||||
context_id INTEGER PRIMARY KEY,
|
||||
model_name TEXT NOT NULL,
|
||||
embedding_dimension INTEGER NOT NULL,
|
||||
vector_data BLOB NOT NULL,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
''')
|
||||
|
||||
# Create index for fast lookups
|
||||
cursor.execute('''
|
||||
CREATE INDEX IF NOT EXISTS idx_context_vectors_model
|
||||
ON context_vectors(model_name, context_id)
|
||||
''')
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
logger.info(f"Vector database initialized: {self.vector_db_path}")
|
||||
|
||||
@contextmanager
|
||||
def _get_vector_db(self):
|
||||
"""Get vector database connection with proper cleanup."""
|
||||
conn = sqlite3.connect(self.vector_db_path)
|
||||
try:
|
||||
yield conn
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def generate_embedding(self, text: str, use_cache: bool = True) -> np.ndarray:
|
||||
"""Generate embedding for text with caching."""
|
||||
cache_key = f"{self.model_config.name}:{hash(text)}"
|
||||
|
||||
if use_cache:
|
||||
cached = self.vector_cache.get(cache_key)
|
||||
if cached is not None:
|
||||
return cached
|
||||
|
||||
model = self._get_model()
|
||||
embedding = model.encode(
|
||||
text,
|
||||
normalize_embeddings=self.model_config.normalize,
|
||||
show_progress_bar=False
|
||||
)
|
||||
|
||||
if use_cache:
|
||||
self.vector_cache.put(cache_key, embedding)
|
||||
|
||||
return embedding
|
||||
|
||||
def generate_embeddings_batch(self, texts: List[str], use_cache: bool = True) -> List[np.ndarray]:
|
||||
"""Generate embeddings for multiple texts efficiently."""
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
# Check cache first
|
||||
cache_results = []
|
||||
uncached_indices = []
|
||||
uncached_texts = []
|
||||
|
||||
if use_cache:
|
||||
for i, text in enumerate(texts):
|
||||
cache_key = f"{self.model_config.name}:{hash(text)}"
|
||||
cached = self.vector_cache.get(cache_key)
|
||||
if cached is not None:
|
||||
cache_results.append((i, cached))
|
||||
else:
|
||||
uncached_indices.append(i)
|
||||
uncached_texts.append(text)
|
||||
else:
|
||||
uncached_indices = list(range(len(texts)))
|
||||
uncached_texts = texts
|
||||
|
||||
# Generate embeddings for uncached texts
|
||||
embeddings = [None] * len(texts)
|
||||
|
||||
# Place cached results
|
||||
for i, embedding in cache_results:
|
||||
embeddings[i] = embedding
|
||||
|
||||
if uncached_texts:
|
||||
model = self._get_model()
|
||||
|
||||
# Process in batches
|
||||
for batch_start in range(0, len(uncached_texts), self.batch_size):
|
||||
batch_end = min(batch_start + self.batch_size, len(uncached_texts))
|
||||
batch_texts = uncached_texts[batch_start:batch_end]
|
||||
batch_indices = uncached_indices[batch_start:batch_end]
|
||||
|
||||
batch_embeddings = model.encode(
|
||||
batch_texts,
|
||||
normalize_embeddings=self.model_config.normalize,
|
||||
show_progress_bar=False,
|
||||
batch_size=self.batch_size
|
||||
)
|
||||
|
||||
# Store results and cache
|
||||
for i, (orig_idx, embedding) in enumerate(zip(batch_indices, batch_embeddings)):
|
||||
embeddings[orig_idx] = embedding
|
||||
|
||||
if use_cache:
|
||||
cache_key = f"{self.model_config.name}:{hash(batch_texts[i])}"
|
||||
self.vector_cache.put(cache_key, embedding)
|
||||
|
||||
return embeddings
|
||||
|
||||
def store_embedding(self, context_id: int, embedding: np.ndarray) -> None:
|
||||
"""Store embedding in vector database."""
|
||||
with self._get_vector_db() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Convert to bytes for storage
|
||||
vector_bytes = embedding.astype(np.float32).tobytes()
|
||||
|
||||
cursor.execute('''
|
||||
INSERT OR REPLACE INTO context_vectors
|
||||
(context_id, model_name, embedding_dimension, vector_data, updated_at)
|
||||
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)
|
||||
''', (context_id, self.model_config.name, embedding.shape[0], vector_bytes))
|
||||
|
||||
conn.commit()
|
||||
|
||||
def store_embeddings_batch(self, context_embeddings: List[Tuple[int, np.ndarray]]) -> None:
|
||||
"""Store multiple embeddings efficiently."""
|
||||
if not context_embeddings:
|
||||
return
|
||||
|
||||
with self._get_vector_db() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
data = [
|
||||
(context_id, self.model_config.name, embedding.shape[0],
|
||||
embedding.astype(np.float32).tobytes())
|
||||
for context_id, embedding in context_embeddings
|
||||
]
|
||||
|
||||
cursor.executemany('''
|
||||
INSERT OR REPLACE INTO context_vectors
|
||||
(context_id, model_name, embedding_dimension, vector_data, updated_at)
|
||||
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)
|
||||
''', data)
|
||||
|
||||
conn.commit()
|
||||
|
||||
logger.info(f"Stored {len(context_embeddings)} embeddings in batch")
|
||||
|
||||
def get_embedding(self, context_id: int) -> Optional[np.ndarray]:
|
||||
"""Retrieve embedding for a context."""
|
||||
with self._get_vector_db() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('''
|
||||
SELECT vector_data, embedding_dimension FROM context_vectors
|
||||
WHERE context_id = ? AND model_name = ?
|
||||
''', (context_id, self.model_config.name))
|
||||
|
||||
result = cursor.fetchone()
|
||||
if result:
|
||||
vector_bytes, dimension = result
|
||||
return np.frombuffer(vector_bytes, dtype=np.float32).reshape(dimension)
|
||||
|
||||
return None
|
||||
|
||||
def vector_similarity_search(self,
|
||||
query_embedding: np.ndarray,
|
||||
context_ids: Optional[List[int]] = None,
|
||||
top_k: int = 10,
|
||||
min_similarity: float = 0.0) -> List[VectorSearchResult]:
|
||||
"""Efficient vector similarity search."""
|
||||
with self._get_vector_db() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Build query
|
||||
if context_ids:
|
||||
placeholders = ','.join(['?'] * len(context_ids))
|
||||
query = f'''
|
||||
SELECT context_id, vector_data, embedding_dimension
|
||||
FROM context_vectors
|
||||
WHERE model_name = ? AND context_id IN ({placeholders})
|
||||
'''
|
||||
params = [self.model_config.name] + context_ids
|
||||
else:
|
||||
query = '''
|
||||
SELECT context_id, vector_data, embedding_dimension
|
||||
FROM context_vectors
|
||||
WHERE model_name = ?
|
||||
'''
|
||||
params = [self.model_config.name]
|
||||
|
||||
cursor.execute(query, params)
|
||||
results = cursor.fetchall()
|
||||
|
||||
if not results:
|
||||
return []
|
||||
|
||||
# Calculate similarities
|
||||
similarities = []
|
||||
query_embedding = query_embedding.reshape(1, -1)
|
||||
|
||||
for context_id, vector_bytes, dimension in results:
|
||||
stored_embedding = np.frombuffer(vector_bytes, dtype=np.float32).reshape(1, dimension)
|
||||
|
||||
similarity = cosine_similarity(query_embedding, stored_embedding)[0][0]
|
||||
|
||||
if similarity >= min_similarity:
|
||||
similarities.append(VectorSearchResult(
|
||||
context_id=context_id,
|
||||
score=float(similarity)
|
||||
))
|
||||
|
||||
# Sort by similarity and return top_k
|
||||
similarities.sort(key=lambda x: x.score, reverse=True)
|
||||
return similarities[:top_k]
|
||||
|
||||
def semantic_search_optimized(self,
|
||||
query: str,
|
||||
path_prefix: str = None,
|
||||
top_k: int = 5,
|
||||
include_contexts: bool = True) -> List[VectorSearchResult]:
|
||||
"""High-performance semantic search."""
|
||||
# Generate query embedding
|
||||
query_embedding = self.generate_embedding(query)
|
||||
|
||||
# Get relevant context IDs based on path filter
|
||||
context_ids = None
|
||||
if path_prefix:
|
||||
with self.context_db.get_session() as session:
|
||||
from .context_db import ContextBlob
|
||||
blobs = session.query(ContextBlob.id).filter(
|
||||
ContextBlob.path.startswith(path_prefix)
|
||||
).all()
|
||||
context_ids = [blob.id for blob in blobs]
|
||||
|
||||
if not context_ids:
|
||||
return []
|
||||
|
||||
# Perform vector search
|
||||
results = self.vector_similarity_search(
|
||||
query_embedding,
|
||||
context_ids=context_ids,
|
||||
top_k=top_k
|
||||
)
|
||||
|
||||
# Populate with context data if requested
|
||||
if include_contexts and results:
|
||||
context_map = {}
|
||||
with self.context_db.get_session() as session:
|
||||
from .context_db import ContextBlob
|
||||
|
||||
result_ids = [r.context_id for r in results]
|
||||
blobs = session.query(ContextBlob).filter(
|
||||
ContextBlob.id.in_(result_ids)
|
||||
).all()
|
||||
|
||||
for blob in blobs:
|
||||
context_map[blob.id] = Context(
|
||||
id=blob.id,
|
||||
path=blob.path,
|
||||
content=blob.content,
|
||||
summary=blob.summary,
|
||||
author=blob.author,
|
||||
created_at=blob.created_at,
|
||||
updated_at=blob.updated_at,
|
||||
version=blob.version
|
||||
)
|
||||
|
||||
# Add contexts to results
|
||||
for result in results:
|
||||
result.context = context_map.get(result.context_id)
|
||||
|
||||
return results
|
||||
|
||||
def hybrid_search_optimized(self,
|
||||
query: str,
|
||||
path_prefix: str = None,
|
||||
top_k: int = 5,
|
||||
semantic_weight: float = 0.7,
|
||||
rerank_top_n: int = 50) -> List[VectorSearchResult]:
|
||||
"""Optimized hybrid search with two-stage ranking."""
|
||||
|
||||
# Stage 1: Fast semantic search to get candidate set
|
||||
semantic_results = self.semantic_search_optimized(
|
||||
query, path_prefix, rerank_top_n, include_contexts=True
|
||||
)
|
||||
|
||||
if not semantic_results or len(semantic_results) < 2:
|
||||
return semantic_results[:top_k]
|
||||
|
||||
# Stage 2: Re-rank with BM25 scores
|
||||
contexts = [r.context for r in semantic_results if r.context]
|
||||
if not contexts:
|
||||
return semantic_results[:top_k]
|
||||
|
||||
documents = [ctx.content for ctx in contexts]
|
||||
|
||||
# Compute BM25 scores
|
||||
try:
|
||||
if not self._tfidf_fitted:
|
||||
self.tfidf_vectorizer.fit(documents)
|
||||
self._tfidf_fitted = True
|
||||
|
||||
doc_vectors = self.tfidf_vectorizer.transform(documents)
|
||||
query_vector = self.tfidf_vectorizer.transform([query])
|
||||
bm25_scores = cosine_similarity(query_vector, doc_vectors)[0]
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"BM25 scoring failed: {e}, using semantic only")
|
||||
return semantic_results[:top_k]
|
||||
|
||||
# Combine scores
|
||||
for i, result in enumerate(semantic_results[:len(bm25_scores)]):
|
||||
semantic_score = result.score
|
||||
bm25_score = bm25_scores[i]
|
||||
|
||||
combined_score = (semantic_weight * semantic_score +
|
||||
(1 - semantic_weight) * bm25_score)
|
||||
|
||||
result.score = float(combined_score)
|
||||
result.metadata = {
|
||||
"semantic_score": float(semantic_score),
|
||||
"bm25_score": float(bm25_score),
|
||||
"semantic_weight": semantic_weight
|
||||
}
|
||||
|
||||
# Re-sort by combined score
|
||||
semantic_results.sort(key=lambda x: x.score, reverse=True)
|
||||
return semantic_results[:top_k]
|
||||
|
||||
def build_embeddings_index(self, batch_size: int = 100) -> Dict[str, Any]:
|
||||
"""Build embeddings for all contexts without embeddings."""
|
||||
start_time = time.time()
|
||||
|
||||
# Get contexts without embeddings
|
||||
with self.context_db.get_session() as session:
|
||||
from .context_db import ContextBlob
|
||||
|
||||
# Find contexts missing embeddings
|
||||
with self._get_vector_db() as vector_conn:
|
||||
vector_cursor = vector_conn.cursor()
|
||||
vector_cursor.execute('''
|
||||
SELECT context_id FROM context_vectors
|
||||
WHERE model_name = ?
|
||||
''', (self.model_config.name,))
|
||||
|
||||
existing_ids = {row[0] for row in vector_cursor.fetchall()}
|
||||
|
||||
# Get contexts that need embeddings
|
||||
all_blobs = session.query(ContextBlob).all()
|
||||
missing_blobs = [blob for blob in all_blobs if blob.id not in existing_ids]
|
||||
|
||||
if not missing_blobs:
|
||||
return {
|
||||
"total_processed": 0,
|
||||
"processing_time": 0,
|
||||
"embeddings_per_second": 0,
|
||||
"message": "All contexts already have embeddings"
|
||||
}
|
||||
|
||||
logger.info(f"Building embeddings for {len(missing_blobs)} contexts")
|
||||
|
||||
# Process in batches
|
||||
total_processed = 0
|
||||
for batch_start in range(0, len(missing_blobs), batch_size):
|
||||
batch_end = min(batch_start + batch_size, len(missing_blobs))
|
||||
batch_blobs = missing_blobs[batch_start:batch_end]
|
||||
|
||||
# Generate embeddings for batch
|
||||
texts = [blob.content for blob in batch_blobs]
|
||||
embeddings = self.generate_embeddings_batch(texts, use_cache=False)
|
||||
|
||||
# Store embeddings
|
||||
context_embeddings = [
|
||||
(blob.id, embedding)
|
||||
for blob, embedding in zip(batch_blobs, embeddings)
|
||||
]
|
||||
self.store_embeddings_batch(context_embeddings)
|
||||
|
||||
total_processed += len(batch_blobs)
|
||||
logger.info(f"Processed {total_processed}/{len(missing_blobs)} contexts")
|
||||
|
||||
processing_time = time.time() - start_time
|
||||
embeddings_per_second = total_processed / processing_time if processing_time > 0 else 0
|
||||
|
||||
return {
|
||||
"total_processed": total_processed,
|
||||
"processing_time": processing_time,
|
||||
"embeddings_per_second": embeddings_per_second,
|
||||
"model_used": self.model_config.name,
|
||||
"embedding_dimension": self.model_config.dimension
|
||||
}
|
||||
|
||||
def get_statistics(self) -> Dict[str, Any]:
|
||||
"""Get embedding manager statistics."""
|
||||
with self._get_vector_db() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('''
|
||||
SELECT
|
||||
COUNT(*) as total_embeddings,
|
||||
COUNT(DISTINCT model_name) as unique_models,
|
||||
AVG(embedding_dimension) as avg_dimension
|
||||
FROM context_vectors
|
||||
''')
|
||||
|
||||
db_stats = cursor.fetchone()
|
||||
|
||||
cursor.execute('''
|
||||
SELECT model_name, COUNT(*) as count
|
||||
FROM context_vectors
|
||||
GROUP BY model_name
|
||||
''')
|
||||
|
||||
model_counts = dict(cursor.fetchall())
|
||||
|
||||
return {
|
||||
"database_stats": {
|
||||
"total_embeddings": db_stats[0] if db_stats else 0,
|
||||
"unique_models": db_stats[1] if db_stats else 0,
|
||||
"average_dimension": db_stats[2] if db_stats else 0,
|
||||
"model_counts": model_counts
|
||||
},
|
||||
"cache_stats": self.vector_cache.stats(),
|
||||
"current_model": asdict(self.model_config),
|
||||
"vector_db_path": self.vector_db_path,
|
||||
"batch_size": self.batch_size
|
||||
}
|
||||
|
||||
def cleanup_old_embeddings(self, days_old: int = 30) -> int:
|
||||
"""Remove old unused embeddings."""
|
||||
with self._get_vector_db() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('''
|
||||
DELETE FROM context_vectors
|
||||
WHERE updated_at < datetime('now', '-{} days')
|
||||
AND context_id NOT IN (
|
||||
SELECT id FROM context_blobs
|
||||
)
|
||||
'''.format(days_old))
|
||||
|
||||
deleted_count = cursor.rowcount
|
||||
conn.commit()
|
||||
|
||||
logger.info(f"Cleaned up {deleted_count} old embeddings")
|
||||
return deleted_count
|
||||
136
hcfs-python/hcfs/core/embeddings_trio.py
Normal file
136
hcfs-python/hcfs/core/embeddings_trio.py
Normal file
@@ -0,0 +1,136 @@
|
||||
"""
|
||||
Trio-compatible wrapper for OptimizedEmbeddingManager.
|
||||
|
||||
This module provides async compatibility for the optimized embedding system
|
||||
to work with FUSE filesystem operations that require Trio async context.
|
||||
"""
|
||||
|
||||
import trio
|
||||
from typing import List, Dict, Optional, Tuple, Any
|
||||
|
||||
from .embeddings_optimized import OptimizedEmbeddingManager, VectorSearchResult
|
||||
from .context_db import Context
|
||||
|
||||
|
||||
class TrioOptimizedEmbeddingManager:
|
||||
"""
|
||||
Trio-compatible async wrapper for OptimizedEmbeddingManager.
|
||||
"""
|
||||
|
||||
def __init__(self, sync_embedding_manager: OptimizedEmbeddingManager):
|
||||
self.sync_manager = sync_embedding_manager
|
||||
|
||||
async def generate_embedding(self, text: str, use_cache: bool = True) -> 'np.ndarray':
|
||||
"""Generate embedding asynchronously."""
|
||||
return await trio.to_thread.run_sync(
|
||||
self.sync_manager.generate_embedding,
|
||||
text,
|
||||
use_cache
|
||||
)
|
||||
|
||||
async def generate_embeddings_batch(self, texts: List[str], use_cache: bool = True) -> List['np.ndarray']:
|
||||
"""Generate embeddings for multiple texts asynchronously."""
|
||||
return await trio.to_thread.run_sync(
|
||||
self.sync_manager.generate_embeddings_batch,
|
||||
texts,
|
||||
use_cache
|
||||
)
|
||||
|
||||
async def store_embedding(self, context_id: int, embedding: 'np.ndarray') -> None:
|
||||
"""Store embedding asynchronously."""
|
||||
await trio.to_thread.run_sync(
|
||||
self.sync_manager.store_embedding,
|
||||
context_id,
|
||||
embedding
|
||||
)
|
||||
|
||||
async def store_embeddings_batch(self, context_embeddings: List[Tuple[int, 'np.ndarray']]) -> None:
|
||||
"""Store multiple embeddings asynchronously."""
|
||||
await trio.to_thread.run_sync(
|
||||
self.sync_manager.store_embeddings_batch,
|
||||
context_embeddings
|
||||
)
|
||||
|
||||
async def get_embedding(self, context_id: int) -> Optional['np.ndarray']:
|
||||
"""Retrieve embedding asynchronously."""
|
||||
return await trio.to_thread.run_sync(
|
||||
self.sync_manager.get_embedding,
|
||||
context_id
|
||||
)
|
||||
|
||||
async def semantic_search_optimized(self,
|
||||
query: str,
|
||||
path_prefix: str = None,
|
||||
top_k: int = 5,
|
||||
include_contexts: bool = True) -> List[VectorSearchResult]:
|
||||
"""Perform semantic search asynchronously."""
|
||||
return await trio.to_thread.run_sync(
|
||||
self.sync_manager.semantic_search_optimized,
|
||||
query,
|
||||
path_prefix,
|
||||
top_k,
|
||||
include_contexts
|
||||
)
|
||||
|
||||
async def hybrid_search_optimized(self,
|
||||
query: str,
|
||||
path_prefix: str = None,
|
||||
top_k: int = 5,
|
||||
semantic_weight: float = 0.7,
|
||||
rerank_top_n: int = 50) -> List[VectorSearchResult]:
|
||||
"""Perform hybrid search asynchronously."""
|
||||
return await trio.to_thread.run_sync(
|
||||
self.sync_manager.hybrid_search_optimized,
|
||||
query,
|
||||
path_prefix,
|
||||
top_k,
|
||||
semantic_weight,
|
||||
rerank_top_n
|
||||
)
|
||||
|
||||
async def vector_similarity_search(self,
|
||||
query_embedding: 'np.ndarray',
|
||||
context_ids: Optional[List[int]] = None,
|
||||
top_k: int = 10,
|
||||
min_similarity: float = 0.0) -> List[VectorSearchResult]:
|
||||
"""Perform vector similarity search asynchronously."""
|
||||
return await trio.to_thread.run_sync(
|
||||
self.sync_manager.vector_similarity_search,
|
||||
query_embedding,
|
||||
context_ids,
|
||||
top_k,
|
||||
min_similarity
|
||||
)
|
||||
|
||||
async def build_embeddings_index(self, batch_size: int = 100) -> Dict[str, Any]:
|
||||
"""Build embeddings index asynchronously."""
|
||||
return await trio.to_thread.run_sync(
|
||||
self.sync_manager.build_embeddings_index,
|
||||
batch_size
|
||||
)
|
||||
|
||||
async def get_statistics(self) -> Dict[str, Any]:
|
||||
"""Get statistics asynchronously."""
|
||||
return await trio.to_thread.run_sync(
|
||||
self.sync_manager.get_statistics
|
||||
)
|
||||
|
||||
async def cleanup_old_embeddings(self, days_old: int = 30) -> int:
|
||||
"""Clean up old embeddings asynchronously."""
|
||||
return await trio.to_thread.run_sync(
|
||||
self.sync_manager.cleanup_old_embeddings,
|
||||
days_old
|
||||
)
|
||||
|
||||
# Synchronous access to underlying manager properties
|
||||
@property
|
||||
def model_config(self):
|
||||
return self.sync_manager.model_config
|
||||
|
||||
@property
|
||||
def vector_cache(self):
|
||||
return self.sync_manager.vector_cache
|
||||
|
||||
@property
|
||||
def batch_size(self):
|
||||
return self.sync_manager.batch_size
|
||||
179
hcfs-python/hcfs/core/filesystem.py
Normal file
179
hcfs-python/hcfs/core/filesystem.py
Normal file
@@ -0,0 +1,179 @@
|
||||
"""
|
||||
HCFS Filesystem - FUSE-based virtual filesystem layer.
|
||||
"""
|
||||
|
||||
import os
|
||||
import stat
|
||||
import errno
|
||||
import time
|
||||
from typing import Dict, Optional
|
||||
from pathlib import Path
|
||||
|
||||
import pyfuse3
|
||||
from pyfuse3 import FUSEError
|
||||
|
||||
from .context_db import ContextDatabase, Context
|
||||
|
||||
|
||||
class HCFSFilesystem(pyfuse3.Operations):
|
||||
"""
|
||||
HCFS FUSE filesystem implementation.
|
||||
|
||||
Maps directory navigation to context scope and provides
|
||||
virtual files for context access.
|
||||
"""
|
||||
|
||||
def __init__(self, context_db: ContextDatabase, mount_point: str):
|
||||
super().__init__()
|
||||
self.context_db = context_db
|
||||
self.mount_point = mount_point
|
||||
self._inode_counter = 1
|
||||
self._inode_to_path: Dict[int, str] = {1: "/"} # Root inode
|
||||
self._path_to_inode: Dict[str, int] = {"/": 1}
|
||||
|
||||
# Virtual files
|
||||
self.CONTEXT_FILE = ".context"
|
||||
self.CONTEXT_LIST_FILE = ".context_list"
|
||||
self.CONTEXT_PUSH_FILE = ".context_push"
|
||||
|
||||
def _get_inode(self, path: str) -> int:
|
||||
"""Get or create inode for path."""
|
||||
if path in self._path_to_inode:
|
||||
return self._path_to_inode[path]
|
||||
|
||||
self._inode_counter += 1
|
||||
inode = self._inode_counter
|
||||
self._inode_to_path[inode] = path
|
||||
self._path_to_inode[path] = inode
|
||||
return inode
|
||||
|
||||
def _get_path(self, inode: int) -> str:
|
||||
"""Get path for inode."""
|
||||
return self._inode_to_path.get(inode, "/")
|
||||
|
||||
def _is_virtual_file(self, path: str) -> bool:
|
||||
"""Check if path is a virtual context file."""
|
||||
basename = os.path.basename(path)
|
||||
return basename in [self.CONTEXT_FILE, self.CONTEXT_LIST_FILE, self.CONTEXT_PUSH_FILE]
|
||||
|
||||
async def getattr(self, inode: int, ctx=None) -> pyfuse3.EntryAttributes:
|
||||
"""Get file attributes."""
|
||||
path = self._get_path(inode)
|
||||
entry = pyfuse3.EntryAttributes()
|
||||
entry.st_ino = inode
|
||||
entry.st_uid = os.getuid()
|
||||
entry.st_gid = os.getgid()
|
||||
entry.st_atime_ns = int(time.time() * 1e9)
|
||||
entry.st_mtime_ns = int(time.time() * 1e9)
|
||||
entry.st_ctime_ns = int(time.time() * 1e9)
|
||||
|
||||
if self._is_virtual_file(path):
|
||||
# Virtual files are readable text files
|
||||
entry.st_mode = stat.S_IFREG | 0o644
|
||||
entry.st_size = 1024 # Placeholder size
|
||||
else:
|
||||
# Directories
|
||||
entry.st_mode = stat.S_IFDIR | 0o755
|
||||
entry.st_size = 0
|
||||
|
||||
return entry
|
||||
|
||||
async def lookup(self, parent_inode: int, name: bytes, ctx=None) -> pyfuse3.EntryAttributes:
|
||||
"""Look up a directory entry."""
|
||||
parent_path = self._get_path(parent_inode)
|
||||
child_path = os.path.join(parent_path, name.decode('utf-8'))
|
||||
|
||||
# Normalize path
|
||||
if child_path.startswith("//"):
|
||||
child_path = child_path[1:]
|
||||
|
||||
child_inode = self._get_inode(child_path)
|
||||
return await self.getattr(child_inode, ctx)
|
||||
|
||||
async def opendir(self, inode: int, ctx=None) -> int:
|
||||
"""Open directory."""
|
||||
return inode
|
||||
|
||||
async def readdir(self, inode: int, start_id: int, token) -> None:
|
||||
"""Read directory contents."""
|
||||
path = self._get_path(inode)
|
||||
|
||||
# Always show virtual context files in every directory
|
||||
entries = [
|
||||
(self.CONTEXT_FILE, await self.getattr(self._get_inode(os.path.join(path, self.CONTEXT_FILE)))),
|
||||
(self.CONTEXT_LIST_FILE, await self.getattr(self._get_inode(os.path.join(path, self.CONTEXT_LIST_FILE)))),
|
||||
(self.CONTEXT_PUSH_FILE, await self.getattr(self._get_inode(os.path.join(path, self.CONTEXT_PUSH_FILE)))),
|
||||
]
|
||||
|
||||
# Add subdirectories (you might want to make this dynamic based on context paths)
|
||||
# For now, allowing any directory to be created by navigation
|
||||
|
||||
for i, (name, attr) in enumerate(entries):
|
||||
if i >= start_id:
|
||||
if not pyfuse3.readdir_reply(token, name.encode('utf-8'), attr, i + 1):
|
||||
break
|
||||
|
||||
async def open(self, inode: int, flags: int, ctx=None) -> int:
|
||||
"""Open file."""
|
||||
path = self._get_path(inode)
|
||||
if not self._is_virtual_file(path):
|
||||
raise FUSEError(errno.EISDIR)
|
||||
return inode
|
||||
|
||||
async def read(self, fh: int, offset: int, size: int) -> bytes:
|
||||
"""Read from virtual files."""
|
||||
path = self._get_path(fh)
|
||||
basename = os.path.basename(path)
|
||||
dir_path = os.path.dirname(path)
|
||||
|
||||
if basename == self.CONTEXT_FILE:
|
||||
# Return aggregated context for current directory
|
||||
contexts = self.context_db.get_context_by_path(dir_path, depth=1)
|
||||
content = "\\n".join(f"[{ctx.path}] {ctx.content}" for ctx in contexts)
|
||||
|
||||
elif basename == self.CONTEXT_LIST_FILE:
|
||||
# List contexts at current path
|
||||
contexts = self.context_db.list_contexts_at_path(dir_path)
|
||||
content = "\\n".join(f"ID: {ctx.id}, Path: {ctx.path}, Author: {ctx.author}, Created: {ctx.created_at}"
|
||||
for ctx in contexts)
|
||||
|
||||
elif basename == self.CONTEXT_PUSH_FILE:
|
||||
# Instructions for pushing context
|
||||
content = f"Write to this file to push context to path: {dir_path}\\nFormat: <content>"
|
||||
|
||||
else:
|
||||
content = "Unknown virtual file"
|
||||
|
||||
content_bytes = content.encode('utf-8')
|
||||
return content_bytes[offset:offset + size]
|
||||
|
||||
async def write(self, fh: int, offset: int, data: bytes) -> int:
|
||||
"""Write to virtual files (context_push only)."""
|
||||
path = self._get_path(fh)
|
||||
basename = os.path.basename(path)
|
||||
dir_path = os.path.dirname(path)
|
||||
|
||||
if basename == self.CONTEXT_PUSH_FILE:
|
||||
# Push new context to current directory
|
||||
content = data.decode('utf-8').strip()
|
||||
context = Context(
|
||||
id=None,
|
||||
path=dir_path,
|
||||
content=content,
|
||||
author="fuse_user"
|
||||
)
|
||||
self.context_db.store_context(context)
|
||||
return len(data)
|
||||
else:
|
||||
raise FUSEError(errno.EACCES)
|
||||
|
||||
async def mkdir(self, parent_inode: int, name: bytes, mode: int, ctx=None) -> pyfuse3.EntryAttributes:
|
||||
"""Create directory (virtual - just for navigation)."""
|
||||
parent_path = self._get_path(parent_inode)
|
||||
new_path = os.path.join(parent_path, name.decode('utf-8'))
|
||||
|
||||
if new_path.startswith("//"):
|
||||
new_path = new_path[1:]
|
||||
|
||||
new_inode = self._get_inode(new_path)
|
||||
return await self.getattr(new_inode, ctx)
|
||||
Reference in New Issue
Block a user