564 lines
18 KiB
Python
564 lines
18 KiB
Python
"""
|
|
HCFS SDK Utility Functions
|
|
|
|
Common utilities for text processing, caching, and data manipulation.
|
|
"""
|
|
|
|
import hashlib
|
|
import json
|
|
import math
|
|
import re
|
|
import time
|
|
from typing import List, Dict, Any, Optional, Tuple, Iterator, Callable, Union
|
|
from datetime import datetime, timedelta
|
|
from collections import defaultdict, OrderedDict
|
|
from threading import Lock
|
|
import asyncio
|
|
from functools import lru_cache, wraps
|
|
|
|
from .models import Context, SearchResult, CacheStrategy
|
|
from .exceptions import HCFSError, HCFSCacheError
|
|
|
|
|
|
def context_similarity(context1: Context, context2: Context, method: str = "jaccard") -> float:
|
|
"""
|
|
Calculate similarity between two contexts.
|
|
|
|
Args:
|
|
context1: First context
|
|
context2: Second context
|
|
method: Similarity method ("jaccard", "cosine", "levenshtein")
|
|
|
|
Returns:
|
|
Similarity score between 0.0 and 1.0
|
|
"""
|
|
if method == "jaccard":
|
|
return _jaccard_similarity(context1.content, context2.content)
|
|
elif method == "cosine":
|
|
return _cosine_similarity(context1.content, context2.content)
|
|
elif method == "levenshtein":
|
|
return _levenshtein_similarity(context1.content, context2.content)
|
|
else:
|
|
raise ValueError(f"Unknown similarity method: {method}")
|
|
|
|
|
|
def _jaccard_similarity(text1: str, text2: str) -> float:
|
|
"""Calculate Jaccard similarity between two texts."""
|
|
words1 = set(text1.lower().split())
|
|
words2 = set(text2.lower().split())
|
|
|
|
intersection = words1.intersection(words2)
|
|
union = words1.union(words2)
|
|
|
|
return len(intersection) / len(union) if union else 0.0
|
|
|
|
|
|
def _cosine_similarity(text1: str, text2: str) -> float:
|
|
"""Calculate cosine similarity between two texts."""
|
|
words1 = text1.lower().split()
|
|
words2 = text2.lower().split()
|
|
|
|
# Create word frequency vectors
|
|
all_words = set(words1 + words2)
|
|
vector1 = [words1.count(word) for word in all_words]
|
|
vector2 = [words2.count(word) for word in all_words]
|
|
|
|
# Calculate dot product and magnitudes
|
|
dot_product = sum(a * b for a, b in zip(vector1, vector2))
|
|
magnitude1 = math.sqrt(sum(a * a for a in vector1))
|
|
magnitude2 = math.sqrt(sum(a * a for a in vector2))
|
|
|
|
if magnitude1 == 0 or magnitude2 == 0:
|
|
return 0.0
|
|
|
|
return dot_product / (magnitude1 * magnitude2)
|
|
|
|
|
|
def _levenshtein_similarity(text1: str, text2: str) -> float:
|
|
"""Calculate normalized Levenshtein similarity."""
|
|
def levenshtein_distance(s1: str, s2: str) -> int:
|
|
if len(s1) < len(s2):
|
|
return levenshtein_distance(s2, s1)
|
|
|
|
if len(s2) == 0:
|
|
return len(s1)
|
|
|
|
previous_row = list(range(len(s2) + 1))
|
|
for i, c1 in enumerate(s1):
|
|
current_row = [i + 1]
|
|
for j, c2 in enumerate(s2):
|
|
insertions = previous_row[j + 1] + 1
|
|
deletions = current_row[j] + 1
|
|
substitutions = previous_row[j] + (c1 != c2)
|
|
current_row.append(min(insertions, deletions, substitutions))
|
|
previous_row = current_row
|
|
|
|
return previous_row[-1]
|
|
|
|
max_len = max(len(text1), len(text2))
|
|
if max_len == 0:
|
|
return 1.0
|
|
|
|
distance = levenshtein_distance(text1.lower(), text2.lower())
|
|
return 1.0 - (distance / max_len)
|
|
|
|
|
|
def text_chunker(text: str, chunk_size: int = 512, overlap: int = 50, preserve_sentences: bool = True) -> List[str]:
|
|
"""
|
|
Split text into overlapping chunks.
|
|
|
|
Args:
|
|
text: Text to chunk
|
|
chunk_size: Maximum chunk size in characters
|
|
overlap: Overlap between chunks
|
|
preserve_sentences: Try to preserve sentence boundaries
|
|
|
|
Returns:
|
|
List of text chunks
|
|
"""
|
|
if len(text) <= chunk_size:
|
|
return [text]
|
|
|
|
chunks = []
|
|
start = 0
|
|
|
|
while start < len(text):
|
|
end = start + chunk_size
|
|
|
|
if end >= len(text):
|
|
chunks.append(text[start:])
|
|
break
|
|
|
|
# Try to find a good break point
|
|
chunk = text[start:end]
|
|
|
|
if preserve_sentences and '.' in chunk:
|
|
# Find the last sentence boundary
|
|
last_period = chunk.rfind('.')
|
|
if last_period > chunk_size // 2: # Don't make chunks too small
|
|
end = start + last_period + 1
|
|
chunk = text[start:end]
|
|
|
|
chunks.append(chunk.strip())
|
|
start = end - overlap
|
|
|
|
return [chunk for chunk in chunks if chunk.strip()]
|
|
|
|
|
|
def extract_keywords(text: str, max_keywords: int = 10, min_length: int = 3) -> List[str]:
|
|
"""
|
|
Extract keywords from text using simple frequency analysis.
|
|
|
|
Args:
|
|
text: Input text
|
|
max_keywords: Maximum number of keywords
|
|
min_length: Minimum keyword length
|
|
|
|
Returns:
|
|
List of keywords ordered by frequency
|
|
"""
|
|
# Simple stopwords
|
|
stopwords = {
|
|
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
|
|
'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
|
|
'could', 'should', 'may', 'might', 'can', 'this', 'that', 'these',
|
|
'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him',
|
|
'her', 'us', 'them', 'my', 'your', 'his', 'its', 'our', 'their'
|
|
}
|
|
|
|
# Extract words and count frequencies
|
|
words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
|
|
word_freq = defaultdict(int)
|
|
|
|
for word in words:
|
|
if len(word) >= min_length and word not in stopwords:
|
|
word_freq[word] += 1
|
|
|
|
# Sort by frequency and return top keywords
|
|
return sorted(word_freq.keys(), key=lambda x: word_freq[x], reverse=True)[:max_keywords]
|
|
|
|
|
|
def format_content_preview(content: str, max_length: int = 200) -> str:
|
|
"""
|
|
Format content for preview display.
|
|
|
|
Args:
|
|
content: Full content
|
|
max_length: Maximum preview length
|
|
|
|
Returns:
|
|
Formatted preview string
|
|
"""
|
|
if len(content) <= max_length:
|
|
return content
|
|
|
|
# Try to cut at word boundary
|
|
preview = content[:max_length]
|
|
last_space = preview.rfind(' ')
|
|
|
|
if last_space > max_length * 0.8: # Don't cut too much
|
|
preview = preview[:last_space]
|
|
|
|
return preview + "..."
|
|
|
|
|
|
def validate_path(path: str) -> bool:
|
|
"""
|
|
Validate context path format.
|
|
|
|
Args:
|
|
path: Path to validate
|
|
|
|
Returns:
|
|
True if valid, False otherwise
|
|
"""
|
|
if not path or not isinstance(path, str):
|
|
return False
|
|
|
|
if not path.startswith('/'):
|
|
return False
|
|
|
|
# Check for invalid characters
|
|
invalid_chars = set('<>"|?*')
|
|
if any(char in path for char in invalid_chars):
|
|
return False
|
|
|
|
# Check path components
|
|
components = path.split('/')
|
|
for component in components[1:]: # Skip empty first component
|
|
if not component or component in ['.', '..']:
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def normalize_path(path: str) -> str:
|
|
"""
|
|
Normalize context path.
|
|
|
|
Args:
|
|
path: Path to normalize
|
|
|
|
Returns:
|
|
Normalized path
|
|
"""
|
|
if not path.startswith('/'):
|
|
path = '/' + path
|
|
|
|
# Remove duplicate slashes and normalize
|
|
components = [c for c in path.split('/') if c]
|
|
return '/' + '/'.join(components) if components else '/'
|
|
|
|
|
|
def hash_content(content: str, algorithm: str = "sha256") -> str:
|
|
"""
|
|
Generate hash of content for deduplication.
|
|
|
|
Args:
|
|
content: Content to hash
|
|
algorithm: Hash algorithm
|
|
|
|
Returns:
|
|
Hex digest of content hash
|
|
"""
|
|
if algorithm == "md5":
|
|
hasher = hashlib.md5()
|
|
elif algorithm == "sha1":
|
|
hasher = hashlib.sha1()
|
|
elif algorithm == "sha256":
|
|
hasher = hashlib.sha256()
|
|
else:
|
|
raise ValueError(f"Unsupported hash algorithm: {algorithm}")
|
|
|
|
hasher.update(content.encode('utf-8'))
|
|
return hasher.hexdigest()
|
|
|
|
|
|
def merge_contexts(contexts: List[Context], strategy: str = "latest") -> Context:
|
|
"""
|
|
Merge multiple contexts into one.
|
|
|
|
Args:
|
|
contexts: List of contexts to merge
|
|
strategy: Merge strategy ("latest", "longest", "combined")
|
|
|
|
Returns:
|
|
Merged context
|
|
"""
|
|
if not contexts:
|
|
raise ValueError("No contexts to merge")
|
|
|
|
if len(contexts) == 1:
|
|
return contexts[0]
|
|
|
|
if strategy == "latest":
|
|
return max(contexts, key=lambda c: c.updated_at or c.created_at or datetime.min)
|
|
elif strategy == "longest":
|
|
return max(contexts, key=lambda c: len(c.content))
|
|
elif strategy == "combined":
|
|
# Combine content and metadata
|
|
merged = contexts[0].copy()
|
|
merged.content = "\n\n".join(c.content for c in contexts)
|
|
merged.tags = list(set(tag for c in contexts for tag in c.tags))
|
|
|
|
# Merge metadata
|
|
merged_metadata = {}
|
|
for context in contexts:
|
|
merged_metadata.update(context.metadata)
|
|
merged.metadata = merged_metadata
|
|
|
|
return merged
|
|
else:
|
|
raise ValueError(f"Unknown merge strategy: {strategy}")
|
|
|
|
|
|
class MemoryCache:
|
|
"""Thread-safe in-memory cache with configurable eviction strategies."""
|
|
|
|
def __init__(self, max_size: int = 1000, strategy: CacheStrategy = CacheStrategy.LRU, ttl_seconds: Optional[int] = None):
|
|
self.max_size = max_size
|
|
self.strategy = strategy
|
|
self.ttl_seconds = ttl_seconds
|
|
self._cache = OrderedDict()
|
|
self._access_counts = defaultdict(int)
|
|
self._timestamps = {}
|
|
self._lock = Lock()
|
|
|
|
def get(self, key: str) -> Optional[Any]:
|
|
"""Get value from cache."""
|
|
with self._lock:
|
|
if key not in self._cache:
|
|
return None
|
|
|
|
# Check TTL
|
|
if self.ttl_seconds and key in self._timestamps:
|
|
if time.time() - self._timestamps[key] > self.ttl_seconds:
|
|
self._remove(key)
|
|
return None
|
|
|
|
# Update access patterns
|
|
if self.strategy == CacheStrategy.LRU:
|
|
# Move to end (most recently used)
|
|
self._cache.move_to_end(key)
|
|
elif self.strategy == CacheStrategy.LFU:
|
|
self._access_counts[key] += 1
|
|
|
|
return self._cache[key]
|
|
|
|
def put(self, key: str, value: Any) -> None:
|
|
"""Put value in cache."""
|
|
with self._lock:
|
|
# Remove if already exists
|
|
if key in self._cache:
|
|
self._remove(key)
|
|
|
|
# Evict if necessary
|
|
while len(self._cache) >= self.max_size:
|
|
self._evict_one()
|
|
|
|
# Add new entry
|
|
self._cache[key] = value
|
|
self._timestamps[key] = time.time()
|
|
if self.strategy == CacheStrategy.LFU:
|
|
self._access_counts[key] = 1
|
|
|
|
def remove(self, key: str) -> bool:
|
|
"""Remove key from cache."""
|
|
with self._lock:
|
|
return self._remove(key)
|
|
|
|
def clear(self) -> None:
|
|
"""Clear all cache entries."""
|
|
with self._lock:
|
|
self._cache.clear()
|
|
self._access_counts.clear()
|
|
self._timestamps.clear()
|
|
|
|
def size(self) -> int:
|
|
"""Get current cache size."""
|
|
return len(self._cache)
|
|
|
|
def stats(self) -> Dict[str, Any]:
|
|
"""Get cache statistics."""
|
|
with self._lock:
|
|
return {
|
|
"size": len(self._cache),
|
|
"max_size": self.max_size,
|
|
"strategy": self.strategy.value,
|
|
"ttl_seconds": self.ttl_seconds,
|
|
"keys": list(self._cache.keys())
|
|
}
|
|
|
|
def _remove(self, key: str) -> bool:
|
|
"""Remove key without lock (internal use)."""
|
|
if key in self._cache:
|
|
del self._cache[key]
|
|
self._access_counts.pop(key, None)
|
|
self._timestamps.pop(key, None)
|
|
return True
|
|
return False
|
|
|
|
def _evict_one(self) -> None:
|
|
"""Evict one item based on strategy."""
|
|
if not self._cache:
|
|
return
|
|
|
|
if self.strategy == CacheStrategy.LRU:
|
|
# Remove least recently used (first item)
|
|
key = next(iter(self._cache))
|
|
self._remove(key)
|
|
elif self.strategy == CacheStrategy.LFU:
|
|
# Remove least frequently used
|
|
if self._access_counts:
|
|
key = min(self._access_counts.keys(), key=lambda k: self._access_counts[k])
|
|
self._remove(key)
|
|
elif self.strategy == CacheStrategy.FIFO:
|
|
# Remove first in, first out
|
|
key = next(iter(self._cache))
|
|
self._remove(key)
|
|
elif self.strategy == CacheStrategy.TTL:
|
|
# Remove expired items first, then oldest
|
|
current_time = time.time()
|
|
expired_keys = [
|
|
key for key, timestamp in self._timestamps.items()
|
|
if current_time - timestamp > (self.ttl_seconds or 0)
|
|
]
|
|
|
|
if expired_keys:
|
|
self._remove(expired_keys[0])
|
|
else:
|
|
# Remove oldest
|
|
key = min(self._timestamps.keys(), key=lambda k: self._timestamps[k])
|
|
self._remove(key)
|
|
|
|
|
|
class BatchProcessor:
|
|
"""Utility for processing items in batches with error handling."""
|
|
|
|
def __init__(self, batch_size: int = 10, max_workers: Optional[int] = None):
|
|
self.batch_size = batch_size
|
|
self.max_workers = max_workers or min(32, (len(os.sched_getaffinity(0)) or 1) + 4)
|
|
|
|
async def process_async(self,
|
|
items: List[Any],
|
|
processor: Callable[[Any], Any],
|
|
on_success: Optional[Callable[[Any, Any], None]] = None,
|
|
on_error: Optional[Callable[[Any, Exception], None]] = None) -> Dict[str, Any]:
|
|
"""
|
|
Process items asynchronously in batches.
|
|
|
|
Args:
|
|
items: Items to process
|
|
processor: Async function to process each item
|
|
on_success: Callback for successful processing
|
|
on_error: Callback for processing errors
|
|
|
|
Returns:
|
|
Processing results summary
|
|
"""
|
|
results = {
|
|
"success_count": 0,
|
|
"error_count": 0,
|
|
"total_items": len(items),
|
|
"successful_items": [],
|
|
"failed_items": [],
|
|
"execution_time": 0
|
|
}
|
|
|
|
start_time = time.time()
|
|
|
|
# Process in batches
|
|
for i in range(0, len(items), self.batch_size):
|
|
batch = items[i:i + self.batch_size]
|
|
|
|
# Create tasks for this batch
|
|
tasks = []
|
|
for item in batch:
|
|
task = asyncio.create_task(self._process_item_async(item, processor))
|
|
tasks.append((item, task))
|
|
|
|
# Wait for batch completion
|
|
for item, task in tasks:
|
|
try:
|
|
result = await task
|
|
results["success_count"] += 1
|
|
results["successful_items"].append(result)
|
|
|
|
if on_success:
|
|
on_success(item, result)
|
|
|
|
except Exception as e:
|
|
results["error_count"] += 1
|
|
results["failed_items"].append({"item": item, "error": str(e)})
|
|
|
|
if on_error:
|
|
on_error(item, e)
|
|
|
|
results["execution_time"] = time.time() - start_time
|
|
return results
|
|
|
|
async def _process_item_async(self, item: Any, processor: Callable) -> Any:
|
|
"""Process a single item asynchronously."""
|
|
if asyncio.iscoroutinefunction(processor):
|
|
return await processor(item)
|
|
else:
|
|
# Run synchronous processor in thread pool
|
|
loop = asyncio.get_event_loop()
|
|
return await loop.run_in_executor(None, processor, item)
|
|
|
|
|
|
# Global instances
|
|
embedding_cache = MemoryCache(max_size=2000, strategy=CacheStrategy.LRU, ttl_seconds=3600)
|
|
batch_processor = BatchProcessor(batch_size=10)
|
|
|
|
|
|
def cache_key(*args, **kwargs) -> str:
|
|
"""Generate cache key from arguments."""
|
|
key_parts = []
|
|
|
|
# Add positional arguments
|
|
for arg in args:
|
|
if isinstance(arg, (str, int, float, bool)):
|
|
key_parts.append(str(arg))
|
|
else:
|
|
key_parts.append(str(hash(str(arg))))
|
|
|
|
# Add keyword arguments
|
|
for k, v in sorted(kwargs.items()):
|
|
if isinstance(v, (str, int, float, bool)):
|
|
key_parts.append(f"{k}={v}")
|
|
else:
|
|
key_parts.append(f"{k}={hash(str(v))}")
|
|
|
|
return ":".join(key_parts)
|
|
|
|
|
|
def timing_decorator(func):
|
|
"""Decorator to measure function execution time."""
|
|
@wraps(func)
|
|
async def async_wrapper(*args, **kwargs):
|
|
start_time = time.time()
|
|
try:
|
|
result = await func(*args, **kwargs)
|
|
return result
|
|
finally:
|
|
execution_time = time.time() - start_time
|
|
# Could log or store timing data here
|
|
pass
|
|
|
|
@wraps(func)
|
|
def sync_wrapper(*args, **kwargs):
|
|
start_time = time.time()
|
|
try:
|
|
result = func(*args, **kwargs)
|
|
return result
|
|
finally:
|
|
execution_time = time.time() - start_time
|
|
# Could log or store timing data here
|
|
pass
|
|
|
|
if asyncio.iscoroutinefunction(func):
|
|
return async_wrapper
|
|
else:
|
|
return sync_wrapper |