Files
HCFS/hcfs-python/hcfs/sdk/utils.py
2025-07-30 09:34:16 +10:00

564 lines
18 KiB
Python

"""
HCFS SDK Utility Functions
Common utilities for text processing, caching, and data manipulation.
"""
import hashlib
import json
import math
import re
import time
from typing import List, Dict, Any, Optional, Tuple, Iterator, Callable, Union
from datetime import datetime, timedelta
from collections import defaultdict, OrderedDict
from threading import Lock
import asyncio
from functools import lru_cache, wraps
from .models import Context, SearchResult, CacheStrategy
from .exceptions import HCFSError, HCFSCacheError
def context_similarity(context1: Context, context2: Context, method: str = "jaccard") -> float:
"""
Calculate similarity between two contexts.
Args:
context1: First context
context2: Second context
method: Similarity method ("jaccard", "cosine", "levenshtein")
Returns:
Similarity score between 0.0 and 1.0
"""
if method == "jaccard":
return _jaccard_similarity(context1.content, context2.content)
elif method == "cosine":
return _cosine_similarity(context1.content, context2.content)
elif method == "levenshtein":
return _levenshtein_similarity(context1.content, context2.content)
else:
raise ValueError(f"Unknown similarity method: {method}")
def _jaccard_similarity(text1: str, text2: str) -> float:
"""Calculate Jaccard similarity between two texts."""
words1 = set(text1.lower().split())
words2 = set(text2.lower().split())
intersection = words1.intersection(words2)
union = words1.union(words2)
return len(intersection) / len(union) if union else 0.0
def _cosine_similarity(text1: str, text2: str) -> float:
"""Calculate cosine similarity between two texts."""
words1 = text1.lower().split()
words2 = text2.lower().split()
# Create word frequency vectors
all_words = set(words1 + words2)
vector1 = [words1.count(word) for word in all_words]
vector2 = [words2.count(word) for word in all_words]
# Calculate dot product and magnitudes
dot_product = sum(a * b for a, b in zip(vector1, vector2))
magnitude1 = math.sqrt(sum(a * a for a in vector1))
magnitude2 = math.sqrt(sum(a * a for a in vector2))
if magnitude1 == 0 or magnitude2 == 0:
return 0.0
return dot_product / (magnitude1 * magnitude2)
def _levenshtein_similarity(text1: str, text2: str) -> float:
"""Calculate normalized Levenshtein similarity."""
def levenshtein_distance(s1: str, s2: str) -> int:
if len(s1) < len(s2):
return levenshtein_distance(s2, s1)
if len(s2) == 0:
return len(s1)
previous_row = list(range(len(s2) + 1))
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1
deletions = current_row[j] + 1
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
max_len = max(len(text1), len(text2))
if max_len == 0:
return 1.0
distance = levenshtein_distance(text1.lower(), text2.lower())
return 1.0 - (distance / max_len)
def text_chunker(text: str, chunk_size: int = 512, overlap: int = 50, preserve_sentences: bool = True) -> List[str]:
"""
Split text into overlapping chunks.
Args:
text: Text to chunk
chunk_size: Maximum chunk size in characters
overlap: Overlap between chunks
preserve_sentences: Try to preserve sentence boundaries
Returns:
List of text chunks
"""
if len(text) <= chunk_size:
return [text]
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
if end >= len(text):
chunks.append(text[start:])
break
# Try to find a good break point
chunk = text[start:end]
if preserve_sentences and '.' in chunk:
# Find the last sentence boundary
last_period = chunk.rfind('.')
if last_period > chunk_size // 2: # Don't make chunks too small
end = start + last_period + 1
chunk = text[start:end]
chunks.append(chunk.strip())
start = end - overlap
return [chunk for chunk in chunks if chunk.strip()]
def extract_keywords(text: str, max_keywords: int = 10, min_length: int = 3) -> List[str]:
"""
Extract keywords from text using simple frequency analysis.
Args:
text: Input text
max_keywords: Maximum number of keywords
min_length: Minimum keyword length
Returns:
List of keywords ordered by frequency
"""
# Simple stopwords
stopwords = {
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
'could', 'should', 'may', 'might', 'can', 'this', 'that', 'these',
'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him',
'her', 'us', 'them', 'my', 'your', 'his', 'its', 'our', 'their'
}
# Extract words and count frequencies
words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
word_freq = defaultdict(int)
for word in words:
if len(word) >= min_length and word not in stopwords:
word_freq[word] += 1
# Sort by frequency and return top keywords
return sorted(word_freq.keys(), key=lambda x: word_freq[x], reverse=True)[:max_keywords]
def format_content_preview(content: str, max_length: int = 200) -> str:
"""
Format content for preview display.
Args:
content: Full content
max_length: Maximum preview length
Returns:
Formatted preview string
"""
if len(content) <= max_length:
return content
# Try to cut at word boundary
preview = content[:max_length]
last_space = preview.rfind(' ')
if last_space > max_length * 0.8: # Don't cut too much
preview = preview[:last_space]
return preview + "..."
def validate_path(path: str) -> bool:
"""
Validate context path format.
Args:
path: Path to validate
Returns:
True if valid, False otherwise
"""
if not path or not isinstance(path, str):
return False
if not path.startswith('/'):
return False
# Check for invalid characters
invalid_chars = set('<>"|?*')
if any(char in path for char in invalid_chars):
return False
# Check path components
components = path.split('/')
for component in components[1:]: # Skip empty first component
if not component or component in ['.', '..']:
return False
return True
def normalize_path(path: str) -> str:
"""
Normalize context path.
Args:
path: Path to normalize
Returns:
Normalized path
"""
if not path.startswith('/'):
path = '/' + path
# Remove duplicate slashes and normalize
components = [c for c in path.split('/') if c]
return '/' + '/'.join(components) if components else '/'
def hash_content(content: str, algorithm: str = "sha256") -> str:
"""
Generate hash of content for deduplication.
Args:
content: Content to hash
algorithm: Hash algorithm
Returns:
Hex digest of content hash
"""
if algorithm == "md5":
hasher = hashlib.md5()
elif algorithm == "sha1":
hasher = hashlib.sha1()
elif algorithm == "sha256":
hasher = hashlib.sha256()
else:
raise ValueError(f"Unsupported hash algorithm: {algorithm}")
hasher.update(content.encode('utf-8'))
return hasher.hexdigest()
def merge_contexts(contexts: List[Context], strategy: str = "latest") -> Context:
"""
Merge multiple contexts into one.
Args:
contexts: List of contexts to merge
strategy: Merge strategy ("latest", "longest", "combined")
Returns:
Merged context
"""
if not contexts:
raise ValueError("No contexts to merge")
if len(contexts) == 1:
return contexts[0]
if strategy == "latest":
return max(contexts, key=lambda c: c.updated_at or c.created_at or datetime.min)
elif strategy == "longest":
return max(contexts, key=lambda c: len(c.content))
elif strategy == "combined":
# Combine content and metadata
merged = contexts[0].copy()
merged.content = "\n\n".join(c.content for c in contexts)
merged.tags = list(set(tag for c in contexts for tag in c.tags))
# Merge metadata
merged_metadata = {}
for context in contexts:
merged_metadata.update(context.metadata)
merged.metadata = merged_metadata
return merged
else:
raise ValueError(f"Unknown merge strategy: {strategy}")
class MemoryCache:
"""Thread-safe in-memory cache with configurable eviction strategies."""
def __init__(self, max_size: int = 1000, strategy: CacheStrategy = CacheStrategy.LRU, ttl_seconds: Optional[int] = None):
self.max_size = max_size
self.strategy = strategy
self.ttl_seconds = ttl_seconds
self._cache = OrderedDict()
self._access_counts = defaultdict(int)
self._timestamps = {}
self._lock = Lock()
def get(self, key: str) -> Optional[Any]:
"""Get value from cache."""
with self._lock:
if key not in self._cache:
return None
# Check TTL
if self.ttl_seconds and key in self._timestamps:
if time.time() - self._timestamps[key] > self.ttl_seconds:
self._remove(key)
return None
# Update access patterns
if self.strategy == CacheStrategy.LRU:
# Move to end (most recently used)
self._cache.move_to_end(key)
elif self.strategy == CacheStrategy.LFU:
self._access_counts[key] += 1
return self._cache[key]
def put(self, key: str, value: Any) -> None:
"""Put value in cache."""
with self._lock:
# Remove if already exists
if key in self._cache:
self._remove(key)
# Evict if necessary
while len(self._cache) >= self.max_size:
self._evict_one()
# Add new entry
self._cache[key] = value
self._timestamps[key] = time.time()
if self.strategy == CacheStrategy.LFU:
self._access_counts[key] = 1
def remove(self, key: str) -> bool:
"""Remove key from cache."""
with self._lock:
return self._remove(key)
def clear(self) -> None:
"""Clear all cache entries."""
with self._lock:
self._cache.clear()
self._access_counts.clear()
self._timestamps.clear()
def size(self) -> int:
"""Get current cache size."""
return len(self._cache)
def stats(self) -> Dict[str, Any]:
"""Get cache statistics."""
with self._lock:
return {
"size": len(self._cache),
"max_size": self.max_size,
"strategy": self.strategy.value,
"ttl_seconds": self.ttl_seconds,
"keys": list(self._cache.keys())
}
def _remove(self, key: str) -> bool:
"""Remove key without lock (internal use)."""
if key in self._cache:
del self._cache[key]
self._access_counts.pop(key, None)
self._timestamps.pop(key, None)
return True
return False
def _evict_one(self) -> None:
"""Evict one item based on strategy."""
if not self._cache:
return
if self.strategy == CacheStrategy.LRU:
# Remove least recently used (first item)
key = next(iter(self._cache))
self._remove(key)
elif self.strategy == CacheStrategy.LFU:
# Remove least frequently used
if self._access_counts:
key = min(self._access_counts.keys(), key=lambda k: self._access_counts[k])
self._remove(key)
elif self.strategy == CacheStrategy.FIFO:
# Remove first in, first out
key = next(iter(self._cache))
self._remove(key)
elif self.strategy == CacheStrategy.TTL:
# Remove expired items first, then oldest
current_time = time.time()
expired_keys = [
key for key, timestamp in self._timestamps.items()
if current_time - timestamp > (self.ttl_seconds or 0)
]
if expired_keys:
self._remove(expired_keys[0])
else:
# Remove oldest
key = min(self._timestamps.keys(), key=lambda k: self._timestamps[k])
self._remove(key)
class BatchProcessor:
"""Utility for processing items in batches with error handling."""
def __init__(self, batch_size: int = 10, max_workers: Optional[int] = None):
self.batch_size = batch_size
self.max_workers = max_workers or min(32, (len(os.sched_getaffinity(0)) or 1) + 4)
async def process_async(self,
items: List[Any],
processor: Callable[[Any], Any],
on_success: Optional[Callable[[Any, Any], None]] = None,
on_error: Optional[Callable[[Any, Exception], None]] = None) -> Dict[str, Any]:
"""
Process items asynchronously in batches.
Args:
items: Items to process
processor: Async function to process each item
on_success: Callback for successful processing
on_error: Callback for processing errors
Returns:
Processing results summary
"""
results = {
"success_count": 0,
"error_count": 0,
"total_items": len(items),
"successful_items": [],
"failed_items": [],
"execution_time": 0
}
start_time = time.time()
# Process in batches
for i in range(0, len(items), self.batch_size):
batch = items[i:i + self.batch_size]
# Create tasks for this batch
tasks = []
for item in batch:
task = asyncio.create_task(self._process_item_async(item, processor))
tasks.append((item, task))
# Wait for batch completion
for item, task in tasks:
try:
result = await task
results["success_count"] += 1
results["successful_items"].append(result)
if on_success:
on_success(item, result)
except Exception as e:
results["error_count"] += 1
results["failed_items"].append({"item": item, "error": str(e)})
if on_error:
on_error(item, e)
results["execution_time"] = time.time() - start_time
return results
async def _process_item_async(self, item: Any, processor: Callable) -> Any:
"""Process a single item asynchronously."""
if asyncio.iscoroutinefunction(processor):
return await processor(item)
else:
# Run synchronous processor in thread pool
loop = asyncio.get_event_loop()
return await loop.run_in_executor(None, processor, item)
# Global instances
embedding_cache = MemoryCache(max_size=2000, strategy=CacheStrategy.LRU, ttl_seconds=3600)
batch_processor = BatchProcessor(batch_size=10)
def cache_key(*args, **kwargs) -> str:
"""Generate cache key from arguments."""
key_parts = []
# Add positional arguments
for arg in args:
if isinstance(arg, (str, int, float, bool)):
key_parts.append(str(arg))
else:
key_parts.append(str(hash(str(arg))))
# Add keyword arguments
for k, v in sorted(kwargs.items()):
if isinstance(v, (str, int, float, bool)):
key_parts.append(f"{k}={v}")
else:
key_parts.append(f"{k}={hash(str(v))}")
return ":".join(key_parts)
def timing_decorator(func):
"""Decorator to measure function execution time."""
@wraps(func)
async def async_wrapper(*args, **kwargs):
start_time = time.time()
try:
result = await func(*args, **kwargs)
return result
finally:
execution_time = time.time() - start_time
# Could log or store timing data here
pass
@wraps(func)
def sync_wrapper(*args, **kwargs):
start_time = time.time()
try:
result = func(*args, **kwargs)
return result
finally:
execution_time = time.time() - start_time
# Could log or store timing data here
pass
if asyncio.iscoroutinefunction(func):
return async_wrapper
else:
return sync_wrapper