HCFS/hcfs-python/hcfs/sdk/utils.py

"""
HCFS SDK Utility Functions

Common utilities for text processing, caching, and data manipulation.
"""

import hashlib
import json
import math
import re
import time
from typing import List, Dict, Any, Optional, Tuple, Iterator, Callable, Union
from datetime import datetime, timedelta
from collections import defaultdict, OrderedDict
from threading import Lock
import asyncio
from functools import lru_cache, wraps

from .models import Context, SearchResult, CacheStrategy
from .exceptions import HCFSError, HCFSCacheError


def context_similarity(context1: Context, context2: Context, method: str = "jaccard") -> float:
    """
    Calculate similarity between two contexts.

    Args:
        context1: First context
        context2: Second context
        method: Similarity method ("jaccard", "cosine", "levenshtein")

    Returns:
        Similarity score between 0.0 and 1.0
    """
    if method == "jaccard":
        return _jaccard_similarity(context1.content, context2.content)
    elif method == "cosine":
        return _cosine_similarity(context1.content, context2.content)
    elif method == "levenshtein":
        return _levenshtein_similarity(context1.content, context2.content)
    else:
        raise ValueError(f"Unknown similarity method: {method}")


def _jaccard_similarity(text1: str, text2: str) -> float:
    """Calculate Jaccard similarity between two texts."""
    words1 = set(text1.lower().split())
    words2 = set(text2.lower().split())

    intersection = words1.intersection(words2)
    union = words1.union(words2)

    return len(intersection) / len(union) if union else 0.0


def _cosine_similarity(text1: str, text2: str) -> float:
    """Calculate cosine similarity between two texts."""
    words1 = text1.lower().split()
    words2 = text2.lower().split()

    # Create word frequency vectors
    all_words = set(words1 + words2)
    vector1 = [words1.count(word) for word in all_words]
    vector2 = [words2.count(word) for word in all_words]

    # Calculate dot product and magnitudes
    dot_product = sum(a * b for a, b in zip(vector1, vector2))
    magnitude1 = math.sqrt(sum(a * a for a in vector1))
    magnitude2 = math.sqrt(sum(a * a for a in vector2))

    if magnitude1 == 0 or magnitude2 == 0:
        return 0.0

    return dot_product / (magnitude1 * magnitude2)


def _levenshtein_similarity(text1: str, text2: str) -> float:
    """Calculate normalized Levenshtein similarity."""
    def levenshtein_distance(s1: str, s2: str) -> int:
        if len(s1) < len(s2):
            return levenshtein_distance(s2, s1)

        if len(s2) == 0:
            return len(s1)

        previous_row = list(range(len(s2) + 1))
        for i, c1 in enumerate(s1):
            current_row = [i + 1]
            for j, c2 in enumerate(s2):
                insertions = previous_row[j + 1] + 1
                deletions = current_row[j] + 1
                substitutions = previous_row[j] + (c1 != c2)
                current_row.append(min(insertions, deletions, substitutions))
            previous_row = current_row

        return previous_row[-1]

    max_len = max(len(text1), len(text2))
    if max_len == 0:
        return 1.0

    distance = levenshtein_distance(text1.lower(), text2.lower())
    return 1.0 - (distance / max_len)


def text_chunker(text: str, chunk_size: int = 512, overlap: int = 50, preserve_sentences: bool = True) -> List[str]:
    """
    Split text into overlapping chunks.

    Args:
        text: Text to chunk
        chunk_size: Maximum chunk size in characters
        overlap: Overlap between chunks
        preserve_sentences: Try to preserve sentence boundaries

    Returns:
        List of text chunks
    """
    if len(text) <= chunk_size:
        return [text]

    chunks = []
    start = 0

    while start < len(text):
        end = start + chunk_size

        if end >= len(text):
            chunks.append(text[start:])
            break

        # Try to find a good break point
        chunk = text[start:end]

        if preserve_sentences and '.' in chunk:
            # Find the last sentence boundary
            last_period = chunk.rfind('.')
            if last_period > chunk_size // 2:  # Don't make chunks too small
                end = start + last_period + 1
                chunk = text[start:end]

        chunks.append(chunk.strip())
        start = end - overlap

    return [chunk for chunk in chunks if chunk.strip()]


def extract_keywords(text: str, max_keywords: int = 10, min_length: int = 3) -> List[str]:
    """
    Extract keywords from text using simple frequency analysis.

    Args:
        text: Input text
        max_keywords: Maximum number of keywords
        min_length: Minimum keyword length

    Returns:
        List of keywords ordered by frequency
    """
    # Simple stopwords
    stopwords = {
        'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
        'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
        'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
        'could', 'should', 'may', 'might', 'can', 'this', 'that', 'these',
        'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him',
        'her', 'us', 'them', 'my', 'your', 'his', 'its', 'our', 'their'
    }

    # Extract words and count frequencies
    words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
    word_freq = defaultdict(int)

    for word in words:
        if len(word) >= min_length and word not in stopwords:
            word_freq[word] += 1

    # Sort by frequency and return top keywords
    return sorted(word_freq.keys(), key=lambda x: word_freq[x], reverse=True)[:max_keywords]


def format_content_preview(content: str, max_length: int = 200) -> str:
    """
    Format content for preview display.

    Args:
        content: Full content
        max_length: Maximum preview length

    Returns:
        Formatted preview string
    """
    if len(content) <= max_length:
        return content

    # Try to cut at word boundary
    preview = content[:max_length]
    last_space = preview.rfind(' ')

    if last_space > max_length * 0.8:  # Don't cut too much
        preview = preview[:last_space]

    return preview + "..."


def validate_path(path: str) -> bool:
    """
    Validate context path format.

    Args:
        path: Path to validate

    Returns:
        True if valid, False otherwise
    """
    if not path or not isinstance(path, str):
        return False

    if not path.startswith('/'):
        return False

    # Check for invalid characters
    invalid_chars = set('<>"|?*')
    if any(char in path for char in invalid_chars):
        return False

    # Check path components
    components = path.split('/')
    for component in components[1:]:  # Skip empty first component
        if not component or component in ['.', '..']:
            return False

    return True


def normalize_path(path: str) -> str:
    """
    Normalize context path.

    Args:
        path: Path to normalize

    Returns:
        Normalized path
    """
    if not path.startswith('/'):
        path = '/' + path

    # Remove duplicate slashes and normalize
    components = [c for c in path.split('/') if c]
    return '/' + '/'.join(components) if components else '/'


def hash_content(content: str, algorithm: str = "sha256") -> str:
    """
    Generate hash of content for deduplication.

    Args:
        content: Content to hash
        algorithm: Hash algorithm

    Returns:
        Hex digest of content hash
    """
    if algorithm == "md5":
        hasher = hashlib.md5()
    elif algorithm == "sha1":
        hasher = hashlib.sha1()
    elif algorithm == "sha256":
        hasher = hashlib.sha256()
    else:
        raise ValueError(f"Unsupported hash algorithm: {algorithm}")

    hasher.update(content.encode('utf-8'))
    return hasher.hexdigest()


def merge_contexts(contexts: List[Context], strategy: str = "latest") -> Context:
    """
    Merge multiple contexts into one.

    Args:
        contexts: List of contexts to merge
        strategy: Merge strategy ("latest", "longest", "combined")

    Returns:
        Merged context
    """
    if not contexts:
        raise ValueError("No contexts to merge")

    if len(contexts) == 1:
        return contexts[0]

    if strategy == "latest":
        return max(contexts, key=lambda c: c.updated_at or c.created_at or datetime.min)
    elif strategy == "longest":
        return max(contexts, key=lambda c: len(c.content))
    elif strategy == "combined":
        # Combine content and metadata
        merged = contexts[0].copy()
        merged.content = "\n\n".join(c.content for c in contexts)
        merged.tags = list(set(tag for c in contexts for tag in c.tags))

        # Merge metadata
        merged_metadata = {}
        for context in contexts:
            merged_metadata.update(context.metadata)
        merged.metadata = merged_metadata

        return merged
    else:
        raise ValueError(f"Unknown merge strategy: {strategy}")


class MemoryCache:
    """Thread-safe in-memory cache with configurable eviction strategies."""

    def __init__(self, max_size: int = 1000, strategy: CacheStrategy = CacheStrategy.LRU, ttl_seconds: Optional[int] = None):
        self.max_size = max_size
        self.strategy = strategy
        self.ttl_seconds = ttl_seconds
        self._cache = OrderedDict()
        self._access_counts = defaultdict(int)
        self._timestamps = {}
        self._lock = Lock()

    def get(self, key: str) -> Optional[Any]:
        """Get value from cache."""
        with self._lock:
            if key not in self._cache:
                return None

            # Check TTL
            if self.ttl_seconds and key in self._timestamps:
                if time.time() - self._timestamps[key] > self.ttl_seconds:
                    self._remove(key)
                    return None

            # Update access patterns
            if self.strategy == CacheStrategy.LRU:
                # Move to end (most recently used)
                self._cache.move_to_end(key)
            elif self.strategy == CacheStrategy.LFU:
                self._access_counts[key] += 1

            return self._cache[key]

    def put(self, key: str, value: Any) -> None:
        """Put value in cache."""
        with self._lock:
            # Remove if already exists
            if key in self._cache:
                self._remove(key)

            # Evict if necessary
            while len(self._cache) >= self.max_size:
                self._evict_one()

            # Add new entry
            self._cache[key] = value
            self._timestamps[key] = time.time()
            if self.strategy == CacheStrategy.LFU:
                self._access_counts[key] = 1

    def remove(self, key: str) -> bool:
        """Remove key from cache."""
        with self._lock:
            return self._remove(key)

    def clear(self) -> None:
        """Clear all cache entries."""
        with self._lock:
            self._cache.clear()
            self._access_counts.clear()
            self._timestamps.clear()

    def size(self) -> int:
        """Get current cache size."""
        return len(self._cache)

    def stats(self) -> Dict[str, Any]:
        """Get cache statistics."""
        with self._lock:
            return {
                "size": len(self._cache),
                "max_size": self.max_size,
                "strategy": self.strategy.value,
                "ttl_seconds": self.ttl_seconds,
                "keys": list(self._cache.keys())
            }

    def _remove(self, key: str) -> bool:
        """Remove key without lock (internal use)."""
        if key in self._cache:
            del self._cache[key]
            self._access_counts.pop(key, None)
            self._timestamps.pop(key, None)
            return True
        return False

    def _evict_one(self) -> None:
        """Evict one item based on strategy."""
        if not self._cache:
            return

        if self.strategy == CacheStrategy.LRU:
            # Remove least recently used (first item)
            key = next(iter(self._cache))
            self._remove(key)
        elif self.strategy == CacheStrategy.LFU:
            # Remove least frequently used
            if self._access_counts:
                key = min(self._access_counts.keys(), key=lambda k: self._access_counts[k])
                self._remove(key)
        elif self.strategy == CacheStrategy.FIFO:
            # Remove first in, first out
            key = next(iter(self._cache))
            self._remove(key)
        elif self.strategy == CacheStrategy.TTL:
            # Remove expired items first, then oldest
            current_time = time.time()
            expired_keys = [
                key for key, timestamp in self._timestamps.items()
                if current_time - timestamp > (self.ttl_seconds or 0)
            ]

            if expired_keys:
                self._remove(expired_keys[0])
            else:
                # Remove oldest
                key = min(self._timestamps.keys(), key=lambda k: self._timestamps[k])
                self._remove(key)


class BatchProcessor:
    """Utility for processing items in batches with error handling."""

    def __init__(self, batch_size: int = 10, max_workers: Optional[int] = None):
        self.batch_size = batch_size
        self.max_workers = max_workers or min(32, (len(os.sched_getaffinity(0)) or 1) + 4)

    async def process_async(self,
                          items: List[Any],
                          processor: Callable[[Any], Any],
                          on_success: Optional[Callable[[Any, Any], None]] = None,
                          on_error: Optional[Callable[[Any, Exception], None]] = None) -> Dict[str, Any]:
        """
        Process items asynchronously in batches.

        Args:
            items: Items to process
            processor: Async function to process each item
            on_success: Callback for successful processing
            on_error: Callback for processing errors

        Returns:
            Processing results summary
        """
        results = {
            "success_count": 0,
            "error_count": 0,
            "total_items": len(items),
            "successful_items": [],
            "failed_items": [],
            "execution_time": 0
        }

        start_time = time.time()

        # Process in batches
        for i in range(0, len(items), self.batch_size):
            batch = items[i:i + self.batch_size]

            # Create tasks for this batch
            tasks = []
            for item in batch:
                task = asyncio.create_task(self._process_item_async(item, processor))
                tasks.append((item, task))

            # Wait for batch completion
            for item, task in tasks:
                try:
                    result = await task
                    results["success_count"] += 1
                    results["successful_items"].append(result)

                    if on_success:
                        on_success(item, result)

                except Exception as e:
                    results["error_count"] += 1
                    results["failed_items"].append({"item": item, "error": str(e)})

                    if on_error:
                        on_error(item, e)

        results["execution_time"] = time.time() - start_time
        return results

    async def _process_item_async(self, item: Any, processor: Callable) -> Any:
        """Process a single item asynchronously."""
        if asyncio.iscoroutinefunction(processor):
            return await processor(item)
        else:
            # Run synchronous processor in thread pool
            loop = asyncio.get_event_loop()
            return await loop.run_in_executor(None, processor, item)


# Global instances
embedding_cache = MemoryCache(max_size=2000, strategy=CacheStrategy.LRU, ttl_seconds=3600)
batch_processor = BatchProcessor(batch_size=10)


def cache_key(*args, **kwargs) -> str:
    """Generate cache key from arguments."""
    key_parts = []

    # Add positional arguments
    for arg in args:
        if isinstance(arg, (str, int, float, bool)):
            key_parts.append(str(arg))
        else:
            key_parts.append(str(hash(str(arg))))

    # Add keyword arguments
    for k, v in sorted(kwargs.items()):
        if isinstance(v, (str, int, float, bool)):
            key_parts.append(f"{k}={v}")
        else:
            key_parts.append(f"{k}={hash(str(v))}")

    return ":".join(key_parts)


def timing_decorator(func):
    """Decorator to measure function execution time."""
    @wraps(func)
    async def async_wrapper(*args, **kwargs):
        start_time = time.time()
        try:
            result = await func(*args, **kwargs)
            return result
        finally:
            execution_time = time.time() - start_time
            # Could log or store timing data here
            pass

    @wraps(func)
    def sync_wrapper(*args, **kwargs):
        start_time = time.time()
        try:
            result = func(*args, **kwargs)
            return result
        finally:
            execution_time = time.time() - start_time
            # Could log or store timing data here
            pass

    if asyncio.iscoroutinefunction(func):
        return async_wrapper
    else:
        return sync_wrapper