#!/usr/bin/env python3 """ Cascading Hierarchical Metadata Generator This system implements CSS-like cascading for contextual metadata: 1. Context flows DOWN the directory tree (inheritance) 2. More specific contexts override parent contexts 3. Only unique/different metadata is stored per level 4. Lookups resolve by walking UP the tree to find applicable context 5. Massive space savings by avoiding redundant metadata Key Concepts: - Context Inheritance: Child directories inherit parent context unless overridden - Context Specificity: More specific paths can override parent context - Context Consolidation: Similar contexts are merged/consolidated - Lazy Resolution: Context is resolved at query time by walking the hierarchy Usage: python3 cascading_metadata_generator.py [--bzzz-path PATH] [--rag-endpoint URL] """ import os import json import argparse import hashlib from pathlib import Path from typing import Dict, List, Optional, Any, Set, Tuple from datetime import datetime, timezone from dataclasses import dataclass, asdict import requests import logging from collections import defaultdict logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) @dataclass class ContextNode: """Represents a context node in the hierarchical tree""" path: str ucxl_address: str summary: str purpose: str technologies: List[str] tags: List[str] insights: List[str] overrides_parent: bool = False # Does this context override parent? context_specificity: int = 0 # Higher = more specific applies_to_children: bool = True # Does this context cascade down? generated_at: str = "" rag_confidence: float = 0.0 @dataclass class PathMetadata: """Lightweight path metadata - most context comes from hierarchy""" ucxl_address: str filesystem_path: str file_type: str size_bytes: Optional[int] extension: Optional[str] language: Optional[str] content_hash: Optional[str] last_modified: Optional[str] has_local_context: bool = False # Does this path have its own context node? class CascadingMetadataSystem: def __init__(self, bzzz_path: str, rag_endpoint: str, metadata_base: str): self.bzzz_path = Path(bzzz_path) self.rag_endpoint = rag_endpoint self.metadata_base = Path(metadata_base) self.project_name = "BZZZ" self.project_metadata_dir = self.metadata_base / self.project_name # Context hierarchy storage self.context_tree: Dict[str, ContextNode] = {} self.path_metadata: Dict[str, PathMetadata] = {} # Context consolidation data self.context_patterns = defaultdict(list) # Similar contexts grouped self.directory_purposes = {} # Common directory purposes # Ensure metadata directory exists self.project_metadata_dir.mkdir(parents=True, exist_ok=True) def analyze_directory_structure(self) -> Dict[str, Any]: """Analyze the entire directory structure to identify patterns and hierarchy""" logger.info("šŸ” Analyzing directory structure for context patterns...") directory_analysis = { 'common_purposes': defaultdict(list), 'technology_clusters': defaultdict(set), 'pattern_directories': defaultdict(list), 'depth_analysis': defaultdict(int) } for item in self.bzzz_path.rglob('*'): if not self.should_process_path(item): continue rel_path = item.relative_to(self.bzzz_path) depth = len(rel_path.parts) directory_analysis['depth_analysis'][depth] += 1 # Analyze directory patterns if item.is_dir(): dir_name = item.name.lower() # Common directory patterns if dir_name in ['src', 'source', 'lib']: directory_analysis['common_purposes']['source_code'].append(str(rel_path)) elif dir_name in ['test', 'tests', 'spec', 'specs']: directory_analysis['common_purposes']['testing'].append(str(rel_path)) elif dir_name in ['doc', 'docs', 'documentation']: directory_analysis['common_purposes']['documentation'].append(str(rel_path)) elif dir_name in ['config', 'configuration', 'settings']: directory_analysis['common_purposes']['configuration'].append(str(rel_path)) elif dir_name in ['asset', 'assets', 'static', 'public']: directory_analysis['common_purposes']['assets'].append(str(rel_path)) elif dir_name in ['font', 'fonts']: directory_analysis['common_purposes']['fonts'].append(str(rel_path)) elif dir_name in ['image', 'images', 'img']: directory_analysis['common_purposes']['images'].append(str(rel_path)) elif dir_name in ['style', 'styles', 'css']: directory_analysis['common_purposes']['styling'].append(str(rel_path)) elif dir_name in ['script', 'scripts', 'js']: directory_analysis['common_purposes']['scripts'].append(str(rel_path)) elif dir_name in ['build', 'dist', 'output', 'target']: directory_analysis['common_purposes']['build_output'].append(str(rel_path)) elif dir_name in ['vendor', 'third_party', 'external']: directory_analysis['common_purposes']['third_party'].append(str(rel_path)) elif dir_name in ['util', 'utils', 'helper', 'helpers', 'common']: directory_analysis['common_purposes']['utilities'].append(str(rel_path)) elif dir_name in ['api', 'endpoint', 'service', 'services']: directory_analysis['common_purposes']['api_services'].append(str(rel_path)) elif dir_name in ['model', 'models', 'entity', 'entities']: directory_analysis['common_purposes']['data_models'].append(str(rel_path)) elif dir_name in ['component', 'components', 'widget', 'widgets']: directory_analysis['common_purposes']['ui_components'].append(str(rel_path)) elif dir_name in ['template', 'templates', 'layout', 'layouts']: directory_analysis['common_purposes']['templates'].append(str(rel_path)) # Analyze technology clusters by file extensions if item.is_file(): ext = item.suffix.lower() parent_dir = str(rel_path.parent) if rel_path.parent != Path('.') else 'root' directory_analysis['technology_clusters'][parent_dir].add(ext) logger.info(f"šŸ“Š Found {len(directory_analysis['common_purposes'])} common directory patterns") logger.info(f"šŸ”§ Identified {len(directory_analysis['technology_clusters'])} technology clusters") return directory_analysis def create_context_hierarchy(self) -> None: """Create the cascading context hierarchy based on directory analysis""" logger.info("šŸ—ļø Building cascading context hierarchy...") # First, analyze the structure structure_analysis = self.analyze_directory_structure() # Create context nodes for significant directories contexts_created = 0 for purpose, directories in structure_analysis['common_purposes'].items(): for dir_path in directories: full_path = self.bzzz_path / dir_path if full_path.exists() and full_path.is_dir(): context_node = self.create_directory_context(full_path, purpose) if context_node: self.context_tree[str(full_path)] = context_node contexts_created += 1 # Create root project context root_context = self.create_root_context() self.context_tree[str(self.bzzz_path)] = root_context contexts_created += 1 logger.info(f"āœ… Created {contexts_created} context nodes in hierarchy") def create_root_context(self) -> ContextNode: """Create the root context for the entire project""" return ContextNode( path=str(self.bzzz_path), ucxl_address="ucxl://any:any@BZZZ:RUSTLE-testing", summary="BZZZ distributed system project root", purpose="Core distributed system implementing contextual metadata architecture with 1:1 filesystem mapping", technologies=["Rust", "Go", "Distributed Systems", "P2P", "DHT", "UCXL Protocol"], tags=["project-root", "distributed-system", "bzzz", "ucxl", "rust", "go"], insights=[ "Main project implementing distributed contextual metadata system", "Uses UCXL protocol for unified contextual exchange", "Implements 1:1 mapping between filesystem and UCXL addresses", "Part of larger CHORUS ecosystem for AI development" ], overrides_parent=False, context_specificity=0, applies_to_children=True, generated_at=datetime.now(timezone.utc).isoformat(), rag_confidence=0.9 ) def create_directory_context(self, dir_path: Path, purpose_type: str) -> Optional[ContextNode]: """Create context for a specific directory based on its purpose""" rel_path = dir_path.relative_to(self.bzzz_path) ucxl_address = f"ucxl://any:any@BZZZ:RUSTLE-testing/{str(rel_path).replace(os.sep, '/')}" # Context templates based on directory purpose context_templates = { 'source_code': { 'summary': f"Source code directory: {dir_path.name}", 'purpose': "Implementation of core system functionality and business logic", 'technologies': ["Rust", "Go", "Source Code"], 'tags': ["source-code", "implementation", "core-logic"], 'insights': [ "Contains primary implementation files", "Houses main business logic and algorithms", "Critical for system functionality" ] }, 'testing': { 'summary': f"Testing directory: {dir_path.name}", 'purpose': "Quality assurance, validation, and testing infrastructure", 'technologies': ["Testing Frameworks", "Unit Tests", "Integration Tests"], 'tags': ["testing", "qa", "validation", "quality-assurance"], 'insights': [ "Ensures code quality and correctness", "Provides regression testing capabilities", "Critical for maintaining system reliability" ] }, 'documentation': { 'summary': f"Documentation directory: {dir_path.name}", 'purpose': "Project documentation, guides, and knowledge resources", 'technologies': ["Markdown", "Documentation"], 'tags': ["documentation", "guides", "knowledge", "reference"], 'insights': [ "Provides user and developer guidance", "Contains architectural decisions and design docs", "Essential for project maintainability" ] }, 'configuration': { 'summary': f"Configuration directory: {dir_path.name}", 'purpose': "System configuration, settings, and environment management", 'technologies': ["TOML", "YAML", "JSON", "Configuration"], 'tags': ["configuration", "settings", "environment", "deployment"], 'insights': [ "Manages system behavior and parameters", "Controls deployment and runtime settings", "Centralizes configuration management" ] }, 'assets': { 'summary': f"Assets directory: {dir_path.name}", 'purpose': "Static assets, resources, and multimedia content", 'technologies': ["Static Assets", "Resources"], 'tags': ["assets", "resources", "static", "content"], 'insights': [ "Houses non-code project resources", "Supports user interface and experience", "Manages static content delivery" ] }, 'fonts': { 'summary': f"Fonts directory: {dir_path.name}", 'purpose': "Typography assets implementing design system specifications", 'technologies': ["Typography", "Fonts", "Design System"], 'tags': ["fonts", "typography", "design-system", "ui"], 'insights': [ "Implements brand typography guidelines", "Ensures consistent visual identity", "Supports responsive design requirements" ] }, 'api_services': { 'summary': f"API services directory: {dir_path.name}", 'purpose': "API endpoints, service interfaces, and external communication", 'technologies': ["REST API", "HTTP", "Service Layer"], 'tags': ["api", "services", "endpoints", "communication"], 'insights': [ "Defines external system interfaces", "Handles inter-service communication", "Critical for system integration" ] }, 'utilities': { 'summary': f"Utilities directory: {dir_path.name}", 'purpose': "Shared utilities, helpers, and common functionality", 'technologies': ["Utilities", "Helper Functions", "Common Code"], 'tags': ["utilities", "helpers", "shared", "common"], 'insights': [ "Provides reusable functionality", "Reduces code duplication", "Supports DRY principles" ] } } if purpose_type not in context_templates: return None template = context_templates[purpose_type] return ContextNode( path=str(dir_path), ucxl_address=ucxl_address, summary=template['summary'], purpose=template['purpose'], technologies=template['technologies'], tags=template['tags'], insights=template['insights'], overrides_parent=False, context_specificity=len(rel_path.parts), applies_to_children=True, generated_at=datetime.now(timezone.utc).isoformat(), rag_confidence=0.8 ) def resolve_context_for_path(self, file_path: Path) -> ContextNode: """Resolve context for a path by walking UP the hierarchy (CSS-like cascading)""" # Start from the file's directory and walk up to find applicable context current_path = file_path if file_path.is_dir() else file_path.parent contexts = [] # Walk up the directory tree collecting contexts while current_path >= self.bzzz_path: if str(current_path) in self.context_tree: context = self.context_tree[str(current_path)] if context.applies_to_children: contexts.append(context) if context.overrides_parent: break current_path = current_path.parent # If no contexts found, use root context if not contexts: return self.context_tree.get(str(self.bzzz_path), self.create_root_context()) # Merge contexts (more specific overrides less specific) return self.merge_contexts(contexts, file_path) def merge_contexts(self, contexts: List[ContextNode], file_path: Path) -> ContextNode: """Merge multiple contexts using CSS-like specificity rules""" if len(contexts) == 1: return contexts[0] # Sort by specificity (higher = more specific) contexts.sort(key=lambda c: c.context_specificity, reverse=True) # Start with most specific context merged = contexts[0] # Merge in less specific contexts where not overridden for context in contexts[1:]: # Tags are additive (union) merged.tags = list(set(merged.tags + context.tags)) # Technologies are additive (union) merged.technologies = list(set(merged.technologies + context.technologies)) # Insights are additive (append unique) for insight in context.insights: if insight not in merged.insights: merged.insights.append(insight) # Summary and purpose use most specific unless empty if not merged.summary: merged.summary = context.summary if not merged.purpose: merged.purpose = context.purpose # Update path-specific information rel_path = file_path.relative_to(self.bzzz_path) merged.ucxl_address = f"ucxl://any:any@BZZZ:RUSTLE-testing/{str(rel_path).replace(os.sep, '/')}" merged.path = str(file_path) return merged def should_process_path(self, path: Path) -> bool: """Determine if a path should be processed""" if any(part.startswith('.') for part in path.parts): return False ignore_patterns = [ 'target/', 'node_modules/', '__pycache__/', '.git/', 'vendor/', 'build/', 'dist/', '.cache/', 'tmp/' ] path_str = str(path).lower() return not any(pattern in path_str for pattern in ignore_patterns) def create_path_metadata(self, file_path: Path) -> PathMetadata: """Create lightweight metadata for a path (context comes from hierarchy)""" is_dir = file_path.is_dir() rel_path = file_path.relative_to(self.bzzz_path) ucxl_address = f"ucxl://any:any@BZZZ:RUSTLE-testing/{str(rel_path).replace(os.sep, '/')}" # Basic file information only size_bytes = None content_hash = None last_modified = None if not is_dir: try: stat = file_path.stat() size_bytes = stat.st_size last_modified = datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).isoformat() # Only hash small text files if size_bytes < 50000: # 50KB limit try: content = file_path.read_text(encoding='utf-8') content_hash = hashlib.sha256(content.encode('utf-8')).hexdigest() except: pass except: pass # Determine language/type language = None if not is_dir: ext = file_path.suffix.lower() lang_map = { '.rs': 'rust', '.go': 'go', '.py': 'python', '.js': 'javascript', '.ts': 'typescript', '.md': 'markdown', '.toml': 'toml', '.yaml': 'yaml', '.yml': 'yaml', '.json': 'json' } language = lang_map.get(ext) return PathMetadata( ucxl_address=ucxl_address, filesystem_path=str(file_path), file_type="directory" if is_dir else "file", size_bytes=size_bytes, extension=file_path.suffix if not is_dir else None, language=language, content_hash=content_hash, last_modified=last_modified, has_local_context=str(file_path) in self.context_tree ) def save_cascading_metadata(self) -> Dict[str, Any]: """Save the cascading metadata system to files""" # Save context hierarchy hierarchy_file = self.project_metadata_dir / "context_hierarchy.json" hierarchy_data = { path: asdict(context) for path, context in self.context_tree.items() } with open(hierarchy_file, 'w', encoding='utf-8') as f: json.dump(hierarchy_data, f, indent=2, ensure_ascii=False) # Save path metadata (lightweight) paths_file = self.project_metadata_dir / "path_metadata.json" paths_data = { path: asdict(metadata) for path, metadata in self.path_metadata.items() } with open(paths_file, 'w', encoding='utf-8') as f: json.dump(paths_data, f, indent=2, ensure_ascii=False) # Generate lookup index for fast context resolution lookup_index = {} for path, metadata in self.path_metadata.items(): file_path = Path(path) resolved_context = self.resolve_context_for_path(file_path) lookup_index[metadata.ucxl_address] = { 'context_path': resolved_context.path, 'specificity': resolved_context.context_specificity, 'has_local_context': metadata.has_local_context } index_file = self.project_metadata_dir / "context_lookup_index.json" with open(index_file, 'w', encoding='utf-8') as f: json.dump(lookup_index, f, indent=2, ensure_ascii=False) return { 'context_nodes': len(self.context_tree), 'path_entries': len(self.path_metadata), 'hierarchy_file': str(hierarchy_file), 'paths_file': str(paths_file), 'index_file': str(index_file) } def generate_context_demo(self, demo_paths: List[str]) -> Dict[str, Any]: """Generate a demo showing how context cascades for specific paths""" demo_results = {} for path_str in demo_paths: file_path = Path(path_str) if not file_path.exists(): continue resolved_context = self.resolve_context_for_path(file_path) path_metadata = self.path_metadata.get(str(file_path), {}) demo_results[path_str] = { 'ucxl_address': resolved_context.ucxl_address, 'resolved_context': { 'summary': resolved_context.summary, 'purpose': resolved_context.purpose, 'technologies': resolved_context.technologies, 'tags': resolved_context.tags, 'context_source': resolved_context.path, 'specificity': resolved_context.context_specificity }, 'path_metadata': asdict(path_metadata) if hasattr(path_metadata, '__dict__') else path_metadata, 'inheritance_chain': self.get_inheritance_chain(file_path) } return demo_results def get_inheritance_chain(self, file_path: Path) -> List[str]: """Get the chain of context inheritance for a path""" chain = [] current_path = file_path if file_path.is_dir() else file_path.parent while current_path >= self.bzzz_path: if str(current_path) in self.context_tree: chain.append(str(current_path)) current_path = current_path.parent return chain def process_repository(self) -> Dict[str, Any]: """Process the entire repository with cascading context system""" logger.info("šŸš€ Processing repository with cascading context system...") # Step 1: Create context hierarchy self.create_context_hierarchy() # Step 2: Create lightweight metadata for all paths paths_processed = 0 for item in self.bzzz_path.rglob('*'): if not self.should_process_path(item): continue metadata = self.create_path_metadata(item) self.path_metadata[str(item)] = metadata paths_processed += 1 logger.info(f"šŸ“Š Processed {paths_processed} paths with {len(self.context_tree)} context nodes") # Step 3: Save the system save_results = self.save_cascading_metadata() # Step 4: Calculate space savings traditional_size = paths_processed * 2000 # Estimate 2KB per traditional metadata file actual_size = len(self.context_tree) * 2000 + paths_processed * 500 # Context + lightweight metadata space_savings = ((traditional_size - actual_size) / traditional_size) * 100 return { 'paths_processed': paths_processed, 'context_nodes': len(self.context_tree), 'space_savings_percent': space_savings, 'estimated_traditional_size_kb': traditional_size // 1024, 'actual_size_kb': actual_size // 1024, **save_results } def main(): parser = argparse.ArgumentParser(description="Generate cascading hierarchical metadata for BZZZ project") parser.add_argument("--bzzz-path", default="/home/tony/chorus/project-queues/active/BZZZ", help="Path to BZZZ repository") parser.add_argument("--metadata-base", default=os.path.expanduser("~/chorus/project-metadata"), help="Base directory for metadata storage") parser.add_argument("--demo", action="store_true", help="Run demonstration with sample paths") args = parser.parse_args() # Check if BZZZ path exists, create demo if not bzzz_path = Path(args.bzzz_path) if not bzzz_path.exists(): logger.warning(f"BZZZ repository not found at: {bzzz_path}") logger.info("Creating demo structure...") demo_path = Path("/tmp/demo-bzzz-cascading") demo_path.mkdir(exist_ok=True) # Create comprehensive demo structure directories = [ "src", "src/api", "src/core", "src/utils", "tests", "tests/unit", "tests/integration", "docs", "docs/api", "docs/user", "config", "config/dev", "config/prod", "assets", "assets/fonts", "assets/images", "scripts", "build" ] for dir_path in directories: (demo_path / dir_path).mkdir(parents=True, exist_ok=True) # Create demo files files = { "README.md": "# BZZZ Project\n\nDistributed contextual metadata system", "Cargo.toml": "[package]\nname = \"bzzz\"\nversion = \"0.1.0\"", "src/main.rs": "fn main() { println!(\"BZZZ!\"); }", "src/lib.rs": "//! BZZZ core library", "src/api/handlers.rs": "//! HTTP request handlers", "src/core/engine.rs": "//! Core processing engine", "src/utils/helpers.rs": "//! Utility functions", "tests/unit/core_tests.rs": "//! Unit tests for core", "tests/integration/api_tests.rs": "//! API integration tests", "docs/README.md": "# Documentation\n\nProject documentation", "docs/api/endpoints.md": "# API Endpoints", "config/settings.toml": "[server]\nport = 8080", "assets/fonts/README.md": "# Fonts\n\nTypography assets for UI", "scripts/build.sh": "#!/bin/bash\ncargo build --release" } for file_path, content in files.items(): full_path = demo_path / file_path full_path.write_text(content) bzzz_path = demo_path logger.info(f"Demo structure created at: {demo_path}") # Initialize the cascading system system = CascadingMetadataSystem( bzzz_path=str(bzzz_path), rag_endpoint="http://localhost:8000/query", # Not used in this version metadata_base=args.metadata_base ) # Process the repository results = system.process_repository() logger.info("āœ… Cascading metadata system complete!") logger.info(f"šŸ“Š Results:") logger.info(f" šŸ“ Paths processed: {results['paths_processed']}") logger.info(f" šŸ—ļø Context nodes: {results['context_nodes']}") logger.info(f" šŸ’¾ Space savings: {results['space_savings_percent']:.1f}%") logger.info(f" šŸ“ Traditional size: {results['estimated_traditional_size_kb']} KB") logger.info(f" šŸŽÆ Actual size: {results['actual_size_kb']} KB") logger.info(f" šŸ“‚ Files saved:") logger.info(f" šŸ—ļø Hierarchy: {results['hierarchy_file']}") logger.info(f" šŸ“„ Paths: {results['paths_file']}") logger.info(f" šŸ” Index: {results['index_file']}") # Run demo if requested if args.demo: logger.info("\nšŸŽ¬ Running context resolution demo...") demo_paths = [ str(bzzz_path / "src" / "main.rs"), str(bzzz_path / "src" / "api" / "handlers.rs"), str(bzzz_path / "tests" / "unit" / "core_tests.rs"), str(bzzz_path / "assets" / "fonts" / "README.md"), str(bzzz_path / "config" / "settings.toml") ] demo_results = system.generate_context_demo(demo_paths) for path, info in demo_results.items(): logger.info(f"\nšŸ“„ {path}:") logger.info(f" šŸ”— UCXL: {info['ucxl_address']}") logger.info(f" šŸ“ Summary: {info['resolved_context']['summary']}") logger.info(f" šŸŽÆ Purpose: {info['resolved_context']['purpose']}") logger.info(f" šŸ·ļø Tags: {', '.join(info['resolved_context']['tags'][:5])}") logger.info(f" šŸ“Š Context from: {Path(info['resolved_context']['context_source']).name}") logger.info(f" šŸ”— Inheritance: {' → '.join([Path(p).name for p in info['inheritance_chain']])}") if __name__ == "__main__": main()