package intelligence import ( "bufio" "bytes" "context" "fmt" "os" "path/filepath" "regexp" "strings" "time" ) // DefaultFileAnalyzer provides comprehensive file analysis capabilities type DefaultFileAnalyzer struct { config *EngineConfig languageDetector *LanguageDetector structureAnalyzer *CodeStructureAnalyzer metadataExtractor *MetadataExtractor } // LanguageDetector detects programming languages from file content and extensions type LanguageDetector struct { extensionMap map[string]string signatureRegexs map[string][]*regexp.Regexp } // CodeStructureAnalyzer analyzes code structure and patterns type CodeStructureAnalyzer struct { languagePatterns map[string]*LanguagePatterns } // LanguagePatterns contains regex patterns for different language constructs type LanguagePatterns struct { Functions []*regexp.Regexp Classes []*regexp.Regexp Variables []*regexp.Regexp Imports []*regexp.Regexp Comments []*regexp.Regexp TODOs []*regexp.Regexp } // MetadataExtractor extracts file system metadata type MetadataExtractor struct { mimeTypes map[string]string } // NewDefaultFileAnalyzer creates a new file analyzer with comprehensive language support func NewDefaultFileAnalyzer(config *EngineConfig) *DefaultFileAnalyzer { return &DefaultFileAnalyzer{ config: config, languageDetector: NewLanguageDetector(), structureAnalyzer: NewCodeStructureAnalyzer(), metadataExtractor: NewMetadataExtractor(), } } // NewLanguageDetector creates a language detector with extensive language support func NewLanguageDetector() *LanguageDetector { detector := &LanguageDetector{ extensionMap: make(map[string]string), signatureRegexs: make(map[string][]*regexp.Regexp), } // Map file extensions to languages extensions := map[string]string{ ".go": "go", ".py": "python", ".js": "javascript", ".jsx": "javascript", ".ts": "typescript", ".tsx": "typescript", ".java": "java", ".c": "c", ".cpp": "cpp", ".cc": "cpp", ".cxx": "cpp", ".h": "c", ".hpp": "cpp", ".cs": "csharp", ".php": "php", ".rb": "ruby", ".rs": "rust", ".kt": "kotlin", ".swift": "swift", ".m": "objective-c", ".mm": "objective-c", ".scala": "scala", ".clj": "clojure", ".hs": "haskell", ".ex": "elixir", ".exs": "elixir", ".erl": "erlang", ".lua": "lua", ".pl": "perl", ".r": "r", ".sh": "shell", ".bash": "shell", ".zsh": "shell", ".fish": "shell", ".sql": "sql", ".html": "html", ".htm": "html", ".css": "css", ".scss": "scss", ".sass": "sass", ".less": "less", ".xml": "xml", ".json": "json", ".yaml": "yaml", ".yml": "yaml", ".toml": "toml", ".ini": "ini", ".cfg": "ini", ".conf": "config", ".md": "markdown", ".rst": "rst", ".tex": "latex", ".proto": "protobuf", ".tf": "terraform", ".hcl": "hcl", ".dockerfile": "dockerfile", ".dockerignore": "dockerignore", ".gitignore": "gitignore", ".vim": "vim", ".emacs": "emacs", } for ext, lang := range extensions { detector.extensionMap[ext] = lang } // Language signature patterns signatures := map[string][]string{ "go": { `^package\s+\w+`, `^import\s*\(`, `func\s+\w+\s*\(`, }, "python": { `^#!/usr/bin/env python`, `^#!/usr/bin/python`, `^import\s+\w+`, `^from\s+\w+\s+import`, `^def\s+\w+\s*\(`, `^class\s+\w+`, }, "javascript": { `^#!/usr/bin/env node`, `function\s+\w+\s*\(`, `const\s+\w+\s*=`, `let\s+\w+\s*=`, `var\s+\w+\s*=`, `require\s*\(`, `import\s+.*from`, }, "typescript": { `interface\s+\w+`, `type\s+\w+\s*=`, `class\s+\w+`, `import\s+.*from.*\.ts`, }, "java": { `^package\s+[\w\.]+;`, `^import\s+[\w\.]+;`, `public\s+class\s+\w+`, `public\s+static\s+void\s+main`, }, "rust": { `^use\s+\w+`, `fn\s+\w+\s*\(`, `struct\s+\w+`, `impl\s+\w+`, `extern\s+crate`, }, "cpp": { `^#include\s*<.*>`, `^#include\s*".*"`, `using\s+namespace`, `class\s+\w+`, `template\s*<`, }, } for lang, patterns := range signatures { regexes := make([]*regexp.Regexp, len(patterns)) for i, pattern := range patterns { regexes[i] = regexp.MustCompile(pattern) } detector.signatureRegexs[lang] = regexes } return detector } // NewCodeStructureAnalyzer creates a code structure analyzer func NewCodeStructureAnalyzer() *CodeStructureAnalyzer { analyzer := &CodeStructureAnalyzer{ languagePatterns: make(map[string]*LanguagePatterns), } // Define patterns for different languages patterns := map[string]*LanguagePatterns{ "go": { Functions: []*regexp.Regexp{ regexp.MustCompile(`func\s+(\w+)\s*\(`), regexp.MustCompile(`func\s+\(\w+\s+\*?\w+\)\s+(\w+)\s*\(`), }, Classes: []*regexp.Regexp{ regexp.MustCompile(`type\s+(\w+)\s+struct`), regexp.MustCompile(`type\s+(\w+)\s+interface`), }, Variables: []*regexp.Regexp{ regexp.MustCompile(`var\s+(\w+)`), regexp.MustCompile(`(\w+)\s*:=`), }, Imports: []*regexp.Regexp{ regexp.MustCompile(`import\s+"([^"]+)"`), regexp.MustCompile(`import\s+\w+\s+"([^"]+)"`), }, Comments: []*regexp.Regexp{ regexp.MustCompile(`//\s*(.*)`), regexp.MustCompile(`/\*([^*]|\*(?!/))*\*/`), }, TODOs: []*regexp.Regexp{ regexp.MustCompile(`//\s*TODO:?\s*(.*)`), regexp.MustCompile(`//\s*FIXME:?\s*(.*)`), regexp.MustCompile(`//\s*HACK:?\s*(.*)`), }, }, "python": { Functions: []*regexp.Regexp{ regexp.MustCompile(`def\s+(\w+)\s*\(`), regexp.MustCompile(`async\s+def\s+(\w+)\s*\(`), }, Classes: []*regexp.Regexp{ regexp.MustCompile(`class\s+(\w+)\s*[\(:]`), }, Variables: []*regexp.Regexp{ regexp.MustCompile(`(\w+)\s*=`), }, Imports: []*regexp.Regexp{ regexp.MustCompile(`import\s+(\w+)`), regexp.MustCompile(`from\s+(\w+)\s+import`), }, Comments: []*regexp.Regexp{ regexp.MustCompile(`#\s*(.*)`), regexp.MustCompile(`"""([^"]|"(?!""))*"""`), regexp.MustCompile(`'''([^']|'(?!''))*'''`), }, TODOs: []*regexp.Regexp{ regexp.MustCompile(`#\s*TODO:?\s*(.*)`), regexp.MustCompile(`#\s*FIXME:?\s*(.*)`), }, }, "javascript": { Functions: []*regexp.Regexp{ regexp.MustCompile(`function\s+(\w+)\s*\(`), regexp.MustCompile(`(\w+)\s*:\s*function\s*\(`), regexp.MustCompile(`const\s+(\w+)\s*=\s*\([^)]*\)\s*=>`), regexp.MustCompile(`(\w+)\s*=\s*\([^)]*\)\s*=>`), }, Classes: []*regexp.Regexp{ regexp.MustCompile(`class\s+(\w+)`), }, Variables: []*regexp.Regexp{ regexp.MustCompile(`var\s+(\w+)`), regexp.MustCompile(`let\s+(\w+)`), regexp.MustCompile(`const\s+(\w+)`), }, Imports: []*regexp.Regexp{ regexp.MustCompile(`import\s+.*from\s+['"]([^'"]+)['"]`), regexp.MustCompile(`require\s*\(\s*['"]([^'"]+)['"]`), }, Comments: []*regexp.Regexp{ regexp.MustCompile(`//\s*(.*)`), regexp.MustCompile(`/\*([^*]|\*(?!/))*\*/`), }, TODOs: []*regexp.Regexp{ regexp.MustCompile(`//\s*TODO:?\s*(.*)`), regexp.MustCompile(`//\s*FIXME:?\s*(.*)`), }, }, "java": { Functions: []*regexp.Regexp{ regexp.MustCompile(`(?:public|private|protected|static|\s)*\w+\s+(\w+)\s*\(`), }, Classes: []*regexp.Regexp{ regexp.MustCompile(`(?:public|private|protected|\s)*class\s+(\w+)`), regexp.MustCompile(`(?:public|private|protected|\s)*interface\s+(\w+)`), }, Variables: []*regexp.Regexp{ regexp.MustCompile(`(?:public|private|protected|static|final|\s)*\w+\s+(\w+)\s*[=;]`), }, Imports: []*regexp.Regexp{ regexp.MustCompile(`import\s+([\w\.]+);`), }, Comments: []*regexp.Regexp{ regexp.MustCompile(`//\s*(.*)`), regexp.MustCompile(`/\*([^*]|\*(?!/))*\*/`), }, TODOs: []*regexp.Regexp{ regexp.MustCompile(`//\s*TODO:?\s*(.*)`), regexp.MustCompile(`//\s*FIXME:?\s*(.*)`), }, }, } for lang, pattern := range patterns { analyzer.languagePatterns[lang] = pattern } return analyzer } // NewMetadataExtractor creates a metadata extractor func NewMetadataExtractor() *MetadataExtractor { return &MetadataExtractor{ mimeTypes: map[string]string{ ".txt": "text/plain", ".md": "text/markdown", ".json": "application/json", ".xml": "application/xml", ".html": "text/html", ".css": "text/css", ".js": "application/javascript", ".pdf": "application/pdf", ".png": "image/png", ".jpg": "image/jpeg", ".gif": "image/gif", }, } } // AnalyzeContent performs comprehensive analysis of file content func (fa *DefaultFileAnalyzer) AnalyzeContent(ctx context.Context, filePath string, content []byte) (*FileAnalysis, error) { analysis := &FileAnalysis{ FilePath: filePath, Size: int64(len(content)), LineCount: countLines(content), Dependencies: []string{}, Exports: []string{}, Imports: []string{}, Functions: []string{}, Classes: []string{}, Variables: []string{}, Comments: []string{}, TODOs: []string{}, Metadata: make(map[string]interface{}), AnalyzedAt: time.Now(), } // Detect language language, confidence, err := fa.DetectLanguage(ctx, filePath, content) if err != nil { language = "unknown" confidence = 0.0 } analysis.Language = language analysis.LanguageConf = confidence // Extract metadata metadata, err := fa.ExtractMetadata(ctx, filePath) if err == nil { analysis.FileType = metadata.Extension analysis.Metadata["mime_type"] = metadata.MimeType analysis.Metadata["permissions"] = metadata.Permissions analysis.Metadata["mod_time"] = metadata.ModTime } // Analyze structure if it's a known programming language if patterns, exists := fa.structureAnalyzer.languagePatterns[language]; exists { fa.analyzeCodeStructure(analysis, content, patterns) } // Calculate complexity analysis.Complexity = fa.calculateComplexity(analysis) return analysis, nil } // DetectLanguage detects programming language from content and file extension func (fa *DefaultFileAnalyzer) DetectLanguage(ctx context.Context, filePath string, content []byte) (string, float64, error) { ext := strings.ToLower(filepath.Ext(filePath)) // First try extension-based detection if lang, exists := fa.languageDetector.extensionMap[ext]; exists { confidence := 0.8 // High confidence for extension-based detection // Verify with content signatures if signatures, hasSignatures := fa.languageDetector.signatureRegexs[lang]; hasSignatures { matches := 0 for _, regex := range signatures { if regex.Match(content) { matches++ } } // Adjust confidence based on signature matches if matches > 0 { confidence = 0.9 + float64(matches)/float64(len(signatures))*0.1 } else { confidence = 0.6 // Lower confidence if no signatures match } } return lang, confidence, nil } // Fall back to content-based detection bestLang := "unknown" bestScore := 0 for lang, signatures := range fa.languageDetector.signatureRegexs { score := 0 for _, regex := range signatures { if regex.Match(content) { score++ } } if score > bestScore { bestScore = score bestLang = lang } } confidence := float64(bestScore) / 5.0 // Normalize to 0-1 range if confidence > 1.0 { confidence = 1.0 } return bestLang, confidence, nil } // ExtractMetadata extracts file system metadata func (fa *DefaultFileAnalyzer) ExtractMetadata(ctx context.Context, filePath string) (*FileMetadata, error) { info, err := os.Stat(filePath) if err != nil { return nil, fmt.Errorf("failed to get file info: %w", err) } ext := filepath.Ext(filePath) mimeType := fa.metadataExtractor.mimeTypes[strings.ToLower(ext)] if mimeType == "" { mimeType = "application/octet-stream" } metadata := &FileMetadata{ Path: filePath, Size: info.Size(), ModTime: info.ModTime(), Mode: uint32(info.Mode()), IsDir: info.IsDir(), Extension: ext, MimeType: mimeType, Permissions: info.Mode().String(), } return metadata, nil } // AnalyzeStructure analyzes code structure and organization func (fa *DefaultFileAnalyzer) AnalyzeStructure(ctx context.Context, filePath string, content []byte) (*StructureAnalysis, error) { analysis := &StructureAnalysis{ Architecture: "unknown", Patterns: []string{}, Components: []*Component{}, Relationships: []*Relationship{}, Complexity: &ComplexityMetrics{}, QualityMetrics: &QualityMetrics{}, TestCoverage: 0.0, Documentation: &DocMetrics{}, AnalyzedAt: time.Now(), } // Detect language language, _, err := fa.DetectLanguage(ctx, filePath, content) if err != nil { return analysis, fmt.Errorf("failed to detect language: %w", err) } // Analyze based on language patterns if patterns, exists := fa.structureAnalyzer.languagePatterns[language]; exists { fa.analyzeArchitecturalPatterns(analysis, content, patterns, language) } return analysis, nil } // IdentifyPurpose identifies the primary purpose of the file func (fa *DefaultFileAnalyzer) IdentifyPurpose(ctx context.Context, analysis *FileAnalysis) (string, float64, error) { purpose := "General purpose file" confidence := 0.5 // Purpose identification based on file patterns filename := filepath.Base(analysis.FilePath) filenameUpper := strings.ToUpper(filename) // Configuration files if strings.Contains(filenameUpper, "CONFIG") || strings.Contains(filenameUpper, "CONF") || analysis.FileType == ".ini" || analysis.FileType == ".toml" { purpose = "Configuration management" confidence = 0.9 return purpose, confidence, nil } // Test files if strings.Contains(filenameUpper, "TEST") || strings.Contains(filenameUpper, "SPEC") || strings.HasSuffix(filenameUpper, "_TEST.GO") || strings.HasSuffix(filenameUpper, "_TEST.PY") { purpose = "Testing and quality assurance" confidence = 0.9 return purpose, confidence, nil } // Documentation files if analysis.FileType == ".md" || analysis.FileType == ".rst" || strings.Contains(filenameUpper, "README") || strings.Contains(filenameUpper, "DOC") { purpose = "Documentation and guidance" confidence = 0.9 return purpose, confidence, nil } // API files if strings.Contains(filenameUpper, "API") || strings.Contains(filenameUpper, "ROUTER") || strings.Contains(filenameUpper, "HANDLER") { purpose = "API endpoint management" confidence = 0.8 return purpose, confidence, nil } // Database files if strings.Contains(filenameUpper, "DB") || strings.Contains(filenameUpper, "DATABASE") || strings.Contains(filenameUpper, "MODEL") || strings.Contains(filenameUpper, "SCHEMA") { purpose = "Data storage and management" confidence = 0.8 return purpose, confidence, nil } // UI/Frontend files if analysis.Language == "javascript" || analysis.Language == "typescript" || strings.Contains(filenameUpper, "COMPONENT") || strings.Contains(filenameUpper, "VIEW") || strings.Contains(filenameUpper, "UI") { purpose = "User interface component" confidence = 0.7 return purpose, confidence, nil } // Service/Business logic if strings.Contains(filenameUpper, "SERVICE") || strings.Contains(filenameUpper, "BUSINESS") || strings.Contains(filenameUpper, "LOGIC") { purpose = "Business logic implementation" confidence = 0.7 return purpose, confidence, nil } // Utility files if strings.Contains(filenameUpper, "UTIL") || strings.Contains(filenameUpper, "HELPER") || strings.Contains(filenameUpper, "COMMON") { purpose = "Utility and helper functions" confidence = 0.7 return purpose, confidence, nil } // Based on functions and structure if len(analysis.Functions) > 5 { purpose = "Multi-function module" confidence = 0.6 } else if len(analysis.Classes) > 0 { purpose = "Class-based component" confidence = 0.6 } else if len(analysis.Functions) > 0 { purpose = "Functional implementation" confidence = 0.6 } return purpose, confidence, nil } // GenerateSummary generates a concise summary of file content func (fa *DefaultFileAnalyzer) GenerateSummary(ctx context.Context, analysis *FileAnalysis) (string, error) { summary := strings.Builder{} // Language and type if analysis.Language != "unknown" { summary.WriteString(fmt.Sprintf("%s", strings.Title(analysis.Language))) } else { summary.WriteString("File") } // Size information if analysis.Size > 0 { summary.WriteString(fmt.Sprintf(" (%s)", formatFileSize(analysis.Size))) } // Content summary if len(analysis.Functions) > 0 { summary.WriteString(fmt.Sprintf(" with %d function(s)", len(analysis.Functions))) } if len(analysis.Classes) > 0 { summary.WriteString(fmt.Sprintf(" and %d class(es)", len(analysis.Classes))) } if len(analysis.Dependencies) > 0 { summary.WriteString(fmt.Sprintf(", imports %d dependencies", len(analysis.Dependencies))) } // Complexity note if analysis.Complexity > 10 { summary.WriteString(" (high complexity)") } else if analysis.Complexity > 5 { summary.WriteString(" (medium complexity)") } return summary.String(), nil } // ExtractTechnologies identifies technologies used in the file func (fa *DefaultFileAnalyzer) ExtractTechnologies(ctx context.Context, analysis *FileAnalysis) ([]string, error) { technologies := []string{} // Add primary language if analysis.Language != "unknown" && analysis.Language != "" { technologies = append(technologies, analysis.Language) } // Extract from imports/dependencies for _, dep := range analysis.Imports { if tech := fa.mapImportToTechnology(dep, analysis.Language); tech != "" { technologies = append(technologies, tech) } } // Extract from file patterns filename := strings.ToLower(filepath.Base(analysis.FilePath)) // Framework detection frameworks := map[string]string{ "react": "React", "vue": "Vue.js", "angular": "Angular", "express": "Express.js", "django": "Django", "flask": "Flask", "spring": "Spring", "gin": "Gin", "echo": "Echo", "fastapi": "FastAPI", "bootstrap": "Bootstrap", "tailwind": "Tailwind CSS", "material": "Material UI", "antd": "Ant Design", } for pattern, tech := range frameworks { if strings.Contains(filename, pattern) { technologies = append(technologies, tech) } } // Database detection from file content or names if strings.Contains(filename, "sql") || strings.Contains(filename, "db") { technologies = append(technologies, "SQL") } if strings.Contains(filename, "mongo") { technologies = append(technologies, "MongoDB") } if strings.Contains(filename, "redis") { technologies = append(technologies, "Redis") } // Remove duplicates seen := make(map[string]bool) uniqueTech := []string{} for _, tech := range technologies { if !seen[tech] { seen[tech] = true uniqueTech = append(uniqueTech, tech) } } return uniqueTech, nil } // Helper methods func countLines(content []byte) int { return bytes.Count(content, []byte("\n")) + 1 } func formatFileSize(size int64) string { const unit = 1024 if size < unit { return fmt.Sprintf("%d B", size) } div, exp := int64(unit), 0 for n := size / unit; n >= unit; n /= unit { div *= unit exp++ } return fmt.Sprintf("%.1f %cB", float64(size)/float64(div), "KMGTPE"[exp]) } func (fa *DefaultFileAnalyzer) analyzeCodeStructure(analysis *FileAnalysis, content []byte, patterns *LanguagePatterns) { contentStr := string(content) // Extract functions for _, regex := range patterns.Functions { matches := regex.FindAllStringSubmatch(contentStr, -1) for _, match := range matches { if len(match) > 1 { analysis.Functions = append(analysis.Functions, match[1]) } } } // Extract classes for _, regex := range patterns.Classes { matches := regex.FindAllStringSubmatch(contentStr, -1) for _, match := range matches { if len(match) > 1 { analysis.Classes = append(analysis.Classes, match[1]) } } } // Extract variables for _, regex := range patterns.Variables { matches := regex.FindAllStringSubmatch(contentStr, -1) for _, match := range matches { if len(match) > 1 { analysis.Variables = append(analysis.Variables, match[1]) } } } // Extract imports for _, regex := range patterns.Imports { matches := regex.FindAllStringSubmatch(contentStr, -1) for _, match := range matches { if len(match) > 1 { analysis.Imports = append(analysis.Imports, match[1]) analysis.Dependencies = append(analysis.Dependencies, match[1]) } } } // Extract comments for _, regex := range patterns.Comments { matches := regex.FindAllString(contentStr, -1) for _, match := range matches { if len(strings.TrimSpace(match)) > 2 { analysis.Comments = append(analysis.Comments, strings.TrimSpace(match)) } } } // Extract TODOs for _, regex := range patterns.TODOs { matches := regex.FindAllStringSubmatch(contentStr, -1) for _, match := range matches { if len(match) > 1 { analysis.TODOs = append(analysis.TODOs, strings.TrimSpace(match[1])) } } } } func (fa *DefaultFileAnalyzer) calculateComplexity(analysis *FileAnalysis) float64 { complexity := 0.0 // Base complexity from structure complexity += float64(len(analysis.Functions)) * 1.5 complexity += float64(len(analysis.Classes)) * 2.0 complexity += float64(len(analysis.Variables)) * 0.5 complexity += float64(len(analysis.Dependencies)) * 1.0 // Line count factor if analysis.LineCount > 500 { complexity += 5.0 } else if analysis.LineCount > 200 { complexity += 2.0 } else if analysis.LineCount > 100 { complexity += 1.0 } return complexity } func (fa *DefaultFileAnalyzer) analyzeArchitecturalPatterns(analysis *StructureAnalysis, content []byte, patterns *LanguagePatterns, language string) { contentStr := string(content) // Detect common architectural patterns if strings.Contains(contentStr, "interface") && language == "go" { analysis.Patterns = append(analysis.Patterns, "Interface Segregation") } if strings.Contains(contentStr, "Factory") { analysis.Patterns = append(analysis.Patterns, "Factory Pattern") } if strings.Contains(contentStr, "Singleton") { analysis.Patterns = append(analysis.Patterns, "Singleton Pattern") } if strings.Contains(contentStr, "Observer") { analysis.Patterns = append(analysis.Patterns, "Observer Pattern") } // Architectural style detection if strings.Contains(contentStr, "http.") || strings.Contains(contentStr, "router") { analysis.Architecture = "REST API" } else if strings.Contains(contentStr, "graphql") { analysis.Architecture = "GraphQL" } else if strings.Contains(contentStr, "grpc") || strings.Contains(contentStr, "proto") { analysis.Architecture = "gRPC" } else if len(patterns.Functions) > 0 && len(patterns.Classes) == 0 { analysis.Architecture = "Functional" } else if len(patterns.Classes) > 0 { analysis.Architecture = "Object-Oriented" } } func (fa *DefaultFileAnalyzer) mapImportToTechnology(importPath, language string) string { // Technology mapping based on common imports techMap := map[string]string{ // Go "gin-gonic/gin": "Gin", "labstack/echo": "Echo", "gorilla/mux": "Gorilla Mux", "gorm.io/gorm": "GORM", "github.com/redis": "Redis", "go.mongodb.org": "MongoDB", // Python "django": "Django", "flask": "Flask", "fastapi": "FastAPI", "requests": "HTTP Client", "sqlalchemy": "SQLAlchemy", "pandas": "Pandas", "numpy": "NumPy", "tensorflow": "TensorFlow", "torch": "PyTorch", // JavaScript/TypeScript "react": "React", "vue": "Vue.js", "angular": "Angular", "express": "Express.js", "axios": "Axios", "lodash": "Lodash", "moment": "Moment.js", "socket.io": "Socket.IO", } for pattern, tech := range techMap { if strings.Contains(strings.ToLower(importPath), pattern) { return tech } } return "" }