#!/usr/bin/env python3 """ Codebase Analyzer - Phase 1: Tech stack detection and metrics calculation This module provides comprehensive codebase analysis: - Technology stack detection (Python, JavaScript, Go, Rust, Java, etc.) - File organization analysis (src/, tests/, docs/) - Code metrics (LOC, file counts, language distribution) - Testing framework detection - CI/CD configuration detection - Documentation detection Features: - Multi-language project support - Extensible tech stack detection - Detailed metrics and reporting - Empty project handling - Security: Path validation and audit logging Usage: from codebase_analyzer import CodebaseAnalyzer, TechStack analyzer = CodebaseAnalyzer(project_root="/path/to/project") report = analyzer.analyze() print(f"Primary language: {report.primary_language}") print(f"Tech stacks: {report.tech_stacks}") print(f"Total lines: {report.total_lines}") Date: 2025-11-11 Feature: /align-project-retrofit command (Phase 1) Agent: implementer Design Patterns: See library-design-patterns skill for standardized design patterns. """ import sys from collections import defaultdict from dataclasses import dataclass, field from enum import Enum from pathlib import Path from typing import Any, Dict, List, Optional, Set # Import with fallback for both dev (plugins/) and installed (.claude/lib/) environments try: # Development environment sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) from plugins.autonomous_dev.lib import security_utils except ImportError: # Installed environment (.claude/lib/) import security_utils class TechStack(Enum): """ See error-handling-patterns skill for exception hierarchy and error handling best practices. Supported technology stacks.""" PYTHON = "python" JAVASCRIPT = "javascript" TYPESCRIPT = "typescript" GO = "go" RUST = "rust" JAVA = "java" RUBY = "ruby" PHP = "php" CSHARP = "csharp" CPP = "cpp" UNKNOWN = "unknown" # Tech stack detection patterns TECH_STACK_INDICATORS = { TechStack.PYTHON: { "files": ["requirements.txt", "setup.py", "pyproject.toml", "Pipfile", "setup.cfg", "tox.ini"], "extensions": [".py"], "dirs": ["__pycache__", ".venv", "venv"], }, TechStack.JAVASCRIPT: { "files": ["package.json", "package-lock.json", "yarn.lock", ".npmrc"], "extensions": [".js", ".jsx", ".mjs"], "dirs": ["node_modules"], }, TechStack.TYPESCRIPT: { "files": ["tsconfig.json"], "extensions": [".ts", ".tsx"], "dirs": ["node_modules"], }, TechStack.GO: { "files": ["go.mod", "go.sum"], "extensions": [".go"], "dirs": ["vendor"], }, TechStack.RUST: { "files": ["Cargo.toml", "Cargo.lock"], "extensions": [".rs"], "dirs": ["target"], }, TechStack.JAVA: { "files": ["pom.xml", "build.gradle", "build.gradle.kts"], "extensions": [".java"], "dirs": ["target", "build"], }, TechStack.RUBY: { "files": ["Gemfile", "Gemfile.lock", ".ruby-version"], "extensions": [".rb"], "dirs": [], }, TechStack.PHP: { "files": ["composer.json", "composer.lock"], "extensions": [".php"], "dirs": ["vendor"], }, } # Testing framework detection TESTING_FRAMEWORKS = { "pytest": ["pytest.ini", "pyproject.toml", "setup.cfg"], "unittest": ["test_*.py", "*_test.py"], "jest": ["jest.config.js", "jest.config.ts"], "mocha": ["mocha.opts", ".mocharc.js"], "go test": ["*_test.go"], "cargo test": ["Cargo.toml"], "junit": ["pom.xml", "build.gradle"], "rspec": ["spec/spec_helper.rb", ".rspec"], "phpunit": ["phpunit.xml", "phpunit.xml.dist"], } # CI/CD detection CI_CD_INDICATORS = { "github_actions": [".github/workflows"], "gitlab_ci": [".gitlab-ci.yml"], "travis": [".travis.yml"], "circle_ci": [".circleci/config.yml"], "jenkins": ["Jenkinsfile"], "azure_pipelines": ["azure-pipelines.yml"], } # Standard directory patterns STANDARD_DIRECTORIES = { "source": ["src", "lib", "app", "pkg"], "tests": ["tests", "test", "__tests__", "spec"], "docs": ["docs", "doc", "documentation"], "config": ["config", "conf", "cfg"], "scripts": ["scripts", "bin"], "build": ["build", "dist", "target", "out"], } # Files to skip SKIP_PATTERNS = { ".git", ".hg", ".svn", "__pycache__", "node_modules", ".venv", "venv", ".pytest_cache", ".mypy_cache", ".tox", "dist", "build", "*.egg-info", ".DS_Store", "Thumbs.db", } @dataclass class AnalysisReport: """Comprehensive codebase analysis report. Attributes: project_root: Path to analyzed project tech_stacks: Detected technology stacks primary_language: Primary programming language detected_files: Key files detected (config, manifest, etc.) testing_frameworks: Detected testing frameworks ci_cd_providers: Detected CI/CD providers has_ci_cd: Whether CI/CD is configured has_tests: Whether project has test files directory_structure: Directory organization analysis has_source_directory: Whether project has dedicated source directory has_test_directory: Whether project has dedicated test directory has_docs_directory: Whether project has documentation directory structure_type: Structure type (organized, flat, monorepo, etc.) file_distribution: File count distribution by directory total_files: Total number of files total_lines: Total lines of code lines_by_language: Lines of code by language (language names, not extensions) language_percentages: Language percentage distribution file_types: File type distribution estimated_test_coverage: Estimated test coverage percentage patterns_found: Patterns detected in codebase recommendations: Actionable recommendations warnings: Warnings about potential issues agent_analysis: Analysis from brownfield-analyzer agent architecture_style: Architecture style (monolithic, microservices, etc.) design_patterns: Detected design patterns quality_indicators: Code quality indicators metadata: Additional metadata """ project_root: Optional[Path] = None tech_stacks: List[TechStack] = field(default_factory=list) primary_language: Optional[str] = None detected_files: List[str] = field(default_factory=list) testing_frameworks: List[str] = field(default_factory=list) ci_cd_providers: List[str] = field(default_factory=list) has_ci_cd: bool = False has_tests: bool = False directory_structure: List[str] = field(default_factory=list) has_source_directory: bool = False has_test_directory: bool = False has_docs_directory: bool = False structure_type: str = "unknown" file_distribution: Dict[str, int] = field(default_factory=dict) total_files: int = 0 total_lines: int = 0 lines_by_language: Dict[str, int] = field(default_factory=dict) language_percentages: Dict[str, float] = field(default_factory=dict) file_types: Dict[str, int] = field(default_factory=dict) estimated_test_coverage: float = 0.0 patterns_found: List[str] = field(default_factory=list) recommendations: List[str] = field(default_factory=list) warnings: List[str] = field(default_factory=list) agent_analysis: Optional[Dict[str, Any]] = None architecture_style: Optional[str] = None design_patterns: List[str] = field(default_factory=list) quality_indicators: Dict[str, Any] = field(default_factory=dict) metadata: Dict[str, Any] = field(default_factory=dict) def __post_init__(self): """Auto-generate recommendations and warnings after initialization.""" # Only generate if not already provided if not self.recommendations: self._auto_generate_recommendations() if not self.warnings: self._auto_generate_warnings() def _auto_generate_recommendations(self) -> None: """Generate actionable recommendations based on analysis data.""" recommendations = [] # CI/CD recommendations if not self.has_ci_cd: recommendations.append("Add CI/CD: Configure automated testing and deployment") # Documentation recommendations if not self.has_docs_directory: recommendations.append("Improve docs: Add documentation directory with README and guides") # Testing recommendations if not self.has_tests: recommendations.append("Add tests: Create test directory and add test coverage") elif self.estimated_test_coverage < 50: recommendations.append(f"Increase test coverage: Current estimate {self.estimated_test_coverage:.0f}%") # Structure recommendations if self.structure_type == "flat": recommendations.append("Organize structure: Consider organizing code into src/ and tests/ directories") self.recommendations = recommendations def _auto_generate_warnings(self) -> None: """Generate warnings for potential issues.""" warnings = [] # Test warnings if not self.has_tests: warnings.append("No test directory found - consider adding automated tests") # Structure warnings if self.structure_type == "flat": warnings.append("Flat structure detected - may be difficult to maintain as project grows") # CI/CD warnings if not self.has_ci_cd: warnings.append("No CI/CD configuration found - consider adding automated workflows") self.warnings = warnings def to_dict(self) -> Dict[str, Any]: """Serialize report to dictionary.""" return { "project_root": str(self.project_root), "tech_stacks": [stack.value for stack in self.tech_stacks], "primary_language": self.primary_language, "detected_files": self.detected_files, "testing_frameworks": self.testing_frameworks, "ci_cd_providers": self.ci_cd_providers, "has_ci_cd": self.has_ci_cd, "has_tests": self.has_tests, "directory_structure": self.directory_structure, "has_source_directory": self.has_source_directory, "has_test_directory": self.has_test_directory, "has_docs_directory": self.has_docs_directory, "structure_type": self.structure_type, "file_distribution": self.file_distribution, "total_files": self.total_files, "total_lines": self.total_lines, "lines_by_language": self.lines_by_language, "language_percentages": self.language_percentages, "file_types": self.file_types, "estimated_test_coverage": self.estimated_test_coverage, "patterns_found": self.patterns_found, "recommendations": self.recommendations, "warnings": self.warnings, "agent_analysis": self.agent_analysis, "architecture_style": self.architecture_style, "design_patterns": self.design_patterns, "quality_indicators": self.quality_indicators, "metadata": self.metadata, } def to_json(self) -> str: """Serialize report to JSON string. Returns: JSON string representation of report """ import json return json.dumps(self.to_dict(), indent=2) @property def summary(self) -> str: """Generate human-readable summary of analysis. Returns: Human-readable summary string """ lines = [ f"=== Codebase Analysis Report ===", f"Project: {self.project_root}", f"", f"Tech Stack:", ] if self.tech_stacks: for stack in self.tech_stacks: # Capitalize language name for display lang_name = stack.value.capitalize() lines.append(f" - {lang_name}") else: lines.append(" - None detected") # Capitalize primary language for display primary_lang = self.primary_language.capitalize() if self.primary_language else 'Unknown' lines.extend([ f"", f"Primary Language: {primary_lang}", f"", f"Metrics:", f" - {self.total_files} files", f" - {self.total_lines} lines", f" - Estimated Test Coverage: {self.estimated_test_coverage:.1f}%", f"", f"Structure: {self.structure_type}", f" - Source Directory: {'Yes' if self.has_source_directory else 'No'}", f" - Test Directory: {'Yes' if self.has_test_directory else 'No'}", f" - Docs Directory: {'Yes' if self.has_docs_directory else 'No'}", ]) if self.recommendations: lines.append(f"") lines.append(f"Recommendations:") for rec in self.recommendations: lines.append(f" - {rec}") if self.warnings: lines.append(f"") lines.append(f"Warnings:") for warning in self.warnings: lines.append(f" - {warning}") return "\n".join(lines) def generate_summary(self) -> str: """Generate human-readable summary of analysis (alias for summary property). Returns: Human-readable summary string """ return self.summary class CodebaseAnalyzer: """Analyze codebase for tech stack, structure, and metrics. This class performs comprehensive codebase analysis including: - Technology stack detection - File organization analysis - Code metrics calculation - Testing and CI/CD detection Attributes: project_root: Path to project root directory """ def __init__(self, project_root: Path): """Initialize codebase analyzer. Args: project_root: Path to project root directory Raises: ValueError: If project_root is invalid or doesn't exist """ self.project_root = Path(project_root).resolve() # Validate project root try: security_utils.validate_path( str(project_root), purpose="codebase analysis project root", allow_missing=False, ) except ValueError as e: # Re-raise with clearer message for tests raise ValueError(f"Invalid project root: {project_root}") from e security_utils.audit_log( "codebase_analyzer_init", "success", {"project_root": str(self.project_root)}, ) def analyze(self) -> AnalysisReport: """Perform comprehensive codebase analysis. Returns: AnalysisReport with complete analysis results """ report = AnalysisReport(project_root=self.project_root) # Detect tech stacks self._detect_tech_stacks(report) # Analyze directory structure self._analyze_directory_structure(report) # Calculate metrics self._calculate_metrics(report) # Detect testing frameworks self._detect_testing_frameworks(report) # Detect CI/CD self._detect_ci_cd(report) # Determine primary language self._determine_primary_language(report) # Determine structure type self._determine_structure_type(report) # Recommendations and warnings are auto-generated by __post_init__ # No need to call explicitly here # Invoke agent for enhanced analysis (optional) try: self._invoke_agent(report) except Exception: # Agent invocation is optional - don't fail analysis pass security_utils.audit_log( "codebase_analysis_complete", "success", { "project_root": str(self.project_root), "tech_stacks": [stack.value for stack in report.tech_stacks], "total_files": report.total_files, "total_lines": report.total_lines, }, ) return report def _detect_tech_stacks(self, report: AnalysisReport) -> None: """Detect technology stacks in project. Args: report: AnalysisReport to update """ detected_stacks: Set[TechStack] = set() for stack, indicators in TECH_STACK_INDICATORS.items(): # Check for indicator files for file_name in indicators["files"]: if (self.project_root / file_name).exists(): detected_stacks.add(stack) report.detected_files.append(file_name) # Check for file extensions (sample files) for ext in indicators["extensions"]: if list(self.project_root.rglob(f"*{ext}")): detected_stacks.add(stack) report.tech_stacks = list(detected_stacks) def _analyze_directory_structure(self, report: AnalysisReport) -> None: """Analyze project directory structure. Args: report: AnalysisReport to update """ directories = [] for item in self.project_root.iterdir(): if item.is_dir() and item.name not in SKIP_PATTERNS: directories.append(item.name) report.directory_structure = directories # Check for standard directories for dir_name in STANDARD_DIRECTORIES["source"]: if dir_name in directories: report.has_source_directory = True break for dir_name in STANDARD_DIRECTORIES["tests"]: if dir_name in directories: report.has_test_directory = True break for dir_name in STANDARD_DIRECTORIES["docs"]: if dir_name in directories: report.has_docs_directory = True break def _calculate_metrics(self, report: AnalysisReport) -> None: """Calculate code metrics. Args: report: AnalysisReport to update """ file_counts: Dict[str, int] = defaultdict(int) line_counts_by_ext: Dict[str, int] = defaultdict(int) file_type_counts: Dict[str, int] = defaultdict(int) total_files = 0 total_lines = 0 source_files = 0 test_files = 0 # Extension to language mapping ext_to_lang = { ".py": "python", ".js": "javascript", ".jsx": "javascript", ".ts": "typescript", ".tsx": "typescript", ".go": "go", ".rs": "rust", ".java": "java", ".rb": "ruby", ".php": "php", ".cs": "csharp", ".cpp": "cpp", ".cc": "cpp", ".cxx": "cpp", ".c": "c", ".h": "c", } # Walk project directory for file_path in self._walk_project(): # Check if file is binary first if self._is_binary_file(file_path): continue total_files += 1 # Count by directory relative_path = file_path.relative_to(self.project_root) if len(relative_path.parts) > 1: top_dir = relative_path.parts[0] file_counts[top_dir] += 1 # Track test files if "test" in top_dir.lower(): test_files += 1 elif "test" not in str(relative_path).lower(): source_files += 1 else: file_counts["root"] += 1 if "test" in file_path.name.lower(): test_files += 1 else: source_files += 1 # Count lines try: content = file_path.read_text(errors="ignore") lines = content.count("\n") # Only count non-empty files if lines > 0: total_lines += lines # Count by file extension (language) ext = file_path.suffix.lower() if ext: file_type_counts[ext] += 1 line_counts_by_ext[ext] += lines except Exception: # Skip files that can't be read pass # Convert extension counts to language counts line_counts_by_language: Dict[str, int] = defaultdict(int) for ext, lines in line_counts_by_ext.items(): lang = ext_to_lang.get(ext, ext.lstrip(".")) line_counts_by_language[lang] += lines report.total_files = total_files report.total_lines = total_lines report.file_distribution = dict(file_counts) report.file_types = dict(file_type_counts) report.lines_by_language = dict(line_counts_by_language) report.has_tests = test_files > 0 # Calculate test coverage estimate if source_files > 0: report.estimated_test_coverage = (test_files / source_files) * 100 # Cap at 100% if report.estimated_test_coverage > 100: report.estimated_test_coverage = 100.0 else: report.estimated_test_coverage = 0.0 # Calculate language percentages if total_lines > 0: report.language_percentages = { lang: (lines / total_lines) * 100 for lang, lines in line_counts_by_language.items() } def _detect_testing_frameworks(self, report: AnalysisReport) -> None: """Detect testing frameworks. Args: report: AnalysisReport to update """ detected_frameworks = [] for framework, patterns in TESTING_FRAMEWORKS.items(): for pattern in patterns: # Check for config files if "/" not in pattern: if (self.project_root / pattern).exists(): detected_frameworks.append(framework) break # Check for glob patterns if "*" in pattern: if list(self.project_root.rglob(pattern)): detected_frameworks.append(framework) break report.testing_frameworks = detected_frameworks def _detect_ci_cd(self, report: AnalysisReport) -> None: """Detect CI/CD configuration. Args: report: AnalysisReport to update """ detected_providers = [] for provider, paths in CI_CD_INDICATORS.items(): for path in paths: if "/" in path: # Directory path if (self.project_root / path).exists(): detected_providers.append(provider) break else: # File path if (self.project_root / path).exists(): detected_providers.append(provider) break report.ci_cd_providers = detected_providers report.has_ci_cd = len(detected_providers) > 0 def _determine_primary_language(self, report: AnalysisReport) -> None: """Determine primary programming language. Args: report: AnalysisReport to update """ if not report.lines_by_language: report.primary_language = None return # Find language with most lines of code primary_ext = max(report.lines_by_language.items(), key=lambda x: x[1])[0] # Map extension to language name extension_map = { ".py": "python", ".js": "javascript", ".jsx": "javascript", ".ts": "typescript", ".tsx": "typescript", ".go": "go", ".rs": "rust", ".java": "java", ".rb": "ruby", ".php": "php", ".cs": "csharp", ".cpp": "cpp", ".cc": "cpp", ".cxx": "cpp", } report.primary_language = extension_map.get(primary_ext, primary_ext.lstrip(".")) def _determine_structure_type(self, report: AnalysisReport) -> None: """Determine project structure type. Args: report: AnalysisReport to update """ if report.has_source_directory and report.has_test_directory: report.structure_type = "organized" elif report.total_files == 0: report.structure_type = "empty" elif len(report.directory_structure) == 0 and report.total_files > 0: # Files exist but no subdirectories = flat structure report.structure_type = "flat" elif not report.has_source_directory and not report.has_test_directory: report.structure_type = "flat" else: report.structure_type = "mixed" def _invoke_agent(self, report: AnalysisReport) -> None: """Invoke brownfield-analyzer agent for enhanced analysis. Args: report: AnalysisReport to update """ try: # Invoke agent (uses module-level function for testability) result = invoke_agent( agent_name="brownfield-analyzer", task="Analyze codebase structure and patterns", context={"project_root": str(self.project_root)}, ) if result.get("success"): analysis = result.get("analysis", {}) report.agent_analysis = analysis # Extract agent insights if "patterns_found" in analysis: report.patterns_found = analysis["patterns_found"] if "architecture_style" in analysis: report.architecture_style = analysis["architecture_style"] if "design_patterns" in analysis: report.design_patterns = analysis["design_patterns"] if "quality_indicators" in analysis: report.quality_indicators = analysis["quality_indicators"] if "recommendations" in analysis: # Merge with existing recommendations report.recommendations.extend(analysis["recommendations"]) else: # Agent failed - add warning error = result.get("error", "Unknown error") report.warnings.append(error) except Exception as e: # Agent invocation failed - log but don't fail analysis report.warnings.append(f"Agent invocation failed: {str(e)}") security_utils.audit_log( "codebase_analyzer_agent_failed", "warning", { "project_root": str(self.project_root), "error": str(e), }, ) def _is_binary_file(self, file_path: Path) -> bool: """Check if file is binary (non-text). Args: file_path: Path to file Returns: True if binary, False if text """ # Binary file extensions binary_extensions = { ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".ico", ".pdf", ".zip", ".tar", ".gz", ".bz2", ".xz", ".exe", ".dll", ".so", ".dylib", ".pyc", ".pyo", ".class", ".woff", ".woff2", ".ttf", ".eot", } if file_path.suffix.lower() in binary_extensions: return True # Check first few bytes for binary content try: with open(file_path, "rb") as f: chunk = f.read(1024) # Check for null bytes (strong indicator of binary) if b"\x00" in chunk: return True except Exception: # If we can't read it, assume binary return True return False def _walk_project(self) -> List[Path]: """Walk project directory, skipping ignored patterns. Returns: List of file paths """ files = [] for item in self.project_root.rglob("*"): # Skip if any path component matches skip patterns # Check against path parts, not full path string (to avoid false positives like "dist" in "distribution") skip_item = False for part in item.parts: # Skip hidden files and directories (starting with .) if part.startswith("."): skip_item = True break # Check exact match for directory names if part in SKIP_PATTERNS: skip_item = True break # Check glob patterns (e.g., "*.egg-info") for pattern in SKIP_PATTERNS: if "*" in pattern: import fnmatch if fnmatch.fnmatch(part, pattern): skip_item = True break if skip_item: break if skip_item: continue if item.is_file(): files.append(item) return files # Module-level agent invocation (for mocking in tests) def invoke_agent(agent_name: str, task: str, context: Dict[str, Any]) -> Dict[str, Any]: """Invoke agent for analysis (wrapper for testing). Args: agent_name: Name of agent to invoke task: Task description context: Context dictionary Returns: Agent result dictionary """ from plugins.autonomous_dev.lib.agent_invoker import invoke_agent as _invoke_agent return _invoke_agent(agent_name=agent_name, task=task, context=context) # Convenience function def analyze_codebase(project_root: Path) -> AnalysisReport: """Analyze codebase and return report. Args: project_root: Path to project root Returns: AnalysisReport with analysis results """ analyzer = CodebaseAnalyzer(project_root=project_root) return analyzer.analyze()