#!/usr/bin/env python3 """ Documentation Parity Validator - Validate documentation consistency DEPRECATED: This regex-based validator is deprecated as of v3.44.0. Use hybrid_validator.py instead, which provides GenAI-powered semantic validation with automatic fallback to regex if no API key is available. Migration: # Old (deprecated): from validate_documentation_parity import validate_documentation_parity report = validate_documentation_parity(project_root) # New (recommended): from hybrid_validator import validate_manifest_alignment report = validate_manifest_alignment(repo_root) Removal planned: v3.45.0 --- This module validates documentation consistency across CLAUDE.md, PROJECT.md, README.md, and CHANGELOG.md to prevent documentation drift and ensure accuracy. Validation Categories: 1. Version consistency - Detect when CLAUDE.md date != PROJECT.md date 2. Count discrepancies - Detect when documented counts != actual counts (agents, commands, skills, hooks) 3. Cross-references - Detect when documented features don't exist in codebase (or vice versa) 4. CHANGELOG parity - Detect when plugin.json version missing from CHANGELOG 5. Security documentation - Detect missing or incomplete security docs Security Features: - Path validation via security_utils (CWE-22, CWE-59 prevention) - File size limits to prevent DoS (max 10MB per file) - Safe file reading (no execution of file content) - Audit logging for validation operations Usage: from validate_documentation_parity import validate_documentation_parity # Validate documentation report = validate_documentation_parity(project_root) if report.has_errors: print(report.generate_report()) sys.exit(report.exit_code) CLI Usage: python validate_documentation_parity.py --project-root /path/to/project python validate_documentation_parity.py --verbose python validate_documentation_parity.py --json Date: 2025-11-09 Related: Documentation parity validation feature Agent: implementer See error-handling-patterns skill for exception hierarchy and error handling best practices. Design Patterns: See library-design-patterns skill for standardized design patterns. """ import json import re import sys import warnings from dataclasses import dataclass, field from datetime import datetime from enum import Enum from pathlib import Path from typing import List, Optional, Dict, Any # Emit deprecation warning on module import warnings.warn( "validate_documentation_parity is deprecated as of v3.44.0. " "Use hybrid_validator.validate_manifest_alignment() instead. " "This module will be removed in v3.45.0.", DeprecationWarning, stacklevel=2, ) # Import security utilities try: from plugins.autonomous_dev.lib.security_utils import ( validate_path, audit_log, PROJECT_ROOT, ) except ImportError: # Fallback for testing PROJECT_ROOT = Path(__file__).parent.parent.parent.parent.resolve() def validate_path(path: Path, context: str) -> Path: """Fallback path validation for testing.""" if not path.exists(): raise ValueError(f"Path does not exist: {path}") resolved = path.resolve() if not str(resolved).startswith(str(PROJECT_ROOT)): raise ValueError(f"Path outside project root: {resolved}") return resolved def audit_log(event_type: str, status: str, context: Dict[str, Any]) -> None: """Fallback audit logging for testing.""" pass # File size limit to prevent DoS attacks (10MB) MAX_FILE_SIZE = 10 * 1024 * 1024 class ValidationLevel(Enum): """Validation issue severity levels.""" ERROR = "ERROR" WARNING = "WARNING" INFO = "INFO" @dataclass class ParityIssue: """Represents a single documentation parity issue.""" level: ValidationLevel message: str details: str = "" def __str__(self) -> str: """Human-readable string representation.""" if self.details: return f"[{self.level.value}] {self.message}\n Details: {self.details}" return f"[{self.level.value}] {self.message}" @dataclass class ParityReport: """Comprehensive documentation parity validation report.""" version_issues: List[ParityIssue] = field(default_factory=list) count_issues: List[ParityIssue] = field(default_factory=list) cross_reference_issues: List[ParityIssue] = field(default_factory=list) changelog_issues: List[ParityIssue] = field(default_factory=list) security_issues: List[ParityIssue] = field(default_factory=list) @property def total_issues(self) -> int: """Total number of issues across all categories.""" return ( len(self.version_issues) + len(self.count_issues) + len(self.cross_reference_issues) + len(self.changelog_issues) + len(self.security_issues) ) @property def error_count(self) -> int: """Count of ERROR level issues.""" all_issues = ( self.version_issues + self.count_issues + self.cross_reference_issues + self.changelog_issues + self.security_issues ) return sum(1 for issue in all_issues if issue.level == ValidationLevel.ERROR) @property def warning_count(self) -> int: """Count of WARNING level issues.""" all_issues = ( self.version_issues + self.count_issues + self.cross_reference_issues + self.changelog_issues + self.security_issues ) return sum(1 for issue in all_issues if issue.level == ValidationLevel.WARNING) @property def info_count(self) -> int: """Count of INFO level issues.""" all_issues = ( self.version_issues + self.count_issues + self.cross_reference_issues + self.changelog_issues + self.security_issues ) return sum(1 for issue in all_issues if issue.level == ValidationLevel.INFO) @property def has_errors(self) -> bool: """True if any ERROR level issues exist.""" return self.error_count > 0 @property def has_warnings(self) -> bool: """True if any WARNING level issues exist.""" return self.warning_count > 0 @property def exit_code(self) -> int: """Exit code for CLI integration (0=success, 1=errors).""" return 1 if self.has_errors else 0 def generate_report(self) -> str: """Generate human-readable markdown report.""" lines = ["# Documentation Parity Validation Report", ""] # Summary lines.append(f"**Total Issues**: {self.total_issues}") lines.append(f"- Errors: {self.error_count}") lines.append(f"- Warnings: {self.warning_count}") lines.append(f"- Info: {self.info_count}") lines.append("") # Version issues if self.version_issues: lines.append("## Version Consistency Issues") lines.append("") for issue in self.version_issues: lines.append(f"- {issue}") lines.append("") # Count issues if self.count_issues: lines.append("## Count Discrepancy Issues") lines.append("") for issue in self.count_issues: lines.append(f"- {issue}") lines.append("") # Cross-reference issues if self.cross_reference_issues: lines.append("## Cross-Reference Issues") lines.append("") for issue in self.cross_reference_issues: lines.append(f"- {issue}") lines.append("") # CHANGELOG issues if self.changelog_issues: lines.append("## CHANGELOG Parity Issues") lines.append("") for issue in self.changelog_issues: lines.append(f"- {issue}") lines.append("") # Security documentation issues if self.security_issues: lines.append("## Security Documentation Issues") lines.append("") for issue in self.security_issues: lines.append(f"- {issue}") lines.append("") # Status if self.total_issues == 0: lines.append("**Status**: ✓ All documentation checks passed") elif self.has_errors: lines.append("**Status**: ✗ Documentation has errors that must be fixed") else: lines.append("**Status**: ⚠ Documentation has warnings") return "\n".join(lines) class DocumentationParityValidator: """Validates documentation consistency across project files.""" def __init__(self, project_root: Path): """Initialize validator with project root path. Args: project_root: Path to project root directory Raises: ValueError: If path validation fails (CWE-22, CWE-59 prevention) """ # Validate project root path self.project_root = validate_path(Path(project_root), "project root") # Define documentation file paths self.claude_md = self.project_root / "CLAUDE.md" self.project_md = self.project_root / ".claude" / "PROJECT.md" self.readme_md = self.project_root / "README.md" self.changelog_md = self.project_root / "CHANGELOG.md" self.security_md = self.project_root / "docs" / "SECURITY.md" # Define plugin paths self.plugin_dir = self.project_root / "plugins" / "autonomous-dev" self.agents_dir = self.plugin_dir / "agents" self.commands_dir = self.plugin_dir / "commands" self.skills_dir = self.plugin_dir / "skills" self.hooks_dir = self.plugin_dir / "hooks" self.lib_dir = self.plugin_dir / "lib" self.plugin_json = self.plugin_dir / "plugin.json" # Audit log initialization audit_log( "documentation_validation", "initialized", {"project_root": str(self.project_root)}, ) def _read_file_safe(self, file_path: Path) -> Optional[str]: """Safely read file content with size limit. Args: file_path: Path to file to read Returns: File content as string, or None if file doesn't exist or exceeds size limit Security: - Checks file size to prevent DoS attacks - Reads file as text (no execution) - Returns None for oversized files """ if not file_path.exists(): return None # Check file size file_size = file_path.stat().st_size if file_size > MAX_FILE_SIZE: audit_log( "documentation_validation", "file_too_large", {"file": str(file_path), "size": file_size}, ) return None try: return file_path.read_text(encoding="utf-8") except Exception as e: audit_log( "documentation_validation", "read_error", {"file": str(file_path), "error": str(e)}, ) return None def _parse_date(self, date_str: str) -> Optional[datetime]: """Parse date string in YYYY-MM-DD format. Args: date_str: Date string to parse Returns: datetime object or None if parsing fails """ try: return datetime.strptime(date_str.strip(), "%Y-%m-%d") except ValueError: return None def _has_malformed_date(self, content: str) -> Optional[str]: """Check if content has Last Updated field with malformed date. Args: content: Markdown file content Returns: The malformed date string if found, None otherwise """ # Pattern: **Last Updated**: anything that's not YYYY-MM-DD match = re.search(r"\*\*Last Updated:?\*\*:?\s*([^\n]+)", content) if match: date_str = match.group(1).strip() # Check if it's NOT in YYYY-MM-DD format if not re.match(r'^\d{4}-\d{2}-\d{2}$', date_str): return date_str return None def _extract_version_date(self, content: str, filename: str) -> Optional[str]: """Extract version date from markdown content. Args: content: Markdown file content filename: Filename for error reporting Returns: Date string in YYYY-MM-DD format, or None if not found """ # Pattern: **Last Updated**: YYYY-MM-DD or **Last Updated:** YYYY-MM-DD # Support both single colon (:) and double colon (::) after "Last Updated" match = re.search(r"\*\*Last Updated:?\*\*:?\s*(\d{4}-\d{2}-\d{2})", content) if match: return match.group(1) return None def validate_version_consistency(self) -> List[ParityIssue]: """Validate version consistency between CLAUDE.md and PROJECT.md. Returns: List of validation issues Checks: - CLAUDE.md has version date - PROJECT.md has version date - Dates are in sync (no drift) """ issues = [] # Read files claude_content = self._read_file_safe(self.claude_md) project_content = self._read_file_safe(self.project_md) # Check files exist if claude_content is None: issues.append( ParityIssue( ValidationLevel.ERROR, "CLAUDE.md is missing", f"Expected at: {self.claude_md}", ) ) if project_content is None: issues.append( ParityIssue( ValidationLevel.ERROR, "PROJECT.md is missing", f"Expected at: {self.project_md}", ) ) if not claude_content or not project_content: return issues # Extract version dates claude_date_str = self._extract_version_date(claude_content, "CLAUDE.md") project_date_str = self._extract_version_date(project_content, "PROJECT.md") # Check for malformed dates claude_malformed = self._has_malformed_date(claude_content) project_malformed = self._has_malformed_date(project_content) # Check version dates exist or are malformed if claude_date_str is None: if claude_malformed: issues.append( ParityIssue( ValidationLevel.ERROR, "CLAUDE.md has malformed date format", f"Found: {claude_malformed}, Expected format: YYYY-MM-DD", ) ) else: issues.append( ParityIssue( ValidationLevel.ERROR, "CLAUDE.md is missing version date", "Expected format: **Last Updated**: YYYY-MM-DD", ) ) if project_date_str is None: if project_malformed: issues.append( ParityIssue( ValidationLevel.ERROR, "PROJECT.md has malformed date format", f"Found: {project_malformed}, Expected format: YYYY-MM-DD", ) ) else: issues.append( ParityIssue( ValidationLevel.ERROR, "PROJECT.md is missing version date", "Expected format: **Last Updated**: YYYY-MM-DD", ) ) if not claude_date_str or not project_date_str: return issues # Parse dates claude_date = self._parse_date(claude_date_str) project_date = self._parse_date(project_date_str) if claude_date is None: issues.append( ParityIssue( ValidationLevel.ERROR, "CLAUDE.md has malformed date format", f"Found: {claude_date_str}, Expected: YYYY-MM-DD", ) ) if project_date is None: issues.append( ParityIssue( ValidationLevel.ERROR, "PROJECT.md has malformed date format", f"Found: {project_date_str}, Expected: YYYY-MM-DD", ) ) if not claude_date or not project_date: return issues # Compare dates if claude_date < project_date: issues.append( ParityIssue( ValidationLevel.ERROR, "CLAUDE.md is outdated relative to PROJECT.md", f"CLAUDE.md: {claude_date_str}, PROJECT.md: {project_date_str}", ) ) elif project_date < claude_date: issues.append( ParityIssue( ValidationLevel.WARNING, "PROJECT.md is outdated relative to CLAUDE.md", f"PROJECT.md: {project_date_str}, CLAUDE.md: {claude_date_str}", ) ) return issues def _count_files_in_dir(self, directory: Path, extension: str) -> int: """Count files with given extension in directory. Args: directory: Directory to search extension: File extension (e.g., '.md', '.py') Returns: Count of files with extension """ if not directory.exists(): return 0 return len(list(directory.glob(f"*{extension}"))) def _extract_count_from_text( self, content: str, pattern: str ) -> Optional[int]: """Extract count from text using regex pattern. Args: content: Text to search pattern: Regex pattern with count capture group Returns: Extracted count or None if not found """ match = re.search(pattern, content) if match: try: return int(match.group(1)) except (ValueError, IndexError): return None return None def validate_count_discrepancies(self) -> List[ParityIssue]: """Validate documented counts match actual counts. Returns: List of validation issues Checks: - Agent count (documented vs actual) - Command count (documented vs actual) - Skill count (documented vs actual) - Hook count (documented vs actual) """ issues = [] # Read CLAUDE.md claude_content = self._read_file_safe(self.claude_md) if claude_content is None: return issues # Already flagged in version validation # Count actual files actual_agents = self._count_files_in_dir(self.agents_dir, ".md") actual_commands = self._count_files_in_dir(self.commands_dir, ".md") actual_skills = self._count_files_in_dir(self.skills_dir, ".md") actual_hooks = self._count_files_in_dir(self.hooks_dir, ".py") # Extract documented counts # Pattern: "### Agents (5 specialists)" or "Agents (5)" doc_agents = self._extract_count_from_text( claude_content, r"Agents?\s*\((\d+)\s+(?:specialists?|active)?\)" ) # Pattern: "**Commands (10 active)**:" or "Commands (10)" doc_commands = self._extract_count_from_text( claude_content, r"Commands?\s*\((\d+)\s+(?:active|total)?\)" ) # Pattern: "### Skills (19 Active)" or "Skills (19)" doc_skills = self._extract_count_from_text( claude_content, r"Skills?\s*\((\d+)\s+(?:Active|active|total)?\)" ) # Pattern: "### Hooks (29 total automation)" or "Hooks (29)" doc_hooks = self._extract_count_from_text( claude_content, r"Hooks?\s*\((\d+)\s+(?:total|active)?\s*(?:automation)?\)" ) # Validate agent count if doc_agents is not None and doc_agents != actual_agents: issues.append( ParityIssue( ValidationLevel.ERROR, f"Agent count mismatch: documented {doc_agents}, actual {actual_agents}", f"Found {actual_agents} agent files in {self.agents_dir}", ) ) # Validate command count if doc_commands is not None and doc_commands != actual_commands: issues.append( ParityIssue( ValidationLevel.ERROR, f"Command count mismatch: documented {doc_commands}, actual {actual_commands}", f"Found {actual_commands} command files in {self.commands_dir}", ) ) # Validate skill count (WARNING level - less critical) if doc_skills is not None and doc_skills != actual_skills: issues.append( ParityIssue( ValidationLevel.WARNING, f"Skill count mismatch: documented {doc_skills}, actual {actual_skills}", f"Found {actual_skills} skill files in {self.skills_dir}", ) ) # Validate hook count (WARNING level - less critical) if doc_hooks is not None and doc_hooks != actual_hooks: issues.append( ParityIssue( ValidationLevel.WARNING, f"Hook count mismatch: documented {doc_hooks}, actual {actual_hooks}", f"Found {actual_hooks} hook files in {self.hooks_dir}", ) ) return issues def _extract_documented_features( self, content: str, feature_type: str ) -> List[str]: """Extract documented feature names from markdown content. Args: content: Markdown content to parse feature_type: Type of feature ('agent', 'command', 'library') Returns: List of feature names """ features = [] if feature_type == "agent": # Pattern: "- **researcher**: Web research for patterns" # Pattern: "**researcher**: Web research" matches = re.findall(r"\*\*([a-z-]+)\*\*:\s*[A-Z]", content) features.extend(matches) elif feature_type == "command": # Pattern: "- `/auto-implement` - Autonomous feature development" # Pattern: "`/auto-implement`" matches = re.findall(r"`/([a-z-]+)`", content) # Exclude built-in CLI commands (not part of plugin) built_in_commands = {"clear", "exit", "help"} features.extend([m for m in matches if m not in built_in_commands]) elif feature_type == "library": # Pattern: "1. **security_utils.py** - Centralized security validation" # Pattern: "**security_utils.py**" matches = re.findall(r"\*\*([a-z_]+\.py)\*\*", content) features.extend(matches) return list(set(features)) # Remove duplicates def validate_cross_references(self) -> List[ParityIssue]: """Validate documented features exist in codebase. Returns: List of validation issues Checks: - Documented agents exist as files - Documented commands exist as files - Documented libraries exist as files - Undocumented features in codebase (reverse check) """ issues = [] # Read CLAUDE.md claude_content = self._read_file_safe(self.claude_md) if claude_content is None: return issues # Extract documented features doc_agents = self._extract_documented_features(claude_content, "agent") doc_commands = self._extract_documented_features(claude_content, "command") doc_libraries = self._extract_documented_features(claude_content, "library") # Get actual features actual_agents = ( [f.stem for f in self.agents_dir.glob("*.md")] if self.agents_dir.exists() else [] ) actual_commands = ( [f.stem for f in self.commands_dir.glob("*.md")] if self.commands_dir.exists() else [] ) actual_libraries = ( [f.name for f in self.lib_dir.glob("*.py")] if self.lib_dir.exists() else [] ) # Check documented agents exist for agent in doc_agents: if agent not in actual_agents: issues.append( ParityIssue( ValidationLevel.ERROR, f"Documented agent '{agent}' not found in codebase", f"Expected file: {self.agents_dir / agent}.md", ) ) # Check documented commands exist for command in doc_commands: if command not in actual_commands: issues.append( ParityIssue( ValidationLevel.ERROR, f"Documented command '{command}' not found in codebase", f"Expected file: {self.commands_dir / command}.md", ) ) # Check documented libraries exist for library in doc_libraries: if library not in actual_libraries: issues.append( ParityIssue( ValidationLevel.WARNING, f"Documented library '{library}' not found in codebase", f"Expected file: {self.lib_dir / library}", ) ) # Reverse check: undocumented features for agent in actual_agents: if agent not in doc_agents and not agent.startswith("_"): issues.append( ParityIssue( ValidationLevel.INFO, f"Agent '{agent}' exists in codebase but not documented", f"Consider adding to CLAUDE.md", ) ) for command in actual_commands: if command not in doc_commands and not command.startswith("_"): issues.append( ParityIssue( ValidationLevel.INFO, f"Command '{command}' exists in codebase but not documented", f"Consider adding to CLAUDE.md", ) ) return issues def validate_changelog_parity(self) -> List[ParityIssue]: """Validate CHANGELOG contains current plugin version. Returns: List of validation issues Checks: - CHANGELOG.md exists - Current version from plugin.json is documented in CHANGELOG """ issues = [] # Read plugin.json for current version plugin_json_content = self._read_file_safe(self.plugin_json) if plugin_json_content is None: # plugin.json missing is not critical for this check return issues try: plugin_data = json.loads(plugin_json_content) current_version = plugin_data.get("version", "") except json.JSONDecodeError: issues.append( ParityIssue( ValidationLevel.WARNING, "plugin.json is malformed", f"Could not parse JSON from {self.plugin_json}", ) ) return issues if not current_version: return issues # Read CHANGELOG.md changelog_content = self._read_file_safe(self.changelog_md) if changelog_content is None: issues.append( ParityIssue( ValidationLevel.WARNING, "CHANGELOG.md is missing", f"Expected at: {self.changelog_md}", ) ) return issues # Check if current version is documented in CHANGELOG # Pattern: ## [3.8.0] or ## [3.8.0-beta.1] version_pattern = re.escape(current_version) if not re.search(rf"##\s*\[{version_pattern}\]", changelog_content): issues.append( ParityIssue( ValidationLevel.WARNING, f"Version {current_version} not found in CHANGELOG.md", f"Add entry for version {current_version} to CHANGELOG.md", ) ) return issues def validate_security_documentation(self) -> List[ParityIssue]: """Validate security documentation completeness. Returns: List of validation issues Checks: - Security practices mentioned in CLAUDE.md - SECURITY.md exists - CWE coverage documented """ issues = [] # Read CLAUDE.md claude_content = self._read_file_safe(self.claude_md) security_md_content = self._read_file_safe(self.security_md) # Check if security is mentioned in CLAUDE.md if claude_content: if ( "security" not in claude_content.lower() and security_md_content is None ): issues.append( ParityIssue( ValidationLevel.WARNING, "Security documentation is missing", "No security section in CLAUDE.md and SECURITY.md not found", ) ) # Check SECURITY.md exists if security_md_content is None: # Only flag if CLAUDE.md mentions security but SECURITY.md missing if claude_content and "security" in claude_content.lower(): issues.append( ParityIssue( ValidationLevel.WARNING, "SECURITY.md is missing", f"Expected at: {self.security_md}", ) ) return issues def validate(self) -> ParityReport: """Run all validation checks and generate comprehensive report. Returns: ParityReport with all validation results """ audit_log( "documentation_validation", "started", {"project_root": str(self.project_root)}, ) report = ParityReport( version_issues=self.validate_version_consistency(), count_issues=self.validate_count_discrepancies(), cross_reference_issues=self.validate_cross_references(), changelog_issues=self.validate_changelog_parity(), security_issues=self.validate_security_documentation(), ) audit_log( "documentation_validation", "completed", { "project_root": str(self.project_root), "total_issues": report.total_issues, "errors": report.error_count, "warnings": report.warning_count, }, ) return report def validate_documentation_parity(project_root: Path) -> ParityReport: """Convenience function for documentation validation. Args: project_root: Path to project root directory Returns: ParityReport with all validation results """ validator = DocumentationParityValidator(project_root) return validator.validate() def main(): """CLI entry point for documentation parity validation.""" import argparse parser = argparse.ArgumentParser( description="Validate documentation parity across project files" ) parser.add_argument( "--project-root", type=Path, default=Path.cwd(), help="Path to project root (default: current directory)", ) parser.add_argument( "--verbose", action="store_true", help="Enable verbose output" ) parser.add_argument( "--json", action="store_true", help="Output JSON for scripting" ) args = parser.parse_args() try: # Validate documentation report = validate_documentation_parity(args.project_root) if args.json: # JSON output for scripting output = { "total_issues": report.total_issues, "errors": report.error_count, "warnings": report.warning_count, "info": report.info_count, "exit_code": report.exit_code, "version_issues": [str(i) for i in report.version_issues], "count_issues": [str(i) for i in report.count_issues], "cross_reference_issues": [ str(i) for i in report.cross_reference_issues ], "changelog_issues": [str(i) for i in report.changelog_issues], "security_issues": [str(i) for i in report.security_issues], } print(json.dumps(output, indent=2)) else: # Human-readable output print(report.generate_report()) sys.exit(report.exit_code) except ValueError as e: print(f"Error: {e}", file=sys.stderr) sys.exit(1) except Exception as e: print(f"Unexpected error: {e}", file=sys.stderr) sys.exit(1) if __name__ == "__main__": main()