TradingAgents/.claude/lib/validate_manifest_doc_align...

#!/usr/bin/env python3
"""
Manifest-Documentation Alignment Validator.

DEPRECATED: This regex-based validator is deprecated as of v3.44.0.
Use hybrid_validator.py instead, which provides GenAI-powered semantic
validation with automatic fallback to regex if no API key is available.

Migration:
    # Old (deprecated):
    from validate_manifest_doc_alignment import validate_alignment
    result = validate_alignment(manifest_path)

    # New (recommended):
    from hybrid_validator import validate_manifest_alignment
    report = validate_manifest_alignment(repo_root)

Removal planned: v3.45.0

---

Validates that CLAUDE.md, PROJECT.md, and health-check.py component counts
match install_manifest.json (the single source of truth).

This prevents documentation drift by failing loudly when counts mismatch.

Usage:
    python validate_manifest_doc_alignment.py
    python validate_manifest_doc_alignment.py --fix  # Show fix instructions
    python validate_manifest_doc_alignment.py --manifest path/to/manifest.json

Issue #159: Prevent documentation drift after manifest completeness audit
Issue #160: GenAI-powered validation replaces regex-based approach
"""

import argparse
import json
import re
import sys
import warnings
from pathlib import Path
from typing import Dict, Any, Optional, List

# Emit deprecation warning on module import
warnings.warn(
    "validate_manifest_doc_alignment is deprecated as of v3.44.0. "
    "Use hybrid_validator.validate_manifest_alignment() instead. "
    "This module will be removed in v3.45.0.",
    DeprecationWarning,
    stacklevel=2,
)


class DocumentationDriftError(Exception):
    """Raised when documentation structure prevents count extraction."""
    pass


def find_project_root() -> Path:
    """Find the project root by looking for CLAUDE.md."""
    current = Path.cwd()
    for parent in [current] + list(current.parents):
        if (parent / "CLAUDE.md").exists():
            return parent
        if (parent / "plugins" / "autonomous-dev").exists():
            return parent
    return current


def load_manifest_counts(manifest_path: Path) -> Dict[str, Any]:
    """
    Load component counts from install_manifest.json.

    Args:
        manifest_path: Path to install_manifest.json

    Returns:
        Dict with counts for each component type and version

    Raises:
        FileNotFoundError: If manifest doesn't exist
        json.JSONDecodeError: If manifest is invalid JSON
    """
    if not manifest_path.exists():
        raise FileNotFoundError(f"Manifest not found: {manifest_path}")

    with open(manifest_path) as f:
        manifest = json.load(f)

    # Handle nested "components" structure (actual manifest format)
    # or flat structure (test fixtures)
    components = manifest.get("components", manifest)

    # Count libs (key is "lib" not "libs" in manifest)
    lib_files = components.get("lib", {}).get("files", [])
    # Fallback to "libs" for test fixtures
    if not lib_files:
        lib_files = components.get("libs", {}).get("files", [])

    # Count skill packages (directories), not individual files
    # Each skill is in a directory like "skills/skill-name/skill.md"
    skill_files = components.get("skills", {}).get("files", [])
    # Extract unique skill directories
    skill_dirs = set()
    for f in skill_files:
        # Extract directory name: "plugins/.../skills/skill-name/file.md" -> "skill-name"
        parts = f.split("/")
        if "skills" in parts:
            skills_idx = parts.index("skills")
            if skills_idx + 1 < len(parts):
                skill_dirs.add(parts[skills_idx + 1])

    counts = {
        "version": manifest.get("version", "unknown"),
        "agents": len(components.get("agents", {}).get("files", [])),
        "commands": len(components.get("commands", {}).get("files", [])),
        "hooks": len(components.get("hooks", {}).get("files", [])),
        "libs": len(lib_files),
        "skills": len(skill_dirs) if skill_dirs else len(skill_files),
    }

    return counts


def extract_claude_md_counts(claude_md_path: Path) -> Dict[str, int]:
    """
    Extract component counts from CLAUDE.md table format.

    Looks for table like:
    | Component | Version | Count | Status |
    | Agents | 1.0.0 | 21 | ✅ |

    Args:
        claude_md_path: Path to CLAUDE.md

    Returns:
        Dict with counts for each component type

    Raises:
        DocumentationDriftError: If table format not found
    """
    content = claude_md_path.read_text()

    # Match table rows: | Component | ... | Count | ... |
    # Pattern: | Agents | 1.0.0 | 21 | ✅ Compliant |
    table_pattern = r'\|\s*(Skills|Commands|Agents|Hooks)\s*\|\s*[\d.]+\s*\|\s*(\d+)\s*\|'

    matches = re.findall(table_pattern, content, re.IGNORECASE)

    if not matches:
        raise DocumentationDriftError(
            f"Component table not found in {claude_md_path}. "
            "Expected format: | Component | Version | Count | Status |"
        )

    counts = {}
    for component, count in matches:
        key = component.lower()
        counts[key] = int(count)

    return counts


def extract_claude_md_version(claude_md_path: Path) -> str:
    """
    Extract version from CLAUDE.md header.

    Looks for: **Version**: v3.44.0

    Args:
        claude_md_path: Path to CLAUDE.md

    Returns:
        Version string (without 'v' prefix)
    """
    content = claude_md_path.read_text()

    # Match: **Version**: v3.44.0
    version_pattern = r'\*\*Version\*\*:\s*v?([\d.]+)'
    match = re.search(version_pattern, content)

    if match:
        return match.group(1)

    return "unknown"


def extract_project_md_counts(project_md_path: Path) -> Dict[str, int]:
    """
    Extract component counts from PROJECT.md table format.

    Looks for table like:
    | Component | Count | Purpose |
    | Agents | 21 | Specialized AI assistants |

    Args:
        project_md_path: Path to PROJECT.md

    Returns:
        Dict with counts for each component type
    """
    content = project_md_path.read_text()

    # Match table rows: | Component | Count | ... |
    # Pattern: | Agents | 21 | Purpose text |
    table_pattern = r'\|\s*(Agents|Skills|Commands|Hooks|Libraries)\s*\|\s*(\d+)\s*\|'

    matches = re.findall(table_pattern, content, re.IGNORECASE)

    counts = {}
    for component, count in matches:
        key = component.lower()
        # Normalize "Libraries" to "libs"
        if key == "libraries":
            key = "libs"
        counts[key] = int(count)

    return counts


def extract_project_md_version(project_md_path: Path) -> str:
    """
    Extract version from PROJECT.md header.

    Looks for: **Version**: v3.44.0

    Args:
        project_md_path: Path to PROJECT.md

    Returns:
        Version string (without 'v' prefix)
    """
    content = project_md_path.read_text()

    # Match: **Version**: v3.44.0
    version_pattern = r'\*\*Version\*\*:\s*v?([\d.]+)'
    match = re.search(version_pattern, content)

    if match:
        return match.group(1)

    return "unknown"


def extract_health_check_counts(health_check_path: Path) -> Dict[str, int]:
    """
    Extract expected component counts from health_check.py lists.

    Looks for EXPECTED_AGENTS, EXPECTED_HOOKS, EXPECTED_COMMANDS lists.

    Args:
        health_check_path: Path to health_check.py

    Returns:
        Dict with counts for each component type
    """
    content = health_check_path.read_text()

    counts = {}

    # Count items in EXPECTED_AGENTS list
    agents_match = re.search(r'EXPECTED_AGENTS\s*=\s*\[(.*?)\]', content, re.DOTALL)
    if agents_match:
        items = re.findall(r'"([^"]+)"', agents_match.group(1))
        counts["agents"] = len(items)

    # Count items in EXPECTED_HOOKS list
    hooks_match = re.search(r'EXPECTED_HOOKS\s*=\s*\[(.*?)\]', content, re.DOTALL)
    if hooks_match:
        items = re.findall(r'"([^"]+)"', hooks_match.group(1))
        counts["hooks"] = len(items)

    # Count items in EXPECTED_COMMANDS list
    commands_match = re.search(r'EXPECTED_COMMANDS\s*=\s*\[(.*?)\]', content, re.DOTALL)
    if commands_match:
        items = re.findall(r'"([^"]+)"', commands_match.group(1))
        counts["commands"] = len(items)

    return counts


def detect_mismatches(
    expected: Dict[str, Any],
    actual: Dict[str, Any],
) -> Dict[str, Dict[str, Any]]:
    """
    Detect mismatches between expected (manifest) and actual (doc) counts.

    Args:
        expected: Counts from manifest (source of truth)
        actual: Counts from documentation file

    Returns:
        Dict of mismatches with expected and actual values
    """
    mismatches = {}

    for key in expected:
        if key == "version":
            continue  # Handle version separately
        if key in actual and expected[key] != actual[key]:
            mismatches[key] = {
                "expected": expected[key],
                "actual": actual[key],
            }

    return mismatches


def detect_version_mismatch(expected: str, actual: str) -> Dict[str, Dict[str, str]]:
    """
    Detect version mismatch.

    Args:
        expected: Version from manifest
        actual: Version from document

    Returns:
        Dict with version mismatch if different
    """
    if expected != actual and expected != "unknown" and actual != "unknown":
        return {
            "version": {
                "expected": expected,
                "actual": actual,
            }
        }
    return {}


def validate_alignment(
    manifest_path: Path,
    claude_md_path: Optional[Path] = None,
    project_md_path: Optional[Path] = None,
    health_check_path: Optional[Path] = None,
) -> Dict[str, Any]:
    """
    Validate alignment between manifest and documentation files.

    Args:
        manifest_path: Path to install_manifest.json
        claude_md_path: Optional path to CLAUDE.md
        project_md_path: Optional path to PROJECT.md
        health_check_path: Optional path to health_check.py

    Returns:
        Dict with status, mismatches, and details
    """
    result = {
        "status": "ALIGNED",
        "mismatches": {},
        "details": {},
    }

    # Load manifest counts (source of truth)
    manifest_counts = load_manifest_counts(manifest_path)
    result["details"]["manifest"] = manifest_counts

    # Validate CLAUDE.md
    if claude_md_path and claude_md_path.exists():
        try:
            claude_counts = extract_claude_md_counts(claude_md_path)
            claude_version = extract_claude_md_version(claude_md_path)

            mismatches = detect_mismatches(manifest_counts, claude_counts)
            version_mismatch = detect_version_mismatch(
                manifest_counts["version"], claude_version
            )

            if mismatches or version_mismatch:
                result["status"] = "DRIFTED"
                for key, value in mismatches.items():
                    value["file"] = "CLAUDE.md"
                    result["mismatches"][f"claude_md_{key}"] = value
                if version_mismatch:
                    version_mismatch["version"]["file"] = "CLAUDE.md"
                    result["mismatches"]["claude_md_version"] = version_mismatch["version"]

            result["details"]["claude_md"] = {
                "counts": claude_counts,
                "version": claude_version,
            }

        except DocumentationDriftError as e:
            result["status"] = "ERROR"
            result["mismatches"]["claude_md_format"] = {"error": str(e)}

    # Validate PROJECT.md
    if project_md_path and project_md_path.exists():
        project_counts = extract_project_md_counts(project_md_path)
        project_version = extract_project_md_version(project_md_path)

        mismatches = detect_mismatches(manifest_counts, project_counts)
        version_mismatch = detect_version_mismatch(
            manifest_counts["version"], project_version
        )

        if mismatches or version_mismatch:
            result["status"] = "DRIFTED"
            for key, value in mismatches.items():
                value["file"] = "PROJECT.md"
                result["mismatches"][f"project_md_{key}"] = value
            if version_mismatch:
                version_mismatch["version"]["file"] = "PROJECT.md"
                result["mismatches"]["project_md_version"] = version_mismatch["version"]

        result["details"]["project_md"] = {
            "counts": project_counts,
            "version": project_version,
        }

    # Note: health_check.py validates "core" components (8 agents, 12 hooks, 8 commands)
    # not ALL installed components. So we don't compare it to manifest counts.
    # health_check.py is intentionally a subset for essential pipeline validation.

    return result


def generate_fix_instructions(mismatches: Dict[str, Dict[str, Any]]) -> str:
    """
    Generate actionable fix instructions for mismatches.

    Args:
        mismatches: Dict of detected mismatches

    Returns:
        Human-readable fix instructions
    """
    if not mismatches:
        return "✅ All documentation is aligned with manifest."

    lines = [
        "❌ Documentation drift detected!",
        "",
        "The following files need updates to match install_manifest.json:",
        "",
    ]

    # Group by file
    by_file: Dict[str, List[str]] = {}
    for key, value in mismatches.items():
        file = value.get("file", "unknown")
        if file not in by_file:
            by_file[file] = []

        if "error" in value:
            by_file[file].append(f"  - ERROR: {value['error']}")
        else:
            component = key.split("_")[-1]  # Extract component name
            by_file[file].append(
                f"  - {component}: expected {value['expected']}, found {value['actual']}"
            )

    for file, issues in by_file.items():
        lines.append(f"**{file}**:")
        lines.extend(issues)
        lines.append("")

    lines.extend([
        "To fix:",
        "1. Update the counts in the affected files to match install_manifest.json",
        "2. Update version numbers to match manifest version",
        "3. Run this validator again to confirm alignment",
    ])

    return "\n".join(lines)


def should_block_commit(result: Dict[str, Any]) -> bool:
    """
    Determine if a commit should be blocked based on validation result.

    Args:
        result: Validation result from validate_alignment()

    Returns:
        True if commit should be blocked
    """
    return result["status"] in ("DRIFTED", "ERROR")


def main(args: Optional[List[str]] = None) -> int:
    """
    CLI entry point.

    Args:
        args: Command line arguments (defaults to sys.argv)

    Returns:
        Exit code (0 = aligned, 1 = drifted, 2 = error)
    """
    parser = argparse.ArgumentParser(
        description="Validate manifest-documentation alignment"
    )
    parser.add_argument(
        "--manifest",
        type=Path,
        help="Path to install_manifest.json",
    )
    parser.add_argument(
        "--claude-md",
        type=Path,
        help="Path to CLAUDE.md",
    )
    parser.add_argument(
        "--project-md",
        type=Path,
        help="Path to PROJECT.md",
    )
    parser.add_argument(
        "--fix",
        action="store_true",
        help="Show fix instructions",
    )
    parser.add_argument(
        "--json",
        action="store_true",
        help="Output as JSON",
    )

    parsed = parser.parse_args(args)

    # Find project root and default paths
    root = find_project_root()

    manifest_path = parsed.manifest or (
        root / "plugins" / "autonomous-dev" / "config" / "install_manifest.json"
    )
    claude_md_path = parsed.claude_md or (root / "CLAUDE.md")
    project_md_path = parsed.project_md or (root / "PROJECT.md")

    try:
        result = validate_alignment(
            manifest_path=manifest_path,
            claude_md_path=claude_md_path,
            project_md_path=project_md_path,
        )

        if parsed.json:
            print(json.dumps(result, indent=2))
        else:
            if result["status"] == "ALIGNED":
                print("✅ Documentation is aligned with install_manifest.json")
                return 0
            else:
                print(generate_fix_instructions(result["mismatches"]))
                return 1

    except FileNotFoundError as e:
        print(f"❌ Error: {e}")
        return 2
    except json.JSONDecodeError as e:
        print(f"❌ Invalid JSON in manifest: {e}")
        return 2

    return 0 if result["status"] == "ALIGNED" else 1


if __name__ == "__main__":
    sys.exit(main())