#!/usr/bin/env python3 """ Unified GenAI Quality Validator All quality validation in one place using Claude Sonnet 4.5. Consolidates 4 separate validator files into a single tool. Usage: # PROJECT.md alignment python genai_validate.py alignment --feature "Add OAuth" # Documentation consistency python genai_validate.py docs --full # Code review python genai_validate.py code-review --diff # Test quality python genai_validate.py test-quality --test-file tests/test_foo.py --source-file src/foo.py # Security scan python genai_validate.py security --file src/api.py # Issue classification python genai_validate.py classify-issue --description "Login fails" # Commit message generation python genai_validate.py commit-msg --use-git-diff # Version consistency python genai_validate.py version-sync --check # Run all validations python genai_validate.py all Design Patterns: See library-design-patterns skill for standardized design patterns. """ import json import os import re import subprocess import sys from dataclasses import dataclass from pathlib import Path from typing import Dict, List, Optional # ============================================================================ # Configuration # ============================================================================ PROJECT_ROOT = Path(__file__).parent.parent.parent.parent PROJECT_MD = PROJECT_ROOT / ".claude" / "PROJECT.md" VERSION_FILE = PROJECT_ROOT / "VERSION" DOCS_TO_VALIDATE = [ PROJECT_ROOT / "README.md", PROJECT_ROOT / "plugins" / "autonomous-dev" / "README.md", PROJECT_ROOT / ".claude" / "PROJECT.md", ] COMMANDS_DIR = PROJECT_ROOT / "plugins" / "autonomous-dev" / "commands" AGENTS_DIR = PROJECT_ROOT / "plugins" / "autonomous-dev" / "agents" HOOKS_DIR = PROJECT_ROOT / "plugins" / "autonomous-dev" / "hooks" VERSION_EXCLUDE_PATTERNS = [ "**/UPDATES.md", "**/CHANGELOG.md", "**/.git/**", "**/node_modules/**", "**/__pycache__/**", "**/venv/**", "**/docs/sessions/**", ] # ============================================================================ # Shared GenAI Client # ============================================================================ def get_llm_client(): """Get LLM client (prefer Anthropic for accuracy).""" anthropic_key = os.getenv("ANTHROPIC_API_KEY") openrouter_key = os.getenv("OPENROUTER_API_KEY") if anthropic_key: try: import anthropic except ImportError: print("❌ anthropic package not installed!") print("Install with: pip install anthropic") sys.exit(1) client = anthropic.Anthropic(api_key=anthropic_key) model = "claude-sonnet-4-5-20250929" # Latest Sonnet 4.5 return client, model, "anthropic" elif openrouter_key: try: import openai except ImportError: print("❌ openai package not installed!") print("Install with: pip install openai") sys.exit(1) client = openai.OpenAI( base_url="https://openrouter.ai/api/v1", api_key=openrouter_key, ) model = "anthropic/claude-sonnet-4.5" return client, model, "openrouter" else: print("❌ No API key found!") print() print("Set one of:") print(" export ANTHROPIC_API_KEY=sk-ant-...") print(" export OPENROUTER_API_KEY=sk-or-v1-...") sys.exit(1) def call_llm(prompt: str) -> str: """Call LLM with prompt, return response.""" client, model, provider = get_llm_client() if provider == "anthropic": response = client.messages.create( model=model, max_tokens=4000, messages=[{"role": "user", "content": prompt}] ) return response.content[0].text else: # openrouter response = client.chat.completions.create( model=model, messages=[{"role": "user", "content": prompt}] ) return response.choices[0].message.content def parse_json_response(response_text: str) -> dict: """Parse JSON from LLM response (handles markdown formatting).""" json_match = re.search(r'```json\s*(.*?)\s*```', response_text, re.DOTALL) if json_match: json_str = json_match.group(1) else: json_str = response_text try: return json.loads(json_str) except json.JSONDecodeError as e: print(f"❌ Failed to parse GenAI response: {e}") print(f"Response: {response_text[:500]}") sys.exit(1) # ============================================================================ # 1. PROJECT.md Alignment Validator # ============================================================================ @dataclass class AlignmentResult: """Result of alignment validation.""" feature_description: str aligned: bool confidence: str reasoning: str alignment_score: int concerns: List[str] suggestions: List[str] relevant_goals: List[str] scope_violations: List[str] constraint_violations: List[str] def is_acceptable(self) -> bool: has_critical_violations = ( len(self.scope_violations) > 0 or len(self.constraint_violations) > 0 ) return self.alignment_score >= 7 and not has_critical_violations def read_project_md() -> Dict[str, str]: """Read and parse PROJECT.md into sections.""" if not PROJECT_MD.exists(): print(f"❌ PROJECT.md not found at: {PROJECT_MD}") sys.exit(1) content = PROJECT_MD.read_text() sections = {} for section_name in ['GOALS', 'SCOPE', 'CONSTRAINTS', 'CURRENT_SPRINT']: match = re.search( rf'## {section_name}\s*\n(.*?)(?=\n##|\Z)', content, re.DOTALL ) if match: sections[section_name] = match.group(1).strip() return sections def validate_alignment(feature_description: str) -> AlignmentResult: """Validate feature alignment with PROJECT.md.""" _, _, provider = get_llm_client() print(f"🤖 Validating alignment with {provider} GenAI...") project_sections = read_project_md() prompt = f"""You are validating whether a proposed feature aligns with a project's strategic goals and constraints. **PROJECT CONTEXT** **GOALS** (What success looks like): {project_sections.get('GOALS', 'Not specified')} **SCOPE** (What's included/excluded): {project_sections.get('SCOPE', 'Not specified')} **CONSTRAINTS** (Technical, resource, philosophical limits): {project_sections.get('CONSTRAINTS', 'Not specified')} **CURRENT SPRINT** (Active focus): {project_sections.get('CURRENT_SPRINT', 'Not specified')} --- **PROPOSED FEATURE**: {feature_description} --- **VALIDATION TASK**: Analyze whether this feature aligns with the project's strategic direction. Consider: 1. **Goal Alignment**: Does this serve the stated goals? Which ones? How directly? 2. **Scope Fit**: Is this within declared scope? Or is it scope creep disguised as enhancement? 3. **Constraint Compliance**: Does it violate any constraints (technical, resource, philosophical)? 4. **Strategic Value**: Is this solving the right problem? Or a distraction? 5. **Sprint Relevance**: Does it align with current sprint focus? If not, should it wait? Provide your analysis in JSON format: ```json {{ "aligned": true/false, "confidence": "high/medium/low", "alignment_score": 0-10, "reasoning": "Detailed explanation of why this aligns or doesn't", "relevant_goals": ["Goal 1 that this serves", "Goal 2..."], "concerns": ["Concern 1 if any", "Concern 2..."], "scope_violations": ["Violation 1 if any", "Violation 2..."], "constraint_violations": ["Violation 1 if any", "Violation 2..."], "suggestions": ["How to make it better align", "Alternative approach..."] }} ``` Be strict but fair. If it's borderline, say so (medium confidence). """ response = call_llm(prompt) data = parse_json_response(response) return AlignmentResult( feature_description=feature_description, aligned=data.get("aligned", False), confidence=data.get("confidence", "low"), reasoning=data.get("reasoning", "No reasoning provided"), alignment_score=data.get("alignment_score", 0), concerns=data.get("concerns", []), suggestions=data.get("suggestions", []), relevant_goals=data.get("relevant_goals", []), scope_violations=data.get("scope_violations", []), constraint_violations=data.get("constraint_violations", []) ) # ============================================================================ # 2. Documentation Consistency Validator # ============================================================================ @dataclass class InconsistencyFound: """A documentation inconsistency.""" file_path: str claim: str reality: str severity: str reasoning: str line_number: Optional[int] = None @dataclass class ValidationResult: """Result of documentation validation.""" file_path: str is_consistent: bool confidence: str summary: str inconsistencies: List[InconsistencyFound] verified_claims: List[str] def gather_code_context() -> Dict: """Gather code context for validation.""" def list_dir(dir_path, pattern): if not dir_path.exists(): return [] return [f.stem for f in dir_path.glob(pattern)] return { "commands": list_dir(COMMANDS_DIR, "*.md"), "agents": list_dir(AGENTS_DIR, "*.md"), "hooks": list_dir(HOOKS_DIR, "*.py"), } def validate_docs(doc_file: Path) -> ValidationResult: """Validate documentation against code reality.""" _, _, provider = get_llm_client() print(f"🤖 Validating {doc_file.name} with {provider} GenAI...") code_context = gather_code_context() doc_content = doc_file.read_text() prompt = f"""You are validating whether documentation accurately describes code reality. **DOCUMENTATION CONTENT** ({doc_file.name}): ``` {doc_content[:8000]} ``` **CODE REALITY**: Available commands: {len(code_context['commands'])} total {', '.join(code_context['commands'][:20])} Available agents: {len(code_context['agents'])} total {', '.join(code_context['agents'])} Available hooks: {len(code_context['hooks'])} total {', '.join(code_context['hooks'])} --- **VALIDATION TASK**: Check if the documentation makes claims that don't match code reality. **Common Issues to Detect**: 1. **Overpromising**: Claims features that don't exist 2. **Count Mismatches**: Claims wrong numbers 3. **Misleading Descriptions**: Technically true but misleading 4. **Outdated Behavior**: Describes old implementation 5. **Missing Caveats**: Doesn't mention limitations Provide analysis in JSON: ```json {{ "is_consistent": true/false, "confidence": "high/medium/low", "summary": "Brief summary of validation", "inconsistencies": [ {{ "claim": "What the doc claims", "reality": "What the code actually does", "severity": "critical/high/medium/low", "reasoning": "Why this is inconsistent", "line_number": null }} ], "verified_claims": ["Claim 1 that IS accurate", "Claim 2 that IS accurate"] }} ``` Focus on critical and high severity issues. """ response = call_llm(prompt) data = parse_json_response(response) inconsistencies = [ InconsistencyFound( file_path=str(doc_file.relative_to(PROJECT_ROOT)), claim=inc.get("claim", ""), reality=inc.get("reality", ""), severity=inc.get("severity", "low"), reasoning=inc.get("reasoning", ""), line_number=inc.get("line_number") ) for inc in data.get("inconsistencies", []) ] return ValidationResult( file_path=str(doc_file.relative_to(PROJECT_ROOT)), is_consistent=data.get("is_consistent", True), confidence=data.get("confidence", "low"), summary=data.get("summary", ""), inconsistencies=inconsistencies, verified_claims=data.get("verified_claims", []) ) # ============================================================================ # 3. Code Review Quality Gate # ============================================================================ @dataclass class CodeReviewResult: approved: bool score: int issues: List[Dict] strengths: List[str] suggestions: List[str] reasoning: str def code_review(diff_content: str) -> CodeReviewResult: """Deep code review with architectural awareness.""" print("🤖 Performing code review with GenAI...") prompt = f"""You are performing a deep code review with architectural awareness. **CODE CHANGES**: ``` {diff_content[:6000]} ``` **REVIEW CHECKLIST**: 1. **Logic & Correctness**: Edge cases, off-by-one errors, race conditions, resource leaks 2. **Code Quality**: Semantic names, single-responsibility, reasonable complexity, DRY principle 3. **Architecture**: Follows patterns, modularity, coupling 4. **Security**: Input validation, injection risks, XSS, sensitive data exposure 5. **Testing**: Tests included, edge cases tested, test quality adequate 6. **Performance**: Algorithm complexity, memory leaks, unnecessary queries Respond JSON: ```json {{ "approved": true/false, "score": 0-10, "reasoning": "Overall assessment", "issues": [ {{"severity": "critical/high/medium/low", "description": "...", "suggestion": "..."}} ], "strengths": ["What's good about this code"], "suggestions": ["How to improve"] }} ``` Approve (score 7+) if no critical issues. """ response = call_llm(prompt) data = parse_json_response(response) return CodeReviewResult( approved=data.get("approved", False), score=data.get("score", 0), issues=data.get("issues", []), strengths=data.get("strengths", []), suggestions=data.get("suggestions", []), reasoning=data.get("reasoning", "") ) # ============================================================================ # 4. Test Quality Assessment # ============================================================================ @dataclass class TestQualityResult: score: int coverage_meaningful: bool gaps: List[str] strengths: List[str] recommendations: List[str] def assess_test_quality(test_code: str, source_code: str) -> TestQualityResult: """Assess test quality beyond coverage %.""" print("🤖 Assessing test quality with GenAI...") prompt = f"""Assess test quality (not just coverage %). **SOURCE CODE**: ``` {source_code[:3000]} ``` **TEST CODE**: ``` {test_code[:3000]} ``` **ASSESSMENT CRITERIA**: 1. **Edge Cases**: null, empty, negative, boundary, max values 2. **Error Conditions**: exceptions, invalid input, timeouts 3. **Independence**: no shared state, order-independent 4. **Assertions**: meaningful (not just "assert True") 5. **Test Names**: descriptive of what's being tested 6. **Setup/Teardown**: proper resource cleanup 7. **Mocking**: appropriate use of mocks/stubs Respond JSON: ```json {{ "score": 0-10, "coverage_meaningful": true/false, "gaps": ["Missing edge case: null input", "No error condition tests"], "strengths": ["Good test independence", "Clear test names"], "recommendations": ["Add boundary value tests", "Test concurrent access"] }} ``` Score 7+ = good tests. Be strict. """ response = call_llm(prompt) data = parse_json_response(response) return TestQualityResult( score=data.get("score", 0), coverage_meaningful=data.get("coverage_meaningful", False), gaps=data.get("gaps", []), strengths=data.get("strengths", []), recommendations=data.get("recommendations", []) ) # ============================================================================ # 5. Security Vulnerability Detection # ============================================================================ @dataclass class SecurityScanResult: vulnerabilities: List[Dict] risk_score: int safe: bool def security_scan(code: str) -> SecurityScanResult: """Context-aware security vulnerability detection.""" print("🤖 Scanning for security vulnerabilities with GenAI...") prompt = f"""Perform context-aware security analysis. **CODE**: ``` {code[:4000]} ``` **SECURITY CHECKS**: 1. **Injection Attacks**: SQL, command, LDAP, XML injection 2. **XSS Vulnerabilities**: Output escaping, Content-Type headers 3. **Authentication/Authorization**: Auth bypasses, privilege escalation 4. **Data Exposure**: Sensitive data in logs, PII handling, secrets hardcoded 5. **Crypto Issues**: Weak algorithms, hardcoded keys, insecure random 6. **Race Conditions**: TOCTOU, concurrent access issues 7. **Resource Exhaustion**: Unbounded loops, memory/file descriptor leaks Respond JSON: ```json {{ "vulnerabilities": [ {{"severity": "critical/high/medium/low", "type": "SQL Injection", "description": "...", "line": 42, "fix": "Use parameterized queries"}} ], "risk_score": 0-10, "safe": true/false }} ``` Mark safe=false if any critical/high vulnerabilities found. """ response = call_llm(prompt) data = parse_json_response(response) return SecurityScanResult( vulnerabilities=data.get("vulnerabilities", []), risk_score=data.get("risk_score", 0), safe=data.get("safe", True) ) # ============================================================================ # 6. GitHub Issue Classification # ============================================================================ @dataclass class IssueClassification: type: str priority: str component: str labels: List[str] goal_alignment: str def classify_issue(description: str) -> IssueClassification: """Intelligent issue classification.""" print("🤖 Classifying issue with GenAI...") prompt = f"""Classify this GitHub issue. **ISSUE DESCRIPTION**: {description} **CLASSIFICATION TASK**: Determine: 1. **Type**: bug/feature/enhancement/refactoring/documentation/question 2. **Priority**: critical (blocks release) / high (important) / medium (nice to have) / low (backlog) 3. **Component**: Which part of codebase affected 4. **Labels**: Suggested GitHub labels 5. **Goal Alignment**: Which PROJECT.md goal does this relate to? Respond JSON: ```json {{ "type": "bug", "priority": "high", "component": "authentication", "labels": ["bug", "security", "P1"], "goal_alignment": "Security and quality" }} ``` """ response = call_llm(prompt) data = parse_json_response(response) return IssueClassification( type=data.get("type", "question"), priority=data.get("priority", "low"), component=data.get("component", "general"), labels=data.get("labels", []), goal_alignment=data.get("goal_alignment", "") ) # ============================================================================ # 7. Commit Message Generation # ============================================================================ def generate_commit_message(diff: str) -> str: """Generate semantic commit message following conventions.""" print("🤖 Generating commit message with GenAI...") prompt = f"""Generate a semantic commit message following conventional commits. **GIT DIFF**: ``` {diff[:3000]} ``` **COMMIT MESSAGE FORMAT**: ``` ():