TradingAgents/.claude/hooks/security_scan.py

273 lines
8.6 KiB
Python
Executable File
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Language-agnostic security scanning hook with GenAI context analysis.
Scans for:
- Hardcoded API keys and secrets
- Common security vulnerabilities
- Sensitive data in code
Features:
- Pattern matching (regex-based detection)
- GenAI context analysis (Claude determines if real vs test data)
- Graceful degradation (works without Anthropic SDK)
Works across Python, JavaScript, Go, and other languages.
"""
import re
import sys
import os
from pathlib import Path
from typing import List, Tuple, Optional
from genai_utils import GenAIAnalyzer, parse_binary_response
from genai_prompts import SECRET_ANALYSIS_PROMPT
# Secret patterns to detect
SECRET_PATTERNS = [
# API keys
(r"sk-[a-zA-Z0-9]{20,}", "Anthropic API key"),
(r"sk-proj-[a-zA-Z0-9]{20,}", "OpenAI API key"),
(r"xoxb-[a-zA-Z0-9-]{40,}", "Slack bot token"),
(r"ghp_[a-zA-Z0-9]{36,}", "GitHub personal access token"),
(r"gho_[a-zA-Z0-9]{36,}", "GitHub OAuth token"),
# AWS keys
(r"AKIA[0-9A-Z]{16}", "AWS access key ID"),
(r"(?i)aws_secret_access_key.*[=:].*[a-zA-Z0-9/+=]{40}", "AWS secret key"),
# Generic patterns
(r'(?i)(api[_-]?key|apikey).*[=:].*["\'][a-zA-Z0-9]{20,}["\']', "Generic API key"),
(r'(?i)(secret|password|passwd|pwd).*[=:].*["\'][^"\']{8,}["\']', "Generic secret"),
(r'(?i)token.*[=:].*["\'][a-zA-Z0-9]{20,}["\']', "Generic token"),
# Database URLs with credentials
(r"(?i)(mongodb|mysql|postgres)://[^:]+:[^@]+@", "Database URL with credentials"),
]
# File patterns to ignore
IGNORE_PATTERNS = [
r"\.git/",
r"__pycache__/",
r"node_modules/",
r"\.env\.example$",
r"\.env\.template$",
r"test_.*\.py$", # Test files often have fake secrets
r".*_test\.go$",
]
# Initialize GenAI analyzer (with feature flag support)
analyzer = GenAIAnalyzer(
use_genai=os.environ.get("GENAI_SECURITY_SCAN", "true").lower() == "true"
)
def should_scan_file(file_path: Path) -> bool:
"""Determine if file should be scanned."""
path_str = str(file_path)
# Ignore patterns
for pattern in IGNORE_PATTERNS:
if re.search(pattern, path_str):
return False
# Only scan code files
code_extensions = {".py", ".js", ".jsx", ".ts", ".tsx", ".go", ".java", ".rb", ".php", ".cs"}
return file_path.suffix in code_extensions
def is_comment_or_docstring(line: str, language: str) -> bool:
"""Check if line is a comment or docstring."""
line = line.strip()
if language == "python":
return line.startswith("#") or line.startswith('"""') or line.startswith("'''")
elif language in ["javascript", "typescript", "go", "java"]:
return line.startswith("//") or line.startswith("/*") or line.startswith("*")
return False
def analyze_secret_context(line: str, secret_type: str, variable_name: Optional[str] = None) -> bool:
"""Use GenAI to determine if a matched secret is real or test data.
Delegates to shared GenAI utility with graceful fallback to heuristics.
Returns:
True if it appears to be a real secret, False if likely test data
"""
# Extract variable context from line
var_context = ""
if "=" in line:
var_context = line.split("=")[0].strip()
# Call shared GenAI analyzer
response = analyzer.analyze(
SECRET_ANALYSIS_PROMPT,
line=line,
secret_type=secret_type,
variable_name=var_context or "N/A"
)
# Parse response using shared utility
if response:
is_real = parse_binary_response(
response,
true_keywords=["REAL", "LIKELY_REAL"],
false_keywords=["FAKE"]
)
if is_real is not None:
return is_real
# Fallback to heuristics if GenAI unavailable or ambiguous
return _heuristic_secret_check(line, secret_type, variable_name)
def _heuristic_secret_check(line: str, secret_type: str, variable_name: Optional[str] = None) -> bool:
"""Fallback heuristic check if GenAI unavailable.
Returns:
True if likely real secret, False if likely test data
"""
# Common test data indicators
test_indicators = [
"test_", "fake_", "mock_", "example_", "dummy_",
"test123", "fake123", "mock123",
"sk-test", "pk_test", "rk_test",
"00000000", "11111111", "aaaaaaa", "99999999",
"placeholder", "sample", "demo", "xxx",
]
line_lower = line.lower()
for indicator in test_indicators:
if indicator in line_lower:
return False
# If no obvious test indicators, assume real (conservative approach)
return True
def get_language(file_path: Path) -> str:
"""Get language from file extension."""
ext_map = {
".py": "python",
".js": "javascript",
".jsx": "javascript",
".ts": "typescript",
".tsx": "typescript",
".go": "go",
".java": "java",
}
return ext_map.get(file_path.suffix, "unknown")
def scan_file(file_path: Path) -> List[Tuple[int, str, str]]:
"""Scan a file for secrets with GenAI context analysis.
Returns:
List of (line_number, secret_type, matched_text) tuples
"""
violations = []
language = get_language(file_path)
try:
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
for line_num, line in enumerate(f, 1):
# Skip comments and docstrings
if is_comment_or_docstring(line, language):
continue
# Check each pattern
for pattern, secret_type in SECRET_PATTERNS:
if re.search(pattern, line):
# Extract matched text (redacted)
match = re.search(pattern, line)
matched = match.group(0)
# Redact middle part
if len(matched) > 10:
redacted = matched[:5] + "***" + matched[-5:]
else:
redacted = "***"
# Use GenAI to determine if this is a real secret or test data
is_real_secret = analyze_secret_context(line, secret_type)
if is_real_secret:
violations.append((line_num, secret_type, redacted))
elif os.environ.get("DEBUG_SECURITY_SCAN"):
print(f" Skipped test data in {file_path}:{line_num} ({secret_type})",
file=sys.stderr)
except Exception as e:
print(f"⚠️ Error scanning {file_path}: {e}", file=sys.stderr)
return violations
def scan_directory(directory: Path = Path(".")) -> dict:
"""Scan directory for secrets.
Returns:
Dictionary mapping file paths to violations
"""
all_violations = {}
# Scan source directories
for source_dir in ["src", "lib", "pkg", "app"]:
dir_path = directory / source_dir
if not dir_path.exists():
continue
for file_path in dir_path.rglob("*"):
if not file_path.is_file():
continue
if not should_scan_file(file_path):
continue
violations = scan_file(file_path)
if violations:
all_violations[file_path] = violations
return all_violations
def main():
"""Run security scan with GenAI context analysis."""
use_genai = os.environ.get("GENAI_SECURITY_SCAN", "true").lower() == "true"
genai_status = "🤖 (with GenAI context analysis)" if use_genai else ""
print(f"🔒 Running security scan... {genai_status}")
violations = scan_directory()
if not violations:
print("✅ No secrets or sensitive data detected")
if use_genai:
print(" (GenAI context analysis reduced false positives)")
sys.exit(0)
# Report violations
print("\n❌ SECURITY ISSUES DETECTED:\n")
for file_path, issues in violations.items():
print(f"📄 {file_path}")
for line_num, secret_type, redacted in issues:
print(f" Line {line_num}: {secret_type}")
print(f" Found: {redacted}")
print()
print("⚠️ Fix these issues before committing:")
print(" 1. Move secrets to .env file (add to .gitignore)")
print(" 2. Use environment variables: os.getenv('API_KEY')")
print(" 3. Never commit real API keys or passwords")
print()
if use_genai:
print("💡 Tip: GenAI analysis reduces false positives by understanding context")
print(" Disable with: export GENAI_SECURITY_SCAN=false")
print()
sys.exit(1)
if __name__ == "__main__":
main()