273 lines
8.6 KiB
Python
Executable File
273 lines
8.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
Language-agnostic security scanning hook with GenAI context analysis.
|
||
|
||
Scans for:
|
||
- Hardcoded API keys and secrets
|
||
- Common security vulnerabilities
|
||
- Sensitive data in code
|
||
|
||
Features:
|
||
- Pattern matching (regex-based detection)
|
||
- GenAI context analysis (Claude determines if real vs test data)
|
||
- Graceful degradation (works without Anthropic SDK)
|
||
|
||
Works across Python, JavaScript, Go, and other languages.
|
||
"""
|
||
|
||
import re
|
||
import sys
|
||
import os
|
||
from pathlib import Path
|
||
from typing import List, Tuple, Optional
|
||
|
||
from genai_utils import GenAIAnalyzer, parse_binary_response
|
||
from genai_prompts import SECRET_ANALYSIS_PROMPT
|
||
|
||
# Secret patterns to detect
|
||
SECRET_PATTERNS = [
|
||
# API keys
|
||
(r"sk-[a-zA-Z0-9]{20,}", "Anthropic API key"),
|
||
(r"sk-proj-[a-zA-Z0-9]{20,}", "OpenAI API key"),
|
||
(r"xoxb-[a-zA-Z0-9-]{40,}", "Slack bot token"),
|
||
(r"ghp_[a-zA-Z0-9]{36,}", "GitHub personal access token"),
|
||
(r"gho_[a-zA-Z0-9]{36,}", "GitHub OAuth token"),
|
||
# AWS keys
|
||
(r"AKIA[0-9A-Z]{16}", "AWS access key ID"),
|
||
(r"(?i)aws_secret_access_key.*[=:].*[a-zA-Z0-9/+=]{40}", "AWS secret key"),
|
||
# Generic patterns
|
||
(r'(?i)(api[_-]?key|apikey).*[=:].*["\'][a-zA-Z0-9]{20,}["\']', "Generic API key"),
|
||
(r'(?i)(secret|password|passwd|pwd).*[=:].*["\'][^"\']{8,}["\']', "Generic secret"),
|
||
(r'(?i)token.*[=:].*["\'][a-zA-Z0-9]{20,}["\']', "Generic token"),
|
||
# Database URLs with credentials
|
||
(r"(?i)(mongodb|mysql|postgres)://[^:]+:[^@]+@", "Database URL with credentials"),
|
||
]
|
||
|
||
# File patterns to ignore
|
||
IGNORE_PATTERNS = [
|
||
r"\.git/",
|
||
r"__pycache__/",
|
||
r"node_modules/",
|
||
r"\.env\.example$",
|
||
r"\.env\.template$",
|
||
r"test_.*\.py$", # Test files often have fake secrets
|
||
r".*_test\.go$",
|
||
]
|
||
|
||
# Initialize GenAI analyzer (with feature flag support)
|
||
analyzer = GenAIAnalyzer(
|
||
use_genai=os.environ.get("GENAI_SECURITY_SCAN", "true").lower() == "true"
|
||
)
|
||
|
||
|
||
def should_scan_file(file_path: Path) -> bool:
|
||
"""Determine if file should be scanned."""
|
||
path_str = str(file_path)
|
||
|
||
# Ignore patterns
|
||
for pattern in IGNORE_PATTERNS:
|
||
if re.search(pattern, path_str):
|
||
return False
|
||
|
||
# Only scan code files
|
||
code_extensions = {".py", ".js", ".jsx", ".ts", ".tsx", ".go", ".java", ".rb", ".php", ".cs"}
|
||
return file_path.suffix in code_extensions
|
||
|
||
|
||
def is_comment_or_docstring(line: str, language: str) -> bool:
|
||
"""Check if line is a comment or docstring."""
|
||
line = line.strip()
|
||
|
||
if language == "python":
|
||
return line.startswith("#") or line.startswith('"""') or line.startswith("'''")
|
||
elif language in ["javascript", "typescript", "go", "java"]:
|
||
return line.startswith("//") or line.startswith("/*") or line.startswith("*")
|
||
|
||
return False
|
||
|
||
|
||
def analyze_secret_context(line: str, secret_type: str, variable_name: Optional[str] = None) -> bool:
|
||
"""Use GenAI to determine if a matched secret is real or test data.
|
||
|
||
Delegates to shared GenAI utility with graceful fallback to heuristics.
|
||
|
||
Returns:
|
||
True if it appears to be a real secret, False if likely test data
|
||
"""
|
||
# Extract variable context from line
|
||
var_context = ""
|
||
if "=" in line:
|
||
var_context = line.split("=")[0].strip()
|
||
|
||
# Call shared GenAI analyzer
|
||
response = analyzer.analyze(
|
||
SECRET_ANALYSIS_PROMPT,
|
||
line=line,
|
||
secret_type=secret_type,
|
||
variable_name=var_context or "N/A"
|
||
)
|
||
|
||
# Parse response using shared utility
|
||
if response:
|
||
is_real = parse_binary_response(
|
||
response,
|
||
true_keywords=["REAL", "LIKELY_REAL"],
|
||
false_keywords=["FAKE"]
|
||
)
|
||
if is_real is not None:
|
||
return is_real
|
||
|
||
# Fallback to heuristics if GenAI unavailable or ambiguous
|
||
return _heuristic_secret_check(line, secret_type, variable_name)
|
||
|
||
|
||
def _heuristic_secret_check(line: str, secret_type: str, variable_name: Optional[str] = None) -> bool:
|
||
"""Fallback heuristic check if GenAI unavailable.
|
||
|
||
Returns:
|
||
True if likely real secret, False if likely test data
|
||
"""
|
||
# Common test data indicators
|
||
test_indicators = [
|
||
"test_", "fake_", "mock_", "example_", "dummy_",
|
||
"test123", "fake123", "mock123",
|
||
"sk-test", "pk_test", "rk_test",
|
||
"00000000", "11111111", "aaaaaaa", "99999999",
|
||
"placeholder", "sample", "demo", "xxx",
|
||
]
|
||
|
||
line_lower = line.lower()
|
||
for indicator in test_indicators:
|
||
if indicator in line_lower:
|
||
return False
|
||
|
||
# If no obvious test indicators, assume real (conservative approach)
|
||
return True
|
||
|
||
|
||
def get_language(file_path: Path) -> str:
|
||
"""Get language from file extension."""
|
||
ext_map = {
|
||
".py": "python",
|
||
".js": "javascript",
|
||
".jsx": "javascript",
|
||
".ts": "typescript",
|
||
".tsx": "typescript",
|
||
".go": "go",
|
||
".java": "java",
|
||
}
|
||
return ext_map.get(file_path.suffix, "unknown")
|
||
|
||
|
||
def scan_file(file_path: Path) -> List[Tuple[int, str, str]]:
|
||
"""Scan a file for secrets with GenAI context analysis.
|
||
|
||
Returns:
|
||
List of (line_number, secret_type, matched_text) tuples
|
||
"""
|
||
violations = []
|
||
language = get_language(file_path)
|
||
|
||
try:
|
||
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
||
for line_num, line in enumerate(f, 1):
|
||
# Skip comments and docstrings
|
||
if is_comment_or_docstring(line, language):
|
||
continue
|
||
|
||
# Check each pattern
|
||
for pattern, secret_type in SECRET_PATTERNS:
|
||
if re.search(pattern, line):
|
||
# Extract matched text (redacted)
|
||
match = re.search(pattern, line)
|
||
matched = match.group(0)
|
||
# Redact middle part
|
||
if len(matched) > 10:
|
||
redacted = matched[:5] + "***" + matched[-5:]
|
||
else:
|
||
redacted = "***"
|
||
|
||
# Use GenAI to determine if this is a real secret or test data
|
||
is_real_secret = analyze_secret_context(line, secret_type)
|
||
|
||
if is_real_secret:
|
||
violations.append((line_num, secret_type, redacted))
|
||
elif os.environ.get("DEBUG_SECURITY_SCAN"):
|
||
print(f"ℹ️ Skipped test data in {file_path}:{line_num} ({secret_type})",
|
||
file=sys.stderr)
|
||
|
||
except Exception as e:
|
||
print(f"⚠️ Error scanning {file_path}: {e}", file=sys.stderr)
|
||
|
||
return violations
|
||
|
||
|
||
def scan_directory(directory: Path = Path(".")) -> dict:
|
||
"""Scan directory for secrets.
|
||
|
||
Returns:
|
||
Dictionary mapping file paths to violations
|
||
"""
|
||
all_violations = {}
|
||
|
||
# Scan source directories
|
||
for source_dir in ["src", "lib", "pkg", "app"]:
|
||
dir_path = directory / source_dir
|
||
if not dir_path.exists():
|
||
continue
|
||
|
||
for file_path in dir_path.rglob("*"):
|
||
if not file_path.is_file():
|
||
continue
|
||
|
||
if not should_scan_file(file_path):
|
||
continue
|
||
|
||
violations = scan_file(file_path)
|
||
if violations:
|
||
all_violations[file_path] = violations
|
||
|
||
return all_violations
|
||
|
||
|
||
def main():
|
||
"""Run security scan with GenAI context analysis."""
|
||
use_genai = os.environ.get("GENAI_SECURITY_SCAN", "true").lower() == "true"
|
||
genai_status = "🤖 (with GenAI context analysis)" if use_genai else ""
|
||
print(f"🔒 Running security scan... {genai_status}")
|
||
|
||
violations = scan_directory()
|
||
|
||
if not violations:
|
||
print("✅ No secrets or sensitive data detected")
|
||
if use_genai:
|
||
print(" (GenAI context analysis reduced false positives)")
|
||
sys.exit(0)
|
||
|
||
# Report violations
|
||
print("\n❌ SECURITY ISSUES DETECTED:\n")
|
||
|
||
for file_path, issues in violations.items():
|
||
print(f"📄 {file_path}")
|
||
for line_num, secret_type, redacted in issues:
|
||
print(f" Line {line_num}: {secret_type}")
|
||
print(f" Found: {redacted}")
|
||
print()
|
||
|
||
print("⚠️ Fix these issues before committing:")
|
||
print(" 1. Move secrets to .env file (add to .gitignore)")
|
||
print(" 2. Use environment variables: os.getenv('API_KEY')")
|
||
print(" 3. Never commit real API keys or passwords")
|
||
print()
|
||
|
||
if use_genai:
|
||
print("💡 Tip: GenAI analysis reduces false positives by understanding context")
|
||
print(" Disable with: export GENAI_SECURITY_SCAN=false")
|
||
print()
|
||
|
||
sys.exit(1)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|