TradingAgents/.claude/lib/github_issue_fetcher.py

485 lines
16 KiB
Python

#!/usr/bin/env python3
"""
GitHub Issue Fetcher - Fetch issue titles via gh CLI for batch processing.
Provides secure GitHub issue fetching functionality for /batch-implement --issues flag.
Uses gh CLI for GitHub operations with comprehensive security validation.
Security Features:
- CWE-20: Input validation (positive integers, max 100 issues)
- CWE-78: Command injection prevention (subprocess list args, shell=False)
- CWE-117: Log injection prevention (sanitize newlines, control characters)
- Audit logging for all gh CLI operations
Key Functions:
1. validate_issue_numbers() - Validate issue numbers before subprocess calls
2. fetch_issue_title() - Fetch single issue title via gh CLI
3. fetch_issue_titles() - Batch fetch multiple issue titles
4. format_feature_description() - Format issue as feature description
Workflow:
1. Parse --issues argument: "72,73,74" → [72, 73, 74]
2. Validate issue numbers (validate_issue_numbers)
3. Fetch titles from GitHub (fetch_issue_titles)
4. Format as features (format_feature_description)
5. Create batch state with issue_numbers
Usage:
from github_issue_fetcher import (
validate_issue_numbers,
fetch_issue_titles,
format_feature_description,
)
# Parse issue numbers
issue_numbers = [72, 73, 74]
# Validate
validate_issue_numbers(issue_numbers)
# Fetch titles
issue_titles = fetch_issue_titles(issue_numbers)
# Returns: {72: "Add logging", 73: "Fix bug"}
# Format as features
features = [
format_feature_description(num, title)
for num, title in issue_titles.items()
]
# Returns: ["Issue #72: Add logging", "Issue #73: Fix bug"]
Date: 2025-11-16
Issue: #77 (Add --issues flag to /batch-implement)
Agent: implementer
Phase: TDD Green (making tests pass)
See error-handling-patterns skill for exception hierarchy and error handling best practices.
Design Patterns:
See library-design-patterns skill for standardized design patterns.
See api-integration-patterns skill for standardized design patterns.
"""
import json
import subprocess
from pathlib import Path
from typing import List, Dict, Optional
from subprocess import TimeoutExpired
# Import security utilities for audit logging
import sys
sys.path.insert(0, str(Path(__file__).parent))
from security_utils import audit_log
# =============================================================================
# EXCEPTIONS
# =============================================================================
class GitHubAPIError(Exception):
"""Base exception for GitHub API errors."""
pass
class IssueNotFoundError(GitHubAPIError):
"""Exception raised when GitHub issue is not found."""
pass
# =============================================================================
# CONSTANTS
# =============================================================================
# Maximum issues per batch (prevent resource exhaustion)
MAX_ISSUES_PER_BATCH = 100
# Subprocess timeout (seconds)
GH_CLI_TIMEOUT = 10
# Title truncation length (prevent log bloat)
MAX_TITLE_LENGTH = 200
# =============================================================================
# INPUT VALIDATION (CWE-20)
# =============================================================================
def validate_issue_numbers(issue_numbers: List[int]) -> None:
"""Validate GitHub issue numbers.
Security (CWE-20): Input Validation
- Accept only positive integers (>0)
- Reject zero, negative numbers
- Enforce maximum limit (100 issues per batch)
- Prevent resource exhaustion attacks
Args:
issue_numbers: List of GitHub issue numbers
Raises:
ValueError: If validation fails with helpful message
Examples:
>>> validate_issue_numbers([72, 73, 74]) # Valid
>>> validate_issue_numbers([]) # Raises ValueError
ValueError: Issue numbers list cannot be empty
>>> validate_issue_numbers([-1]) # Raises ValueError
ValueError: Invalid issue number: -1. Issue numbers must be positive integers.
>>> validate_issue_numbers([0]) # Raises ValueError
ValueError: Invalid issue number: 0. Issue numbers must be positive integers.
>>> validate_issue_numbers(list(range(1, 102))) # Raises ValueError
ValueError: Too many issues: 101. Maximum allowed is 100 issues per batch.
Security Notes:
- This function MUST be called BEFORE any subprocess calls
- Prevents command injection via invalid issue numbers
- Prevents resource exhaustion via batch size limits
"""
# Check for empty list
if not issue_numbers:
raise ValueError(
"Issue numbers list cannot be empty. "
"Provide at least one issue number."
)
# Check maximum batch size
if len(issue_numbers) > MAX_ISSUES_PER_BATCH:
raise ValueError(
f"Too many issues: {len(issue_numbers)}. "
f"Maximum allowed is {MAX_ISSUES_PER_BATCH} issues per batch. "
f"Consider splitting into multiple batches."
)
# Validate each issue number
for num in issue_numbers:
if not isinstance(num, int) or num <= 0:
raise ValueError(
f"Invalid issue number: {num}. "
f"Issue numbers must be positive integers (>0)."
)
# Audit log successful validation
audit_log("github_issue_validation", "success", {
"operation": "validate_issue_numbers",
"count": len(issue_numbers),
"issue_numbers": issue_numbers[:10], # Log first 10 for audit trail
})
# =============================================================================
# SINGLE ISSUE FETCHING (CWE-78)
# =============================================================================
def fetch_issue_title(issue_number: int) -> Optional[str]:
"""Fetch single GitHub issue title via gh CLI.
Security (CWE-78): Command Injection Prevention
- Use subprocess.run() with LIST arguments (not string)
- shell=False (CRITICAL security requirement)
- 10-second timeout to prevent hung processes
- Audit log all gh CLI operations
Args:
issue_number: GitHub issue number
Returns:
Issue title if exists, None if not found (404)
Raises:
FileNotFoundError: If gh CLI is not installed
TimeoutExpired: If gh CLI hangs (>10 seconds)
OSError: If network or system errors occur
Examples:
>>> fetch_issue_title(72) # Existing issue
'Add logging feature'
>>> fetch_issue_title(9999) # Non-existent issue
None
>>> fetch_issue_title(72) # gh CLI not installed
FileNotFoundError: gh CLI not found. Install from: https://cli.github.com
Security Notes:
- CRITICAL: Uses subprocess.run() with list args (prevents command injection)
- CRITICAL: shell=False prevents shell metacharacter attacks
- Validates subprocess command construction in tests
- All operations are audit logged
"""
try:
# SECURITY CRITICAL: Use list arguments, shell=False
# This prevents command injection via issue_number
result = subprocess.run(
['gh', 'issue', 'view', str(issue_number), '--json', 'title'],
capture_output=True,
text=True,
timeout=GH_CLI_TIMEOUT,
shell=False, # CRITICAL: Never use shell=True
)
# Check return code
if result.returncode != 0:
# Check for common errors
stderr_lower = result.stderr.lower()
# Issue not found (404)
if 'no pull requests or issues found' in stderr_lower or 'not found' in stderr_lower:
audit_log(f"Issue #{issue_number} not found (404)", "not_found", {
"operation": "fetch_issue_title",
"issue_number": issue_number,
"error": "Issue not found (404)",
})
return None
# Authentication error
if 'authentication' in stderr_lower or 'unauthorized' in stderr_lower:
audit_log("github_issue_fetch", "error", {
"operation": "fetch_issue_title",
"issue_number": issue_number,
"error": "Authentication required",
})
# Graceful degradation - return None instead of raising
return None
# Rate limit
if 'rate limit' in stderr_lower:
audit_log("github_issue_fetch", "error", {
"operation": "fetch_issue_title",
"issue_number": issue_number,
"error": "API rate limit exceeded",
})
# Graceful degradation - return None instead of raising
return None
# Other errors - graceful degradation
audit_log("github_issue_fetch", "error", {
"operation": "fetch_issue_title",
"issue_number": issue_number,
"error": result.stderr[:200], # Log first 200 chars
})
return None
# Parse JSON response
try:
data = json.loads(result.stdout)
title = data.get('title', '')
# Audit log success
audit_log(f"Successfully fetched issue #{issue_number}", "success", {
"operation": "fetch_issue_title",
"issue_number": issue_number,
"title_length": len(title),
})
return title
except json.JSONDecodeError as e:
# Graceful degradation on JSON parse error
audit_log("github_issue_fetch", "error", {
"operation": "fetch_issue_title",
"issue_number": issue_number,
"error": f"JSON parse error: {e}",
})
return None
except FileNotFoundError:
# gh CLI not installed
audit_log("github_issue_fetch", "error", {
"operation": "fetch_issue_title",
"issue_number": issue_number,
"error": "gh CLI not found",
})
raise FileNotFoundError(
"gh CLI not found. Install from: https://cli.github.com\n"
"After installing, authenticate with: gh auth login"
)
except TimeoutExpired:
# gh CLI timeout
audit_log("github_issue_fetch", "error", {
"operation": "fetch_issue_title",
"issue_number": issue_number,
"error": f"Timeout after {GH_CLI_TIMEOUT} seconds",
})
raise
except OSError as e:
# Network or system error
audit_log("github_issue_fetch", "error", {
"operation": "fetch_issue_title",
"issue_number": issue_number,
"error": str(e),
})
raise
# =============================================================================
# BATCH ISSUE FETCHING
# =============================================================================
def fetch_issue_titles(issue_numbers: List[int]) -> Dict[int, str]:
"""Batch fetch multiple GitHub issue titles.
Features:
- Call fetch_issue_title() for each issue
- Graceful degradation: skip missing issues (return None)
- Audit log batch operations
- Raise ValueError if ALL issues missing
Args:
issue_numbers: List of GitHub issue numbers
Returns:
Dict mapping issue_number → title (only successful fetches)
Raises:
ValueError: If ALL issues are missing or failed to fetch
FileNotFoundError: If gh CLI is not installed
TimeoutExpired: If gh CLI hangs
Examples:
>>> fetch_issue_titles([72, 73, 74])
{72: 'Add logging', 73: 'Fix bug', 74: 'Update docs'}
>>> fetch_issue_titles([72, 9999, 74]) # 9999 doesn't exist
{72: 'Add logging', 74: 'Update docs'}
>>> fetch_issue_titles([9998, 9999]) # All missing
ValueError: No issues found. All issue numbers are invalid or don't exist: [9998, 9999]
Security Notes:
- Input validation should be done BEFORE calling this function
- All gh CLI operations are audit logged
- Graceful degradation on missing issues
"""
# Audit log batch start
audit_log("github_issue_fetch_batch start", "info", {
"operation": "fetch_issue_titles",
"count": len(issue_numbers),
"issue_numbers": issue_numbers[:10], # Log first 10
})
results = {}
missing_issues = []
# Fetch each issue
for num in issue_numbers:
title = fetch_issue_title(num)
if title is not None:
results[num] = title
# Log successful fetch in batch context
audit_log(f"Batch: fetched issue #{num}", "info", {
"operation": "fetch_issue_titles_item",
"issue_number": num,
})
else:
missing_issues.append(num)
# Check if ALL issues failed
if not results:
audit_log("github_issue_fetch_batch", "error", {
"operation": "fetch_issue_titles",
"error": "All issues failed to fetch",
"missing_issues": missing_issues,
})
raise ValueError(
f"No issues found. All issue numbers are invalid or don't exist: {missing_issues}\n"
f"Please verify the issue numbers and try again."
)
# Log warnings for missing issues
if missing_issues:
audit_log("github_issue_fetch_batch", "warning", {
"operation": "fetch_issue_titles",
"successful": len(results),
"missing": len(missing_issues),
"missing_issues": missing_issues,
})
# Audit log batch completion
audit_log("github_issue_fetch_batch complete", "info", {
"operation": "fetch_issue_titles",
"successful": len(results),
"total": len(issue_numbers),
})
return results
# =============================================================================
# OUTPUT FORMATTING (CWE-117)
# =============================================================================
def format_feature_description(issue_number: int, title: str) -> str:
"""Format issue as feature description.
Security (CWE-117): Log Injection Prevention
- Sanitize newlines (\n, \r)
- Remove control characters (\t, \x00, \x1b)
- Truncate long titles (>200 chars → "...")
- Handle empty/whitespace-only titles
Args:
issue_number: GitHub issue number
title: Issue title (may contain malicious characters)
Returns:
Formatted feature description: "Issue #72: Add logging feature"
Examples:
>>> format_feature_description(72, "Add logging feature")
'Issue #72: Add logging feature'
>>> format_feature_description(72, "Title\\nINJECTED\\nLOG")
'Issue #72: Title INJECTED LOG'
>>> format_feature_description(72, "")
'Issue #72: (no title)'
>>> format_feature_description(72, "A" * 500)
'Issue #72: AAAA...AAA...'
Security Notes:
- CRITICAL: Sanitizes newlines to prevent log injection (CWE-117)
- Removes control characters (\t, \x00, \x1b, etc.)
- Truncates long titles to prevent log bloat
- All malicious characters are replaced or removed
"""
# Strip whitespace
title = title.strip()
# Handle empty titles
if not title:
return f"Issue #{issue_number}: (no title)"
# SECURITY: Remove newlines (CWE-117 prevention)
# Replace \n and \r with spaces
title = title.replace('\n', ' ').replace('\r', ' ')
# SECURITY: Remove control characters
# Keep only printable characters (ASCII 32-126) and space
sanitized = []
for char in title:
char_code = ord(char)
if char_code >= 32 and char_code <= 126:
sanitized.append(char)
elif char == ' ':
sanitized.append(char)
# Skip all other control characters (\t, \x00, \x1b, etc.)
title = ''.join(sanitized)
# Collapse multiple spaces
title = ' '.join(title.split())
# Handle whitespace-only after sanitization
if not title:
return f"Issue #{issue_number}: (no title)"
# SECURITY: Truncate long titles (prevent log bloat)
if len(title) > MAX_TITLE_LENGTH:
title = title[:MAX_TITLE_LENGTH] + "..."
return f"Issue #{issue_number}: {title}"