TradingAgents/.claude/lib/failure_classifier.py

397 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Failure Classifier - Classify /auto-implement failures as transient vs permanent.
Classifies error messages to determine if a failed /auto-implement attempt should
be retried (transient errors like network issues) or marked as failed (permanent
errors like syntax errors).
Key Features:
1. Pattern-based classification (transient vs permanent)
2. Error message sanitization (CWE-117 log injection prevention)
3. Error context extraction for debugging
4. Case-insensitive pattern matching
5. Safe defaults (unknown errors → permanent, no retry)
Classification Strategy:
TRANSIENT (retriable):
- Network errors (ConnectionError, NetworkError)
- Timeout errors (TimeoutError, timeout)
- API rate limits (RateLimitError, 429, 503)
- Temporary service failures (502, 504, TemporaryFailure)
PERMANENT (non-retriable):
- Syntax errors (SyntaxError, IndentationError)
- Import errors (ImportError, ModuleNotFoundError)
- Type errors (TypeError, AttributeError, NameError)
- Value errors (ValueError, KeyError, IndexError)
- Logic errors (AssertionError)
UNKNOWN → PERMANENT (safe default, don't retry)
Usage:
from failure_classifier import (
classify_failure,
is_transient_error,
is_permanent_error,
sanitize_error_message,
extract_error_context,
FailureType,
)
# Classify error
error_msg = "ConnectionError: Failed to connect to API"
failure_type = classify_failure(error_msg)
if failure_type == FailureType.TRANSIENT:
# Retry the operation
pass
# Check specific type
if is_transient_error(error_msg):
# Retry
pass
# Sanitize before logging
safe_msg = sanitize_error_message(error_msg)
log.error(safe_msg)
# Extract rich context
context = extract_error_context(error_msg, "Add user authentication")
Security:
- CWE-117: Log injection prevention via sanitization
- Max message length: 1000 chars (prevent resource exhaustion)
- Newline/carriage return removal
- Safe defaults (unknown → permanent)
Date: 2025-11-18
Issue: #89 (Automatic Failure Recovery for /batch-implement)
Agent: implementer
Phase: TDD Green (making tests pass)
See error-handling-patterns skill for exception hierarchy and error handling best practices.
"""
import re
from datetime import datetime
from enum import Enum
from typing import Dict, Any, Optional, List
# =============================================================================
# Constants and Enums
# =============================================================================
class FailureType(Enum):
"""Classification of failure types."""
TRANSIENT = "transient" # Retriable (network, timeout, rate limit)
PERMANENT = "permanent" # Non-retriable (syntax, import, type errors)
# Transient error patterns (case-insensitive regex)
TRANSIENT_ERROR_PATTERNS: List[str] = [
r"connectionerror",
r"timeouterror",
r"ratelimiterror",
r"networkerror",
r"httperror.*503",
r"httperror.*502",
r"httperror.*504",
r"httperror.*429",
r"temporaryfailure",
r"service.*unavailable",
r"bad.*gateway",
r"gateway.*timeout",
r"too.*many.*requests",
r"connection.*refused",
r"connection.*reset",
r"operation.*timed.*out",
r"timed.*out",
r"network.*unreachable",
]
# Permanent error patterns (case-insensitive regex)
PERMANENT_ERROR_PATTERNS: List[str] = [
r"syntaxerror",
r"importerror",
r"typeerror",
r"nameerror",
r"attributeerror",
r"valueerror",
r"indentationerror",
r"keyerror",
r"indexerror",
r"assertionerror",
r"modulenotfounderror",
r"filenotfounderror",
r"permissionerror",
r"zerodivisionerror",
r"notimplementederror",
r"recursionerror",
]
# Maximum error message length (prevent resource exhaustion)
MAX_ERROR_MESSAGE_LENGTH = 1000
# =============================================================================
# Error Classification Functions
# =============================================================================
def is_transient_error(error_message: Optional[str]) -> bool:
"""
Check if error message indicates a transient (retriable) error.
Args:
error_message: Error message to check
Returns:
True if error is transient, False otherwise
"""
if not error_message:
return False
# Case-insensitive pattern matching
error_lower = error_message.lower()
for pattern in TRANSIENT_ERROR_PATTERNS:
if re.search(pattern, error_lower):
return True
return False
def is_permanent_error(error_message: Optional[str]) -> bool:
"""
Check if error message indicates a permanent (non-retriable) error.
Args:
error_message: Error message to check
Returns:
True if error is permanent, False otherwise
"""
if not error_message:
return False
# Case-insensitive pattern matching
error_lower = error_message.lower()
for pattern in PERMANENT_ERROR_PATTERNS:
if re.search(pattern, error_lower):
return True
return False
def classify_failure(error_message: Optional[str]) -> FailureType:
"""
Classify error message as transient or permanent.
Classification Rules:
1. Check transient patterns first (network, timeout, rate limit)
2. Check permanent patterns (syntax, import, type errors)
3. Default to PERMANENT for safety (don't retry unknown errors)
Args:
error_message: Error message to classify
Returns:
FailureType.TRANSIENT or FailureType.PERMANENT
Examples:
>>> classify_failure("ConnectionError: Failed to connect")
FailureType.TRANSIENT
>>> classify_failure("SyntaxError: invalid syntax")
FailureType.PERMANENT
>>> classify_failure("WeirdUnknownError: something happened")
FailureType.PERMANENT # Safe default
"""
# Handle None/empty
if not error_message:
return FailureType.PERMANENT # Safe default
# Check transient patterns
if is_transient_error(error_message):
return FailureType.TRANSIENT
# Check permanent patterns
if is_permanent_error(error_message):
return FailureType.PERMANENT
# Unknown errors default to permanent (safe default - don't retry)
return FailureType.PERMANENT
# =============================================================================
# Error Message Sanitization
# =============================================================================
def sanitize_error_message(error_message: Optional[str]) -> str:
"""
Sanitize error message for safe logging (CWE-117 prevention).
Security Measures:
1. Remove newlines (prevent log injection)
2. Remove carriage returns (prevent log injection)
3. Truncate to MAX_ERROR_MESSAGE_LENGTH (prevent resource exhaustion)
Args:
error_message: Raw error message
Returns:
Sanitized error message safe for logging
Examples:
>>> sanitize_error_message("Error\\nFAKE LOG: Admin access")
"Error FAKE LOG: Admin access"
>>> sanitize_error_message("Error: " + "X" * 10000)
"Error: XXX...[truncated]"
"""
if not error_message:
return ""
# Remove newlines and carriage returns (CWE-117 log injection)
sanitized = error_message.replace("\n", " ").replace("\r", " ")
# Truncate long messages (prevent resource exhaustion)
if len(sanitized) > MAX_ERROR_MESSAGE_LENGTH:
sanitized = sanitized[:MAX_ERROR_MESSAGE_LENGTH - 14] + "...[truncated]"
return sanitized
def sanitize_feature_name(feature_name: str) -> str:
"""
Sanitize feature name for safe storage and logging.
Security Measures:
1. Remove newlines (prevent log injection - CWE-117)
2. Remove carriage returns (prevent log injection - CWE-117)
3. Remove path traversal sequences (prevent CWE-22)
4. Truncate to reasonable length (prevent resource exhaustion)
Args:
feature_name: Raw feature name
Returns:
Sanitized feature name safe for storage and logging
Examples:
>>> sanitize_feature_name("Add auth\\nFAKE LOG: Admin access")
"Add auth FAKE LOG: Admin access"
>>> sanitize_feature_name("../../etc/passwd")
"etc/passwd [sanitized]"
>>> sanitize_feature_name("Normal feature")
"Normal feature"
"""
if not feature_name:
return ""
# Remove newlines and carriage returns (CWE-117 log injection)
sanitized = feature_name.replace("\n", " ").replace("\r", " ")
# Remove path traversal sequences (CWE-22)
if ".." in sanitized:
# Remove ../ and ..\ sequences
sanitized = sanitized.replace("../", "").replace("..\\", "")
# Add marker that this was sanitized
if "sanitized" not in sanitized.lower():
sanitized += " [sanitized]"
# Truncate long names (prevent resource exhaustion)
MAX_FEATURE_NAME_LENGTH = 200
if len(sanitized) > MAX_FEATURE_NAME_LENGTH:
sanitized = sanitized[:MAX_FEATURE_NAME_LENGTH - 14] + "...[truncated]"
return sanitized
# =============================================================================
# Error Context Extraction
# =============================================================================
def extract_error_context(
error_message: Optional[str],
feature_name: str,
) -> Dict[str, Any]:
"""
Extract rich error context for debugging and logging.
Context includes:
- error_type: Type of error (e.g., "SyntaxError")
- error_message: Sanitized error message
- feature_name: Feature being processed
- timestamp: When error occurred
- failure_type: Classification (transient/permanent)
Args:
error_message: Raw error message
feature_name: Name of feature being processed
Returns:
Dictionary with error context containing:
- error_type (str): Error class name
- error_message (str): Sanitized message
- feature_name (str): Feature being processed
- timestamp (str): ISO 8601 timestamp
- failure_type (str): "transient" or "permanent"
Examples:
>>> context = extract_error_context(
... "SyntaxError: invalid syntax",
... "Add user authentication"
... )
>>> context["error_type"]
"SyntaxError"
>>> context["failure_type"]
"permanent"
"""
# Sanitize error message
sanitized_message = sanitize_error_message(error_message)
# Extract error type (first word before colon)
error_type = "Unknown"
if error_message and ":" in error_message:
error_type = error_message.split(":")[0].strip()
elif error_message:
# Try to extract error type from class name
match = re.match(r"(\w+Error)", error_message)
if match:
error_type = match.group(1)
# Classify failure
failure_type = classify_failure(error_message)
# Build context
context = {
"error_type": error_type,
"error_message": sanitized_message,
"feature_name": feature_name,
"timestamp": datetime.utcnow().isoformat() + "Z",
"failure_type": failure_type.value,
}
return context
# =============================================================================
# Module Exports
# =============================================================================
__all__ = [
"FailureType",
"classify_failure",
"is_transient_error",
"is_permanent_error",
"sanitize_error_message",
"extract_error_context",
"TRANSIENT_ERROR_PATTERNS",
"PERMANENT_ERROR_PATTERNS",
]