TradingAgents/.claude/lib/error_analyzer.py

#!/usr/bin/env python3
"""
Error Analyzer Library - Analyze captured tool errors for GitHub issue creation.

Reads error registry from .claude/logs/errors/, classifies errors using
failure_classifier.py, deduplicates via fingerprinting, and returns
structured reports for actionable errors.

Key Features:
1. Error registry reading from JSONL files
2. Integration with failure_classifier.py for transient/permanent classification
3. Error fingerprinting for deduplication
4. Filtering for actionable errors (permanent only, not transient)
5. Structured error reports for issue creation

Security:
- CWE-117: Log injection prevention via existing sanitization
- CWE-532: Secret redaction for API keys, tokens
- CWE-22: Path validation via validation.py
- CWE-400: Resource limits (max errors per session)

Date: 2025-12-13
Issue: #124 (Automated error capture and analysis)
Agent: implementer

See error-handling-patterns skill for exception hierarchy and error handling best practices.

Design Patterns:
    See library-design-patterns skill for standardized design patterns.
"""

import hashlib
import json
import re
import sys
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

# Import security utilities
try:
    from .security_utils import audit_log
except ImportError:
    lib_dir = Path(__file__).parent.resolve()
    sys.path.insert(0, str(lib_dir))
    from security_utils import audit_log

# Import failure classifier
try:
    from .failure_classifier import (
        classify_failure,
        FailureType,
        sanitize_error_message,
    )
except ImportError:
    from failure_classifier import (
        classify_failure,
        FailureType,
        sanitize_error_message,
    )

# Import path utilities
try:
    from .path_utils import get_project_root
except ImportError:
    from path_utils import get_project_root


# =============================================================================
# Constants
# =============================================================================

# Maximum errors to process per session (CWE-400 resource limit)
MAX_ERRORS_PER_SESSION = 500

# Maximum error message length (prevent memory exhaustion)
MAX_ERROR_MESSAGE_LENGTH = 1000

# Secret patterns for redaction (CWE-532)
SECRET_PATTERNS = [
    r"sk-[a-zA-Z0-9]{20,}",  # OpenAI API key
    r"anthropic_[a-zA-Z0-9_-]{20,}",  # Anthropic API key
    r"ghp_[a-zA-Z0-9]{20,}",  # GitHub PAT
    r"gho_[a-zA-Z0-9]{20,}",  # GitHub OAuth token
    r"ghr_[a-zA-Z0-9]{20,}",  # GitHub refresh token
    r"Bearer\s+[a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+",  # JWT
    r"api[_-]?key[\"']?\s*[=:]\s*[\"']?[a-zA-Z0-9_-]{16,}",  # Generic API key
    r"password[\"']?\s*[=:]\s*[\"']?[^\s\"']+",  # Password assignments
    r"secret[\"']?\s*[=:]\s*[\"']?[a-zA-Z0-9_-]{16,}",  # Generic secret
]


# =============================================================================
# Data Classes
# =============================================================================

class ErrorEntry:
    """Represents a single captured error."""

    def __init__(
        self,
        timestamp: str,
        tool_name: str,
        exit_code: Optional[int],
        error_message: str,
        context: Optional[Dict[str, Any]] = None,
    ):
        self.timestamp = timestamp
        self.tool_name = tool_name
        self.exit_code = exit_code
        self.error_message = error_message
        self.context = context or {}
        self.failure_type: Optional[FailureType] = None
        self.fingerprint: Optional[str] = None

    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for serialization."""
        return {
            "timestamp": self.timestamp,
            "tool_name": self.tool_name,
            "exit_code": self.exit_code,
            "error_message": self.error_message,
            "context": self.context,
            "failure_type": self.failure_type.value if self.failure_type else None,
            "fingerprint": self.fingerprint,
        }

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "ErrorEntry":
        """Create from dictionary."""
        entry = cls(
            timestamp=data.get("timestamp", ""),
            tool_name=data.get("tool_name", "unknown"),
            exit_code=data.get("exit_code"),
            error_message=data.get("error_message", ""),
            context=data.get("context", {}),
        )
        if data.get("failure_type"):
            entry.failure_type = FailureType(data["failure_type"])
        entry.fingerprint = data.get("fingerprint")
        return entry


class ErrorReport:
    """Structured report of analyzed errors for issue creation."""

    def __init__(
        self,
        actionable_errors: List[ErrorEntry],
        transient_errors: List[ErrorEntry],
        duplicate_fingerprints: List[str],
        total_errors: int,
        session_date: str,
    ):
        self.actionable_errors = actionable_errors
        self.transient_errors = transient_errors
        self.duplicate_fingerprints = duplicate_fingerprints
        self.total_errors = total_errors
        self.session_date = session_date

    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for serialization."""
        return {
            "actionable_errors": [e.to_dict() for e in self.actionable_errors],
            "transient_errors": [e.to_dict() for e in self.transient_errors],
            "duplicate_fingerprints": self.duplicate_fingerprints,
            "total_errors": self.total_errors,
            "session_date": self.session_date,
            "actionable_count": len(self.actionable_errors),
            "transient_count": len(self.transient_errors),
        }


# =============================================================================
# Error Analyzer
# =============================================================================

class ErrorAnalyzer:
    """Analyzes captured errors for GitHub issue creation."""

    def __init__(self, project_root: Optional[Path] = None):
        """
        Initialize error analyzer.

        Args:
            project_root: Project root directory (auto-detected if not provided)
        """
        if project_root is None:
            project_root = get_project_root()
        self.project_root = Path(project_root)
        self.errors_dir = self.project_root / ".claude" / "logs" / "errors"
        self._seen_fingerprints: set = set()

    def read_error_registry(self, date: Optional[str] = None) -> List[ErrorEntry]:
        """
        Read errors from registry for a specific date.

        Args:
            date: Date string (YYYY-MM-DD). If None, uses today.

        Returns:
            List of ErrorEntry objects
        """
        if date is None:
            date = datetime.now().strftime("%Y-%m-%d")

        error_file = self.errors_dir / f"{date}.jsonl"

        if not error_file.exists():
            return []

        errors = []
        try:
            with open(error_file, "r") as f:
                for i, line in enumerate(f):
                    if i >= MAX_ERRORS_PER_SESSION:
                        audit_log(
                            "error_analyzer_limit_reached",
                            "warning",
                            {"max": MAX_ERRORS_PER_SESSION, "file": str(error_file)},
                        )
                        break

                    line = line.strip()
                    if not line:
                        continue

                    try:
                        data = json.loads(line)
                        errors.append(ErrorEntry.from_dict(data))
                    except json.JSONDecodeError:
                        continue  # Skip malformed lines

        except (OSError, IOError) as e:
            audit_log(
                "error_analyzer_read_failed",
                "failure",
                {"file": str(error_file), "error": str(e)},
            )

        return errors

    def classify_errors(self, errors: List[ErrorEntry]) -> List[ErrorEntry]:
        """
        Classify errors as transient or permanent.

        Args:
            errors: List of errors to classify

        Returns:
            Same list with failure_type populated
        """
        for error in errors:
            error.failure_type = classify_failure(error.error_message)
        return errors

    def create_fingerprint(self, error: ErrorEntry) -> str:
        """
        Create unique fingerprint for error deduplication.

        Fingerprint = hash(tool_name + error_type + normalized_message)

        Args:
            error: Error to fingerprint

        Returns:
            SHA-256 fingerprint (first 16 chars)
        """
        # Normalize message: lowercase, remove numbers, collapse whitespace
        normalized = error.error_message.lower()
        normalized = re.sub(r"\d+", "N", normalized)  # Replace numbers
        normalized = re.sub(r"\s+", " ", normalized)  # Collapse whitespace
        normalized = normalized[:200]  # Cap length for hashing

        # Build fingerprint input
        fingerprint_input = f"{error.tool_name}:{error.failure_type.value if error.failure_type else 'unknown'}:{normalized}"

        # Hash and truncate
        hash_obj = hashlib.sha256(fingerprint_input.encode("utf-8"))
        return hash_obj.hexdigest()[:16]

    def deduplicate_errors(self, errors: List[ErrorEntry]) -> Tuple[List[ErrorEntry], List[str]]:
        """
        Remove duplicate errors based on fingerprints.

        Args:
            errors: List of errors to deduplicate

        Returns:
            Tuple of (unique errors, duplicate fingerprints)
        """
        unique = []
        duplicates = []

        for error in errors:
            fingerprint = self.create_fingerprint(error)
            error.fingerprint = fingerprint

            if fingerprint in self._seen_fingerprints:
                duplicates.append(fingerprint)
            else:
                self._seen_fingerprints.add(fingerprint)
                unique.append(error)

        return unique, duplicates

    def filter_actionable(self, errors: List[ErrorEntry]) -> Tuple[List[ErrorEntry], List[ErrorEntry]]:
        """
        Filter for actionable errors (permanent only).

        Args:
            errors: List of classified errors

        Returns:
            Tuple of (actionable errors, transient errors)
        """
        actionable = []
        transient = []

        for error in errors:
            if error.failure_type == FailureType.PERMANENT:
                actionable.append(error)
            else:
                transient.append(error)

        return actionable, transient

    def analyze(self, date: Optional[str] = None) -> ErrorReport:
        """
        Full analysis pipeline: read, classify, deduplicate, filter.

        Args:
            date: Date to analyze (default: today)

        Returns:
            ErrorReport with actionable and transient errors
        """
        if date is None:
            date = datetime.now().strftime("%Y-%m-%d")

        # Reset fingerprints for new analysis
        self._seen_fingerprints.clear()

        # Pipeline
        errors = self.read_error_registry(date)
        errors = self.classify_errors(errors)
        errors, duplicates = self.deduplicate_errors(errors)
        actionable, transient = self.filter_actionable(errors)

        audit_log(
            "error_analysis_complete",
            "success",
            {
                "date": date,
                "total": len(errors) + len(duplicates),
                "actionable": len(actionable),
                "transient": len(transient),
                "duplicates": len(duplicates),
            },
        )

        return ErrorReport(
            actionable_errors=actionable,
            transient_errors=transient,
            duplicate_fingerprints=duplicates,
            total_errors=len(errors) + len(duplicates),
            session_date=date,
        )


# =============================================================================
# Utility Functions
# =============================================================================

def redact_secrets(message: str) -> str:
    """
    Redact API keys, tokens, and secrets from error messages.

    Args:
        message: Error message that may contain secrets

    Returns:
        Message with secrets redacted
    """
    redacted = message
    for pattern in SECRET_PATTERNS:
        redacted = re.sub(pattern, "[REDACTED]", redacted, flags=re.IGNORECASE)
    return redacted


def format_error_for_issue(error: ErrorEntry) -> str:
    """
    Format error for GitHub issue body.

    Args:
        error: Error to format

    Returns:
        Markdown-formatted error description
    """
    lines = [
        f"### Error Details",
        f"",
        f"**Tool**: {error.tool_name}",
        f"**Exit Code**: {error.exit_code if error.exit_code is not None else 'N/A'}",
        f"**Type**: {error.failure_type.value if error.failure_type else 'unknown'}",
        f"**Fingerprint**: `{error.fingerprint}`",
        f"**Timestamp**: {error.timestamp}",
        f"",
        f"### Error Message",
        f"```",
        redact_secrets(error.error_message[:MAX_ERROR_MESSAGE_LENGTH]),
        f"```",
    ]

    if error.context:
        lines.extend([
            f"",
            f"### Context",
            f"```json",
            json.dumps(error.context, indent=2)[:500],
            f"```",
        ])

    return "\n".join(lines)


def write_error_to_registry(
    tool_name: str,
    exit_code: Optional[int],
    error_message: str,
    context: Optional[Dict[str, Any]] = None,
    project_root: Optional[Path] = None,
) -> bool:
    """
    Write an error to the registry (JSONL format).

    Args:
        tool_name: Name of the tool that failed
        exit_code: Exit code (None if not applicable)
        error_message: Error message
        context: Additional context
        project_root: Project root (auto-detected if not provided)

    Returns:
        True if written successfully, False otherwise
    """
    if project_root is None:
        project_root = get_project_root()

    errors_dir = Path(project_root) / ".claude" / "logs" / "errors"
    errors_dir.mkdir(parents=True, exist_ok=True)

    date = datetime.now().strftime("%Y-%m-%d")
    error_file = errors_dir / f"{date}.jsonl"

    # Sanitize and truncate message
    safe_message = sanitize_error_message(error_message)
    safe_message = redact_secrets(safe_message)
    if len(safe_message) > MAX_ERROR_MESSAGE_LENGTH:
        safe_message = safe_message[:MAX_ERROR_MESSAGE_LENGTH] + "...[truncated]"

    entry = {
        "timestamp": datetime.now().isoformat(),
        "tool_name": tool_name,
        "exit_code": exit_code,
        "error_message": safe_message,
        "context": context or {},
    }

    try:
        with open(error_file, "a") as f:
            f.write(json.dumps(entry) + "\n")

        audit_log(
            "error_written_to_registry",
            "success",
            {"tool": tool_name, "file": str(error_file)},
        )
        return True

    except (OSError, IOError) as e:
        audit_log(
            "error_write_failed",
            "failure",
            {"tool": tool_name, "error": str(e)},
        )
        return False


# =============================================================================
# Module-level convenience functions
# =============================================================================

def analyze_errors(date: Optional[str] = None, project_root: Optional[Path] = None) -> ErrorReport:
    """
    Convenience function to analyze errors for a date.

    Args:
        date: Date to analyze (default: today)
        project_root: Project root (auto-detected if not provided)

    Returns:
        ErrorReport with analysis results
    """
    analyzer = ErrorAnalyzer(project_root)
    return analyzer.analyze(date)


def get_actionable_errors(date: Optional[str] = None, project_root: Optional[Path] = None) -> List[ErrorEntry]:
    """
    Get only actionable (permanent) errors for a date.

    Args:
        date: Date to analyze (default: today)
        project_root: Project root (auto-detected if not provided)

    Returns:
        List of actionable ErrorEntry objects
    """
    report = analyze_errors(date, project_root)
    return report.actionable_errors