TradingAgents/.claude/lib/batch_retry_manager.py

#!/usr/bin/env python3
"""
Batch Retry Manager - Orchestrate retry logic for /batch-implement workflows.

Manages automatic retry logic with max retries, circuit breaker, and global limits.

Features:
1. Per-feature retry tracking (max 3 retries)
2. Circuit breaker (pause after 5 consecutive failures)
3. Global retry limit (prevent resource exhaustion)
4. Retry state persistence (survive crashes)
5. Audit logging for all retry attempts

Retry Decision Logic:
    1. Check circuit breaker (5 consecutive failures → block)
    2. Check global retry limit (max total retries → block)
    3. Check failure type (permanent → block)
    4. Check per-feature retry count (3 retries → block)
    5. If all checks pass → allow retry

Usage:
    from batch_retry_manager import (
        BatchRetryManager,
        should_retry_feature,
        MAX_RETRIES_PER_FEATURE,
    )

    # Create manager
    manager = BatchRetryManager("batch-20251118-123456")

    # Check if should retry
    decision = manager.should_retry_feature(
        feature_index=0,
        failure_type=FailureType.TRANSIENT
    )

    if decision.should_retry:
        # Record attempt
        manager.record_retry_attempt(0, "ConnectionError: Failed")

        # Retry feature...

Security:
- Audit logging for all retry attempts
- Global limits prevent resource exhaustion
- Circuit breaker prevents infinite loops
- State file validation and atomic writes

Date: 2025-11-18
Issue: #89 (Automatic Failure Recovery for /batch-implement)
Agent: implementer
Phase: TDD Green (making tests pass)

See error-handling-patterns skill for exception hierarchy and error handling best practices.
See state-management-patterns skill for state persistence patterns.
"""

import json
import os
import sys
import tempfile
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Dict, Optional, Any

# Import failure classifier and consent checker
try:
    from .failure_classifier import FailureType, sanitize_error_message
    from . import batch_retry_consent
except ImportError:
    lib_dir = Path(__file__).parent.resolve()
    sys.path.insert(0, str(lib_dir))
    from failure_classifier import FailureType, sanitize_error_message
    import batch_retry_consent


# =============================================================================
# Constants
# =============================================================================

# Max retries per feature (3 attempts)
MAX_RETRIES_PER_FEATURE = 3

# Circuit breaker threshold (5 consecutive failures)
CIRCUIT_BREAKER_THRESHOLD = 5

# Global retry limit (prevent resource exhaustion)
MAX_TOTAL_RETRIES = 50


# =============================================================================
# Exceptions
# =============================================================================

class CircuitBreakerError(Exception):
    """Exception raised when circuit breaker is triggered."""
    pass


# =============================================================================
# Data Classes
# =============================================================================

@dataclass
class RetryDecision:
    """Decision about whether to retry a failed feature."""
    should_retry: bool
    reason: str
    retry_count: int = 0


@dataclass
class RetryState:
    """Persistent retry state for a batch."""
    batch_id: str
    retry_counts: Dict[int, int] = field(default_factory=dict)  # feature_index → count
    global_retry_count: int = 0
    consecutive_failures: int = 0
    circuit_breaker_open: bool = False
    created_at: str = field(default_factory=lambda: datetime.utcnow().isoformat() + "Z")
    updated_at: str = field(default_factory=lambda: datetime.utcnow().isoformat() + "Z")


# =============================================================================
# Audit Logging
# =============================================================================

def log_audit_event(event_type: str, batch_id: str, details: Dict[str, Any]) -> None:
    """
    Log retry attempt to audit file.

    Audit Log Format (JSONL):
        Each line is a JSON object with:
        - timestamp (str): ISO 8601 timestamp (UTC)
        - event_type (str): "retry_attempt" or "circuit_breaker_triggered"
        - batch_id (str): Unique batch identifier
        - Additional fields from details dict

    Example audit entry:
        {
          "timestamp": "2025-11-18T12:34:56.789Z",
          "event_type": "retry_attempt",
          "batch_id": "batch-20251118-123456",
          "feature_index": 0,
          "retry_count": 1,
          "global_retry_count": 1,
          "error_message": "ConnectionError: Failed to connect",
          "feature_name": "Add user authentication"
        }

    Args:
        event_type: Type of event (e.g., "retry_attempt", "circuit_breaker")
        batch_id: Batch ID for tracking
        details: Event details (will be merged into audit entry)
    """
    # Create audit log directory
    audit_dir = Path.cwd() / ".claude" / "audit"
    audit_dir.mkdir(parents=True, exist_ok=True)

    # Audit log file
    audit_file = audit_dir / f"{batch_id}_retry_audit.jsonl"

    # Create audit entry
    audit_entry = {
        "timestamp": datetime.utcnow().isoformat() + "Z",
        "event_type": event_type,
        "batch_id": batch_id,
        **details,
    }

    # Append to audit log (JSONL format)
    try:
        with open(audit_file, "a") as f:
            f.write(json.dumps(audit_entry) + "\n")
    except OSError:
        # Non-blocking - log to stderr but don't fail
        print(f"Warning: Failed to write audit log: {audit_file}", file=sys.stderr)


# =============================================================================
# Batch Retry Manager
# =============================================================================

class BatchRetryManager:
    """
    Orchestrate retry logic for /batch-implement workflows.

    Manages:
    - Per-feature retry counts
    - Global retry limits
    - Circuit breaker logic
    - Retry state persistence
    """

    def __init__(self, batch_id: str, state_dir: Optional[Path] = None):
        """
        Initialize retry manager.

        Args:
            batch_id: Unique batch identifier
            state_dir: Directory for state files (default: ./.claude)

        Raises:
            ValueError: If batch_id contains path traversal or directory separators
        """
        # Validate batch_id for path traversal (CWE-22)
        if ".." in batch_id or "/" in batch_id or "\\" in batch_id:
            raise ValueError(
                f"Invalid batch_id: contains path traversal or directory separators. "
                f"batch_id must be a simple identifier without path components. Got: {batch_id}"
            )

        self.batch_id = batch_id
        self.state_dir = state_dir or Path.cwd() / ".claude"
        self.state_file = self.state_dir / f"{batch_id}_retry_state.json"

        # Load existing state or create new
        self.state = self._load_state()

    def _load_state(self) -> RetryState:
        """
        Load retry state from file or create new state.

        Returns:
            RetryState object
        """
        if not self.state_file.exists():
            return RetryState(batch_id=self.batch_id)

        try:
            data = json.loads(self.state_file.read_text())
            return RetryState(
                batch_id=data.get("batch_id", self.batch_id),
                retry_counts={int(k): v for k, v in data.get("retry_counts", {}).items()},
                global_retry_count=data.get("global_retry_count", 0),
                consecutive_failures=data.get("consecutive_failures", 0),
                circuit_breaker_open=data.get("circuit_breaker_open", False),
                created_at=data.get("created_at", datetime.utcnow().isoformat() + "Z"),
                updated_at=data.get("updated_at", datetime.utcnow().isoformat() + "Z"),
            )
        except (json.JSONDecodeError, OSError):
            # Corrupted file - start fresh
            return RetryState(batch_id=self.batch_id)

    def _save_state(self) -> None:
        """
        Save retry state to file (atomic write).
        """
        # Update timestamp
        self.state.updated_at = datetime.utcnow().isoformat() + "Z"

        # Convert to dict
        state_dict = {
            "batch_id": self.state.batch_id,
            "retry_counts": self.state.retry_counts,
            "global_retry_count": self.state.global_retry_count,
            "consecutive_failures": self.state.consecutive_failures,
            "circuit_breaker_open": self.state.circuit_breaker_open,
            "created_at": self.state.created_at,
            "updated_at": self.state.updated_at,
        }

        # Atomic write (temp + rename)
        self.state_dir.mkdir(parents=True, exist_ok=True)

        fd, temp_path = tempfile.mkstemp(
            dir=self.state_dir,
            prefix=".retry_state_",
            suffix=".tmp"
        )

        try:
            os.write(fd, json.dumps(state_dict, indent=2).encode())
            os.close(fd)
            Path(temp_path).replace(self.state_file)
        except Exception:
            try:
                os.close(fd)
            except OSError:
                pass
            try:
                Path(temp_path).unlink()
            except OSError:
                pass
            raise

    def get_retry_count(self, feature_index: int) -> int:
        """
        Get retry count for a specific feature.

        Args:
            feature_index: Index of feature

        Returns:
            Number of retry attempts (0 if never retried)
        """
        return self.state.retry_counts.get(feature_index, 0)

    def get_global_retry_count(self) -> int:
        """
        Get total retry count across all features.

        Returns:
            Total number of retry attempts
        """
        return self.state.global_retry_count

    def record_retry_attempt(self, feature_index: int, error_message: str, feature_name: str = "") -> None:
        """
        Record a retry attempt.

        Updates:
        - Per-feature retry count
        - Global retry count
        - Consecutive failure count
        - Audit log

        Args:
            feature_index: Index of feature being retried
            error_message: Error message from failed attempt
            feature_name: Name of feature (optional, for audit logging)
        """
        # Increment counters (with global limit enforcement)
        self.state.retry_counts[feature_index] = self.get_retry_count(feature_index) + 1

        # Enforce global retry limit (CWE-400 resource exhaustion prevention)
        if self.state.global_retry_count < MAX_TOTAL_RETRIES:
            self.state.global_retry_count += 1
        # Note: If already at MAX_TOTAL_RETRIES, don't increment further
        # This prevents counter overflow and enforces hard limit

        self.state.consecutive_failures += 1

        # Check circuit breaker
        if self.state.consecutive_failures >= CIRCUIT_BREAKER_THRESHOLD:
            self.state.circuit_breaker_open = True

            # User-visible notification (CWE-400 protection)
            print(
                f"\n⚠️  Circuit breaker triggered after {self.state.consecutive_failures} "
                f"consecutive failures.\n"
                f"Automatic retries paused for safety.\n"
                f"To resume, fix the underlying issue and run: /batch-implement --resume {self.batch_id}\n",
                file=sys.stderr
            )

            log_audit_event(
                "circuit_breaker_triggered",
                self.batch_id,
                {
                    "consecutive_failures": self.state.consecutive_failures,
                    "threshold": CIRCUIT_BREAKER_THRESHOLD,
                }
            )

        # Save state
        self._save_state()

        # Log audit event with sanitized feature name (CWE-117 log injection prevention)
        log_audit_event(
            "retry_attempt",
            self.batch_id,
            {
                "feature_index": feature_index,
                "retry_count": self.get_retry_count(feature_index),
                "global_retry_count": self.state.global_retry_count,
                "error_message": sanitize_error_message(error_message),
                "feature_name": sanitize_error_message(feature_name) if feature_name else "",
            }
        )

    def record_success(self, feature_index: int) -> None:
        """
        Record a successful feature completion.

        Resets consecutive failure count (circuit breaker).

        Args:
            feature_index: Index of successful feature
        """
        # Reset consecutive failures (circuit breaker)
        self.state.consecutive_failures = 0
        self.state.circuit_breaker_open = False

        # Save state
        self._save_state()

    def check_circuit_breaker(self) -> bool:
        """
        Check if circuit breaker is open.

        Returns:
            True if circuit breaker is open (retries blocked), False otherwise
        """
        return self.state.circuit_breaker_open

    def reset_circuit_breaker(self) -> None:
        """
        Manually reset circuit breaker.

        Use this after manual intervention to resume batch processing.
        """
        self.state.circuit_breaker_open = False
        self.state.consecutive_failures = 0
        self._save_state()

    def should_retry_feature(
        self,
        feature_index: int,
        failure_type: FailureType
    ) -> RetryDecision:
        """
        Decide if a failed feature should be retried.

        Decision Logic:
        0. Check user consent (retry feature disabled → block)
        1. Check global retry limit (max total retries → block)
        2. Check circuit breaker (5 consecutive failures → block)
        3. Check failure type (permanent → block)
        4. Check per-feature retry count (3 retries → block)
        5. If all checks pass → allow retry

        Args:
            feature_index: Index of failed feature
            failure_type: Classification of failure (transient/permanent)

        Returns:
            RetryDecision with should_retry flag and reason

        Examples:
            >>> manager = BatchRetryManager("batch-123")
            >>> decision = manager.should_retry_feature(0, FailureType.TRANSIENT)
            >>> if decision.should_retry:
            ...     # Retry the feature
            ...     pass
        """
        retry_count = self.get_retry_count(feature_index)

        # 0. Check user consent (highest priority - respect user choice)
        if not batch_retry_consent.is_retry_enabled():
            return RetryDecision(
                should_retry=False,
                reason="consent_not_given",
                retry_count=retry_count
            )

        # 1. Check global retry limit (highest priority - hard limit)
        if self.state.global_retry_count >= MAX_TOTAL_RETRIES:
            return RetryDecision(
                should_retry=False,
                reason="global_retry_limit_reached",
                retry_count=retry_count
            )

        # 2. Check circuit breaker
        if self.check_circuit_breaker():
            return RetryDecision(
                should_retry=False,
                reason="circuit_breaker_open",
                retry_count=retry_count
            )

        # 3. Check failure type (permanent errors not retried)
        if failure_type == FailureType.PERMANENT:
            return RetryDecision(
                should_retry=False,
                reason="permanent_failure",
                retry_count=retry_count
            )

        # 4. Check per-feature retry limit
        if retry_count >= MAX_RETRIES_PER_FEATURE:
            return RetryDecision(
                should_retry=False,
                reason="max_retries_reached",
                retry_count=retry_count
            )

        # All checks passed - allow retry
        return RetryDecision(
            should_retry=True,
            reason="under_retry_limit",
            retry_count=retry_count
        )


# =============================================================================
# Convenience Functions
# =============================================================================

def should_retry_feature(
    batch_id: str,
    feature_index: int,
    failure_type: FailureType,
    state_dir: Optional[Path] = None
) -> RetryDecision:
    """
    Convenience function to check if feature should be retried.

    Args:
        batch_id: Unique batch identifier
        feature_index: Index of failed feature
        failure_type: Classification of failure
        state_dir: Directory for state files (default: ./.claude)

    Returns:
        RetryDecision with should_retry flag and reason
    """
    manager = BatchRetryManager(batch_id, state_dir)
    return manager.should_retry_feature(feature_index, failure_type)


def record_retry_attempt(
    batch_id: str,
    feature_index: int,
    error_message: str,
    feature_name: str = "",
    state_dir: Optional[Path] = None
) -> None:
    """
    Convenience function to record retry attempt.

    Args:
        batch_id: Unique batch identifier
        feature_index: Index of feature being retried
        error_message: Error message from failed attempt
        feature_name: Name of feature (optional, for audit logging)
        state_dir: Directory for state files (default: ./.claude)
    """
    manager = BatchRetryManager(batch_id, state_dir)
    manager.record_retry_attempt(feature_index, error_message, feature_name)


def check_circuit_breaker(
    batch_id: str,
    state_dir: Optional[Path] = None
) -> bool:
    """
    Convenience function to check circuit breaker status.

    Args:
        batch_id: Unique batch identifier
        state_dir: Directory for state files (default: ./.claude)

    Returns:
        True if circuit breaker is open, False otherwise
    """
    manager = BatchRetryManager(batch_id, state_dir)
    return manager.check_circuit_breaker()


def get_retry_count(
    batch_id: str,
    feature_index: int,
    state_dir: Optional[Path] = None
) -> int:
    """
    Convenience function to get retry count for feature.

    Args:
        batch_id: Unique batch identifier
        feature_index: Index of feature
        state_dir: Directory for state files (default: ./.claude)

    Returns:
        Number of retry attempts
    """
    manager = BatchRetryManager(batch_id, state_dir)
    return manager.get_retry_count(feature_index)


def reset_circuit_breaker(
    batch_id: str,
    state_dir: Optional[Path] = None
) -> None:
    """
    Convenience function to reset circuit breaker.

    Args:
        batch_id: Unique batch identifier
        state_dir: Directory for state files (default: ./.claude)
    """
    manager = BatchRetryManager(batch_id, state_dir)
    manager.reset_circuit_breaker()


# =============================================================================
# Module Exports
# =============================================================================

__all__ = [
    "BatchRetryManager",
    "RetryDecision",
    "CircuitBreakerError",
    "should_retry_feature",
    "record_retry_attempt",
    "check_circuit_breaker",
    "get_retry_count",
    "reset_circuit_breaker",
    "MAX_RETRIES_PER_FEATURE",
    "MAX_TOTAL_RETRIES",
    "CIRCUIT_BREAKER_THRESHOLD",
]