#!/usr/bin/env python3 """ Batch Retry Manager - Orchestrate retry logic for /batch-implement workflows. Manages automatic retry logic with max retries, circuit breaker, and global limits. Features: 1. Per-feature retry tracking (max 3 retries) 2. Circuit breaker (pause after 5 consecutive failures) 3. Global retry limit (prevent resource exhaustion) 4. Retry state persistence (survive crashes) 5. Audit logging for all retry attempts Retry Decision Logic: 1. Check circuit breaker (5 consecutive failures → block) 2. Check global retry limit (max total retries → block) 3. Check failure type (permanent → block) 4. Check per-feature retry count (3 retries → block) 5. If all checks pass → allow retry Usage: from batch_retry_manager import ( BatchRetryManager, should_retry_feature, MAX_RETRIES_PER_FEATURE, ) # Create manager manager = BatchRetryManager("batch-20251118-123456") # Check if should retry decision = manager.should_retry_feature( feature_index=0, failure_type=FailureType.TRANSIENT ) if decision.should_retry: # Record attempt manager.record_retry_attempt(0, "ConnectionError: Failed") # Retry feature... Security: - Audit logging for all retry attempts - Global limits prevent resource exhaustion - Circuit breaker prevents infinite loops - State file validation and atomic writes Date: 2025-11-18 Issue: #89 (Automatic Failure Recovery for /batch-implement) Agent: implementer Phase: TDD Green (making tests pass) See error-handling-patterns skill for exception hierarchy and error handling best practices. See state-management-patterns skill for state persistence patterns. """ import json import os import sys import tempfile from dataclasses import dataclass, field from datetime import datetime from pathlib import Path from typing import Dict, Optional, Any # Import failure classifier and consent checker try: from .failure_classifier import FailureType, sanitize_error_message from . import batch_retry_consent except ImportError: lib_dir = Path(__file__).parent.resolve() sys.path.insert(0, str(lib_dir)) from failure_classifier import FailureType, sanitize_error_message import batch_retry_consent # ============================================================================= # Constants # ============================================================================= # Max retries per feature (3 attempts) MAX_RETRIES_PER_FEATURE = 3 # Circuit breaker threshold (5 consecutive failures) CIRCUIT_BREAKER_THRESHOLD = 5 # Global retry limit (prevent resource exhaustion) MAX_TOTAL_RETRIES = 50 # ============================================================================= # Exceptions # ============================================================================= class CircuitBreakerError(Exception): """Exception raised when circuit breaker is triggered.""" pass # ============================================================================= # Data Classes # ============================================================================= @dataclass class RetryDecision: """Decision about whether to retry a failed feature.""" should_retry: bool reason: str retry_count: int = 0 @dataclass class RetryState: """Persistent retry state for a batch.""" batch_id: str retry_counts: Dict[int, int] = field(default_factory=dict) # feature_index → count global_retry_count: int = 0 consecutive_failures: int = 0 circuit_breaker_open: bool = False created_at: str = field(default_factory=lambda: datetime.utcnow().isoformat() + "Z") updated_at: str = field(default_factory=lambda: datetime.utcnow().isoformat() + "Z") # ============================================================================= # Audit Logging # ============================================================================= def log_audit_event(event_type: str, batch_id: str, details: Dict[str, Any]) -> None: """ Log retry attempt to audit file. Audit Log Format (JSONL): Each line is a JSON object with: - timestamp (str): ISO 8601 timestamp (UTC) - event_type (str): "retry_attempt" or "circuit_breaker_triggered" - batch_id (str): Unique batch identifier - Additional fields from details dict Example audit entry: { "timestamp": "2025-11-18T12:34:56.789Z", "event_type": "retry_attempt", "batch_id": "batch-20251118-123456", "feature_index": 0, "retry_count": 1, "global_retry_count": 1, "error_message": "ConnectionError: Failed to connect", "feature_name": "Add user authentication" } Args: event_type: Type of event (e.g., "retry_attempt", "circuit_breaker") batch_id: Batch ID for tracking details: Event details (will be merged into audit entry) """ # Create audit log directory audit_dir = Path.cwd() / ".claude" / "audit" audit_dir.mkdir(parents=True, exist_ok=True) # Audit log file audit_file = audit_dir / f"{batch_id}_retry_audit.jsonl" # Create audit entry audit_entry = { "timestamp": datetime.utcnow().isoformat() + "Z", "event_type": event_type, "batch_id": batch_id, **details, } # Append to audit log (JSONL format) try: with open(audit_file, "a") as f: f.write(json.dumps(audit_entry) + "\n") except OSError: # Non-blocking - log to stderr but don't fail print(f"Warning: Failed to write audit log: {audit_file}", file=sys.stderr) # ============================================================================= # Batch Retry Manager # ============================================================================= class BatchRetryManager: """ Orchestrate retry logic for /batch-implement workflows. Manages: - Per-feature retry counts - Global retry limits - Circuit breaker logic - Retry state persistence """ def __init__(self, batch_id: str, state_dir: Optional[Path] = None): """ Initialize retry manager. Args: batch_id: Unique batch identifier state_dir: Directory for state files (default: ./.claude) Raises: ValueError: If batch_id contains path traversal or directory separators """ # Validate batch_id for path traversal (CWE-22) if ".." in batch_id or "/" in batch_id or "\\" in batch_id: raise ValueError( f"Invalid batch_id: contains path traversal or directory separators. " f"batch_id must be a simple identifier without path components. Got: {batch_id}" ) self.batch_id = batch_id self.state_dir = state_dir or Path.cwd() / ".claude" self.state_file = self.state_dir / f"{batch_id}_retry_state.json" # Load existing state or create new self.state = self._load_state() def _load_state(self) -> RetryState: """ Load retry state from file or create new state. Returns: RetryState object """ if not self.state_file.exists(): return RetryState(batch_id=self.batch_id) try: data = json.loads(self.state_file.read_text()) return RetryState( batch_id=data.get("batch_id", self.batch_id), retry_counts={int(k): v for k, v in data.get("retry_counts", {}).items()}, global_retry_count=data.get("global_retry_count", 0), consecutive_failures=data.get("consecutive_failures", 0), circuit_breaker_open=data.get("circuit_breaker_open", False), created_at=data.get("created_at", datetime.utcnow().isoformat() + "Z"), updated_at=data.get("updated_at", datetime.utcnow().isoformat() + "Z"), ) except (json.JSONDecodeError, OSError): # Corrupted file - start fresh return RetryState(batch_id=self.batch_id) def _save_state(self) -> None: """ Save retry state to file (atomic write). """ # Update timestamp self.state.updated_at = datetime.utcnow().isoformat() + "Z" # Convert to dict state_dict = { "batch_id": self.state.batch_id, "retry_counts": self.state.retry_counts, "global_retry_count": self.state.global_retry_count, "consecutive_failures": self.state.consecutive_failures, "circuit_breaker_open": self.state.circuit_breaker_open, "created_at": self.state.created_at, "updated_at": self.state.updated_at, } # Atomic write (temp + rename) self.state_dir.mkdir(parents=True, exist_ok=True) fd, temp_path = tempfile.mkstemp( dir=self.state_dir, prefix=".retry_state_", suffix=".tmp" ) try: os.write(fd, json.dumps(state_dict, indent=2).encode()) os.close(fd) Path(temp_path).replace(self.state_file) except Exception: try: os.close(fd) except OSError: pass try: Path(temp_path).unlink() except OSError: pass raise def get_retry_count(self, feature_index: int) -> int: """ Get retry count for a specific feature. Args: feature_index: Index of feature Returns: Number of retry attempts (0 if never retried) """ return self.state.retry_counts.get(feature_index, 0) def get_global_retry_count(self) -> int: """ Get total retry count across all features. Returns: Total number of retry attempts """ return self.state.global_retry_count def record_retry_attempt(self, feature_index: int, error_message: str, feature_name: str = "") -> None: """ Record a retry attempt. Updates: - Per-feature retry count - Global retry count - Consecutive failure count - Audit log Args: feature_index: Index of feature being retried error_message: Error message from failed attempt feature_name: Name of feature (optional, for audit logging) """ # Increment counters (with global limit enforcement) self.state.retry_counts[feature_index] = self.get_retry_count(feature_index) + 1 # Enforce global retry limit (CWE-400 resource exhaustion prevention) if self.state.global_retry_count < MAX_TOTAL_RETRIES: self.state.global_retry_count += 1 # Note: If already at MAX_TOTAL_RETRIES, don't increment further # This prevents counter overflow and enforces hard limit self.state.consecutive_failures += 1 # Check circuit breaker if self.state.consecutive_failures >= CIRCUIT_BREAKER_THRESHOLD: self.state.circuit_breaker_open = True # User-visible notification (CWE-400 protection) print( f"\n⚠️ Circuit breaker triggered after {self.state.consecutive_failures} " f"consecutive failures.\n" f"Automatic retries paused for safety.\n" f"To resume, fix the underlying issue and run: /batch-implement --resume {self.batch_id}\n", file=sys.stderr ) log_audit_event( "circuit_breaker_triggered", self.batch_id, { "consecutive_failures": self.state.consecutive_failures, "threshold": CIRCUIT_BREAKER_THRESHOLD, } ) # Save state self._save_state() # Log audit event with sanitized feature name (CWE-117 log injection prevention) log_audit_event( "retry_attempt", self.batch_id, { "feature_index": feature_index, "retry_count": self.get_retry_count(feature_index), "global_retry_count": self.state.global_retry_count, "error_message": sanitize_error_message(error_message), "feature_name": sanitize_error_message(feature_name) if feature_name else "", } ) def record_success(self, feature_index: int) -> None: """ Record a successful feature completion. Resets consecutive failure count (circuit breaker). Args: feature_index: Index of successful feature """ # Reset consecutive failures (circuit breaker) self.state.consecutive_failures = 0 self.state.circuit_breaker_open = False # Save state self._save_state() def check_circuit_breaker(self) -> bool: """ Check if circuit breaker is open. Returns: True if circuit breaker is open (retries blocked), False otherwise """ return self.state.circuit_breaker_open def reset_circuit_breaker(self) -> None: """ Manually reset circuit breaker. Use this after manual intervention to resume batch processing. """ self.state.circuit_breaker_open = False self.state.consecutive_failures = 0 self._save_state() def should_retry_feature( self, feature_index: int, failure_type: FailureType ) -> RetryDecision: """ Decide if a failed feature should be retried. Decision Logic: 0. Check user consent (retry feature disabled → block) 1. Check global retry limit (max total retries → block) 2. Check circuit breaker (5 consecutive failures → block) 3. Check failure type (permanent → block) 4. Check per-feature retry count (3 retries → block) 5. If all checks pass → allow retry Args: feature_index: Index of failed feature failure_type: Classification of failure (transient/permanent) Returns: RetryDecision with should_retry flag and reason Examples: >>> manager = BatchRetryManager("batch-123") >>> decision = manager.should_retry_feature(0, FailureType.TRANSIENT) >>> if decision.should_retry: ... # Retry the feature ... pass """ retry_count = self.get_retry_count(feature_index) # 0. Check user consent (highest priority - respect user choice) if not batch_retry_consent.is_retry_enabled(): return RetryDecision( should_retry=False, reason="consent_not_given", retry_count=retry_count ) # 1. Check global retry limit (highest priority - hard limit) if self.state.global_retry_count >= MAX_TOTAL_RETRIES: return RetryDecision( should_retry=False, reason="global_retry_limit_reached", retry_count=retry_count ) # 2. Check circuit breaker if self.check_circuit_breaker(): return RetryDecision( should_retry=False, reason="circuit_breaker_open", retry_count=retry_count ) # 3. Check failure type (permanent errors not retried) if failure_type == FailureType.PERMANENT: return RetryDecision( should_retry=False, reason="permanent_failure", retry_count=retry_count ) # 4. Check per-feature retry limit if retry_count >= MAX_RETRIES_PER_FEATURE: return RetryDecision( should_retry=False, reason="max_retries_reached", retry_count=retry_count ) # All checks passed - allow retry return RetryDecision( should_retry=True, reason="under_retry_limit", retry_count=retry_count ) # ============================================================================= # Convenience Functions # ============================================================================= def should_retry_feature( batch_id: str, feature_index: int, failure_type: FailureType, state_dir: Optional[Path] = None ) -> RetryDecision: """ Convenience function to check if feature should be retried. Args: batch_id: Unique batch identifier feature_index: Index of failed feature failure_type: Classification of failure state_dir: Directory for state files (default: ./.claude) Returns: RetryDecision with should_retry flag and reason """ manager = BatchRetryManager(batch_id, state_dir) return manager.should_retry_feature(feature_index, failure_type) def record_retry_attempt( batch_id: str, feature_index: int, error_message: str, feature_name: str = "", state_dir: Optional[Path] = None ) -> None: """ Convenience function to record retry attempt. Args: batch_id: Unique batch identifier feature_index: Index of feature being retried error_message: Error message from failed attempt feature_name: Name of feature (optional, for audit logging) state_dir: Directory for state files (default: ./.claude) """ manager = BatchRetryManager(batch_id, state_dir) manager.record_retry_attempt(feature_index, error_message, feature_name) def check_circuit_breaker( batch_id: str, state_dir: Optional[Path] = None ) -> bool: """ Convenience function to check circuit breaker status. Args: batch_id: Unique batch identifier state_dir: Directory for state files (default: ./.claude) Returns: True if circuit breaker is open, False otherwise """ manager = BatchRetryManager(batch_id, state_dir) return manager.check_circuit_breaker() def get_retry_count( batch_id: str, feature_index: int, state_dir: Optional[Path] = None ) -> int: """ Convenience function to get retry count for feature. Args: batch_id: Unique batch identifier feature_index: Index of feature state_dir: Directory for state files (default: ./.claude) Returns: Number of retry attempts """ manager = BatchRetryManager(batch_id, state_dir) return manager.get_retry_count(feature_index) def reset_circuit_breaker( batch_id: str, state_dir: Optional[Path] = None ) -> None: """ Convenience function to reset circuit breaker. Args: batch_id: Unique batch identifier state_dir: Directory for state files (default: ./.claude) """ manager = BatchRetryManager(batch_id, state_dir) manager.reset_circuit_breaker() # ============================================================================= # Module Exports # ============================================================================= __all__ = [ "BatchRetryManager", "RetryDecision", "CircuitBreakerError", "should_retry_feature", "record_retry_attempt", "check_circuit_breaker", "get_retry_count", "reset_circuit_breaker", "MAX_RETRIES_PER_FEATURE", "MAX_TOTAL_RETRIES", "CIRCUIT_BREAKER_THRESHOLD", ]