TradingAgents/.claude/lib/batch_retry_manager.py

605 lines
19 KiB
Python

#!/usr/bin/env python3
"""
Batch Retry Manager - Orchestrate retry logic for /batch-implement workflows.
Manages automatic retry logic with max retries, circuit breaker, and global limits.
Features:
1. Per-feature retry tracking (max 3 retries)
2. Circuit breaker (pause after 5 consecutive failures)
3. Global retry limit (prevent resource exhaustion)
4. Retry state persistence (survive crashes)
5. Audit logging for all retry attempts
Retry Decision Logic:
1. Check circuit breaker (5 consecutive failures → block)
2. Check global retry limit (max total retries → block)
3. Check failure type (permanent → block)
4. Check per-feature retry count (3 retries → block)
5. If all checks pass → allow retry
Usage:
from batch_retry_manager import (
BatchRetryManager,
should_retry_feature,
MAX_RETRIES_PER_FEATURE,
)
# Create manager
manager = BatchRetryManager("batch-20251118-123456")
# Check if should retry
decision = manager.should_retry_feature(
feature_index=0,
failure_type=FailureType.TRANSIENT
)
if decision.should_retry:
# Record attempt
manager.record_retry_attempt(0, "ConnectionError: Failed")
# Retry feature...
Security:
- Audit logging for all retry attempts
- Global limits prevent resource exhaustion
- Circuit breaker prevents infinite loops
- State file validation and atomic writes
Date: 2025-11-18
Issue: #89 (Automatic Failure Recovery for /batch-implement)
Agent: implementer
Phase: TDD Green (making tests pass)
See error-handling-patterns skill for exception hierarchy and error handling best practices.
See state-management-patterns skill for state persistence patterns.
"""
import json
import os
import sys
import tempfile
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Dict, Optional, Any
# Import failure classifier and consent checker
try:
from .failure_classifier import FailureType, sanitize_error_message
from . import batch_retry_consent
except ImportError:
lib_dir = Path(__file__).parent.resolve()
sys.path.insert(0, str(lib_dir))
from failure_classifier import FailureType, sanitize_error_message
import batch_retry_consent
# =============================================================================
# Constants
# =============================================================================
# Max retries per feature (3 attempts)
MAX_RETRIES_PER_FEATURE = 3
# Circuit breaker threshold (5 consecutive failures)
CIRCUIT_BREAKER_THRESHOLD = 5
# Global retry limit (prevent resource exhaustion)
MAX_TOTAL_RETRIES = 50
# =============================================================================
# Exceptions
# =============================================================================
class CircuitBreakerError(Exception):
"""Exception raised when circuit breaker is triggered."""
pass
# =============================================================================
# Data Classes
# =============================================================================
@dataclass
class RetryDecision:
"""Decision about whether to retry a failed feature."""
should_retry: bool
reason: str
retry_count: int = 0
@dataclass
class RetryState:
"""Persistent retry state for a batch."""
batch_id: str
retry_counts: Dict[int, int] = field(default_factory=dict) # feature_index → count
global_retry_count: int = 0
consecutive_failures: int = 0
circuit_breaker_open: bool = False
created_at: str = field(default_factory=lambda: datetime.utcnow().isoformat() + "Z")
updated_at: str = field(default_factory=lambda: datetime.utcnow().isoformat() + "Z")
# =============================================================================
# Audit Logging
# =============================================================================
def log_audit_event(event_type: str, batch_id: str, details: Dict[str, Any]) -> None:
"""
Log retry attempt to audit file.
Audit Log Format (JSONL):
Each line is a JSON object with:
- timestamp (str): ISO 8601 timestamp (UTC)
- event_type (str): "retry_attempt" or "circuit_breaker_triggered"
- batch_id (str): Unique batch identifier
- Additional fields from details dict
Example audit entry:
{
"timestamp": "2025-11-18T12:34:56.789Z",
"event_type": "retry_attempt",
"batch_id": "batch-20251118-123456",
"feature_index": 0,
"retry_count": 1,
"global_retry_count": 1,
"error_message": "ConnectionError: Failed to connect",
"feature_name": "Add user authentication"
}
Args:
event_type: Type of event (e.g., "retry_attempt", "circuit_breaker")
batch_id: Batch ID for tracking
details: Event details (will be merged into audit entry)
"""
# Create audit log directory
audit_dir = Path.cwd() / ".claude" / "audit"
audit_dir.mkdir(parents=True, exist_ok=True)
# Audit log file
audit_file = audit_dir / f"{batch_id}_retry_audit.jsonl"
# Create audit entry
audit_entry = {
"timestamp": datetime.utcnow().isoformat() + "Z",
"event_type": event_type,
"batch_id": batch_id,
**details,
}
# Append to audit log (JSONL format)
try:
with open(audit_file, "a") as f:
f.write(json.dumps(audit_entry) + "\n")
except OSError:
# Non-blocking - log to stderr but don't fail
print(f"Warning: Failed to write audit log: {audit_file}", file=sys.stderr)
# =============================================================================
# Batch Retry Manager
# =============================================================================
class BatchRetryManager:
"""
Orchestrate retry logic for /batch-implement workflows.
Manages:
- Per-feature retry counts
- Global retry limits
- Circuit breaker logic
- Retry state persistence
"""
def __init__(self, batch_id: str, state_dir: Optional[Path] = None):
"""
Initialize retry manager.
Args:
batch_id: Unique batch identifier
state_dir: Directory for state files (default: ./.claude)
Raises:
ValueError: If batch_id contains path traversal or directory separators
"""
# Validate batch_id for path traversal (CWE-22)
if ".." in batch_id or "/" in batch_id or "\\" in batch_id:
raise ValueError(
f"Invalid batch_id: contains path traversal or directory separators. "
f"batch_id must be a simple identifier without path components. Got: {batch_id}"
)
self.batch_id = batch_id
self.state_dir = state_dir or Path.cwd() / ".claude"
self.state_file = self.state_dir / f"{batch_id}_retry_state.json"
# Load existing state or create new
self.state = self._load_state()
def _load_state(self) -> RetryState:
"""
Load retry state from file or create new state.
Returns:
RetryState object
"""
if not self.state_file.exists():
return RetryState(batch_id=self.batch_id)
try:
data = json.loads(self.state_file.read_text())
return RetryState(
batch_id=data.get("batch_id", self.batch_id),
retry_counts={int(k): v for k, v in data.get("retry_counts", {}).items()},
global_retry_count=data.get("global_retry_count", 0),
consecutive_failures=data.get("consecutive_failures", 0),
circuit_breaker_open=data.get("circuit_breaker_open", False),
created_at=data.get("created_at", datetime.utcnow().isoformat() + "Z"),
updated_at=data.get("updated_at", datetime.utcnow().isoformat() + "Z"),
)
except (json.JSONDecodeError, OSError):
# Corrupted file - start fresh
return RetryState(batch_id=self.batch_id)
def _save_state(self) -> None:
"""
Save retry state to file (atomic write).
"""
# Update timestamp
self.state.updated_at = datetime.utcnow().isoformat() + "Z"
# Convert to dict
state_dict = {
"batch_id": self.state.batch_id,
"retry_counts": self.state.retry_counts,
"global_retry_count": self.state.global_retry_count,
"consecutive_failures": self.state.consecutive_failures,
"circuit_breaker_open": self.state.circuit_breaker_open,
"created_at": self.state.created_at,
"updated_at": self.state.updated_at,
}
# Atomic write (temp + rename)
self.state_dir.mkdir(parents=True, exist_ok=True)
fd, temp_path = tempfile.mkstemp(
dir=self.state_dir,
prefix=".retry_state_",
suffix=".tmp"
)
try:
os.write(fd, json.dumps(state_dict, indent=2).encode())
os.close(fd)
Path(temp_path).replace(self.state_file)
except Exception:
try:
os.close(fd)
except OSError:
pass
try:
Path(temp_path).unlink()
except OSError:
pass
raise
def get_retry_count(self, feature_index: int) -> int:
"""
Get retry count for a specific feature.
Args:
feature_index: Index of feature
Returns:
Number of retry attempts (0 if never retried)
"""
return self.state.retry_counts.get(feature_index, 0)
def get_global_retry_count(self) -> int:
"""
Get total retry count across all features.
Returns:
Total number of retry attempts
"""
return self.state.global_retry_count
def record_retry_attempt(self, feature_index: int, error_message: str, feature_name: str = "") -> None:
"""
Record a retry attempt.
Updates:
- Per-feature retry count
- Global retry count
- Consecutive failure count
- Audit log
Args:
feature_index: Index of feature being retried
error_message: Error message from failed attempt
feature_name: Name of feature (optional, for audit logging)
"""
# Increment counters (with global limit enforcement)
self.state.retry_counts[feature_index] = self.get_retry_count(feature_index) + 1
# Enforce global retry limit (CWE-400 resource exhaustion prevention)
if self.state.global_retry_count < MAX_TOTAL_RETRIES:
self.state.global_retry_count += 1
# Note: If already at MAX_TOTAL_RETRIES, don't increment further
# This prevents counter overflow and enforces hard limit
self.state.consecutive_failures += 1
# Check circuit breaker
if self.state.consecutive_failures >= CIRCUIT_BREAKER_THRESHOLD:
self.state.circuit_breaker_open = True
# User-visible notification (CWE-400 protection)
print(
f"\n⚠️ Circuit breaker triggered after {self.state.consecutive_failures} "
f"consecutive failures.\n"
f"Automatic retries paused for safety.\n"
f"To resume, fix the underlying issue and run: /batch-implement --resume {self.batch_id}\n",
file=sys.stderr
)
log_audit_event(
"circuit_breaker_triggered",
self.batch_id,
{
"consecutive_failures": self.state.consecutive_failures,
"threshold": CIRCUIT_BREAKER_THRESHOLD,
}
)
# Save state
self._save_state()
# Log audit event with sanitized feature name (CWE-117 log injection prevention)
log_audit_event(
"retry_attempt",
self.batch_id,
{
"feature_index": feature_index,
"retry_count": self.get_retry_count(feature_index),
"global_retry_count": self.state.global_retry_count,
"error_message": sanitize_error_message(error_message),
"feature_name": sanitize_error_message(feature_name) if feature_name else "",
}
)
def record_success(self, feature_index: int) -> None:
"""
Record a successful feature completion.
Resets consecutive failure count (circuit breaker).
Args:
feature_index: Index of successful feature
"""
# Reset consecutive failures (circuit breaker)
self.state.consecutive_failures = 0
self.state.circuit_breaker_open = False
# Save state
self._save_state()
def check_circuit_breaker(self) -> bool:
"""
Check if circuit breaker is open.
Returns:
True if circuit breaker is open (retries blocked), False otherwise
"""
return self.state.circuit_breaker_open
def reset_circuit_breaker(self) -> None:
"""
Manually reset circuit breaker.
Use this after manual intervention to resume batch processing.
"""
self.state.circuit_breaker_open = False
self.state.consecutive_failures = 0
self._save_state()
def should_retry_feature(
self,
feature_index: int,
failure_type: FailureType
) -> RetryDecision:
"""
Decide if a failed feature should be retried.
Decision Logic:
0. Check user consent (retry feature disabled → block)
1. Check global retry limit (max total retries → block)
2. Check circuit breaker (5 consecutive failures → block)
3. Check failure type (permanent → block)
4. Check per-feature retry count (3 retries → block)
5. If all checks pass → allow retry
Args:
feature_index: Index of failed feature
failure_type: Classification of failure (transient/permanent)
Returns:
RetryDecision with should_retry flag and reason
Examples:
>>> manager = BatchRetryManager("batch-123")
>>> decision = manager.should_retry_feature(0, FailureType.TRANSIENT)
>>> if decision.should_retry:
... # Retry the feature
... pass
"""
retry_count = self.get_retry_count(feature_index)
# 0. Check user consent (highest priority - respect user choice)
if not batch_retry_consent.is_retry_enabled():
return RetryDecision(
should_retry=False,
reason="consent_not_given",
retry_count=retry_count
)
# 1. Check global retry limit (highest priority - hard limit)
if self.state.global_retry_count >= MAX_TOTAL_RETRIES:
return RetryDecision(
should_retry=False,
reason="global_retry_limit_reached",
retry_count=retry_count
)
# 2. Check circuit breaker
if self.check_circuit_breaker():
return RetryDecision(
should_retry=False,
reason="circuit_breaker_open",
retry_count=retry_count
)
# 3. Check failure type (permanent errors not retried)
if failure_type == FailureType.PERMANENT:
return RetryDecision(
should_retry=False,
reason="permanent_failure",
retry_count=retry_count
)
# 4. Check per-feature retry limit
if retry_count >= MAX_RETRIES_PER_FEATURE:
return RetryDecision(
should_retry=False,
reason="max_retries_reached",
retry_count=retry_count
)
# All checks passed - allow retry
return RetryDecision(
should_retry=True,
reason="under_retry_limit",
retry_count=retry_count
)
# =============================================================================
# Convenience Functions
# =============================================================================
def should_retry_feature(
batch_id: str,
feature_index: int,
failure_type: FailureType,
state_dir: Optional[Path] = None
) -> RetryDecision:
"""
Convenience function to check if feature should be retried.
Args:
batch_id: Unique batch identifier
feature_index: Index of failed feature
failure_type: Classification of failure
state_dir: Directory for state files (default: ./.claude)
Returns:
RetryDecision with should_retry flag and reason
"""
manager = BatchRetryManager(batch_id, state_dir)
return manager.should_retry_feature(feature_index, failure_type)
def record_retry_attempt(
batch_id: str,
feature_index: int,
error_message: str,
feature_name: str = "",
state_dir: Optional[Path] = None
) -> None:
"""
Convenience function to record retry attempt.
Args:
batch_id: Unique batch identifier
feature_index: Index of feature being retried
error_message: Error message from failed attempt
feature_name: Name of feature (optional, for audit logging)
state_dir: Directory for state files (default: ./.claude)
"""
manager = BatchRetryManager(batch_id, state_dir)
manager.record_retry_attempt(feature_index, error_message, feature_name)
def check_circuit_breaker(
batch_id: str,
state_dir: Optional[Path] = None
) -> bool:
"""
Convenience function to check circuit breaker status.
Args:
batch_id: Unique batch identifier
state_dir: Directory for state files (default: ./.claude)
Returns:
True if circuit breaker is open, False otherwise
"""
manager = BatchRetryManager(batch_id, state_dir)
return manager.check_circuit_breaker()
def get_retry_count(
batch_id: str,
feature_index: int,
state_dir: Optional[Path] = None
) -> int:
"""
Convenience function to get retry count for feature.
Args:
batch_id: Unique batch identifier
feature_index: Index of feature
state_dir: Directory for state files (default: ./.claude)
Returns:
Number of retry attempts
"""
manager = BatchRetryManager(batch_id, state_dir)
return manager.get_retry_count(feature_index)
def reset_circuit_breaker(
batch_id: str,
state_dir: Optional[Path] = None
) -> None:
"""
Convenience function to reset circuit breaker.
Args:
batch_id: Unique batch identifier
state_dir: Directory for state files (default: ./.claude)
"""
manager = BatchRetryManager(batch_id, state_dir)
manager.reset_circuit_breaker()
# =============================================================================
# Module Exports
# =============================================================================
__all__ = [
"BatchRetryManager",
"RetryDecision",
"CircuitBreakerError",
"should_retry_feature",
"record_retry_attempt",
"check_circuit_breaker",
"get_retry_count",
"reset_circuit_breaker",
"MAX_RETRIES_PER_FEATURE",
"MAX_TOTAL_RETRIES",
"CIRCUIT_BREAKER_THRESHOLD",
]