TradingAgents/.claude/lib/checkpoint.py

358 lines
11 KiB
Python

"""
Checkpoint/Resume System for autonomous-dev v2.0
Allows workflows to be saved and resumed after interruptions or failures.
Design Patterns:
See library-design-patterns skill for standardized design patterns.
See state-management-patterns skill for standardized design patterns.
"""
import json
from pathlib import Path
from datetime import datetime
from typing import Dict, Any, Optional, List
class CheckpointManager:
"""
Manages workflow checkpoints for resume capability
Checkpoints allow workflows to be interrupted and resumed later without
starting over from the beginning.
"""
def __init__(self, artifacts_dir: Optional[Path] = None):
"""
Initialize checkpoint manager
Args:
artifacts_dir: Base directory for artifacts (default: .claude/artifacts)
"""
if artifacts_dir is None:
artifacts_dir = Path(".claude/artifacts")
self.artifacts_dir = artifacts_dir
def create_checkpoint(
self,
workflow_id: str,
completed_agents: List[str],
current_agent: str,
artifacts_created: List[str],
metadata: Optional[Dict[str, Any]] = None
) -> Path:
"""
Create a checkpoint after an agent completes
Args:
workflow_id: Workflow identifier
completed_agents: List of agents that have completed
current_agent: Agent that just completed (or next to run)
artifacts_created: List of artifact files created so far
metadata: Additional checkpoint metadata
Returns:
Path to checkpoint file
"""
checkpoint = {
'version': '2.0',
'workflow_id': workflow_id,
'created_at': datetime.utcnow().isoformat(),
'checkpoint_type': 'agent_completion',
'completed_agents': completed_agents,
'current_agent': current_agent,
'artifacts_created': artifacts_created,
'metadata': metadata or {}
}
checkpoint_path = self._get_checkpoint_path(workflow_id)
checkpoint_path.parent.mkdir(parents=True, exist_ok=True)
checkpoint_path.write_text(json.dumps(checkpoint, indent=2))
return checkpoint_path
def load_checkpoint(self, workflow_id: str) -> Optional[Dict[str, Any]]:
"""
Load checkpoint for a workflow
Args:
workflow_id: Workflow identifier
Returns:
Checkpoint data or None if not found
"""
checkpoint_path = self._get_checkpoint_path(workflow_id)
if not checkpoint_path.exists():
return None
return json.loads(checkpoint_path.read_text())
def checkpoint_exists(self, workflow_id: str) -> bool:
"""Check if checkpoint exists for workflow"""
return self._get_checkpoint_path(workflow_id).exists()
def delete_checkpoint(self, workflow_id: str):
"""Delete checkpoint (after workflow completes)"""
checkpoint_path = self._get_checkpoint_path(workflow_id)
if checkpoint_path.exists():
checkpoint_path.unlink()
def list_resumable_workflows(self) -> List[Dict[str, Any]]:
"""
List all workflows that can be resumed
Returns:
List of workflow summaries with checkpoint info
"""
resumable = []
if not self.artifacts_dir.exists():
return resumable
for workflow_dir in self.artifacts_dir.iterdir():
if not workflow_dir.is_dir():
continue
checkpoint_path = workflow_dir / "checkpoint.json"
if not checkpoint_path.exists():
continue
try:
checkpoint = json.loads(checkpoint_path.read_text())
resumable.append({
'workflow_id': checkpoint['workflow_id'],
'created_at': checkpoint['created_at'],
'current_agent': checkpoint['current_agent'],
'completed_agents': checkpoint['completed_agents'],
'progress': f"{len(checkpoint['completed_agents'])}/8 agents"
})
except Exception:
continue
return sorted(resumable, key=lambda x: x['created_at'], reverse=True)
def validate_checkpoint(self, workflow_id: str) -> tuple[bool, Optional[str]]:
"""
Validate checkpoint integrity
Args:
workflow_id: Workflow identifier
Returns:
(is_valid, error_message)
"""
checkpoint = self.load_checkpoint(workflow_id)
if checkpoint is None:
return False, "Checkpoint not found"
# Check required fields
required_fields = ['version', 'workflow_id', 'completed_agents', 'current_agent']
for field in required_fields:
if field not in checkpoint:
return False, f"Missing required field: {field}"
# Check artifacts exist
artifacts_created = checkpoint.get('artifacts_created', [])
workflow_dir = self.artifacts_dir / workflow_id
for artifact in artifacts_created:
artifact_path = workflow_dir / artifact
if not artifact_path.exists():
return False, f"Artifact missing: {artifact}"
return True, None
def get_resume_plan(self, workflow_id: str) -> Dict[str, Any]:
"""
Get plan for resuming workflow
Args:
workflow_id: Workflow identifier
Returns:
Resume plan with next steps
"""
checkpoint = self.load_checkpoint(workflow_id)
if checkpoint is None:
return {'error': 'Checkpoint not found'}
# Agent pipeline
all_agents = [
'orchestrator',
'researcher',
'planner',
'test-master',
'implementer',
'reviewer',
'security-auditor',
'doc-master'
]
completed = checkpoint.get('completed_agents', [])
remaining = [agent for agent in all_agents if agent not in completed]
return {
'workflow_id': workflow_id,
'checkpoint_valid': True,
'completed_agents': completed,
'remaining_agents': remaining,
'next_agent': remaining[0] if remaining else None,
'progress_percentage': int((len(completed) / len(all_agents)) * 100),
'can_resume': len(remaining) > 0
}
def _get_checkpoint_path(self, workflow_id: str) -> Path:
"""Get path to checkpoint file"""
return self.artifacts_dir / workflow_id / "checkpoint.json"
class CheckpointError(Exception):
"""Raised when checkpoint operations fail"""
pass
class WorkflowResumer:
"""
Resume interrupted workflows
Handles the logic of loading checkpoints and continuing from where
the workflow was interrupted.
"""
def __init__(
self,
checkpoint_manager: CheckpointManager,
artifact_manager: Any # Avoid circular import
):
"""
Initialize workflow resumer
Args:
checkpoint_manager: CheckpointManager instance
artifact_manager: ArtifactManager instance
"""
self.checkpoint_manager = checkpoint_manager
self.artifact_manager = artifact_manager
def can_resume(self, workflow_id: str) -> bool:
"""Check if workflow can be resumed"""
if not self.checkpoint_manager.checkpoint_exists(workflow_id):
return False
is_valid, _ = self.checkpoint_manager.validate_checkpoint(workflow_id)
return is_valid
def resume_workflow(self, workflow_id: str) -> tuple[bool, str, Dict[str, Any]]:
"""
Resume a workflow from checkpoint
Args:
workflow_id: Workflow identifier
Returns:
(success, message, resume_context)
"""
# Validate checkpoint
is_valid, error = self.checkpoint_manager.validate_checkpoint(workflow_id)
if not is_valid:
return False, f"Cannot resume: {error}", {}
# Load checkpoint
checkpoint = self.checkpoint_manager.load_checkpoint(workflow_id)
# Get resume plan
resume_plan = self.checkpoint_manager.get_resume_plan(workflow_id)
if not resume_plan.get('can_resume'):
return False, "Workflow already completed", {}
# Load workflow manifest
try:
manifest = self.artifact_manager.read_artifact(workflow_id, 'manifest')
except Exception as e:
return False, f"Cannot load manifest: {e}", {}
# Prepare resume context
resume_context = {
'workflow_id': workflow_id,
'original_request': manifest.get('request'),
'completed_agents': checkpoint['completed_agents'],
'next_agent': resume_plan['next_agent'],
'remaining_agents': resume_plan['remaining_agents'],
'progress': resume_plan['progress_percentage'],
'artifacts_available': checkpoint.get('artifacts_created', []),
'checkpoint_timestamp': checkpoint['created_at']
}
success_msg = f"""
✅ **Workflow Resumed**
Workflow ID: {workflow_id}
Original Request: {resume_context['original_request']}
Progress: {resume_context['progress']}% complete
Completed: {', '.join(resume_context['completed_agents'])}
Next: {resume_context['next_agent']}
Checkpoint from: {resume_context['checkpoint_timestamp']}
Continuing workflow...
"""
return True, success_msg, resume_context
if __name__ == '__main__':
# Example usage
import tempfile
with tempfile.TemporaryDirectory() as tmpdir:
tmppath = Path(tmpdir)
artifacts_dir = tmppath / ".claude" / "artifacts"
# Create checkpoint manager
manager = CheckpointManager(artifacts_dir)
# Create a checkpoint
workflow_id = "20251023_093456"
checkpoint_path = manager.create_checkpoint(
workflow_id=workflow_id,
completed_agents=['orchestrator', 'researcher', 'planner'],
current_agent='test-master',
artifacts_created=['manifest.json', 'research.json', 'architecture.json'],
metadata={'error': None, 'retry_count': 0}
)
print(f"Created checkpoint: {checkpoint_path}")
print()
# Load checkpoint
checkpoint = manager.load_checkpoint(workflow_id)
print("Loaded checkpoint:")
print(json.dumps(checkpoint, indent=2))
print()
# Validate checkpoint
is_valid, error = manager.validate_checkpoint(workflow_id)
print(f"Checkpoint valid: {is_valid}")
if error:
print(f"Error: {error}")
print()
# Get resume plan
resume_plan = manager.get_resume_plan(workflow_id)
print("Resume plan:")
print(json.dumps(resume_plan, indent=2))
print()
# List resumable workflows
resumable = manager.list_resumable_workflows()
print(f"Resumable workflows: {len(resumable)}")
for workflow in resumable:
print(f" - {workflow['workflow_id']}: {workflow['progress']}, next: {workflow['current_agent']}")