TradingAgents/.claude/lib/health_check.py

276 lines
8.9 KiB
Python

"""
Health check system for autonomous-dev v2.0 agents
Monitors agent execution to detect:
- Agent started successfully
- Agent making progress (file updates, log activity)
- Agent hung/crashed (no activity for timeout period)
- Agent completed successfully (expected artifacts created)
Design Patterns:
See library-design-patterns skill for standardized design patterns.
See api-integration-patterns skill for standardized design patterns.
"""
import json
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, Any, List
class AgentHealthCheck:
"""Monitor agent health and execution progress"""
def __init__(self, workflow_id: str, agent_name: str):
self.workflow_id = workflow_id
self.agent_name = agent_name
self.artifacts_dir = Path(f".claude/artifacts/{workflow_id}")
self.log_file = self.artifacts_dir / "logs" / f"{agent_name}.log"
def check_started(self, timeout_seconds: int = 60) -> Dict[str, Any]:
"""
Check if agent has started (log file exists with recent activity)
Args:
timeout_seconds: How long to wait for agent to start
Returns:
Dict with status and details
"""
start_time = time.time()
while time.time() - start_time < timeout_seconds:
if self.log_file.exists():
# Check if log has content
if self.log_file.stat().st_size > 0:
mtime = datetime.fromtimestamp(self.log_file.stat().st_mtime)
age_seconds = (datetime.now() - mtime).total_seconds()
return {
'started': True,
'log_file': str(self.log_file),
'log_size': self.log_file.stat().st_size,
'last_modified': mtime.isoformat(),
'age_seconds': age_seconds
}
time.sleep(1)
return {
'started': False,
'error': f'Agent {self.agent_name} did not start within {timeout_seconds}s',
'log_file': str(self.log_file),
'log_exists': self.log_file.exists()
}
def check_progress(self, max_idle_seconds: int = 300) -> Dict[str, Any]:
"""
Check if agent is making progress (recent log activity)
Args:
max_idle_seconds: Maximum seconds without log updates before considering hung
Returns:
Dict with progress status
"""
if not self.log_file.exists():
return {
'active': False,
'error': f'Log file does not exist: {self.log_file}'
}
mtime = datetime.fromtimestamp(self.log_file.stat().st_mtime)
age_seconds = (datetime.now() - mtime).total_seconds()
# Read last few log entries
try:
with open(self.log_file, 'r') as f:
lines = f.readlines()
last_entries = lines[-5:] if len(lines) >= 5 else lines
last_events = []
for line in last_entries:
try:
entry = json.loads(line)
last_events.append({
'timestamp': entry.get('timestamp', 'unknown'),
'event': entry.get('event_type', 'unknown'),
'message': entry.get('message', '')
})
except:
pass
except Exception as e:
last_events = []
is_active = age_seconds < max_idle_seconds
return {
'active': is_active,
'last_modified': mtime.isoformat(),
'age_seconds': age_seconds,
'max_idle_seconds': max_idle_seconds,
'log_size': self.log_file.stat().st_size,
'last_events': last_events,
'status': 'active' if is_active else 'possibly_hung'
}
def check_completion(self, expected_artifacts: List[str]) -> Dict[str, Any]:
"""
Check if agent completed successfully (expected artifacts exist)
Args:
expected_artifacts: List of artifact filenames that should exist
Returns:
Dict with completion status
"""
missing_artifacts = []
existing_artifacts = []
for artifact in expected_artifacts:
artifact_path = self.artifacts_dir / artifact
if artifact_path.exists():
existing_artifacts.append({
'name': artifact,
'path': str(artifact_path),
'size': artifact_path.stat().st_size,
'modified': datetime.fromtimestamp(
artifact_path.stat().st_mtime
).isoformat()
})
else:
missing_artifacts.append(artifact)
completed = len(missing_artifacts) == 0
return {
'completed': completed,
'existing_artifacts': existing_artifacts,
'missing_artifacts': missing_artifacts,
'total_expected': len(expected_artifacts),
'total_found': len(existing_artifacts)
}
def full_health_check(
self,
expected_artifacts: List[str],
start_timeout: int = 60,
max_idle: int = 300
) -> Dict[str, Any]:
"""
Comprehensive health check
Args:
expected_artifacts: Artifacts that should be created
start_timeout: Seconds to wait for agent to start
max_idle: Seconds without activity before considering hung
Returns:
Dict with complete health status
"""
started = self.check_started(start_timeout)
if not started['started']:
return {
'status': 'not_started',
'details': started
}
progress = self.check_progress(max_idle)
completion = self.check_completion(expected_artifacts)
if completion['completed']:
status = 'completed'
elif progress['active']:
status = 'running'
else:
status = 'hung'
return {
'status': status,
'workflow_id': self.workflow_id,
'agent': self.agent_name,
'started': started,
'progress': progress,
'completion': completion,
'timestamp': datetime.now().isoformat()
}
def monitor_agent_execution(
workflow_id: str,
agent_name: str,
expected_artifacts: List[str],
poll_interval: int = 5,
max_wait: int = 900
) -> Dict[str, Any]:
"""
Monitor agent execution until completion or timeout
Args:
workflow_id: Workflow ID
agent_name: Agent being monitored
expected_artifacts: Artifacts that should be created
poll_interval: Seconds between health checks
max_wait: Maximum seconds to wait
Returns:
Final health check result
"""
health = AgentHealthCheck(workflow_id, agent_name)
start_time = time.time()
print(f"Monitoring {agent_name} agent for workflow {workflow_id}...")
print(f"Expected artifacts: {', '.join(expected_artifacts)}")
print(f"Max wait time: {max_wait}s\n")
while time.time() - start_time < max_wait:
check = health.full_health_check(expected_artifacts)
elapsed = int(time.time() - start_time)
print(f"[{elapsed}s] Status: {check['status']}")
if check['status'] == 'completed':
print("✓ Agent completed successfully!")
return check
elif check['status'] == 'hung':
print(f"✗ Agent appears to be hung (no activity for {check['progress']['age_seconds']}s)")
return check
elif check['status'] == 'not_started':
print("⏳ Waiting for agent to start...")
else: # running
print(f"⏺ Agent running (last activity {int(check['progress']['age_seconds'])}s ago)")
if check['progress'].get('last_events'):
last_event = check['progress']['last_events'][-1]
print(f" Latest: {last_event['event']} - {last_event['message'][:60]}")
time.sleep(poll_interval)
print(f"\n✗ Timeout after {max_wait}s")
return health.full_health_check(expected_artifacts)
if __name__ == '__main__':
import sys
if len(sys.argv) < 3:
print("Usage: python health_check.py <workflow_id> <agent_name> [artifact1 artifact2 ...]")
sys.exit(1)
workflow_id = sys.argv[1]
agent_name = sys.argv[2]
expected_artifacts = sys.argv[3:] if len(sys.argv) > 3 else []
if expected_artifacts:
result = monitor_agent_execution(workflow_id, agent_name, expected_artifacts)
else:
health = AgentHealthCheck(workflow_id, agent_name)
result = health.full_health_check([])
print("\n=== FINAL STATUS ===")
print(json.dumps(result, indent=2))