276 lines
8.9 KiB
Python
276 lines
8.9 KiB
Python
"""
|
|
Health check system for autonomous-dev v2.0 agents
|
|
|
|
Monitors agent execution to detect:
|
|
- Agent started successfully
|
|
- Agent making progress (file updates, log activity)
|
|
- Agent hung/crashed (no activity for timeout period)
|
|
- Agent completed successfully (expected artifacts created)
|
|
|
|
|
|
Design Patterns:
|
|
See library-design-patterns skill for standardized design patterns.
|
|
See api-integration-patterns skill for standardized design patterns.
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Dict, Any, List
|
|
|
|
|
|
class AgentHealthCheck:
|
|
"""Monitor agent health and execution progress"""
|
|
|
|
def __init__(self, workflow_id: str, agent_name: str):
|
|
self.workflow_id = workflow_id
|
|
self.agent_name = agent_name
|
|
self.artifacts_dir = Path(f".claude/artifacts/{workflow_id}")
|
|
self.log_file = self.artifacts_dir / "logs" / f"{agent_name}.log"
|
|
|
|
def check_started(self, timeout_seconds: int = 60) -> Dict[str, Any]:
|
|
"""
|
|
Check if agent has started (log file exists with recent activity)
|
|
|
|
Args:
|
|
timeout_seconds: How long to wait for agent to start
|
|
|
|
Returns:
|
|
Dict with status and details
|
|
"""
|
|
start_time = time.time()
|
|
|
|
while time.time() - start_time < timeout_seconds:
|
|
if self.log_file.exists():
|
|
# Check if log has content
|
|
if self.log_file.stat().st_size > 0:
|
|
mtime = datetime.fromtimestamp(self.log_file.stat().st_mtime)
|
|
age_seconds = (datetime.now() - mtime).total_seconds()
|
|
|
|
return {
|
|
'started': True,
|
|
'log_file': str(self.log_file),
|
|
'log_size': self.log_file.stat().st_size,
|
|
'last_modified': mtime.isoformat(),
|
|
'age_seconds': age_seconds
|
|
}
|
|
|
|
time.sleep(1)
|
|
|
|
return {
|
|
'started': False,
|
|
'error': f'Agent {self.agent_name} did not start within {timeout_seconds}s',
|
|
'log_file': str(self.log_file),
|
|
'log_exists': self.log_file.exists()
|
|
}
|
|
|
|
def check_progress(self, max_idle_seconds: int = 300) -> Dict[str, Any]:
|
|
"""
|
|
Check if agent is making progress (recent log activity)
|
|
|
|
Args:
|
|
max_idle_seconds: Maximum seconds without log updates before considering hung
|
|
|
|
Returns:
|
|
Dict with progress status
|
|
"""
|
|
if not self.log_file.exists():
|
|
return {
|
|
'active': False,
|
|
'error': f'Log file does not exist: {self.log_file}'
|
|
}
|
|
|
|
mtime = datetime.fromtimestamp(self.log_file.stat().st_mtime)
|
|
age_seconds = (datetime.now() - mtime).total_seconds()
|
|
|
|
# Read last few log entries
|
|
try:
|
|
with open(self.log_file, 'r') as f:
|
|
lines = f.readlines()
|
|
last_entries = lines[-5:] if len(lines) >= 5 else lines
|
|
|
|
last_events = []
|
|
for line in last_entries:
|
|
try:
|
|
entry = json.loads(line)
|
|
last_events.append({
|
|
'timestamp': entry.get('timestamp', 'unknown'),
|
|
'event': entry.get('event_type', 'unknown'),
|
|
'message': entry.get('message', '')
|
|
})
|
|
except:
|
|
pass
|
|
|
|
except Exception as e:
|
|
last_events = []
|
|
|
|
is_active = age_seconds < max_idle_seconds
|
|
|
|
return {
|
|
'active': is_active,
|
|
'last_modified': mtime.isoformat(),
|
|
'age_seconds': age_seconds,
|
|
'max_idle_seconds': max_idle_seconds,
|
|
'log_size': self.log_file.stat().st_size,
|
|
'last_events': last_events,
|
|
'status': 'active' if is_active else 'possibly_hung'
|
|
}
|
|
|
|
def check_completion(self, expected_artifacts: List[str]) -> Dict[str, Any]:
|
|
"""
|
|
Check if agent completed successfully (expected artifacts exist)
|
|
|
|
Args:
|
|
expected_artifacts: List of artifact filenames that should exist
|
|
|
|
Returns:
|
|
Dict with completion status
|
|
"""
|
|
missing_artifacts = []
|
|
existing_artifacts = []
|
|
|
|
for artifact in expected_artifacts:
|
|
artifact_path = self.artifacts_dir / artifact
|
|
|
|
if artifact_path.exists():
|
|
existing_artifacts.append({
|
|
'name': artifact,
|
|
'path': str(artifact_path),
|
|
'size': artifact_path.stat().st_size,
|
|
'modified': datetime.fromtimestamp(
|
|
artifact_path.stat().st_mtime
|
|
).isoformat()
|
|
})
|
|
else:
|
|
missing_artifacts.append(artifact)
|
|
|
|
completed = len(missing_artifacts) == 0
|
|
|
|
return {
|
|
'completed': completed,
|
|
'existing_artifacts': existing_artifacts,
|
|
'missing_artifacts': missing_artifacts,
|
|
'total_expected': len(expected_artifacts),
|
|
'total_found': len(existing_artifacts)
|
|
}
|
|
|
|
def full_health_check(
|
|
self,
|
|
expected_artifacts: List[str],
|
|
start_timeout: int = 60,
|
|
max_idle: int = 300
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Comprehensive health check
|
|
|
|
Args:
|
|
expected_artifacts: Artifacts that should be created
|
|
start_timeout: Seconds to wait for agent to start
|
|
max_idle: Seconds without activity before considering hung
|
|
|
|
Returns:
|
|
Dict with complete health status
|
|
"""
|
|
started = self.check_started(start_timeout)
|
|
|
|
if not started['started']:
|
|
return {
|
|
'status': 'not_started',
|
|
'details': started
|
|
}
|
|
|
|
progress = self.check_progress(max_idle)
|
|
completion = self.check_completion(expected_artifacts)
|
|
|
|
if completion['completed']:
|
|
status = 'completed'
|
|
elif progress['active']:
|
|
status = 'running'
|
|
else:
|
|
status = 'hung'
|
|
|
|
return {
|
|
'status': status,
|
|
'workflow_id': self.workflow_id,
|
|
'agent': self.agent_name,
|
|
'started': started,
|
|
'progress': progress,
|
|
'completion': completion,
|
|
'timestamp': datetime.now().isoformat()
|
|
}
|
|
|
|
|
|
def monitor_agent_execution(
|
|
workflow_id: str,
|
|
agent_name: str,
|
|
expected_artifacts: List[str],
|
|
poll_interval: int = 5,
|
|
max_wait: int = 900
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Monitor agent execution until completion or timeout
|
|
|
|
Args:
|
|
workflow_id: Workflow ID
|
|
agent_name: Agent being monitored
|
|
expected_artifacts: Artifacts that should be created
|
|
poll_interval: Seconds between health checks
|
|
max_wait: Maximum seconds to wait
|
|
|
|
Returns:
|
|
Final health check result
|
|
"""
|
|
health = AgentHealthCheck(workflow_id, agent_name)
|
|
start_time = time.time()
|
|
|
|
print(f"Monitoring {agent_name} agent for workflow {workflow_id}...")
|
|
print(f"Expected artifacts: {', '.join(expected_artifacts)}")
|
|
print(f"Max wait time: {max_wait}s\n")
|
|
|
|
while time.time() - start_time < max_wait:
|
|
check = health.full_health_check(expected_artifacts)
|
|
elapsed = int(time.time() - start_time)
|
|
|
|
print(f"[{elapsed}s] Status: {check['status']}")
|
|
|
|
if check['status'] == 'completed':
|
|
print("✓ Agent completed successfully!")
|
|
return check
|
|
elif check['status'] == 'hung':
|
|
print(f"✗ Agent appears to be hung (no activity for {check['progress']['age_seconds']}s)")
|
|
return check
|
|
elif check['status'] == 'not_started':
|
|
print("⏳ Waiting for agent to start...")
|
|
else: # running
|
|
print(f"⏺ Agent running (last activity {int(check['progress']['age_seconds'])}s ago)")
|
|
if check['progress'].get('last_events'):
|
|
last_event = check['progress']['last_events'][-1]
|
|
print(f" Latest: {last_event['event']} - {last_event['message'][:60]}")
|
|
|
|
time.sleep(poll_interval)
|
|
|
|
print(f"\n✗ Timeout after {max_wait}s")
|
|
return health.full_health_check(expected_artifacts)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
import sys
|
|
|
|
if len(sys.argv) < 3:
|
|
print("Usage: python health_check.py <workflow_id> <agent_name> [artifact1 artifact2 ...]")
|
|
sys.exit(1)
|
|
|
|
workflow_id = sys.argv[1]
|
|
agent_name = sys.argv[2]
|
|
expected_artifacts = sys.argv[3:] if len(sys.argv) > 3 else []
|
|
|
|
if expected_artifacts:
|
|
result = monitor_agent_execution(workflow_id, agent_name, expected_artifacts)
|
|
else:
|
|
health = AgentHealthCheck(workflow_id, agent_name)
|
|
result = health.full_health_check([])
|
|
|
|
print("\n=== FINAL STATUS ===")
|
|
print(json.dumps(result, indent=2))
|