325 lines
8.6 KiB
Bash
Executable File
325 lines
8.6 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# TradingAgents SLURM Job Management Script
|
|
# This script provides convenience functions for managing TradingAgents jobs on SLURM
|
|
|
|
# Colors for output
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
NC='\033[0m' # No Color
|
|
|
|
# Print colored output
|
|
print_status() {
|
|
echo -e "${GREEN}[INFO]${NC} $1"
|
|
}
|
|
|
|
print_warning() {
|
|
echo -e "${YELLOW}[WARNING]${NC} $1"
|
|
}
|
|
|
|
print_error() {
|
|
echo -e "${RED}[ERROR]${NC} $1"
|
|
}
|
|
|
|
print_header() {
|
|
echo -e "${BLUE}=== $1 ===${NC}"
|
|
}
|
|
|
|
# Check if required files exist
|
|
check_prerequisites() {
|
|
print_header "Checking Prerequisites"
|
|
|
|
local missing_files=()
|
|
|
|
if [ ! -f "requirements.txt" ]; then
|
|
missing_files+=("requirements.txt")
|
|
fi
|
|
|
|
if [ ! -f "slurm_setup.sh" ]; then
|
|
missing_files+=("slurm_setup.sh")
|
|
fi
|
|
|
|
if [ ! -f ".env.slurm.template" ]; then
|
|
missing_files+=(".env.slurm.template")
|
|
fi
|
|
|
|
if [ ${#missing_files[@]} -ne 0 ]; then
|
|
print_error "Missing required files: ${missing_files[*]}"
|
|
return 1
|
|
fi
|
|
|
|
print_status "All required files found"
|
|
return 0
|
|
}
|
|
|
|
# Setup environment
|
|
setup_environment() {
|
|
print_header "Setting Up Environment"
|
|
|
|
# Create necessary directories
|
|
mkdir -p logs results data_cache
|
|
|
|
# Copy environment template if .env doesn't exist
|
|
if [ ! -f ".env" ]; then
|
|
cp .env.slurm.template .env
|
|
print_status "Created .env file from template. Please customize it for your environment."
|
|
fi
|
|
|
|
# Make scripts executable
|
|
chmod +x slurm_*.sh
|
|
|
|
print_status "Environment setup completed"
|
|
}
|
|
|
|
# Submit setup job
|
|
submit_setup() {
|
|
print_header "Submitting Setup Job"
|
|
|
|
if [ ! -f "slurm_setup.sh" ]; then
|
|
print_error "slurm_setup.sh not found"
|
|
return 1
|
|
fi
|
|
|
|
local job_id=$(sbatch slurm_setup.sh | grep -o '[0-9]*')
|
|
print_status "Setup job submitted with ID: $job_id"
|
|
echo "$job_id"
|
|
}
|
|
|
|
# Submit single analysis job
|
|
submit_single_analysis() {
|
|
local symbol=${1:-"SPY"}
|
|
local date=${2:-$(date +%Y-%m-%d)}
|
|
|
|
print_header "Submitting Single Analysis Job"
|
|
print_status "Symbol: $symbol, Date: $date"
|
|
|
|
if [ ! -f "slurm_single_analysis.sh" ]; then
|
|
print_error "slurm_single_analysis.sh not found"
|
|
return 1
|
|
fi
|
|
|
|
local job_id=$(sbatch slurm_single_analysis.sh "$symbol" "$date" | grep -o '[0-9]*')
|
|
print_status "Single analysis job submitted with ID: $job_id"
|
|
echo "$job_id"
|
|
}
|
|
|
|
# Submit batch analysis job
|
|
submit_batch_analysis() {
|
|
print_header "Submitting Batch Analysis Job"
|
|
|
|
if [ ! -f "slurm_batch_analysis.sh" ]; then
|
|
print_error "slurm_batch_analysis.sh not found"
|
|
return 1
|
|
fi
|
|
|
|
local job_id=$(sbatch slurm_batch_analysis.sh | grep -o '[0-9]*')
|
|
print_status "Batch analysis job submitted with ID: $job_id"
|
|
echo "$job_id"
|
|
}
|
|
|
|
# Submit GPU analysis job
|
|
submit_gpu_analysis() {
|
|
local symbol=${1:-"SPY"}
|
|
local date=${2:-$(date +%Y-%m-%d)}
|
|
|
|
print_header "Submitting GPU Analysis Job"
|
|
print_status "Symbol: $symbol, Date: $date"
|
|
|
|
if [ ! -f "slurm_gpu_analysis.sh" ]; then
|
|
print_error "slurm_gpu_analysis.sh not found"
|
|
return 1
|
|
fi
|
|
|
|
local job_id=$(sbatch slurm_gpu_analysis.sh "$symbol" "$date" | grep -o '[0-9]*')
|
|
print_status "GPU analysis job submitted with ID: $job_id"
|
|
echo "$job_id"
|
|
}
|
|
|
|
# Check job status
|
|
check_job_status() {
|
|
local job_id=$1
|
|
|
|
if [ -z "$job_id" ]; then
|
|
print_error "Job ID required"
|
|
return 1
|
|
fi
|
|
|
|
print_header "Job Status for ID: $job_id"
|
|
squeue -j "$job_id" --format="%.18i %.9P %.20j %.8u %.8T %.10M %.6D %R"
|
|
}
|
|
|
|
# Show recent jobs
|
|
show_recent_jobs() {
|
|
print_header "Recent TradingAgents Jobs"
|
|
squeue -u $USER --name=trading-agents* --format="%.18i %.9P %.20j %.8u %.8T %.10M %.6D %R"
|
|
}
|
|
|
|
# Cancel job
|
|
cancel_job() {
|
|
local job_id=$1
|
|
|
|
if [ -z "$job_id" ]; then
|
|
print_error "Job ID required"
|
|
return 1
|
|
fi
|
|
|
|
print_header "Cancelling Job: $job_id"
|
|
scancel "$job_id"
|
|
print_status "Job $job_id cancelled"
|
|
}
|
|
|
|
# View job output
|
|
view_job_output() {
|
|
local job_id=$1
|
|
local output_type=${2:-"out"} # "out" or "err"
|
|
|
|
if [ -z "$job_id" ]; then
|
|
print_error "Job ID required"
|
|
return 1
|
|
fi
|
|
|
|
local output_file
|
|
if [ "$output_type" == "err" ]; then
|
|
output_file="logs/trading_${job_id}.err"
|
|
else
|
|
output_file="logs/trading_${job_id}.out"
|
|
fi
|
|
|
|
if [ -f "$output_file" ]; then
|
|
print_header "Job $job_id Output ($output_type)"
|
|
tail -f "$output_file"
|
|
else
|
|
print_error "Output file not found: $output_file"
|
|
fi
|
|
}
|
|
|
|
# Check for failed jobs and show errors
|
|
check_failed_jobs() {
|
|
print_header "Checking for Failed Jobs"
|
|
|
|
# Get failed jobs from sacct
|
|
local failed_jobs=$(sacct -u $USER --name=trading-agents* --state=FAILED --format=JobID,State,ExitCode --noheader --parsable2 | cut -d'|' -f1)
|
|
|
|
if [ -z "$failed_jobs" ]; then
|
|
print_status "No failed jobs found"
|
|
return 0
|
|
fi
|
|
|
|
echo "$failed_jobs" | while read -r job_id; do
|
|
if [ -n "$job_id" ]; then
|
|
print_warning "Failed job: $job_id"
|
|
|
|
# Look for error files
|
|
local error_files=$(find results -name "error_${job_id}.json" 2>/dev/null)
|
|
if [ -n "$error_files" ]; then
|
|
echo "$error_files" | while read -r error_file; do
|
|
echo "Error details from: $error_file"
|
|
if command -v jq >/dev/null 2>&1; then
|
|
jq '.' "$error_file" 2>/dev/null || cat "$error_file"
|
|
else
|
|
cat "$error_file"
|
|
fi
|
|
done
|
|
else
|
|
echo "No error details found for job $job_id"
|
|
fi
|
|
echo ""
|
|
fi
|
|
done
|
|
}
|
|
|
|
# Collect results
|
|
collect_results() {
|
|
local symbol=${1:-"*"}
|
|
local date=${2:-$(date +%Y-%m-%d)}
|
|
|
|
print_header "Collecting Results"
|
|
print_status "Symbol: $symbol, Date: $date"
|
|
|
|
find results -name "*.json" -path "*/$symbol/$date/*" | while read -r file; do
|
|
echo "Found result: $file"
|
|
if command -v jq >/dev/null 2>&1; then
|
|
jq '.decision' "$file" 2>/dev/null || echo " (Could not parse decision)"
|
|
fi
|
|
done
|
|
}
|
|
|
|
# Main function
|
|
main() {
|
|
case "$1" in
|
|
"setup")
|
|
check_prerequisites && setup_environment
|
|
;;
|
|
"submit-setup")
|
|
submit_setup
|
|
;;
|
|
"submit-single")
|
|
submit_single_analysis "$2" "$3"
|
|
;;
|
|
"submit-batch")
|
|
submit_batch_analysis
|
|
;;
|
|
"submit-gpu")
|
|
submit_gpu_analysis "$2" "$3"
|
|
;;
|
|
"status")
|
|
if [ -n "$2" ]; then
|
|
check_job_status "$2"
|
|
else
|
|
show_recent_jobs
|
|
fi
|
|
;;
|
|
"cancel")
|
|
cancel_job "$2"
|
|
;;
|
|
"output")
|
|
view_job_output "$2" "$3"
|
|
;;
|
|
"results")
|
|
collect_results "$2" "$3"
|
|
;;
|
|
"check-failed")
|
|
check_failed_jobs
|
|
;;
|
|
"help"|"--help"|"-h"|"")
|
|
cat << EOF
|
|
TradingAgents SLURM Job Manager
|
|
|
|
Usage: $0 <command> [arguments]
|
|
|
|
Commands:
|
|
setup - Setup environment and create necessary directories
|
|
submit-setup - Submit environment setup job
|
|
submit-single [SYM] [DATE] - Submit single analysis job (default: SPY, today)
|
|
submit-batch - Submit batch analysis job for multiple symbols
|
|
submit-gpu [SYM] [DATE] - Submit GPU-accelerated analysis job
|
|
status [JOB_ID] - Show job status (specific job or all recent jobs)
|
|
cancel <JOB_ID> - Cancel a specific job
|
|
output <JOB_ID> [err] - View job output (stdout or stderr)
|
|
results [SYM] [DATE] - Collect and display results
|
|
check-failed - Check for failed jobs and show error details
|
|
help - Show this help message
|
|
|
|
Examples:
|
|
$0 setup # Initial setup
|
|
$0 submit-single AAPL # Analyze AAPL for today
|
|
$0 submit-batch # Analyze multiple stocks
|
|
$0 status 12345 # Check status of job 12345
|
|
$0 output 12345 # View output of job 12345
|
|
$0 results AAPL # Show results for AAPL
|
|
|
|
EOF
|
|
;;
|
|
*)
|
|
print_error "Unknown command: $1"
|
|
print_status "Use '$0 help' for usage information"
|
|
exit 1
|
|
;;
|
|
esac
|
|
}
|
|
|
|
# Run main function with all arguments
|
|
main "$@"
|