From 43ae0453aac23b793ac25ed5441218da79192eb0 Mon Sep 17 00:00:00 2001 From: Ivan Lee <84584280+ivanleekk@users.noreply.github.com> Date: Wed, 30 Jul 2025 16:39:59 +0800 Subject: [PATCH] slurm init --- .env.slurm.template | 39 +++++ .gitignore | 3 +- SLURM_GUIDE.md | 341 +++++++++++++++++++++++++++++++++++++++ slurm_batch_analysis.sh | 140 ++++++++++++++++ slurm_gpu_analysis.sh | 165 +++++++++++++++++++ slurm_manager.sh | 321 ++++++++++++++++++++++++++++++++++++ slurm_setup.sh | 53 ++++++ slurm_single_analysis.sh | 154 ++++++++++++++++++ 8 files changed, 1215 insertions(+), 1 deletion(-) create mode 100644 .env.slurm.template create mode 100644 SLURM_GUIDE.md create mode 100755 slurm_batch_analysis.sh create mode 100755 slurm_gpu_analysis.sh create mode 100755 slurm_manager.sh create mode 100755 slurm_setup.sh create mode 100755 slurm_single_analysis.sh diff --git a/.env.slurm.template b/.env.slurm.template new file mode 100644 index 00000000..02a36bfa --- /dev/null +++ b/.env.slurm.template @@ -0,0 +1,39 @@ +# TradingAgents SLURM Configuration +# Copy this file to .env and customize for your cluster environment + +# LLM Configuration +LLM_PROVIDER=ollama +LLM_BACKEND_URL=http://localhost:11434/v1 +DEEP_THINK_LLM=llama3.2 +QUICK_THINK_LLM=llama3.2 + +# Alternative: OpenAI/OpenRouter Configuration +# LLM_PROVIDER=openai +# LLM_BACKEND_URL=https://api.openai.com/v1 +# DEEP_THINK_LLM=gpt-4 +# QUICK_THINK_LLM=gpt-3.5-turbo +# OPENAI_API_KEY=your_openai_api_key_here + +# Alternative: Anthropic Configuration +# LLM_PROVIDER=anthropic +# DEEP_THINK_LLM=claude-3-sonnet-20240229 +# QUICK_THINK_LLM=claude-3-haiku-20240307 +# ANTHROPIC_API_KEY=your_anthropic_api_key_here + +# TradingAgents Configuration +TRADINGAGENTS_RESULTS_DIR=./results +MAX_DEBATE_ROUNDS=2 +MAX_RISK_DISCUSS_ROUNDS=2 +ONLINE_TOOLS=true + +# Data Sources API Keys (optional but recommended) +FINNHUB_API_KEY=your_finnhub_api_key +REDDIT_CLIENT_ID=your_reddit_client_id +REDDIT_CLIENT_SECRET=your_reddit_client_secret +REDDIT_USER_AGENT=TradingAgents/1.0 + +# SLURM Cluster Specific +SLURM_PARTITION=cpu +SLURM_GPU_PARTITION=gpu +SLURM_MAX_TIME=08:00:00 +SLURM_DEFAULT_MEMORY=16G diff --git a/.gitignore b/.gitignore index c7327cdd..5cd1c383 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ eval_data/ *.egg-info/ .env *.log -results \ No newline at end of file +results +.idea \ No newline at end of file diff --git a/SLURM_GUIDE.md b/SLURM_GUIDE.md new file mode 100644 index 00000000..67dbf289 --- /dev/null +++ b/SLURM_GUIDE.md @@ -0,0 +1,341 @@ +# TradingAgents SLURM Cluster Guide + +This guide explains how to run the TradingAgents framework on a SLURM cluster environment. + +## Overview + +The TradingAgents framework has been configured to run efficiently on SLURM clusters with the following features: + +- **Multi-job support**: Single analysis, batch processing, and GPU-accelerated runs +- **Resource management**: Optimized CPU, memory, and GPU allocation +- **Environment isolation**: Python virtual environments and dependency management +- **Result collection**: Structured output and error handling +- **LLM flexibility**: Support for various LLM providers (OpenAI, Anthropic, Ollama, etc.) + +## Files Created + +| File | Purpose | +| -------------------------- | --------------------------------------------- | +| `slurm_setup.sh` | Environment setup and dependency installation | +| `slurm_single_analysis.sh` | Single stock analysis job | +| `slurm_batch_analysis.sh` | Batch analysis for multiple stocks | +| `slurm_gpu_analysis.sh` | GPU-accelerated analysis with local models | +| `slurm_manager.sh` | Job management and utility script | +| `.env.slurm.template` | Environment configuration template | + +## Quick Start + +### 1. Initial Setup + +```bash +# Make the manager script executable +chmod +x slurm_manager.sh + +# Setup environment and create directories +./slurm_manager.sh setup + +# Submit setup job to install dependencies +./slurm_manager.sh submit-setup +``` + +### 2. Configure Environment + +Edit the `.env` file (created from template) to configure your LLM provider: + +```bash +# For Ollama (local models) +LLM_PROVIDER=ollama +LLM_BACKEND_URL=http://localhost:11434/v1 +DEEP_THINK_LLM=llama3.2 +QUICK_THINK_LLM=llama3.2 + +# For OpenAI +LLM_PROVIDER=openai +OPENAI_API_KEY=your_api_key_here +DEEP_THINK_LLM=gpt-4 +QUICK_THINK_LLM=gpt-3.5-turbo + +# For Anthropic +LLM_PROVIDER=anthropic +ANTHROPIC_API_KEY=your_api_key_here +DEEP_THINK_LLM=claude-3-sonnet-20240229 +QUICK_THINK_LLM=claude-3-haiku-20240307 +``` + +### 3. Submit Jobs + +```bash +# Single stock analysis +./slurm_manager.sh submit-single AAPL + +# Batch analysis (multiple stocks) +./slurm_manager.sh submit-batch + +# GPU-accelerated analysis +./slurm_manager.sh submit-gpu TSLA +``` + +### 4. Monitor Jobs + +```bash +# Check all recent jobs +./slurm_manager.sh status + +# Check specific job +./slurm_manager.sh status 12345 + +# View job output +./slurm_manager.sh output 12345 + +# View job errors +./slurm_manager.sh output 12345 err +``` + +### 5. Collect Results + +```bash +# View results for all symbols +./slurm_manager.sh results + +# View results for specific symbol +./slurm_manager.sh results AAPL + +# View results for specific date +./slurm_manager.sh results AAPL 2024-01-15 +``` + +## Job Types + +### 1. Single Analysis (`slurm_single_analysis.sh`) + +- **Purpose**: Analyze a single stock symbol +- **Resources**: 8 CPUs, 16GB RAM, 4 hours +- **Usage**: Best for focused analysis or testing + +```bash +sbatch slurm_single_analysis.sh SYMBOL DATE +# or +./slurm_manager.sh submit-single SYMBOL DATE +``` + +### 2. Batch Analysis (`slurm_batch_analysis.sh`) + +- **Purpose**: Analyze multiple stocks in parallel +- **Resources**: Array job with up to 5 concurrent tasks +- **Default symbols**: SPY, QQQ, AAPL, MSFT, GOOGL, AMZN, TSLA, NVDA, META, NFLX +- **Usage**: Efficient for portfolio-wide analysis + +```bash +sbatch slurm_batch_analysis.sh +# or +./slurm_manager.sh submit-batch +``` + +### 3. GPU Analysis (`slurm_gpu_analysis.sh`) + +- **Purpose**: GPU-accelerated analysis with local models +- **Resources**: 1 GPU, 8 CPUs, 32GB RAM, 8 hours +- **Usage**: Best for Ollama or other local LLM providers + +```bash +sbatch slurm_gpu_analysis.sh SYMBOL DATE +# or +./slurm_manager.sh submit-gpu SYMBOL DATE +``` + +## Resource Requirements + +### Minimum Requirements + +- **CPU Jobs**: 4-8 cores, 8-16GB RAM +- **GPU Jobs**: 1 GPU, 8 cores, 32GB RAM +- **Storage**: ~1GB for dependencies, variable for results/cache + +### Recommended Partitions + +- **CPU Partition**: For most analysis jobs +- **GPU Partition**: For local LLM acceleration +- **High-Memory Partition**: For large-scale batch processing + +## LLM Provider Configuration + +### Ollama (Recommended for Clusters) + +- Runs locally on compute nodes +- No external API dependencies +- GPU acceleration support +- Models: llama3.2, mistral, etc. + +### OpenAI/OpenRouter + +- Requires API key and internet access +- Fast inference +- Usage costs apply +- Models: gpt-4, gpt-3.5-turbo, etc. + +### Anthropic + +- Requires API key and internet access +- High-quality reasoning +- Usage costs apply +- Models: claude-3-sonnet, claude-3-haiku + +## File Structure + +``` +TradingAgents/ +├── slurm_*.sh # SLURM job scripts +├── slurm_manager.sh # Job management utility +├── .env # Environment configuration +├── logs/ # Job output and error logs +├── results/ # Analysis results by symbol/date +├── venv/ # Python virtual environment +└── data_cache/ # Cached market data +``` + +## Error Handling and Exit Behavior + +### **Automatic Script Exit** + +✅ **Yes, scripts will exit automatically on failures** with the following behavior: + +#### **1. Bash Script Level** + +- **`set -euo pipefail`**: Scripts exit immediately on any command failure +- **`-e`**: Exit on any non-zero exit status +- **`-u`**: Exit on undefined variables +- **`-o pipefail`**: Exit if any command in a pipeline fails + +#### **2. Python Script Level** + +- **Exception handling**: All Python errors are caught and logged +- **Explicit exit**: `sys.exit(1)` on any analysis failure +- **Error logging**: Failures are saved to JSON files for debugging + +#### **3. SLURM Level** + +- **Job cancellation**: Failed jobs are marked as FAILED in SLURM +- **Resource cleanup**: Allocated resources are automatically released +- **Log preservation**: Output and error logs are saved for investigation + +### **What Happens on Failure** + +1. **Immediate termination** of the failing script +2. **Error information saved** to `results/[SYMBOL]/[DATE]/error_[JOB_ID].json` +3. **SLURM job status** set to FAILED +4. **Exit code 1** returned to SLURM scheduler +5. **Resources released** back to the cluster + +## Troubleshooting + +### Common Issues + +1. **Job Fails to Start** + + - Check SLURM partition availability: `sinfo` + - Verify resource requirements match cluster limits + - Ensure environment setup job completed successfully + +2. **Python Dependencies Missing** + + - Run setup job: `./slurm_manager.sh submit-setup` + - Check setup job output: `./slurm_manager.sh output SETUP_JOB_ID` + +3. **LLM Connection Issues** + + - Verify API keys in `.env` file + - Check network connectivity for external providers + - For Ollama, ensure GPU resources are available + +4. **Out of Memory Errors** + + - Increase memory allocation in job scripts + - Reduce `max_debate_rounds` in configuration + - Use GPU partition for memory-intensive models + +5. **Script Exit Issues** + - Check exit codes: `sacct -j JOB_ID --format=JobID,State,ExitCode` + - Review error logs: `./slurm_manager.sh output JOB_ID err` + - Verify all prerequisites are met before job submission + +### Debugging + +```bash +# Check job status and exit codes +squeue -u $USER +sacct -j JOB_ID --format=JobID,State,ExitCode,Reason + +# View detailed job information +scontrol show job JOB_ID + +# Check node resources +sinfo -N -l + +# View job output in real-time +tail -f logs/trading_JOB_ID.out + +# Check for error files +find results -name "error_*.json" -exec echo "Found error in: {}" \; -exec cat {} \; +``` + +## Customization + +### Modify Stock Lists + +Edit the `SYMBOLS` array in `slurm_batch_analysis.sh`: + +```bash +SYMBOLS=("AAPL" "MSFT" "GOOGL" "AMZN" "TSLA") +``` + +### Adjust Resources + +Modify SLURM directives in job scripts: + +```bash +#SBATCH --cpus-per-task=16 # More CPUs +#SBATCH --mem=64G # More memory +#SBATCH --time=12:00:00 # Longer runtime +``` + +### Configure Analysis Parameters + +Edit the config in Python scripts: + +```python +config["max_debate_rounds"] = 3 # More thorough analysis +config["max_risk_discuss_rounds"] = 3 # More risk assessment +config["online_tools"] = True # Enable web scraping +``` + +## Best Practices + +1. **Start Small**: Test with single analysis before batch jobs +2. **Monitor Resources**: Check CPU/memory usage during jobs +3. **Batch Wisely**: Use array jobs for multiple symbols +4. **Cache Data**: Leverage data caching to reduce API calls +5. **Log Everything**: Review job logs for optimization opportunities +6. **Backup Results**: Copy important results to permanent storage + +## Performance Tips + +1. **Use Local Models**: Ollama reduces API latency and costs +2. **Parallel Processing**: Leverage array jobs for batch analysis +3. **Resource Matching**: Match job resources to actual needs +4. **Data Locality**: Store frequently accessed data on fast storage +5. **Network Optimization**: Use cluster-internal services when possible + +## Security Considerations + +1. **API Keys**: Store sensitive keys in `.env` file, not in scripts +2. **File Permissions**: Ensure job scripts and data have appropriate permissions +3. **Network Access**: Some clusters restrict external API access +4. **Data Privacy**: Be aware of data residency requirements for financial data + +## Support + +For issues specific to: + +- **SLURM**: Consult your cluster documentation or administrator +- **TradingAgents**: Check the main repository issues and documentation +- **LLM Providers**: Refer to respective provider documentation diff --git a/slurm_batch_analysis.sh b/slurm_batch_analysis.sh new file mode 100755 index 00000000..ee5f12d5 --- /dev/null +++ b/slurm_batch_analysis.sh @@ -0,0 +1,140 @@ +#!/bin/bash +#SBATCH --job-name=trading-agents-batch +#SBATCH --output=logs/batch_%A_%a.out +#SBATCH --error=logs/batch_%A_%a.err +#SBATCH --time=06:00:00 +#SBATCH --cpus-per-task=8 +#SBATCH --mem=16G +#SBATCH --partition=cpu +#SBATCH --array=1-10%5 + +# Exit on any error, undefined variable, or pipe failure +set -euo pipefail + +# Batch analysis for multiple stocks +# This script runs trading analysis for multiple symbols in parallel +# The %5 limits to 5 concurrent jobs + +echo "Starting TradingAgents batch analysis..." +echo "Job ID: ${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}" +echo "Node: $SLURM_NODELIST" +echo "Started at: $(date)" + +# Load necessary modules +module load python/3.10 + +# Set up environment +WORK_DIR=${SLURM_SUBMIT_DIR} +cd $WORK_DIR + +# Activate virtual environment +source venv/bin/activate + +# Set environment variables +export PYTHONPATH="${WORK_DIR}:${PYTHONPATH}" +export TRADINGAGENTS_RESULTS_DIR="${WORK_DIR}/results" + +# Define array of stocks to analyze +SYMBOLS=("SPY" "QQQ" "AAPL" "MSFT" "GOOGL" "AMZN" "TSLA" "NVDA" "META" "NFLX") + +# Get the symbol for this array task +SYMBOL=${SYMBOLS[$((SLURM_ARRAY_TASK_ID-1))]} +DATE=$(date +%Y-%m-%d) + +echo "Processing symbol: $SYMBOL (Task ${SLURM_ARRAY_TASK_ID})" + +# Create results directory for this symbol +RESULTS_DIR="${WORK_DIR}/results/${SYMBOL}/${DATE}" +mkdir -p "$RESULTS_DIR" + +# Create a custom Python script for this analysis +cat > "batch_analysis_${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}.py" << EOF +import os +import sys +import json +from datetime import datetime +from tradingagents.graph.trading_graph import TradingAgentsGraph +from tradingagents.default_config import DEFAULT_CONFIG +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +def main(): + symbol = "$SYMBOL" + date = "$DATE" + task_id = "$SLURM_ARRAY_TASK_ID" + + print(f"Batch analysis - Task {task_id}: {symbol} on {date}") + + # Create a custom config for SLURM environment + config = DEFAULT_CONFIG.copy() + + # Adjust for cluster environment + config["results_dir"] = os.getenv("TRADINGAGENTS_RESULTS_DIR", "./results") + config["max_debate_rounds"] = 2 + config["max_risk_discuss_rounds"] = 2 + config["online_tools"] = True + + # Use environment variables for LLM configuration + config["llm_provider"] = os.getenv("LLM_PROVIDER", "ollama") + config["backend_url"] = os.getenv("LLM_BACKEND_URL", "http://localhost:11434/v1") + config["deep_think_llm"] = os.getenv("DEEP_THINK_LLM", "llama3.2") + config["quick_think_llm"] = os.getenv("QUICK_THINK_LLM", "llama3.2") + + try: + # Initialize trading agents + ta = TradingAgentsGraph(debug=True, config=config) + + # Run analysis + print(f"Running trading analysis for {symbol}...") + state, decision = ta.propagate(symbol, date) + + # Save results + results = { + "symbol": symbol, + "date": date, + "decision": decision, + "array_job_id": os.getenv("SLURM_ARRAY_JOB_ID"), + "task_id": task_id, + "node": os.getenv("SLURM_NODELIST"), + "completed_at": datetime.now().isoformat() + } + + output_file = f"$RESULTS_DIR/batch_results_task_{task_id}.json" + with open(output_file, 'w') as f: + json.dump(results, f, indent=2) + + print(f"Analysis completed for {symbol}. Results saved to: {output_file}") + print(f"Decision: {decision}") + + except Exception as e: + print(f"Error during analysis of {symbol}: {str(e)}") + # Save error information + error_info = { + "symbol": symbol, + "date": date, + "error": str(e), + "array_job_id": os.getenv("SLURM_ARRAY_JOB_ID"), + "task_id": task_id, + "failed_at": datetime.now().isoformat() + } + + error_file = f"$RESULTS_DIR/error_task_{task_id}.json" + with open(error_file, 'w') as f: + json.dump(error_info, f, indent=2) + + sys.exit(1) + +if __name__ == "__main__": + main() +EOF + +# Run the analysis +echo "Running Python analysis script for $SYMBOL..." +python "batch_analysis_${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}.py" + +# Clean up temporary script +rm "batch_analysis_${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}.py" + +echo "Task ${SLURM_ARRAY_TASK_ID} for $SYMBOL completed at: $(date)" diff --git a/slurm_gpu_analysis.sh b/slurm_gpu_analysis.sh new file mode 100755 index 00000000..f7ec315c --- /dev/null +++ b/slurm_gpu_analysis.sh @@ -0,0 +1,165 @@ +#!/bin/bash +#SBATCH --job-name=trading-agents-gpu +#SBATCH --output=logs/gpu_trading_%j.out +#SBATCH --error=logs/gpu_trading_%j.err +#SBATCH --time=08:00:00 +#SBATCH --gpus=1 +#SBATCH --cpus-per-task=8 +#SBATCH --mem=32G +#SBATCH --partition=gpu + +# Exit on any error, undefined variable, or pipe failure +set -euo pipefail + +# GPU-accelerated analysis using local LLM models +# This script is useful when running with Ollama or other local models that can benefit from GPU acceleration + +echo "Starting TradingAgents GPU analysis..." +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $SLURM_NODELIST" +echo "GPU: $CUDA_VISIBLE_DEVICES" +echo "Started at: $(date)" + +# Parse command line arguments +SYMBOL=${1:-"SPY"} +DATE=${2:-$(date +%Y-%m-%d)} + +echo "Analyzing symbol: $SYMBOL for date: $DATE" + +# Load necessary modules +module load python/3.10 +module load cuda/11.8 + +# Set up environment +WORK_DIR=${SLURM_SUBMIT_DIR} +cd $WORK_DIR + +# Activate virtual environment +source venv/bin/activate + +# Set environment variables +export PYTHONPATH="${WORK_DIR}:${PYTHONPATH}" +export TRADINGAGENTS_RESULTS_DIR="${WORK_DIR}/results" +export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES + +# Set up Ollama if using local models +if [ "$LLM_PROVIDER" == "ollama" ]; then + # Start Ollama server on this node + export OLLAMA_HOST=0.0.0.0:11434 + export OLLAMA_GPU_LAYERS=999 # Use all GPU layers + + # Start Ollama in background + ollama serve & + OLLAMA_PID=$! + + # Wait for Ollama to start + sleep 10 + + # Pull required models if they don't exist + ollama pull llama3.2 || echo "Model llama3.2 already exists or failed to pull" +fi + +# Create results directory for this job +RESULTS_DIR="${WORK_DIR}/results/${SYMBOL}/${DATE}" +mkdir -p "$RESULTS_DIR" + +# Create a custom Python script for GPU analysis +cat > "gpu_analysis_${SLURM_JOB_ID}.py" << EOF +import os +import sys +import json +import torch +from datetime import datetime +from tradingagents.graph.trading_graph import TradingAgentsGraph +from tradingagents.default_config import DEFAULT_CONFIG +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +def main(): + symbol = "$SYMBOL" + date = "$DATE" + + print(f"Starting GPU-accelerated analysis for {symbol} on {date}") + print(f"CUDA available: {torch.cuda.is_available()}") + if torch.cuda.is_available(): + print(f"GPU device: {torch.cuda.get_device_name()}") + print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB") + + # Create a custom config for GPU SLURM environment + config = DEFAULT_CONFIG.copy() + + # Adjust for GPU cluster environment + config["results_dir"] = os.getenv("TRADINGAGENTS_RESULTS_DIR", "./results") + config["max_debate_rounds"] = 3 # More rounds for thorough analysis + config["max_risk_discuss_rounds"] = 3 + config["online_tools"] = True + + # Configure for GPU-accelerated LLM + config["llm_provider"] = os.getenv("LLM_PROVIDER", "ollama") + config["backend_url"] = os.getenv("LLM_BACKEND_URL", "http://localhost:11434/v1") + config["deep_think_llm"] = os.getenv("DEEP_THINK_LLM", "llama3.2") + config["quick_think_llm"] = os.getenv("QUICK_THINK_LLM", "llama3.2") + + try: + # Initialize trading agents + ta = TradingAgentsGraph(debug=True, config=config) + + # Run analysis + print("Running GPU-accelerated trading analysis...") + state, decision = ta.propagate(symbol, date) + + # Save results + results = { + "symbol": symbol, + "date": date, + "decision": decision, + "job_id": os.getenv("SLURM_JOB_ID"), + "node": os.getenv("SLURM_NODELIST"), + "gpu_used": torch.cuda.is_available(), + "completed_at": datetime.now().isoformat() + } + + output_file = f"$RESULTS_DIR/gpu_analysis_results_{os.getenv('SLURM_JOB_ID')}.json" + with open(output_file, 'w') as f: + json.dump(results, f, indent=2) + + print(f"GPU analysis completed. Results saved to: {output_file}") + print(f"Decision: {decision}") + + except Exception as e: + print(f"Error during GPU analysis: {str(e)}") + # Save error information + error_info = { + "symbol": symbol, + "date": date, + "error": str(e), + "job_id": os.getenv("SLURM_JOB_ID"), + "gpu_available": torch.cuda.is_available(), + "failed_at": datetime.now().isoformat() + } + + error_file = f"$RESULTS_DIR/gpu_error_{os.getenv('SLURM_JOB_ID')}.json" + with open(error_file, 'w') as f: + json.dump(error_info, f, indent=2) + + sys.exit(1) + +if __name__ == "__main__": + main() +EOF + +# Run the analysis +echo "Running GPU Python analysis script..." +python "gpu_analysis_${SLURM_JOB_ID}.py" + +# Clean up +rm "gpu_analysis_${SLURM_JOB_ID}.py" + +# Stop Ollama if we started it +if [ ! -z "$OLLAMA_PID" ]; then + kill $OLLAMA_PID +fi + +echo "GPU job completed at: $(date)" diff --git a/slurm_manager.sh b/slurm_manager.sh new file mode 100755 index 00000000..5607ddd8 --- /dev/null +++ b/slurm_manager.sh @@ -0,0 +1,321 @@ +#!/bin/bash + +# TradingAgents SLURM Job Management Script +# This script provides convenience functions for managing TradingAgents jobs on SLURM + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Print colored output +print_status() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +print_header() { + echo -e "${BLUE}=== $1 ===${NC}" +} + +# Check if required files exist +check_prerequisites() { + print_header "Checking Prerequisites" + + local missing_files=() + + if [ ! -f "requirements.txt" ]; then + missing_files+=("requirements.txt") + fi + + if [ ! -f "slurm_setup.sh" ]; then + missing_files+=("slurm_setup.sh") + fi + + if [ ! -f ".env.slurm.template" ]; then + missing_files+=(".env.slurm.template") + fi + + if [ ${#missing_files[@]} -ne 0 ]; then + print_error "Missing required files: ${missing_files[*]}" + return 1 + fi + + print_status "All required files found" + return 0 +} + +# Setup environment +setup_environment() { + print_header "Setting Up Environment" + + # Create necessary directories + mkdir -p logs results data_cache + + # Copy environment template if .env doesn't exist + if [ ! -f ".env" ]; then + cp .env.slurm.template .env + print_status "Created .env file from template. Please customize it for your environment." + fi + + # Make scripts executable + chmod +x slurm_*.sh + + print_status "Environment setup completed" +} + +# Submit setup job +submit_setup() { + print_header "Submitting Setup Job" + + if [ ! -f "slurm_setup.sh" ]; then + print_error "slurm_setup.sh not found" + return 1 + fi + + local job_id=$(sbatch slurm_setup.sh | grep -o '[0-9]*') + print_status "Setup job submitted with ID: $job_id" + echo "$job_id" +} + +# Submit single analysis job +submit_single_analysis() { + local symbol=${1:-"SPY"} + local date=${2:-$(date +%Y-%m-%d)} + + print_header "Submitting Single Analysis Job" + print_status "Symbol: $symbol, Date: $date" + + if [ ! -f "slurm_single_analysis.sh" ]; then + print_error "slurm_single_analysis.sh not found" + return 1 + fi + + local job_id=$(sbatch slurm_single_analysis.sh "$symbol" "$date" | grep -o '[0-9]*') + print_status "Single analysis job submitted with ID: $job_id" + echo "$job_id" +} + +# Submit batch analysis job +submit_batch_analysis() { + print_header "Submitting Batch Analysis Job" + + if [ ! -f "slurm_batch_analysis.sh" ]; then + print_error "slurm_batch_analysis.sh not found" + return 1 + fi + + local job_id=$(sbatch slurm_batch_analysis.sh | grep -o '[0-9]*') + print_status "Batch analysis job submitted with ID: $job_id" + echo "$job_id" +} + +# Submit GPU analysis job +submit_gpu_analysis() { + local symbol=${1:-"SPY"} + local date=${2:-$(date +%Y-%m-%d)} + + print_header "Submitting GPU Analysis Job" + print_status "Symbol: $symbol, Date: $date" + + if [ ! -f "slurm_gpu_analysis.sh" ]; then + print_error "slurm_gpu_analysis.sh not found" + return 1 + fi + + local job_id=$(sbatch slurm_gpu_analysis.sh "$symbol" "$date" | grep -o '[0-9]*') + print_status "GPU analysis job submitted with ID: $job_id" + echo "$job_id" +} + +# Check job status +check_job_status() { + local job_id=$1 + + if [ -z "$job_id" ]; then + print_error "Job ID required" + return 1 + fi + + print_header "Job Status for ID: $job_id" + squeue -j "$job_id" --format="%.18i %.9P %.20j %.8u %.8T %.10M %.6D %R" +} + +# Show recent jobs +show_recent_jobs() { + print_header "Recent TradingAgents Jobs" + squeue -u $USER --name=trading-agents* --format="%.18i %.9P %.20j %.8u %.8T %.10M %.6D %R" +} + +# Cancel job +cancel_job() { + local job_id=$1 + + if [ -z "$job_id" ]; then + print_error "Job ID required" + return 1 + fi + + print_header "Cancelling Job: $job_id" + scancel "$job_id" + print_status "Job $job_id cancelled" +} + +# View job output +view_job_output() { + local job_id=$1 + local output_type=${2:-"out"} # "out" or "err" + + if [ -z "$job_id" ]; then + print_error "Job ID required" + return 1 + fi + + local output_file + if [ "$output_type" == "err" ]; then + output_file="logs/trading_${job_id}.err" + else + output_file="logs/trading_${job_id}.out" + fi + + if [ -f "$output_file" ]; then + print_header "Job $job_id Output ($output_type)" + tail -f "$output_file" + else + print_error "Output file not found: $output_file" + fi +} + +# Check for failed jobs and show errors +check_failed_jobs() { + print_header "Checking for Failed Jobs" + + # Get failed jobs from sacct + local failed_jobs=$(sacct -u $USER --name=trading-agents* --state=FAILED --format=JobID,State,ExitCode --noheader --parsable2 | cut -d'|' -f1) + + if [ -z "$failed_jobs" ]; then + print_status "No failed jobs found" + return 0 + fi + + echo "$failed_jobs" | while read -r job_id; do + if [ -n "$job_id" ]; then + print_warning "Failed job: $job_id" + + # Look for error files + local error_files=$(find results -name "error_${job_id}.json" 2>/dev/null) + if [ -n "$error_files" ]; then + echo "$error_files" | while read -r error_file; do + echo "Error details from: $error_file" + if command -v jq >/dev/null 2>&1; then + jq '.' "$error_file" 2>/dev/null || cat "$error_file" + else + cat "$error_file" + fi + done + else + echo "No error details found for job $job_id" + fi + echo "" + fi + done +} + local symbol=${1:-"*"} + local date=${2:-$(date +%Y-%m-%d)} + + print_header "Collecting Results" + print_status "Symbol: $symbol, Date: $date" + + find results -name "*.json" -path "*/$symbol/$date/*" | while read -r file; do + echo "Found result: $file" + if command -v jq >/dev/null 2>&1; then + jq '.decision' "$file" 2>/dev/null || echo " (Could not parse decision)" + fi + done +} + +# Main function +main() { + case "$1" in + "setup") + check_prerequisites && setup_environment + ;; + "submit-setup") + submit_setup + ;; + "submit-single") + submit_single_analysis "$2" "$3" + ;; + "submit-batch") + submit_batch_analysis + ;; + "submit-gpu") + submit_gpu_analysis "$2" "$3" + ;; + "status") + if [ -n "$2" ]; then + check_job_status "$2" + else + show_recent_jobs + fi + ;; + "cancel") + cancel_job "$2" + ;; + "output") + view_job_output "$2" "$3" + ;; + "results") + collect_results "$2" "$3" + ;; + "check-failed") + check_failed_jobs + ;; + "help"|"--help"|"-h"|"") + cat << EOF +TradingAgents SLURM Job Manager + +Usage: $0 [arguments] + +Commands: + setup - Setup environment and create necessary directories + submit-setup - Submit environment setup job + submit-single [SYM] [DATE] - Submit single analysis job (default: SPY, today) + submit-batch - Submit batch analysis job for multiple symbols + submit-gpu [SYM] [DATE] - Submit GPU-accelerated analysis job + status [JOB_ID] - Show job status (specific job or all recent jobs) + cancel - Cancel a specific job + output [err] - View job output (stdout or stderr) + results [SYM] [DATE] - Collect and display results + check-failed - Check for failed jobs and show error details + help - Show this help message + +Examples: + $0 setup # Initial setup + $0 submit-single AAPL # Analyze AAPL for today + $0 submit-batch # Analyze multiple stocks + $0 status 12345 # Check status of job 12345 + $0 output 12345 # View output of job 12345 + $0 results AAPL # Show results for AAPL + +EOF + ;; + *) + print_error "Unknown command: $1" + print_status "Use '$0 help' for usage information" + exit 1 + ;; + esac +} + +# Run main function with all arguments +main "$@" diff --git a/slurm_setup.sh b/slurm_setup.sh new file mode 100755 index 00000000..add09fe2 --- /dev/null +++ b/slurm_setup.sh @@ -0,0 +1,53 @@ +#!/bin/bash +#SBATCH --job-name=trading-agents-setup +#SBATCH --output=setup_%j.out +#SBATCH --error=setup_%j.err +#SBATCH --time=02:00:00 +#SBATCH --cpus-per-task=4 +#SBATCH --mem=8G +#SBATCH --partition=cpu + +# Exit on any error, undefined variable, or pipe failure +set -euo pipefail + +# TradingAgents SLURM Setup Script +# This script sets up the environment for running TradingAgents on a SLURM cluster + +echo "Setting up TradingAgents environment on SLURM cluster..." +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $SLURM_NODELIST" +echo "Started at: $(date)" + +# Load necessary modules (adjust based on your cluster's available modules) +module load python/3.10 +module load git + +# Set up working directory +WORK_DIR=${SLURM_SUBMIT_DIR} +cd $WORK_DIR + +# Create Python virtual environment if it doesn't exist +if [ ! -d "venv" ]; then + echo "Creating Python virtual environment..." + python -m venv venv +fi + +# Activate virtual environment +source venv/bin/activate + +# Upgrade pip +pip install --upgrade pip + +# Install dependencies +echo "Installing Python dependencies..." +pip install -r requirements.txt + +# Create necessary directories +mkdir -p results +mkdir -p logs +mkdir -p data_cache + +# Set environment variables +export PYTHONPATH="${WORK_DIR}:${PYTHONPATH}" + +echo "Environment setup completed at: $(date)" diff --git a/slurm_single_analysis.sh b/slurm_single_analysis.sh new file mode 100755 index 00000000..8d72aa60 --- /dev/null +++ b/slurm_single_analysis.sh @@ -0,0 +1,154 @@ +#!/bin/bash +#SBATCH --job-name=trading-agents-single +#SBATCH --output=logs/trading_%j.out +#SBATCH --error=logs/trading_%j.err +#SBATCH --time=04:00:00 +#SBATCH --cpus-per-task=8 +#SBATCH --mem=16G +#SBATCH --partition=cpu + +# Exit on any error, undefined variable, or pipe failure +set -euo pipefail + +# Single stock analysis job +# Usage: sbatch slurm_single_analysis.sh SYMBOL DATE + +echo "Starting TradingAgents single analysis..." +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $SLURM_NODELIST" +echo "Started at: $(date)" + +# Parse command line arguments +SYMBOL=${1:-"SPY"} +DATE=${2:-$(date +%Y-%m-%d)} + +echo "Analyzing symbol: $SYMBOL for date: $DATE" + +# Load necessary modules +if ! module load python/3.10; then + echo "ERROR: Failed to load Python module" + exit 1 +fi + +# Set up environment +WORK_DIR=${SLURM_SUBMIT_DIR} +cd "$WORK_DIR" || { echo "ERROR: Cannot access work directory $WORK_DIR"; exit 1; } + +# Activate virtual environment +if [ ! -f "venv/bin/activate" ]; then + echo "ERROR: Virtual environment not found. Run setup first." + exit 1 +fi + +if ! source venv/bin/activate; then + echo "ERROR: Failed to activate virtual environment" + exit 1 +fi + +# Set environment variables +export PYTHONPATH="${WORK_DIR}:${PYTHONPATH}" +export TRADINGAGENTS_RESULTS_DIR="${WORK_DIR}/results" + +# Set SLURM-specific configurations +export SLURM_JOB_MODE=true +export SLURM_CPUS_AVAILABLE=$SLURM_CPUS_PER_TASK + +# Create results directory for this job +RESULTS_DIR="${WORK_DIR}/results/${SYMBOL}/${DATE}" +if ! mkdir -p "$RESULTS_DIR"; then + echo "ERROR: Failed to create results directory: $RESULTS_DIR" + exit 1 +fi + +# Create a custom Python script for this analysis +cat > "slurm_analysis_${SLURM_JOB_ID}.py" << EOF +import os +import sys +import json +from datetime import datetime +from tradingagents.graph.trading_graph import TradingAgentsGraph +from tradingagents.default_config import DEFAULT_CONFIG +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +def main(): + symbol = "$SYMBOL" + date = "$DATE" + + print(f"Starting analysis for {symbol} on {date}") + + # Create a custom config for SLURM environment + config = DEFAULT_CONFIG.copy() + + # Adjust for cluster environment + config["results_dir"] = os.getenv("TRADINGAGENTS_RESULTS_DIR", "./results") + config["max_debate_rounds"] = 2 # Increase for more thorough analysis + config["max_risk_discuss_rounds"] = 2 + config["online_tools"] = True + + # Use environment variables for LLM configuration + config["llm_provider"] = os.getenv("LLM_PROVIDER", "ollama") + config["backend_url"] = os.getenv("LLM_BACKEND_URL", "http://localhost:11434/v1") + config["deep_think_llm"] = os.getenv("DEEP_THINK_LLM", "llama3.2") + config["quick_think_llm"] = os.getenv("QUICK_THINK_LLM", "llama3.2") + + try: + # Initialize trading agents + ta = TradingAgentsGraph(debug=True, config=config) + + # Run analysis + print("Running trading analysis...") + state, decision = ta.propagate(symbol, date) + + # Save results + results = { + "symbol": symbol, + "date": date, + "decision": decision, + "job_id": os.getenv("SLURM_JOB_ID"), + "node": os.getenv("SLURM_NODELIST"), + "completed_at": datetime.now().isoformat() + } + + output_file = f"$RESULTS_DIR/analysis_results_{os.getenv('SLURM_JOB_ID')}.json" + with open(output_file, 'w') as f: + json.dump(results, f, indent=2) + + print(f"Analysis completed. Results saved to: {output_file}") + print(f"Decision: {decision}") + + except Exception as e: + print(f"Error during analysis: {str(e)}") + # Save error information + error_info = { + "symbol": symbol, + "date": date, + "error": str(e), + "job_id": os.getenv("SLURM_JOB_ID"), + "failed_at": datetime.now().isoformat() + } + + error_file = f"$RESULTS_DIR/error_{os.getenv('SLURM_JOB_ID')}.json" + with open(error_file, 'w') as f: + json.dump(error_info, f, indent=2) + + sys.exit(1) + +if __name__ == "__main__": + main() +EOF + +# Run the analysis +echo "Running Python analysis script..." +if ! python "slurm_analysis_${SLURM_JOB_ID}.py"; then + echo "ERROR: Python analysis script failed" + rm -f "slurm_analysis_${SLURM_JOB_ID}.py" + exit 1 +fi + +# Clean up temporary script +rm "slurm_analysis_${SLURM_JOB_ID}.py" + +echo "Job completed successfully at: $(date)"