Modified evaluation scripts

This commit is contained in:
Quanliang Liu 2025-10-31 10:50:11 -05:00
parent 13b826a31d
commit b512027574
8 changed files with 1215 additions and 0 deletions

1
.gitignore vendored
View File

@ -9,3 +9,4 @@ eval_results/
eval_data/
*.egg-info/
.env
.history/

66
evaluation/__init__.py Normal file
View File

@ -0,0 +1,66 @@
from .baseline_strategies import (
BuyAndHoldStrategy,
MACDStrategy,
KDJRSIStrategy,
ZMRStrategy,
SMAStrategy,
get_all_baseline_strategies
)
from .metrics import (
calculate_cumulative_return,
calculate_annualized_return,
calculate_sharpe_ratio,
calculate_maximum_drawdown,
calculate_all_metrics,
create_comparison_table
)
from .backtest import (
BacktestEngine,
TradingAgentsBacktester,
load_stock_data
)
from .visualize import (
plot_cumulative_returns,
plot_transaction_history,
plot_metrics_comparison,
plot_drawdown,
create_summary_report
)
from .run_evaluation import run_evaluation
__all__ = [
# Strategies
'BuyAndHoldStrategy',
'MACDStrategy',
'KDJRSIStrategy',
'ZMRStrategy',
'SMAStrategy',
'get_all_baseline_strategies',
# Metrics
'calculate_cumulative_return',
'calculate_annualized_return',
'calculate_sharpe_ratio',
'calculate_maximum_drawdown',
'calculate_all_metrics',
'create_comparison_table',
# Backtesting
'BacktestEngine',
'TradingAgentsBacktester',
'load_stock_data',
# Visualization
'plot_cumulative_returns',
'plot_transaction_history',
'plot_metrics_comparison',
'plot_drawdown',
'create_summary_report',
# Main evaluation
'run_evaluation',
]

221
evaluation/backtest.py Normal file
View File

@ -0,0 +1,221 @@
"""
Backtesting engine for TradingAgents and baseline strategies.
"""
import pandas as pd
from typing import Dict, List
from pathlib import Path
import json
STD_FIELDS = {"Open", "High", "Low", "Close", "Adj Close", "Volume"}
class TradingAgentsBacktester:
"""Backtest engine for TradingAgents framework."""
def __init__(self, trading_agents_graph, initial_capital=100000):
self.graph = trading_agents_graph
self.initial_capital = float(initial_capital)
self.name = "TradingAgents"
def backtest(self, ticker: str, start_date: str, end_date: str, data: pd.DataFrame) -> pd.DataFrame:
# Restrict to window
df = data.loc[start_date:end_date].copy()
portfolio = pd.DataFrame(index=df.index)
portfolio["close"] = df["Close"]
if "Volume" in df.columns:
portfolio["Volume"] = df["Volume"]
portfolio["signal"] = 0
portfolio["position"] = 0.0
portfolio["cash"] = self.initial_capital
portfolio["shares"] = 0.0
portfolio["portfolio_value"] = self.initial_capital
decisions: List[Dict] = []
print(f"\nRunning TradingAgents backtest on {ticker} from {start_date} to {end_date}")
print(f"Total trading days: {len(df)}")
print("-" * 80)
for i, (date, row) in enumerate(df.iterrows()):
date_str = date.strftime("%Y-%m-%d")
price = float(row["Close"])
# Get decision
try:
print(f"\n[{i+1}/{len(df)}] {date_str} ... ", end="")
final_state, decision = self.graph.propagate(ticker, date_str)
print(f"Decision: {decision}")
signal = self._parse_decision(decision)
decisions.append({"date": date_str, "decision": decision, "signal": signal, "price": price})
except Exception as e:
print(f"Error: {e}")
signal = 0
decisions.append({"date": date_str, "decision": "ERROR", "signal": 0, "price": price, "error": str(e)})
# Previous day state
if i > 0:
prev_cash = float(portfolio["cash"].iloc[i - 1])
prev_shares = float(portfolio["shares"].iloc[i - 1])
prev_pos = float(portfolio["position"].iloc[i - 1])
else:
prev_cash = self.initial_capital
prev_shares = 0.0
prev_pos = 0.0
cash, shares, position = prev_cash, prev_shares, prev_pos
# Execute: BUY opens/keeps long with all cash; SELL closes to cash; HOLD keeps.
if signal == 1 and prev_pos <= 0:
# Go long full notional
shares = cash / price if price > 0 else 0.0
cash = 0.0
position = 1.0
elif signal == -1 and prev_pos > 0:
# Exit long to cash (no shorting here; paper's figs show short arrows,
# but transactions table is still long/flat in our public code)
cash = shares * price
shares = 0.0
position = 0.0
else:
# Hold current stance
position = prev_pos
portval = cash + shares * price
portfolio.loc[date, "signal"] = signal
portfolio.loc[date, "position"] = position
portfolio.loc[date, "cash"] = cash
portfolio.loc[date, "shares"] = shares
portfolio.loc[date, "portfolio_value"] = portval
# Returns
portfolio["market_return"] = portfolio["close"].pct_change().fillna(0.0)
portfolio["portfolio_return"] = portfolio["portfolio_value"].pct_change().fillna(0.0)
portfolio["strategy_return"] = portfolio["portfolio_return"]
portfolio["cumulative_return"] = (1.0 + portfolio["strategy_return"]).cumprod()
self._save_decisions_log(ticker, decisions, start_date, end_date)
return portfolio
def _parse_decision(self, decision: str) -> int:
"""
Parse decision to signal.
We interpret:
- contains 'BUY' or 'LONG' -> 1
- contains 'SELL' or 'EXIT' -> -1 (we use -1 as 'close to cash' here)
- otherwise HOLD -> 0
"""
d = str(decision).upper()
if "BUY" in d or "LONG" in d:
return 1
if "SELL" in d or "EXIT" in d or "CLOSE" in d:
return -1
return 0
def _save_decisions_log(self, ticker: str, decisions: List[Dict], start_date: str, end_date: str):
out = Path(f"eval_results/{ticker}/TradingAgents_backtest")
out.mkdir(parents=True, exist_ok=True)
fp = out / f"decisions_{start_date}_to_{end_date}.json"
with open(fp, "w") as f:
json.dump(decisions, f, indent=2)
print(f"\nDecisions log saved to: {fp}")
class BacktestEngine:
"""Engine to run and compare multiple strategies."""
def __init__(self, data: pd.DataFrame, initial_capital: float = 100000):
self.data = data
self.initial_capital = float(initial_capital)
self.results: Dict[str, pd.DataFrame] = {}
def run_strategy(self, strategy, start_date: str = None, end_date: str = None, label = None) -> pd.DataFrame:
data_filtered = self.data.loc[start_date:end_date] if (start_date and end_date) else self.data
print(f"\nRunning {strategy.name}...")
portfolio = strategy.backtest(data_filtered)
self.results[label or strategy.name] = portfolio
return portfolio
def run_all_strategies(self, strategies: Dict, start_date: str = None, end_date: str = None):
for name, strategy in strategies.items():
try:
self.run_strategy(strategy, start_date, end_date)
print(f"{name} completed")
except Exception as e:
print(f"{name} failed: {e}")
def get_results(self) -> Dict[str, pd.DataFrame]:
return self.results
def load_stock_data(ticker: str, start_date: str, end_date: str) -> pd.DataFrame:
try:
import yfinance as yf
# Normalize accidental ('A','A','P','L') / ['A','A','P','L']
if isinstance(ticker, (list, tuple)) and all(isinstance(c, str) and len(c) == 1 for c in ticker):
ticker = "".join(ticker)
if not isinstance(ticker, str):
raise ValueError("Pass a single ticker symbol as a string, e.g., 'AAPL'.")
df = yf.download(ticker, start=start_date, end=end_date, progress=False)
if df.empty:
raise ValueError(f"No data found for {ticker}")
return df
except Exception as e:
print(f"Error loading data: {e}")
raise
def standardize_single_ticker(df: pd.DataFrame, ticker: str | None = None) -> pd.DataFrame:
"""Return a single-ticker OHLCV DataFrame with simple columns.
Works with yfinance single or multi-ticker outputs.
"""
df = df.copy()
# If columns are MultiIndex (common with multi-ticker yfinance)
if isinstance(df.columns, pd.MultiIndex):
# Figure out which level is the field (Open/High/...) and which is ticker
lvl0 = set(map(str, df.columns.get_level_values(0)))
lvl1 = set(map(str, df.columns.get_level_values(1)))
if len(STD_FIELDS & lvl0) > 0:
field_level, ticker_level = 0, 1
elif len(STD_FIELDS & lvl1) > 0:
field_level, ticker_level = 1, 0
else:
raise ValueError("Cannot detect OHLCV field level in MultiIndex columns.")
available = list(pd.Index(df.columns.get_level_values(ticker_level)).unique())
# Normalize weird ticker inputs like ('A','A','P','L') -> 'AAPL'
if isinstance(ticker, (list, tuple)) and all(isinstance(c, str) and len(c) == 1 for c in ticker):
ticker = "".join(ticker)
if ticker is None:
if len(available) != 1:
raise ValueError(f"Multi-ticker DataFrame. Pick one with ticker=..., available={available}")
ticker = available[0]
if str(ticker) not in map(str, available):
raise ValueError(f"Ticker {ticker!r} not in columns. Available: {available}")
# Slice to that ticker and drop the ticker level
df = df.xs(ticker, axis=1, level=ticker_level)
# Map Adj Close -> Close if Close missing
if "Close" not in df.columns and "Adj Close" in df.columns:
df = df.rename(columns={"Adj Close": "Close"})
# Final sanity
req = ["Open", "High", "Low", "Close"]
missing = [c for c in req if c not in df.columns]
if missing:
raise ValueError(f"Data missing columns: {missing}")
# Ensure 'Close' is a Series (not 1-col DataFrame)
close = df["Close"]
if isinstance(close, pd.DataFrame) and close.shape[1] == 1:
df["Close"] = close.iloc[:, 0]
return df

View File

@ -0,0 +1,198 @@
"""
Baseline trading strategies for comparison.
Implements: Buy&Hold, MACD, KDJ+RSI, ZMR, SMA
"""
import pandas as pd
import numpy as np
from abc import ABC, abstractmethod
class BaseStrategy(ABC):
"""Base class for trading strategies."""
def __init__(self, initial_capital=100000):
self.initial_capital = float(initial_capital)
self.name = self.__class__.__name__
def _close_series(self, data: pd.DataFrame) -> pd.Series:
close = data["Close"]
if isinstance(close, pd.DataFrame):
if close.shape[1] == 1:
close = close.iloc[:, 0]
else:
raise ValueError("Multiple 'Close' columns detected. Pass single-ticker data.")
return pd.to_numeric(close, errors="coerce")
@abstractmethod
def generate_signals(self, data: pd.DataFrame) -> pd.Series:
"""Generate *target* position by date (1 long, -1 short, 0 flat)."""
pass
def _prep_ohlcv(self, data: pd.DataFrame) -> pd.DataFrame:
req = ["Open", "High", "Low", "Close"]
for col in req:
if col not in data.columns:
raise ValueError(f"Data missing column '{col}'")
return data.copy()
def backtest(self, data: pd.DataFrame) -> pd.DataFrame:
df = self._prep_ohlcv(data)
signals = self.generate_signals(df).astype(float)
signals = signals.clip(lower=-1, upper=1).reindex(df.index).fillna(0)
# ONE place for hold semantics (Option A: 0 = no new signal)
position = signals.replace(0, np.nan).ffill().fillna(0)
close = self._close_series(df)
market_ret = close.pct_change().fillna(0.0)
exposure = position.shift(1).fillna(0.0)
strat_ret = (exposure * market_ret).astype(float)
cumret = (1.0 + strat_ret).cumprod()
portval = self.initial_capital * cumret
portfolio = pd.DataFrame(index=df.index)
portfolio["signal"] = signals
portfolio["position"] = position
portfolio["close"] = close
if "Volume" in df.columns:
vol = df["Volume"]
if isinstance(vol, pd.DataFrame) and vol.shape[1] == 1:
vol = vol.iloc[:, 0]
if isinstance(vol, pd.Series):
portfolio["Volume"] = vol
portfolio["market_return"] = market_ret
portfolio["strategy_return"] = strat_ret
portfolio["cumulative_return"] = cumret
portfolio["portfolio_value"] = portval
portfolio["trade"] = portfolio["position"].diff().fillna(0.0)
return portfolio
class BuyAndHoldStrategy(BaseStrategy):
"""Buy at start and hold long the whole period (no short)."""
def generate_signals(self, data: pd.DataFrame) -> pd.Series:
s = pd.Series(1.0, index=data.index)
return s
class MACDStrategy(BaseStrategy):
"""
MACD Strategy.
Long when MACD > signal, Short when MACD < signal.
"""
def generate_signals(self, data: pd.DataFrame) -> pd.Series:
df = data.copy()
if "macd" not in df.columns or "macds" not in df.columns:
df = self._calculate_macd(df)
macd_diff = (df["macd"] - df["macds"]).fillna(0.0)
sig = pd.Series(0.0, index=df.index)
sig[macd_diff > 0] = 1.0
sig[macd_diff < 0] = -1.0
return sig
def _calculate_macd(self, data: pd.DataFrame, fast=12, slow=26, signal=9):
exp1 = data["Close"].ewm(span=fast, adjust=False).mean()
exp2 = data["Close"].ewm(span=slow, adjust=False).mean()
macd = exp1 - exp2
macds = macd.ewm(span=signal, adjust=False).mean()
data["macd"] = macd
data["macds"] = macds
data["macdh"] = macd - macds
return data
class KDJRSIStrategy(BaseStrategy):
"""
KDJ & RSI Strategy (classic oversold/overbought gating).
Long when RSI<30 & K<20; Short when RSI>70 & K>80.
"""
def generate_signals(self, data: pd.DataFrame) -> pd.Series:
df = data.copy()
if "rsi" not in df.columns:
df = self._calculate_rsi(df)
if "kdj_k" not in df.columns:
df = self._calculate_kdj(df)
sig = pd.Series(0.0, index=df.index)
sig[(df["rsi"] < 30) & (df["kdj_k"] < 20)] = 1.0
sig[(df["rsi"] > 70) & (df["kdj_k"] > 80)] = -1.0
return sig
def _calculate_rsi(self, data: pd.DataFrame, period=14):
# Wilder's smoothing approximation via EMA improves stability
delta = data["Close"].diff()
up = delta.clip(lower=0)
down = -delta.clip(upper=0)
roll_up = up.ewm(alpha=1/period, adjust=False).mean()
roll_down = down.ewm(alpha=1/period, adjust=False).mean()
rs = roll_up / roll_down.replace(0, np.nan)
data["rsi"] = 100 - (100 / (1 + rs))
return data
def _calculate_kdj(self, data: pd.DataFrame, period=9):
low_min = data["Low"].rolling(window=period, min_periods=period).min()
high_max = data["High"].rolling(window=period, min_periods=period).max()
den = (high_max - low_min).replace(0, np.nan)
rsv = 100 * (data["Close"] - low_min) / den
k = rsv.ewm(com=2, adjust=False, min_periods=1).mean()
d = k.ewm(com=2, adjust=False, min_periods=1).mean()
j = 3 * k - 2 * d
data["kdj_k"], data["kdj_d"], data["kdj_j"] = k, d, j
return data
class ZMRStrategy(BaseStrategy):
"""
Zero-mean reversion on z-score of Close vs rolling mean.
"""
def __init__(self, initial_capital=100000, lookback=20, threshold=1.0):
super().__init__(initial_capital)
self.lookback = int(lookback)
self.threshold = float(threshold)
def generate_signals(self, data: pd.DataFrame) -> pd.Series:
close = self._close_series(data)
rm = close.rolling(window=self.lookback, min_periods=self.lookback).mean()
rs = close.rolling(window=self.lookback, min_periods=self.lookback).std()
z = (close - rm) / rs.replace(0, pd.NA)
sig = pd.Series(0.0, index=data.index)
sig[z < -self.threshold] = 1.0
sig[z > self.threshold] = -1.0
return sig
class SMAStrategy(BaseStrategy):
"""
SMA crossover (50/200 by default).
"""
def __init__(self, initial_capital=100000, short_window=50, long_window=200):
super().__init__(initial_capital)
self.short_window = int(short_window)
self.long_window = int(long_window)
def generate_signals(self, data: pd.DataFrame) -> pd.Series:
close = self._close_series(data)
short = close.rolling(window=self.short_window, min_periods=self.short_window).mean()
long_ = close.rolling(window=self.long_window, min_periods=self.long_window).mean()
sig = pd.Series(0.0, index=data.index)
sig[short > long_] = 1.0
sig[short < long_] = -1.0
return sig
def get_all_baseline_strategies(initial_capital=100000):
"""Get all baseline strategies for comparison."""
return {
"BuyAndHold": BuyAndHoldStrategy(initial_capital),
"MACD": MACDStrategy(initial_capital),
"KDJ&RSI": KDJRSIStrategy(initial_capital),
"ZMR": ZMRStrategy(initial_capital),
"SMA": SMAStrategy(initial_capital),
}

116
evaluation/metrics.py Normal file
View File

@ -0,0 +1,116 @@
"""
Evaluation metrics for trading strategies.
Implements: Cumulative Return, Annualized Return, Sharpe Ratio, Maximum Drawdown
"""
import pandas as pd
import numpy as np
from typing import Dict
def _require_cols(df: pd.DataFrame, cols):
missing = [c for c in cols if c not in df.columns]
if missing:
raise ValueError(f"Portfolio missing columns: {missing}")
def calculate_cumulative_return(portfolio: pd.DataFrame) -> float:
"""CR% = (V_end / V_start - 1) * 100"""
_require_cols(portfolio, ["portfolio_value"])
v_start = float(portfolio["portfolio_value"].iloc[0])
v_end = float(portfolio["portfolio_value"].iloc[-1])
if v_start <= 0:
return 0.0
return (v_end / v_start - 1.0) * 100.0
def calculate_annualized_return(portfolio: pd.DataFrame, trading_days: int | None = None) -> float:
"""AR% = ((V_end / V_start) ** (1/years) - 1) * 100 with 252 trading days/year."""
_require_cols(portfolio, ["portfolio_value"])
v_start = float(portfolio["portfolio_value"].iloc[0])
v_end = float(portfolio["portfolio_value"].iloc[-1])
if v_start <= 0 or v_end <= 0:
return 0.0
if trading_days is None:
trading_days = len(portfolio)
years = trading_days / 252.0
if years <= 0:
return 0.0
return ((v_end / v_start) ** (1.0 / years) - 1.0) * 100.0
def calculate_sharpe_ratio(portfolio: pd.DataFrame, risk_free_rate: float = 0.02) -> float:
"""
SR = (E[r] - r_f) / stdev(r), where r are *daily* strategy returns,
annualized using 252 trading days (paper S1.2.3).
"""
_require_cols(portfolio, ["strategy_return"])
r = portfolio["strategy_return"].dropna().astype(float)
if len(r) < 2 or r.std() == 0:
return 0.0
mean_ann = r.mean() * 252.0
std_ann = r.std(ddof=1) * np.sqrt(252.0)
if std_ann == 0:
return 0.0
return (mean_ann - risk_free_rate) / std_ann
def calculate_maximum_drawdown(portfolio: pd.DataFrame) -> float:
"""MDD% = max drawdown on portfolio_value (peak->trough) * 100"""
_require_cols(portfolio, ["portfolio_value"])
values = portfolio["portfolio_value"].astype(float)
running_max = values.cummax()
drawdown = (values - running_max) / running_max
return float(drawdown.min() * -100.0)
def calculate_win_rate(portfolio: pd.DataFrame) -> float:
"""% days where strategy_return > 0"""
_require_cols(portfolio, ["strategy_return"])
r = portfolio["strategy_return"].dropna()
if len(r) == 0:
return 0.0
return 100.0 * (r > 0).sum() / len(r)
def calculate_profit_factor(portfolio: pd.DataFrame) -> float:
"""Gross profit / gross loss on daily returns (informative extra metric)."""
_require_cols(portfolio, ["strategy_return"])
r = portfolio["strategy_return"].dropna()
gp = r[r > 0].sum()
gl = -r[r < 0].sum()
if gl == 0:
return float("inf") if gp > 0 else 0.0
return float(gp / gl)
def calculate_all_metrics(portfolio: pd.DataFrame, risk_free_rate: float = 0.02) -> Dict[str, float]:
return {
"Cumulative Return (%)": calculate_cumulative_return(portfolio),
"Annualized Return (%)": calculate_annualized_return(portfolio),
"Sharpe Ratio": calculate_sharpe_ratio(portfolio, risk_free_rate),
"Maximum Drawdown (%)": calculate_maximum_drawdown(portfolio),
# Extras (not in table but handy)
"Win Rate (%)": calculate_win_rate(portfolio),
"Profit Factor": calculate_profit_factor(portfolio),
}
def print_metrics(metrics: Dict[str, float], strategy_name: str = "Strategy"):
print(f"\n{'='*60}")
print(f"{strategy_name} Performance Metrics")
print(f"{'='*60}")
for k, v in metrics.items():
if "Ratio" in k or "Factor" in k:
print(f"{k:30s}: {v:8.2f}")
else:
print(f"{k:30s}: {v:8.2f}%")
print(f"{'='*60}\n")
def create_comparison_table(all_metrics: Dict[str, Dict[str, float]]) -> pd.DataFrame:
df = pd.DataFrame(all_metrics).T
df = df.round(2)
if "Sharpe Ratio" in df.columns:
df = df.sort_values("Sharpe Ratio", ascending=False)
return df

View File

@ -0,0 +1,209 @@
"""
Main evaluation script to run backtesting and generate results.
Evaluates TradingAgents against baseline strategies for a single ticker.
"""
import argparse
import sys
from pathlib import Path
from datetime import datetime
import pandas as pd
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from evaluation.baseline_strategies import get_all_baseline_strategies
from evaluation.backtest import BacktestEngine, TradingAgentsBacktester, load_stock_data, standardize_single_ticker
from evaluation.metrics import calculate_all_metrics, create_comparison_table, print_metrics
from evaluation.visualize import create_summary_report
from tradingagents.graph.trading_graph import TradingAgentsGraph
from tradingagents.default_config import DEFAULT_CONFIG
def is_debugging() -> bool:
try:
import debugpy
return debugpy.is_client_connected()
except Exception:
return False
def run_evaluation(
ticker: str,
start_date: str,
end_date: str,
initial_capital: float = 100000,
include_tradingagents: bool = True,
output_dir: str = None,
config: dict = None
):
"""
Run complete evaluation: baselines + TradingAgents for a single ticker.
"""
print(f"\n{'='*80}")
print(f"EVALUATION: {ticker} from {start_date} to {end_date}")
print(f"Initial Capital: ${initial_capital:,.2f}")
print(f"{'='*80}\n")
# Output dir
if output_dir is None:
output_dir = f"eval_results/{ticker}/{datetime.now().strftime('%Y%m%d_%H%M%S')}"
out = Path(output_dir)
out.mkdir(parents=True, exist_ok=True)
# Load data
print("\n" + "="*80)
print("STEP 1: Loading Stock Data")
print("="*80)
data = load_stock_data(ticker, start_date, end_date)
data = standardize_single_ticker(data, ticker)
# Backtest engine
engine = BacktestEngine(data, initial_capital)
# Baselines
print("\n" + "="*80)
print("STEP 2: Running Baseline Strategies")
print("="*80)
baselines = get_all_baseline_strategies(initial_capital)
for name, strategy in baselines.items():
try:
print(f"\nRunning {name}...", end=" ")
portfolio = engine.run_strategy(strategy, start_date, end_date)
print("✓ Complete")
except Exception as e:
print(f"✗ Failed: {e}")
# TradingAgents
if include_tradingagents:
print("\n" + "="*80)
print("STEP 3: Running TradingAgents")
print("="*80)
try:
cfg = (config or DEFAULT_CONFIG).copy()
# Fast eval defaults (you can override from CLI)
cfg["deep_think_llm"] = cfg.get("deep_think_llm", "gpt-4o-mini")
cfg["quick_think_llm"] = cfg.get("quick_think_llm", "gpt-4o-mini")
cfg["max_debate_rounds"] = cfg.get("max_debate_rounds", 1)
cfg["max_risk_discuss_rounds"] = cfg.get("max_risk_discuss_rounds", 1)
# Deterministic-ish decoding for reproducibility
cfg.setdefault("llm_params", {}).update({"temperature": 0, "top_p": 1.0, "seed": 42})
print(f"\nInitializing TradingAgents...")
print(f" Deep Thinking LLM: {cfg['deep_think_llm']}")
print(f" Quick Thinking LLM: {cfg['quick_think_llm']}")
print(f" Debate Rounds: {cfg['max_debate_rounds']}")
graph = TradingAgentsGraph(
selected_analysts=["market", "social", "news", "fundamentals"],
debug=False,
config=cfg
)
ta_backtester = TradingAgentsBacktester(graph, initial_capital)
ta_portfolio = ta_backtester.backtest(ticker, start_date, end_date, data)
engine.results["TradingAgents"] = ta_portfolio
print("\n✓ TradingAgents backtest complete")
except Exception as e:
print(f"\n✗ TradingAgents failed: {e}")
import traceback
traceback.print_exc()
# Metrics
print("\n" + "="*80)
print("STEP 4: Calculating Performance Metrics")
print("="*80)
all_metrics = {}
for name, portfolio in engine.results.items():
metrics = calculate_all_metrics(portfolio)
all_metrics[name] = metrics
print_metrics(metrics, name)
comparison_df = create_comparison_table(all_metrics)
print("\n" + "="*80)
print("PERFORMANCE COMPARISON TABLE")
print("="*80)
print(comparison_df.to_string())
print("\n")
comparison_df.to_csv(out / f"{ticker}_comparison.csv")
print(f"Comparison table saved to: {out / f'{ticker}_comparison.csv'}")
# Visuals
print("\n" + "="*80)
print("STEP 5: Generating Visualizations")
print("="*80)
create_summary_report(ticker, engine.results, comparison_df, output_dir)
print("\n" + "="*80)
print("EVALUATION COMPLETE")
print("="*80)
print(f"\nResults saved to: {out}")
print(f" - Comparison table: {ticker}_comparison.csv")
print(f" - Cumulative returns plot: {ticker}_cumulative_returns.png")
print(f" - Metrics comparison: {ticker}_metrics_comparison.png")
if include_tradingagents and "TradingAgents" in engine.results:
print(f" - Transaction history: {ticker}_TradingAgents_transactions.png")
print(f" - Drawdown analysis: {ticker}_drawdown.png")
return engine.results, comparison_df
def main():
parser = argparse.ArgumentParser(description="Run TradingAgents evaluation with baseline comparisons")
parser.add_argument("ticker", type=str, help="Stock ticker symbol (e.g., AAPL)")
parser.add_argument("--start-date", type=str, required=True, help="Start date (YYYY-MM-DD)")
parser.add_argument("--end-date", type=str, required=True, help="End date (YYYY-MM-DD)")
parser.add_argument("--capital", type=float, default=100000, help="Initial capital (default: 100000)")
parser.add_argument("--no-tradingagents", action="store_true", help="Skip TradingAgents")
parser.add_argument("--output-dir", type=str, default=None, help="Output directory for results")
parser.add_argument("--deep-llm", type=str, default="gpt-4o-mini", help="Deep thinking LLM model")
parser.add_argument("--quick-llm", type=str, default="gpt-4o-mini", help="Quick thinking LLM model")
parser.add_argument("--debate-rounds", type=int, default=1, help="Number of debate rounds (default: 1)")
# Used for debugging
if is_debugging():
config = DEFAULT_CONFIG.copy()
config.update({
"deep_think_llm": "gpt-4o-mini",
"quick_think_llm": "gpt-4o-mini",
"max_debate_rounds": 1,
"max_risk_discuss_rounds": 1,
"llm_params": {"temperature": 0, "top_p": 1.0, "seed": 42},
})
run_evaluation(
ticker="AAPL",
start_date="2024-01-01",
end_date="2024-03-30",
initial_capital=1000,
include_tradingagents=False,
output_dir="./evaluation/results",
config=config
)
return
# Build config
args = parser.parse_args()
config = DEFAULT_CONFIG.copy()
config["deep_think_llm"] = args.deep_llm
config["quick_think_llm"] = args.quick_llm
config["max_debate_rounds"] = args.debate_rounds
config["max_risk_discuss_rounds"] = args.debate_rounds
config.setdefault("llm_params", {}).update({"temperature": 0, "top_p": 1.0, "seed": 42})
run_evaluation(
ticker=args.ticker,
start_date=args.start_date,
end_date=args.end_date,
initial_capital=args.capital,
include_tradingagents=not args.no_tradingagents,
output_dir=args.output_dir,
config=config
)
if __name__ == "__main__":
main()

401
evaluation/visualize.py Normal file
View File

@ -0,0 +1,401 @@
"""
Visualization tools for trading strategy evaluation.
Generates plots and reports for comparing TradingAgents with baseline strategies.
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from typing import Dict
import warnings
warnings.filterwarnings('ignore')
# Try to import seaborn for better styling (optional)
try:
import seaborn as sns
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
HAS_SEABORN = True
except ImportError:
HAS_SEABORN = False
# Use default matplotlib styling
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['axes.grid'] = True
def plot_cumulative_returns(
results: Dict[str, pd.DataFrame],
ticker: str,
output_path: str = None,
figsize: tuple = (14, 8)
) -> plt.Figure:
"""
Plot cumulative returns comparison for all strategies.
Args:
results: Dictionary mapping strategy name to portfolio DataFrame
ticker: Stock ticker symbol
output_path: Path to save the figure (optional)
figsize: Figure size (width, height)
Returns:
matplotlib Figure object
"""
fig, ax = plt.subplots(figsize=figsize)
for name, portfolio in results.items():
if "cumulative_return" in portfolio.columns:
cumulative = (portfolio["cumulative_return"] - 1) * 100 # Convert to percentage
ax.plot(portfolio.index, cumulative, label=name, linewidth=2, alpha=0.8)
ax.set_xlabel('Date', fontsize=12, fontweight='bold')
ax.set_ylabel('Cumulative Return (%)', fontsize=12, fontweight='bold')
ax.set_title(f'{ticker} - Cumulative Returns Comparison', fontsize=14, fontweight='bold')
ax.legend(loc='best', fontsize=10, framealpha=0.9)
ax.grid(True, alpha=0.3)
ax.axhline(y=0, color='black', linestyle='--', linewidth=1, alpha=0.5)
# Format y-axis as percentage
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'{y:.1f}%'))
plt.tight_layout()
if output_path:
fig.savefig(output_path, dpi=300, bbox_inches='tight')
print(f"✓ Saved cumulative returns plot to: {output_path}")
return fig
def plot_transaction_history(
portfolio: pd.DataFrame,
ticker: str,
strategy_name: str = "TradingAgents",
output_path: str = None,
figsize: tuple = (14, 10)
) -> plt.Figure:
"""
Plot transaction history with buy/sell signals overlaid on price chart.
Args:
portfolio: Portfolio DataFrame with 'signal' and 'close' columns
ticker: Stock ticker symbol
strategy_name: Name of the strategy
output_path: Path to save the figure (optional)
figsize: Figure size (width, height)
Returns:
matplotlib Figure object
"""
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize, height_ratios=[2, 1])
# Price chart with signals
ax1.plot(portfolio.index, portfolio["close"], label='Close Price',
color='blue', linewidth=1.5, alpha=0.7)
# Buy signals (signal == 1 and previous signal != 1)
signals = portfolio["signal"].copy()
buy_signals = (signals == 1) & (signals.shift(1) != 1)
sell_signals = (signals == -1) & (signals.shift(1) != -1)
# Plot buy/sell markers
if buy_signals.any():
ax1.scatter(portfolio.index[buy_signals],
portfolio.loc[buy_signals, "close"],
marker='^', color='green', s=100, label='Buy',
zorder=5, alpha=0.8)
if sell_signals.any():
ax1.scatter(portfolio.index[sell_signals],
portfolio.loc[sell_signals, "close"],
marker='v', color='red', s=100, label='Sell',
zorder=5, alpha=0.8)
ax1.set_ylabel('Price ($)', fontsize=12, fontweight='bold')
ax1.set_title(f'{ticker} - {strategy_name} Transaction History',
fontsize=14, fontweight='bold')
ax1.legend(loc='best', fontsize=10)
ax1.grid(True, alpha=0.3)
# Portfolio value
ax2.plot(portfolio.index, portfolio["portfolio_value"],
label='Portfolio Value', color='purple', linewidth=2)
ax2.fill_between(portfolio.index, portfolio["portfolio_value"],
alpha=0.3, color='purple')
ax2.set_xlabel('Date', fontsize=12, fontweight='bold')
ax2.set_ylabel('Portfolio Value ($)', fontsize=12, fontweight='bold')
ax2.legend(loc='best', fontsize=10)
ax2.grid(True, alpha=0.3)
# Format y-axis as currency
ax2.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'${y:,.0f}'))
plt.tight_layout()
if output_path:
fig.savefig(output_path, dpi=300, bbox_inches='tight')
print(f"✓ Saved transaction history plot to: {output_path}")
return fig
def plot_metrics_comparison(
comparison_df: pd.DataFrame,
ticker: str,
output_path: str = None,
figsize: tuple = (16, 10)
) -> plt.Figure:
"""
Create bar charts comparing key metrics across strategies.
Args:
comparison_df: DataFrame with strategies as rows and metrics as columns
ticker: Stock ticker symbol
output_path: Path to save the figure (optional)
figsize: Figure size (width, height)
Returns:
matplotlib Figure object
"""
# Select key metrics (matching paper's Table 1)
metrics_to_plot = [
"Cumulative Return (%)",
"Annualized Return (%)",
"Sharpe Ratio",
"Maximum Drawdown (%)"
]
# Filter to available metrics
available_metrics = [m for m in metrics_to_plot if m in comparison_df.columns]
if not available_metrics:
raise ValueError("No matching metrics found in comparison DataFrame")
n_metrics = len(available_metrics)
fig, axes = plt.subplots(2, 2, figsize=figsize)
axes = axes.flatten()
for idx, metric in enumerate(available_metrics):
ax = axes[idx]
data = comparison_df[metric].sort_values(ascending=False)
# Color code: TradingAgents in different color
colors = ['#FF6B6B' if name == 'TradingAgents' else '#4ECDC4'
for name in data.index]
bars = ax.barh(range(len(data)), data.values, color=colors, alpha=0.8)
ax.set_yticks(range(len(data)))
ax.set_yticklabels(data.index, fontsize=10)
ax.set_xlabel(metric, fontsize=11, fontweight='bold')
ax.set_title(metric, fontsize=12, fontweight='bold')
ax.grid(True, alpha=0.3, axis='x')
# Add value labels on bars
for i, (bar, value) in enumerate(zip(bars, data.values)):
if "Ratio" in metric:
label = f'{value:.2f}'
else:
label = f'{value:.1f}%'
ax.text(value, bar.get_y() + bar.get_height()/2,
f' {label}', va='center', fontsize=9)
# Hide unused subplots
for idx in range(n_metrics, 4):
axes[idx].axis('off')
fig.suptitle(f'{ticker} - Performance Metrics Comparison',
fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
if output_path:
fig.savefig(output_path, dpi=300, bbox_inches='tight')
print(f"✓ Saved metrics comparison plot to: {output_path}")
return fig
def plot_drawdown(
results: Dict[str, pd.DataFrame],
ticker: str,
output_path: str = None,
figsize: tuple = (14, 8)
) -> plt.Figure:
"""
Plot drawdown analysis for all strategies.
Args:
results: Dictionary mapping strategy name to portfolio DataFrame
ticker: Stock ticker symbol
output_path: Path to save the figure (optional)
figsize: Figure size (width, height)
Returns:
matplotlib Figure object
"""
fig, ax = plt.subplots(figsize=figsize)
for name, portfolio in results.items():
if "portfolio_value" in portfolio.columns:
values = portfolio["portfolio_value"]
running_max = values.cummax()
drawdown = (values - running_max) / running_max * 100
ax.plot(portfolio.index, drawdown, label=name, linewidth=2, alpha=0.7)
ax.set_xlabel('Date', fontsize=12, fontweight='bold')
ax.set_ylabel('Drawdown (%)', fontsize=12, fontweight='bold')
ax.set_title(f'{ticker} - Drawdown Analysis', fontsize=14, fontweight='bold')
ax.legend(loc='best', fontsize=10, framealpha=0.9)
ax.grid(True, alpha=0.3)
ax.axhline(y=0, color='black', linestyle='--', linewidth=1, alpha=0.5)
# Fill drawdown areas
for name, portfolio in results.items():
if "portfolio_value" in portfolio.columns:
values = portfolio["portfolio_value"]
running_max = values.cummax()
drawdown = (values - running_max) / running_max * 100
ax.fill_between(portfolio.index, drawdown, 0, alpha=0.1)
# Format y-axis as percentage
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'{y:.1f}%'))
plt.tight_layout()
if output_path:
fig.savefig(output_path, dpi=300, bbox_inches='tight')
print(f"✓ Saved drawdown plot to: {output_path}")
return fig
def plot_returns_distribution(
results: Dict[str, pd.DataFrame],
ticker: str,
output_path: str = None,
figsize: tuple = (14, 8)
) -> plt.Figure:
"""
Plot distribution of daily returns for all strategies.
Args:
results: Dictionary mapping strategy name to portfolio DataFrame
ticker: Stock ticker symbol
output_path: Path to save the figure (optional)
figsize: Figure size (width, height)
Returns:
matplotlib Figure object
"""
fig, ax = plt.subplots(figsize=figsize)
for name, portfolio in results.items():
if "strategy_return" in portfolio.columns:
returns = portfolio["strategy_return"].dropna() * 100 # Convert to percentage
ax.hist(returns, bins=50, alpha=0.5, label=name, density=True)
ax.set_xlabel('Daily Return (%)', fontsize=12, fontweight='bold')
ax.set_ylabel('Density', fontsize=12, fontweight='bold')
ax.set_title(f'{ticker} - Returns Distribution', fontsize=14, fontweight='bold')
ax.legend(loc='best', fontsize=10)
ax.grid(True, alpha=0.3)
ax.axvline(x=0, color='black', linestyle='--', linewidth=1, alpha=0.5)
plt.tight_layout()
if output_path:
fig.savefig(output_path, dpi=300, bbox_inches='tight')
print(f"✓ Saved returns distribution plot to: {output_path}")
return fig
def create_summary_report(
ticker: str,
results: Dict[str, pd.DataFrame],
comparison_df: pd.DataFrame,
output_dir: str
) -> None:
"""
Generate comprehensive visual summary report.
Creates all standard plots and saves them to output directory.
Args:
ticker: Stock ticker symbol
results: Dictionary mapping strategy name to portfolio DataFrame
comparison_df: DataFrame with performance metrics comparison
output_dir: Directory to save output files
"""
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
print("\nGenerating visualizations...")
# 1. Cumulative Returns
try:
plot_cumulative_returns(
results,
ticker,
output_path=str(output_path / f"{ticker}_cumulative_returns.png")
)
except Exception as e:
print(f"✗ Failed to generate cumulative returns plot: {e}")
# 2. Metrics Comparison
try:
plot_metrics_comparison(
comparison_df,
ticker,
output_path=str(output_path / f"{ticker}_metrics_comparison.png")
)
except Exception as e:
print(f"✗ Failed to generate metrics comparison plot: {e}")
# 3. Drawdown Analysis
try:
plot_drawdown(
results,
ticker,
output_path=str(output_path / f"{ticker}_drawdown.png")
)
except Exception as e:
print(f"✗ Failed to generate drawdown plot: {e}")
# 4. Transaction History (if TradingAgents results available)
if "TradingAgents" in results:
try:
plot_transaction_history(
results["TradingAgents"],
ticker,
strategy_name="TradingAgents",
output_path=str(output_path / f"{ticker}_TradingAgents_transactions.png")
)
except Exception as e:
print(f"✗ Failed to generate transaction history plot: {e}")
# 5. Returns Distribution
try:
plot_returns_distribution(
results,
ticker,
output_path=str(output_path / f"{ticker}_returns_distribution.png")
)
except Exception as e:
print(f"✗ Failed to generate returns distribution plot: {e}")
print(f"\n✓ All visualizations saved to: {output_dir}")
if __name__ == "__main__":
# Example usage / testing
print("Visualization module loaded successfully!")
print("\nAvailable functions:")
print(" - plot_cumulative_returns")
print(" - plot_transaction_history")
print(" - plot_metrics_comparison")
print(" - plot_drawdown")
print(" - plot_returns_distribution")
print(" - create_summary_report")

View File

@ -24,3 +24,6 @@ rich
questionary
langchain_anthropic
langchain-google-genai
matplotlib
seaborn
numpy