""" 2022 Torture Test - Bear Market Backtest Tests system performance during the 2022 tech crash: - NVDA: -50%+ - AMZN: -50% - AAPL: -27% Pass Criteria: - Max Drawdown < 25% (better than Nasdaq-100's -33%) - Fact checker must reject bullish hallucinations - Regime detector must identify BEAR/VOLATILE periods """ import pandas as pd import numpy as np import yfinance as yf from datetime import datetime, timedelta from typing import Dict, List, Tuple import sys import os sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) from tradingagents.workflows.integrated_workflow import IntegratedTradingWorkflow from tradingagents.schemas.agent_schemas import SignalType class TortureTestBacktest: """ 2022 Bear Market Backtest. Tests if system can survive the tech crash with: - Regime detection (should detect BEAR/VOLATILE) - Fact checker (should reject bullish hallucinations) - Risk gate (should enforce circuit breakers) """ def __init__(self, starting_capital: float = 100000): """Initialize backtest.""" self.starting_capital = starting_capital self.capital = starting_capital self.positions = {} self.equity_curve = [] self.trades = [] self.rejections = { "fact_check": [], "risk_gate": [], "json_compliance": [] } self.regime_log = [] # Configure workflow config = { "anonymizer_seed": "torture_test_2022", "use_nli_model": False, # Use fallback for speed "max_json_retries": 2, "fact_check_latency_budget": 2.0, "portfolio_value": starting_capital, "risk_config": { "max_position_risk": 0.02, # 2% max risk per trade "max_portfolio_heat": 0.10, # 10% max total portfolio risk "circuit_breaker": 0.15 # Stop trading if 15% drawdown } } self.workflow = IntegratedTradingWorkflow(config) def download_data(self, tickers: List[str], start_date: str, end_date: str) -> Dict[str, pd.DataFrame]: """Download historical data for tickers.""" print(f"šŸ“„ Downloading data for {tickers} from {start_date} to {end_date}...") data = {} for ticker in tickers: df = yf.download(ticker, start=start_date, end=end_date, progress=False) if len(df) > 0: data[ticker] = df print(f" āœ… {ticker}: {len(df)} days") else: print(f" āŒ {ticker}: No data") return data def run_backtest( self, tickers: List[str], start_date: str, end_date: str ) -> Dict: """ Run 2022 torture test backtest. Args: tickers: List of tickers to trade start_date: Start date YYYY-MM-DD end_date: End date YYYY-MM-DD Returns: Results dict with metrics """ # Download data data = self.download_data(tickers, start_date, end_date) if not data: raise ValueError("No data downloaded") # Get trading dates (intersection of all tickers) all_dates = set(data[tickers[0]].index) for ticker in tickers[1:]: all_dates = all_dates.intersection(set(data[ticker].index)) trading_dates = sorted(list(all_dates)) print(f"\nšŸ“… Trading period: {trading_dates[0].date()} to {trading_dates[-1].date()}") print(f" Total trading days: {len(trading_dates)}") # Run simulation print(f"\nšŸš€ Starting 2022 Torture Test...") print(f" Starting Capital: ${self.starting_capital:,.2f}") print(f" Max Drawdown Limit: 25% (${self.starting_capital * 0.75:,.2f})") print() for i, date in enumerate(trading_dates): # Calculate current portfolio value portfolio_value = self._calculate_portfolio_value(data, date) self.equity_curve.append({ "date": date, "value": portfolio_value }) # Check circuit breaker drawdown = (portfolio_value - self.starting_capital) / self.starting_capital if drawdown <= -0.25: print(f"\n🚨 CIRCUIT BREAKER TRIGGERED") print(f" Date: {date.date()}") print(f" Portfolio: ${portfolio_value:,.2f}") print(f" Drawdown: {drawdown:.1%}") print(f" āŒ BACKTEST FAILED - Exceeded 25% drawdown limit") break # Trade each ticker (simplified - in production would use judge logic) for ticker in tickers: if ticker not in data: continue # Skip if we don't have enough history ticker_data = data[ticker].loc[:date] if len(ticker_data) < 100: continue # Prepare market data market_data = self._prepare_market_data(ticker_data) # Create mock ground truth (in production, would use real fundamentals) ground_truth = self._create_mock_ground_truth(ticker_data) # Create mock LLM agents (simplified for testing) llm_agents = self._create_mock_agents(ticker, market_data, ground_truth) # Execute workflow try: decision, metrics = self.workflow.execute_trade_decision( ticker=ticker, trading_date=date.strftime("%Y-%m-%d"), market_data=market_data, ground_truth=ground_truth, llm_agents=llm_agents ) # Log regime self.regime_log.append({ "date": date, "ticker": ticker, "regime": "UNKNOWN" # Would extract from workflow }) # Check if rejected if not decision.fact_check_passed: self.rejections["fact_check"].append({ "date": date, "ticker": ticker, "action": "N/A", "reason": decision.reasoning }) elif not decision.risk_gate_passed: self.rejections["risk_gate"].append({ "date": date, "ticker": ticker, "action": decision.action.value, "reason": decision.reasoning }) elif decision.action == SignalType.HOLD: # Check if it's a dead state if "REJECTED" in decision.reasoning: if "JSON" in decision.reasoning: self.rejections["json_compliance"].append({ "date": date, "ticker": ticker, "action": "N/A", "reason": decision.reasoning }) # Execute approved trades if decision.action in [SignalType.BUY, SignalType.SELL] and decision.quantity > 0: self._execute_trade(ticker, decision, market_data["close"], date) except Exception as e: print(f" āš ļø Error processing {ticker} on {date.date()}: {e}") # Progress update every 30 days if i % 30 == 0: print(f" {date.date()}: Portfolio = ${portfolio_value:,.2f} ({drawdown:+.1%})") # Calculate final metrics results = self._calculate_metrics() return results def _prepare_market_data(self, ticker_data: pd.DataFrame) -> Dict: """Prepare market data for workflow.""" # Ensure Close is a Series, not DataFrame close_series = ticker_data['Close'] if isinstance(close_series, pd.DataFrame): close_series = close_series.squeeze() return { "price_series": close_series, "close": float(close_series.iloc[-1]), "atr": float(close_series.rolling(14).std().iloc[-1] * 1.5) if len(close_series) >= 14 else 1.0, "volume": float(ticker_data['Volume'].iloc[-1]) if 'Volume' in ticker_data else 1000000, "indicators": { "RSI": 50, # Simplified "MACD": 0.0 } } def _create_mock_ground_truth(self, ticker_data: pd.DataFrame) -> Dict: """Create mock ground truth (simplified).""" returns = ticker_data['Close'].pct_change() return { "revenue_growth_yoy": returns.tail(20).mean() * 252, # Annualized "price_change_pct": returns.iloc[-1] } def _create_mock_agents(self, ticker: str, market_data: Dict, ground_truth: Dict): """Create mock LLM agents for testing.""" # This is simplified - in production would use real LLMs from unittest.mock import Mock def mock_analyst(prompt): response = Mock() response.content = '''```json { "analyst_type": "market", "key_findings": ["Price movement observed", "Volume analysis complete", "Technical setup identified"], "signal": "HOLD", "confidence": 0.6, "reasoning": "Market conditions require cautious approach during volatile period." } ```''' return response def mock_bull(prompt): response = Mock() response.content = '''```json { "researcher_type": "bull", "key_arguments": ["Long-term growth potential remains", "Technical support holding"], "signal": "BUY", "confidence": 0.55, "supporting_evidence": ["Historical patterns", "Sector strength"] } ```''' return response def mock_bear(prompt): response = Mock() response.content = '''```json { "researcher_type": "bear", "key_arguments": ["Market volatility elevated", "Downside risks present"], "signal": "SELL", "confidence": 0.70, "supporting_evidence": ["Macro headwinds", "Technical weakness"] } ```''' return response return { "market_analyst": mock_analyst, "bull_researcher": mock_bull, "bear_researcher": mock_bear } def _execute_trade(self, ticker: str, decision, price: float, date): """Execute trade.""" self.trades.append({ "date": date, "ticker": ticker, "action": decision.action.value, "quantity": decision.quantity, "price": price, "value": decision.quantity * price }) def _calculate_portfolio_value(self, data: Dict, date) -> float: """Calculate current portfolio value.""" # Simplified - just return capital for now return self.capital def _calculate_metrics(self) -> Dict: """Calculate backtest metrics.""" equity_df = pd.DataFrame(self.equity_curve) final_value = equity_df['value'].iloc[-1] returns = equity_df['value'].pct_change().dropna() # Max drawdown cummax = equity_df['value'].cummax() drawdown = (equity_df['value'] - cummax) / cummax max_drawdown = drawdown.min() # Sharpe ratio (annualized) if len(returns) > 0 and returns.std() > 0: sharpe = (returns.mean() / returns.std()) * np.sqrt(252) else: sharpe = 0.0 return { "final_value": final_value, "total_return": (final_value - self.starting_capital) / self.starting_capital, "max_drawdown": max_drawdown, "sharpe_ratio": sharpe, "total_trades": len(self.trades), "fact_check_rejections": len(self.rejections["fact_check"]), "risk_gate_rejections": len(self.rejections["risk_gate"]), "json_failures": len(self.rejections["json_compliance"]), "equity_curve": equity_df } # Run the torture test if __name__ == "__main__": backtest = TortureTestBacktest(starting_capital=100000) results = backtest.run_backtest( tickers=["AAPL", "NVDA", "AMZN"], start_date="2022-01-01", end_date="2022-12-31" ) print("\n" + "="*80) print("2022 TORTURE TEST RESULTS") print("="*80) print(f"\nFinal Portfolio Value: ${results['final_value']:,.2f}") print(f"Total Return: {results['total_return']:.1%}") print(f"Max Drawdown: {results['max_drawdown']:.1%}") print(f"Sharpe Ratio: {results['sharpe_ratio']:.2f}") print(f"\nTotal Trades: {results['total_trades']}") print(f"Fact Check Rejections: {results['fact_check_rejections']}") print(f"Risk Gate Rejections: {results['risk_gate_rejections']}") # Pass/Fail print("\n" + "="*80) if results['max_drawdown'] > -0.25: print("āœ… PASSED: Max drawdown < 25%") else: print("āŒ FAILED: Max drawdown exceeded 25% limit") if results['fact_check_rejections'] > 0: print(f"āœ… PASSED: Fact checker active ({results['fact_check_rejections']} rejections)") else: print("āŒ FAILED: Fact checker rejected 0 trades (threshold too loose)")