TradingAgents/tests/torture_test_2022.py

"""
2022 Torture Test - Bear Market Backtest

Tests system performance during the 2022 tech crash:
- NVDA: -50%+
- AMZN: -50%
- AAPL: -27%

Pass Criteria:
- Max Drawdown < 25% (better than Nasdaq-100's -33%)
- Fact checker must reject bullish hallucinations
- Regime detector must identify BEAR/VOLATILE periods
"""

import pandas as pd
import numpy as np
import yfinance as yf
from datetime import datetime, timedelta
from typing import Dict, List, Tuple
import sys
import os

sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))

from tradingagents.workflows.integrated_workflow import IntegratedTradingWorkflow
from tradingagents.schemas.agent_schemas import SignalType


class TortureTestBacktest:
    """
    2022 Bear Market Backtest.

    Tests if system can survive the tech crash with:
    - Regime detection (should detect BEAR/VOLATILE)
    - Fact checker (should reject bullish hallucinations)
    - Risk gate (should enforce circuit breakers)
    """

    def __init__(self, starting_capital: float = 100000):
        """Initialize backtest."""
        self.starting_capital = starting_capital
        self.capital = starting_capital
        self.positions = {}
        self.equity_curve = []
        self.trades = []
        self.rejections = {
            "fact_check": [],
            "risk_gate": [],
            "json_compliance": []
        }
        self.regime_log = []

        # Configure workflow
        config = {
            "anonymizer_seed": "torture_test_2022",
            "use_nli_model": False,  # Use fallback for speed
            "max_json_retries": 2,
            "fact_check_latency_budget": 2.0,
            "portfolio_value": starting_capital,
            "risk_config": {
                "max_position_risk": 0.02,  # 2% max risk per trade
                "max_portfolio_heat": 0.10,  # 10% max total portfolio risk
                "circuit_breaker": 0.15  # Stop trading if 15% drawdown
            }
        }

        self.workflow = IntegratedTradingWorkflow(config)

    def download_data(self, tickers: List[str], start_date: str, end_date: str) -> Dict[str, pd.DataFrame]:
        """Download historical data for tickers."""
        print(f"📥 Downloading data for {tickers} from {start_date} to {end_date}...")

        data = {}
        for ticker in tickers:
            df = yf.download(ticker, start=start_date, end=end_date, progress=False)
            if len(df) > 0:
                data[ticker] = df
                print(f"   ✅ {ticker}: {len(df)} days")
            else:
                print(f"   ❌ {ticker}: No data")

        return data

    def run_backtest(
        self,
        tickers: List[str],
        start_date: str,
        end_date: str
    ) -> Dict:
        """
        Run 2022 torture test backtest.

        Args:
            tickers: List of tickers to trade
            start_date: Start date YYYY-MM-DD
            end_date: End date YYYY-MM-DD

        Returns:
            Results dict with metrics
        """
        # Download data
        data = self.download_data(tickers, start_date, end_date)

        if not data:
            raise ValueError("No data downloaded")

        # Get trading dates (intersection of all tickers)
        all_dates = set(data[tickers[0]].index)
        for ticker in tickers[1:]:
            all_dates = all_dates.intersection(set(data[ticker].index))

        trading_dates = sorted(list(all_dates))
        print(f"\n📅 Trading period: {trading_dates[0].date()} to {trading_dates[-1].date()}")
        print(f"   Total trading days: {len(trading_dates)}")

        # Run simulation
        print(f"\n🚀 Starting 2022 Torture Test...")
        print(f"   Starting Capital: ${self.starting_capital:,.2f}")
        print(f"   Max Drawdown Limit: 25% (${self.starting_capital * 0.75:,.2f})")
        print()

        for i, date in enumerate(trading_dates):
            # Calculate current portfolio value
            portfolio_value = self._calculate_portfolio_value(data, date)
            self.equity_curve.append({
                "date": date,
                "value": portfolio_value
            })

            # Check circuit breaker
            drawdown = (portfolio_value - self.starting_capital) / self.starting_capital

            if drawdown <= -0.25:
                print(f"\n🚨 CIRCUIT BREAKER TRIGGERED")
                print(f"   Date: {date.date()}")
                print(f"   Portfolio: ${portfolio_value:,.2f}")
                print(f"   Drawdown: {drawdown:.1%}")
                print(f"   ❌ BACKTEST FAILED - Exceeded 25% drawdown limit")
                break

            # Trade each ticker (simplified - in production would use judge logic)
            for ticker in tickers:
                if ticker not in data:
                    continue

                # Skip if we don't have enough history
                ticker_data = data[ticker].loc[:date]
                if len(ticker_data) < 100:
                    continue

                # Prepare market data
                market_data = self._prepare_market_data(ticker_data)

                # Create mock ground truth (in production, would use real fundamentals)
                ground_truth = self._create_mock_ground_truth(ticker_data)

                # Create mock LLM agents (simplified for testing)
                llm_agents = self._create_mock_agents(ticker, market_data, ground_truth)

                # Execute workflow
                try:
                    decision, metrics = self.workflow.execute_trade_decision(
                        ticker=ticker,
                        trading_date=date.strftime("%Y-%m-%d"),
                        market_data=market_data,
                        ground_truth=ground_truth,
                        llm_agents=llm_agents
                    )

                    # Log regime
                    self.regime_log.append({
                        "date": date,
                        "ticker": ticker,
                        "regime": "UNKNOWN"  # Would extract from workflow
                    })

                    # Check if rejected
                    if not decision.fact_check_passed:
                        self.rejections["fact_check"].append({
                            "date": date,
                            "ticker": ticker,
                            "action": "N/A",
                            "reason": decision.reasoning
                        })
                    elif not decision.risk_gate_passed:
                        self.rejections["risk_gate"].append({
                            "date": date,
                            "ticker": ticker,
                            "action": decision.action.value,
                            "reason": decision.reasoning
                        })
                    elif decision.action == SignalType.HOLD:
                        # Check if it's a dead state
                        if "REJECTED" in decision.reasoning:
                            if "JSON" in decision.reasoning:
                                self.rejections["json_compliance"].append({
                                    "date": date,
                                    "ticker": ticker,
                                    "action": "N/A",
                                    "reason": decision.reasoning
                                })

                    # Execute approved trades
                    if decision.action in [SignalType.BUY, SignalType.SELL] and decision.quantity > 0:
                        self._execute_trade(ticker, decision, market_data["close"], date)

                except Exception as e:
                    print(f"   ⚠️  Error processing {ticker} on {date.date()}: {e}")

            # Progress update every 30 days
            if i % 30 == 0:
                print(f"   {date.date()}: Portfolio = ${portfolio_value:,.2f} ({drawdown:+.1%})")

        # Calculate final metrics
        results = self._calculate_metrics()

        return results

    def _prepare_market_data(self, ticker_data: pd.DataFrame) -> Dict:
        """Prepare market data for workflow."""
        # Ensure Close is a Series, not DataFrame
        close_series = ticker_data['Close']
        if isinstance(close_series, pd.DataFrame):
            close_series = close_series.squeeze()

        return {
            "price_series": close_series,
            "close": float(close_series.iloc[-1]),
            "atr": float(close_series.rolling(14).std().iloc[-1] * 1.5) if len(close_series) >= 14 else 1.0,
            "volume": float(ticker_data['Volume'].iloc[-1]) if 'Volume' in ticker_data else 1000000,
            "indicators": {
                "RSI": 50,  # Simplified
                "MACD": 0.0
            }
        }

    def _create_mock_ground_truth(self, ticker_data: pd.DataFrame) -> Dict:
        """Create mock ground truth (simplified)."""
        returns = ticker_data['Close'].pct_change()

        return {
            "revenue_growth_yoy": returns.tail(20).mean() * 252,  # Annualized
            "price_change_pct": returns.iloc[-1]
        }

    def _create_mock_agents(self, ticker: str, market_data: Dict, ground_truth: Dict):
        """Create mock LLM agents for testing."""
        # This is simplified - in production would use real LLMs
        from unittest.mock import Mock

        def mock_analyst(prompt):
            response = Mock()
            response.content = '''```json
            {
                "analyst_type": "market",
                "key_findings": ["Price movement observed", "Volume analysis complete", "Technical setup identified"],
                "signal": "HOLD",
                "confidence": 0.6,
                "reasoning": "Market conditions require cautious approach during volatile period."
            }
            ```'''
            return response

        def mock_bull(prompt):
            response = Mock()
            response.content = '''```json
            {
                "researcher_type": "bull",
                "key_arguments": ["Long-term growth potential remains", "Technical support holding"],
                "signal": "BUY",
                "confidence": 0.55,
                "supporting_evidence": ["Historical patterns", "Sector strength"]
            }
            ```'''
            return response

        def mock_bear(prompt):
            response = Mock()
            response.content = '''```json
            {
                "researcher_type": "bear",
                "key_arguments": ["Market volatility elevated", "Downside risks present"],
                "signal": "SELL",
                "confidence": 0.70,
                "supporting_evidence": ["Macro headwinds", "Technical weakness"]
            }
            ```'''
            return response

        return {
            "market_analyst": mock_analyst,
            "bull_researcher": mock_bull,
            "bear_researcher": mock_bear
        }

    def _execute_trade(self, ticker: str, decision, price: float, date):
        """Execute trade."""
        self.trades.append({
            "date": date,
            "ticker": ticker,
            "action": decision.action.value,
            "quantity": decision.quantity,
            "price": price,
            "value": decision.quantity * price
        })

    def _calculate_portfolio_value(self, data: Dict, date) -> float:
        """Calculate current portfolio value."""
        # Simplified - just return capital for now
        return self.capital

    def _calculate_metrics(self) -> Dict:
        """Calculate backtest metrics."""
        equity_df = pd.DataFrame(self.equity_curve)

        final_value = equity_df['value'].iloc[-1]
        returns = equity_df['value'].pct_change().dropna()

        # Max drawdown
        cummax = equity_df['value'].cummax()
        drawdown = (equity_df['value'] - cummax) / cummax
        max_drawdown = drawdown.min()

        # Sharpe ratio (annualized)
        if len(returns) > 0 and returns.std() > 0:
            sharpe = (returns.mean() / returns.std()) * np.sqrt(252)
        else:
            sharpe = 0.0

        return {
            "final_value": final_value,
            "total_return": (final_value - self.starting_capital) / self.starting_capital,
            "max_drawdown": max_drawdown,
            "sharpe_ratio": sharpe,
            "total_trades": len(self.trades),
            "fact_check_rejections": len(self.rejections["fact_check"]),
            "risk_gate_rejections": len(self.rejections["risk_gate"]),
            "json_failures": len(self.rejections["json_compliance"]),
            "equity_curve": equity_df
        }


# Run the torture test
if __name__ == "__main__":
    backtest = TortureTestBacktest(starting_capital=100000)

    results = backtest.run_backtest(
        tickers=["AAPL", "NVDA", "AMZN"],
        start_date="2022-01-01",
        end_date="2022-12-31"
    )

    print("\n" + "="*80)
    print("2022 TORTURE TEST RESULTS")
    print("="*80)
    print(f"\nFinal Portfolio Value: ${results['final_value']:,.2f}")
    print(f"Total Return: {results['total_return']:.1%}")
    print(f"Max Drawdown: {results['max_drawdown']:.1%}")
    print(f"Sharpe Ratio: {results['sharpe_ratio']:.2f}")
    print(f"\nTotal Trades: {results['total_trades']}")
    print(f"Fact Check Rejections: {results['fact_check_rejections']}")
    print(f"Risk Gate Rejections: {results['risk_gate_rejections']}")

    # Pass/Fail
    print("\n" + "="*80)
    if results['max_drawdown'] > -0.25:
        print("✅ PASSED: Max drawdown < 25%")
    else:
        print("❌ FAILED: Max drawdown exceeded 25% limit")

    if results['fact_check_rejections'] > 0:
        print(f"✅ PASSED: Fact checker active ({results['fact_check_rejections']} rejections)")
    else:
        print("❌ FAILED: Fact checker rejected 0 trades (threshold too loose)")