TradingAgents/tests/test_data_leakage.py

162 lines
6.4 KiB
Python

"""
Tests to verify no future data leakage in backtesting.
These tests ensure that when running backtests for a historical date,
the system only uses data that would have been available at that time.
"""
import pytest
from datetime import datetime, timedelta
import pandas as pd
class TestStockstatsDataLeakage:
"""Test that stockstats_utils.py doesn't leak future data."""
def test_stockstats_uses_curr_date_not_today(self):
"""Verify stockstats filters data up to curr_date, not today."""
from tradingagents.dataflows.stockstats_utils import StockstatsUtils
# Test with a historical date
historical_date = "2024-06-15"
# This should only use data up to 2024-06-15
# If the function works correctly, we shouldn't get data beyond this date
try:
result = StockstatsUtils.get_stock_stats("AAPL", "close", historical_date)
# Result should be a value, not an error
assert result is not None
except Exception as e:
# If local data not available, that's expected
if "Yahoo Finance data not fetched yet" not in str(e):
raise
def test_end_date_equals_curr_date(self):
"""Verify that yfinance download uses curr_date as end_date."""
# This is a structural check - the code should use curr_date_dt as end_date_dt
from tradingagents.dataflows.stockstats_utils import StockstatsUtils
import inspect
source = inspect.getsource(StockstatsUtils.get_stock_stats)
# Check that the code uses curr_date for end_date calculation
assert "end_date_dt = curr_date_dt" in source or "end=end_date" in source, \
"stockstats should use curr_date as end_date to prevent future data leakage"
class TestAlphaVantageDataLeakage:
"""Test that alpha_vantage_stock.py doesn't leak future data."""
def test_outputsize_uses_end_date_not_now(self):
"""Verify outputsize calculation uses end_date, not datetime.now()."""
from tradingagents.dataflows.alpha_vantage_stock import get_stock
import inspect
source = inspect.getsource(get_stock)
# Should NOT contain datetime.now() for outputsize calculation
assert "datetime.now()" not in source, \
"alpha_vantage_stock should not use datetime.now() - use end_date instead"
# Should use end_date for the calculation
assert "end_dt" in source or "end_date" in source, \
"Should use end_date for outputsize calculation"
class TestFundamentalsDataLeakage:
"""Test that fundamentals data respects publication delays."""
def test_publication_delay_is_conservative(self):
"""Verify publication delay is at least 60 days (conservative estimate)."""
from tradingagents.dataflows.y_finance import _filter_fundamentals_by_date
import inspect
source = inspect.getsource(_filter_fundamentals_by_date)
# Check that publication delay is at least 60 days
assert "publication_delay_days = 60" in source or "publication_delay_days = 90" in source, \
"Publication delay should be at least 60 days for conservative backtesting"
def test_fundamentals_filtered_by_publish_date(self):
"""Verify fundamentals are filtered by estimated publish date, not report date."""
from tradingagents.dataflows.y_finance import _filter_fundamentals_by_date
import pandas as pd
# Create mock fundamentals data with future dates
curr_date = "2024-06-15"
# Report from Q1 2024 (dated 2024-03-31) should be visible
# Report from Q2 2024 (dated 2024-06-30) should NOT be visible
mock_data = pd.DataFrame({
"2024-03-31": [100, 200],
"2024-06-30": [150, 250], # This shouldn't be visible in June
"2024-09-30": [175, 275], # This definitely shouldn't be visible
}, index=["Revenue", "Profit"])
filtered = _filter_fundamentals_by_date(mock_data, curr_date)
# With 60-day delay, only 2024-03-31 should be visible on 2024-06-15
# because 2024-03-31 + 60 days = 2024-05-30, which is before 2024-06-15
assert "2024-03-31" in filtered.columns, \
"Q1 2024 report (dated 2024-03-31) should be visible on 2024-06-15"
assert "2024-06-30" not in filtered.columns, \
"Q2 2024 report (dated 2024-06-30) should NOT be visible on 2024-06-15"
assert "2024-09-30" not in filtered.columns, \
"Q3 2024 report should definitely NOT be visible on 2024-06-15"
class TestLocalDataLeakage:
"""Test that local data files are properly filtered."""
def test_local_data_has_date_filtering(self):
"""Verify local data reading includes date filtering."""
from tradingagents.dataflows.local import get_stock_data
import inspect
source = inspect.getsource(get_stock_data)
# Should filter data by date range
assert "start_date" in source and "end_date" in source, \
"Local data should be filtered by date range"
class TestBacktestIntegrity:
"""Test overall backtest integrity."""
def test_no_future_imports_in_dataflows(self):
"""Verify dataflows don't use any patterns that could leak future data."""
import os
import re
dataflows_dir = "tradingagents/dataflows"
dangerous_patterns = [
# Using today's date when historical date should be used
r"pd\.Timestamp\.today\(\)",
r"datetime\.today\(\)",
r"date\.today\(\)",
# Unfiltered data access (should always filter by date)
r"\.history\([^)]*\)(?![^)]*end=)", # history() without end parameter
]
issues = []
for root, dirs, files in os.walk(dataflows_dir):
for file in files:
if file.endswith(".py"):
filepath = os.path.join(root, file)
with open(filepath, "r") as f:
content = f.read()
for pattern in dangerous_patterns:
matches = re.findall(pattern, content)
if matches:
issues.append(f"{filepath}: Found dangerous pattern {pattern}")
assert len(issues) == 0, \
f"Found potential data leakage patterns:\n" + "\n".join(issues)
if __name__ == "__main__":
pytest.main([__file__, "-v"])