From e1113880a1da00c80258612657fd4f8e68a79ef2 Mon Sep 17 00:00:00 2001 From: Yijia-Xiao Date: Sun, 29 Mar 2026 17:34:35 +0000 Subject: [PATCH] fix: prevent look-ahead bias in backtesting data fetchers (#475) --- .../dataflows/alpha_vantage_fundamentals.py | 80 ++++++---------- tradingagents/dataflows/stockstats_utils.py | 94 ++++++++++++------- tradingagents/dataflows/y_finance.py | 71 +++----------- tradingagents/dataflows/yfinance_news.py | 5 + 4 files changed, 108 insertions(+), 142 deletions(-) diff --git a/tradingagents/dataflows/alpha_vantage_fundamentals.py b/tradingagents/dataflows/alpha_vantage_fundamentals.py index 8b92faa6..a4ef24c0 100644 --- a/tradingagents/dataflows/alpha_vantage_fundamentals.py +++ b/tradingagents/dataflows/alpha_vantage_fundamentals.py @@ -1,6 +1,23 @@ from .alpha_vantage_common import _make_api_request +def _filter_reports_by_date(result, curr_date: str): + """Filter annualReports/quarterlyReports to exclude entries after curr_date. + + Prevents look-ahead bias by removing fiscal periods that end after + the simulation's current date. + """ + if not curr_date or not isinstance(result, dict): + return result + for key in ("annualReports", "quarterlyReports"): + if key in result: + result[key] = [ + r for r in result[key] + if r.get("fiscalDateEnding", "") <= curr_date + ] + return result + + def get_fundamentals(ticker: str, curr_date: str = None) -> str: """ Retrieve comprehensive fundamental data for a given ticker symbol using Alpha Vantage. @@ -19,59 +36,20 @@ def get_fundamentals(ticker: str, curr_date: str = None) -> str: return _make_api_request("OVERVIEW", params) -def get_balance_sheet(ticker: str, freq: str = "quarterly", curr_date: str = None) -> str: - """ - Retrieve balance sheet data for a given ticker symbol using Alpha Vantage. - - Args: - ticker (str): Ticker symbol of the company - freq (str): Reporting frequency: annual/quarterly (default quarterly) - not used for Alpha Vantage - curr_date (str): Current date you are trading at, yyyy-mm-dd (not used for Alpha Vantage) - - Returns: - str: Balance sheet data with normalized fields - """ - params = { - "symbol": ticker, - } - - return _make_api_request("BALANCE_SHEET", params) +def get_balance_sheet(ticker: str, freq: str = "quarterly", curr_date: str = None): + """Retrieve balance sheet data for a given ticker symbol using Alpha Vantage.""" + result = _make_api_request("BALANCE_SHEET", {"symbol": ticker}) + return _filter_reports_by_date(result, curr_date) -def get_cashflow(ticker: str, freq: str = "quarterly", curr_date: str = None) -> str: - """ - Retrieve cash flow statement data for a given ticker symbol using Alpha Vantage. - - Args: - ticker (str): Ticker symbol of the company - freq (str): Reporting frequency: annual/quarterly (default quarterly) - not used for Alpha Vantage - curr_date (str): Current date you are trading at, yyyy-mm-dd (not used for Alpha Vantage) - - Returns: - str: Cash flow statement data with normalized fields - """ - params = { - "symbol": ticker, - } - - return _make_api_request("CASH_FLOW", params) +def get_cashflow(ticker: str, freq: str = "quarterly", curr_date: str = None): + """Retrieve cash flow statement data for a given ticker symbol using Alpha Vantage.""" + result = _make_api_request("CASH_FLOW", {"symbol": ticker}) + return _filter_reports_by_date(result, curr_date) -def get_income_statement(ticker: str, freq: str = "quarterly", curr_date: str = None) -> str: - """ - Retrieve income statement data for a given ticker symbol using Alpha Vantage. - - Args: - ticker (str): Ticker symbol of the company - freq (str): Reporting frequency: annual/quarterly (default quarterly) - not used for Alpha Vantage - curr_date (str): Current date you are trading at, yyyy-mm-dd (not used for Alpha Vantage) - - Returns: - str: Income statement data with normalized fields - """ - params = { - "symbol": ticker, - } - - return _make_api_request("INCOME_STATEMENT", params) +def get_income_statement(ticker: str, freq: str = "quarterly", curr_date: str = None): + """Retrieve income statement data for a given ticker symbol using Alpha Vantage.""" + result = _make_api_request("INCOME_STATEMENT", {"symbol": ticker}) + return _filter_reports_by_date(result, curr_date) diff --git a/tradingagents/dataflows/stockstats_utils.py b/tradingagents/dataflows/stockstats_utils.py index 47d5460a..50747883 100644 --- a/tradingagents/dataflows/stockstats_utils.py +++ b/tradingagents/dataflows/stockstats_utils.py @@ -44,6 +44,64 @@ def _clean_dataframe(data: pd.DataFrame) -> pd.DataFrame: return data +def load_ohlcv(symbol: str, curr_date: str) -> pd.DataFrame: + """Fetch OHLCV data with caching, filtered to prevent look-ahead bias. + + Downloads 15 years of data up to today and caches per symbol. On + subsequent calls the cache is reused. Rows after curr_date are + filtered out so backtests never see future prices. + """ + config = get_config() + curr_date_dt = pd.to_datetime(curr_date) + + # Cache uses a fixed window (15y to today) so one file per symbol + today_date = pd.Timestamp.today() + start_date = today_date - pd.DateOffset(years=5) + start_str = start_date.strftime("%Y-%m-%d") + end_str = today_date.strftime("%Y-%m-%d") + + os.makedirs(config["data_cache_dir"], exist_ok=True) + data_file = os.path.join( + config["data_cache_dir"], + f"{symbol}-YFin-data-{start_str}-{end_str}.csv", + ) + + if os.path.exists(data_file): + data = pd.read_csv(data_file, on_bad_lines="skip") + else: + data = yf_retry(lambda: yf.download( + symbol, + start=start_str, + end=end_str, + multi_level_index=False, + progress=False, + auto_adjust=True, + )) + data = data.reset_index() + data.to_csv(data_file, index=False) + + data = _clean_dataframe(data) + + # Filter to curr_date to prevent look-ahead bias in backtesting + data = data[data["Date"] <= curr_date_dt] + + return data + + +def filter_financials_by_date(data: pd.DataFrame, curr_date: str) -> pd.DataFrame: + """Drop financial statement columns (fiscal period timestamps) after curr_date. + + yfinance financial statements use fiscal period end dates as columns. + Columns after curr_date represent future data and are removed to + prevent look-ahead bias. + """ + if not curr_date or data.empty: + return data + cutoff = pd.Timestamp(curr_date) + mask = pd.to_datetime(data.columns, errors="coerce") <= cutoff + return data.loc[:, mask] + + class StockstatsUtils: @staticmethod def get_stock_stats( @@ -55,42 +113,10 @@ class StockstatsUtils: str, "curr date for retrieving stock price data, YYYY-mm-dd" ], ): - config = get_config() - - today_date = pd.Timestamp.today() - curr_date_dt = pd.to_datetime(curr_date) - - end_date = today_date - start_date = today_date - pd.DateOffset(years=15) - start_date_str = start_date.strftime("%Y-%m-%d") - end_date_str = end_date.strftime("%Y-%m-%d") - - # Ensure cache directory exists - os.makedirs(config["data_cache_dir"], exist_ok=True) - - data_file = os.path.join( - config["data_cache_dir"], - f"{symbol}-YFin-data-{start_date_str}-{end_date_str}.csv", - ) - - if os.path.exists(data_file): - data = pd.read_csv(data_file, on_bad_lines="skip") - else: - data = yf_retry(lambda: yf.download( - symbol, - start=start_date_str, - end=end_date_str, - multi_level_index=False, - progress=False, - auto_adjust=True, - )) - data = data.reset_index() - data.to_csv(data_file, index=False) - - data = _clean_dataframe(data) + data = load_ohlcv(symbol, curr_date) df = wrap(data) df["Date"] = df["Date"].dt.strftime("%Y-%m-%d") - curr_date_str = curr_date_dt.strftime("%Y-%m-%d") + curr_date_str = pd.to_datetime(curr_date).strftime("%Y-%m-%d") df[indicator] # trigger stockstats to calculate the indicator matching_rows = df[df["Date"].str.startswith(curr_date_str)] diff --git a/tradingagents/dataflows/y_finance.py b/tradingagents/dataflows/y_finance.py index 3682a01d..8b4b93f5 100644 --- a/tradingagents/dataflows/y_finance.py +++ b/tradingagents/dataflows/y_finance.py @@ -3,7 +3,7 @@ from datetime import datetime from dateutil.relativedelta import relativedelta import yfinance as yf import os -from .stockstats_utils import StockstatsUtils, _clean_dataframe, yf_retry +from .stockstats_utils import StockstatsUtils, _clean_dataframe, yf_retry, load_ohlcv, filter_financials_by_date def get_YFin_data_online( symbol: Annotated[str, "ticker symbol of the company"], @@ -194,58 +194,9 @@ def _get_stock_stats_bulk( Fetches data once and calculates indicator for all available dates. Returns dict mapping date strings to indicator values. """ - from .config import get_config - import pandas as pd from stockstats import wrap - import os - - config = get_config() - online = config["data_vendors"]["technical_indicators"] != "local" - - if not online: - # Local data path - try: - data = pd.read_csv( - os.path.join( - config.get("data_cache_dir", "data"), - f"{symbol}-YFin-data-2015-01-01-2025-03-25.csv", - ), - on_bad_lines="skip", - ) - except FileNotFoundError: - raise Exception("Stockstats fail: Yahoo Finance data not fetched yet!") - else: - # Online data fetching with caching - today_date = pd.Timestamp.today() - curr_date_dt = pd.to_datetime(curr_date) - end_date = today_date - start_date = today_date - pd.DateOffset(years=15) - start_date_str = start_date.strftime("%Y-%m-%d") - end_date_str = end_date.strftime("%Y-%m-%d") - - os.makedirs(config["data_cache_dir"], exist_ok=True) - - data_file = os.path.join( - config["data_cache_dir"], - f"{symbol}-YFin-data-{start_date_str}-{end_date_str}.csv", - ) - - if os.path.exists(data_file): - data = pd.read_csv(data_file, on_bad_lines="skip") - else: - data = yf_retry(lambda: yf.download( - symbol, - start=start_date_str, - end=end_date_str, - multi_level_index=False, - progress=False, - auto_adjust=True, - )) - data = data.reset_index() - data.to_csv(data_file, index=False) - - data = _clean_dataframe(data) + data = load_ohlcv(symbol, curr_date) df = wrap(data) df["Date"] = df["Date"].dt.strftime("%Y-%m-%d") @@ -353,7 +304,7 @@ def get_fundamentals( def get_balance_sheet( ticker: Annotated[str, "ticker symbol of the company"], freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly", - curr_date: Annotated[str, "current date (not used for yfinance)"] = None + curr_date: Annotated[str, "current date in YYYY-MM-DD format"] = None ): """Get balance sheet data from yfinance.""" try: @@ -363,7 +314,9 @@ def get_balance_sheet( data = yf_retry(lambda: ticker_obj.quarterly_balance_sheet) else: data = yf_retry(lambda: ticker_obj.balance_sheet) - + + data = filter_financials_by_date(data, curr_date) + if data.empty: return f"No balance sheet data found for symbol '{ticker}'" @@ -383,7 +336,7 @@ def get_balance_sheet( def get_cashflow( ticker: Annotated[str, "ticker symbol of the company"], freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly", - curr_date: Annotated[str, "current date (not used for yfinance)"] = None + curr_date: Annotated[str, "current date in YYYY-MM-DD format"] = None ): """Get cash flow data from yfinance.""" try: @@ -393,7 +346,9 @@ def get_cashflow( data = yf_retry(lambda: ticker_obj.quarterly_cashflow) else: data = yf_retry(lambda: ticker_obj.cashflow) - + + data = filter_financials_by_date(data, curr_date) + if data.empty: return f"No cash flow data found for symbol '{ticker}'" @@ -413,7 +368,7 @@ def get_cashflow( def get_income_statement( ticker: Annotated[str, "ticker symbol of the company"], freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly", - curr_date: Annotated[str, "current date (not used for yfinance)"] = None + curr_date: Annotated[str, "current date in YYYY-MM-DD format"] = None ): """Get income statement data from yfinance.""" try: @@ -423,7 +378,9 @@ def get_income_statement( data = yf_retry(lambda: ticker_obj.quarterly_income_stmt) else: data = yf_retry(lambda: ticker_obj.income_stmt) - + + data = filter_financials_by_date(data, curr_date) + if data.empty: return f"No income statement data found for symbol '{ticker}'" diff --git a/tradingagents/dataflows/yfinance_news.py b/tradingagents/dataflows/yfinance_news.py index 20e9120d..7254ebc3 100644 --- a/tradingagents/dataflows/yfinance_news.py +++ b/tradingagents/dataflows/yfinance_news.py @@ -167,6 +167,11 @@ def get_global_news_yfinance( # Handle both flat and nested structures if "content" in article: data = _extract_article_data(article) + # Skip articles published after curr_date (look-ahead guard) + if data.get("pub_date"): + pub_naive = data["pub_date"].replace(tzinfo=None) if hasattr(data["pub_date"], "replace") else data["pub_date"] + if pub_naive > curr_dt + relativedelta(days=1): + continue title = data["title"] publisher = data["publisher"] link = data["link"]