fix: prevent look-ahead bias in backtesting data fetchers (#475)

This commit is contained in:
Yijia-Xiao 2026-03-29 17:34:35 +00:00
parent 589b351f2a
commit e1113880a1
No known key found for this signature in database
4 changed files with 108 additions and 142 deletions

View File

@ -1,6 +1,23 @@
from .alpha_vantage_common import _make_api_request from .alpha_vantage_common import _make_api_request
def _filter_reports_by_date(result, curr_date: str):
"""Filter annualReports/quarterlyReports to exclude entries after curr_date.
Prevents look-ahead bias by removing fiscal periods that end after
the simulation's current date.
"""
if not curr_date or not isinstance(result, dict):
return result
for key in ("annualReports", "quarterlyReports"):
if key in result:
result[key] = [
r for r in result[key]
if r.get("fiscalDateEnding", "") <= curr_date
]
return result
def get_fundamentals(ticker: str, curr_date: str = None) -> str: def get_fundamentals(ticker: str, curr_date: str = None) -> str:
""" """
Retrieve comprehensive fundamental data for a given ticker symbol using Alpha Vantage. Retrieve comprehensive fundamental data for a given ticker symbol using Alpha Vantage.
@ -19,59 +36,20 @@ def get_fundamentals(ticker: str, curr_date: str = None) -> str:
return _make_api_request("OVERVIEW", params) return _make_api_request("OVERVIEW", params)
def get_balance_sheet(ticker: str, freq: str = "quarterly", curr_date: str = None) -> str: def get_balance_sheet(ticker: str, freq: str = "quarterly", curr_date: str = None):
""" """Retrieve balance sheet data for a given ticker symbol using Alpha Vantage."""
Retrieve balance sheet data for a given ticker symbol using Alpha Vantage. result = _make_api_request("BALANCE_SHEET", {"symbol": ticker})
return _filter_reports_by_date(result, curr_date)
Args:
ticker (str): Ticker symbol of the company
freq (str): Reporting frequency: annual/quarterly (default quarterly) - not used for Alpha Vantage
curr_date (str): Current date you are trading at, yyyy-mm-dd (not used for Alpha Vantage)
Returns:
str: Balance sheet data with normalized fields
"""
params = {
"symbol": ticker,
}
return _make_api_request("BALANCE_SHEET", params)
def get_cashflow(ticker: str, freq: str = "quarterly", curr_date: str = None) -> str: def get_cashflow(ticker: str, freq: str = "quarterly", curr_date: str = None):
""" """Retrieve cash flow statement data for a given ticker symbol using Alpha Vantage."""
Retrieve cash flow statement data for a given ticker symbol using Alpha Vantage. result = _make_api_request("CASH_FLOW", {"symbol": ticker})
return _filter_reports_by_date(result, curr_date)
Args:
ticker (str): Ticker symbol of the company
freq (str): Reporting frequency: annual/quarterly (default quarterly) - not used for Alpha Vantage
curr_date (str): Current date you are trading at, yyyy-mm-dd (not used for Alpha Vantage)
Returns:
str: Cash flow statement data with normalized fields
"""
params = {
"symbol": ticker,
}
return _make_api_request("CASH_FLOW", params)
def get_income_statement(ticker: str, freq: str = "quarterly", curr_date: str = None) -> str: def get_income_statement(ticker: str, freq: str = "quarterly", curr_date: str = None):
""" """Retrieve income statement data for a given ticker symbol using Alpha Vantage."""
Retrieve income statement data for a given ticker symbol using Alpha Vantage. result = _make_api_request("INCOME_STATEMENT", {"symbol": ticker})
return _filter_reports_by_date(result, curr_date)
Args:
ticker (str): Ticker symbol of the company
freq (str): Reporting frequency: annual/quarterly (default quarterly) - not used for Alpha Vantage
curr_date (str): Current date you are trading at, yyyy-mm-dd (not used for Alpha Vantage)
Returns:
str: Income statement data with normalized fields
"""
params = {
"symbol": ticker,
}
return _make_api_request("INCOME_STATEMENT", params)

View File

@ -44,6 +44,64 @@ def _clean_dataframe(data: pd.DataFrame) -> pd.DataFrame:
return data return data
def load_ohlcv(symbol: str, curr_date: str) -> pd.DataFrame:
"""Fetch OHLCV data with caching, filtered to prevent look-ahead bias.
Downloads 15 years of data up to today and caches per symbol. On
subsequent calls the cache is reused. Rows after curr_date are
filtered out so backtests never see future prices.
"""
config = get_config()
curr_date_dt = pd.to_datetime(curr_date)
# Cache uses a fixed window (15y to today) so one file per symbol
today_date = pd.Timestamp.today()
start_date = today_date - pd.DateOffset(years=5)
start_str = start_date.strftime("%Y-%m-%d")
end_str = today_date.strftime("%Y-%m-%d")
os.makedirs(config["data_cache_dir"], exist_ok=True)
data_file = os.path.join(
config["data_cache_dir"],
f"{symbol}-YFin-data-{start_str}-{end_str}.csv",
)
if os.path.exists(data_file):
data = pd.read_csv(data_file, on_bad_lines="skip")
else:
data = yf_retry(lambda: yf.download(
symbol,
start=start_str,
end=end_str,
multi_level_index=False,
progress=False,
auto_adjust=True,
))
data = data.reset_index()
data.to_csv(data_file, index=False)
data = _clean_dataframe(data)
# Filter to curr_date to prevent look-ahead bias in backtesting
data = data[data["Date"] <= curr_date_dt]
return data
def filter_financials_by_date(data: pd.DataFrame, curr_date: str) -> pd.DataFrame:
"""Drop financial statement columns (fiscal period timestamps) after curr_date.
yfinance financial statements use fiscal period end dates as columns.
Columns after curr_date represent future data and are removed to
prevent look-ahead bias.
"""
if not curr_date or data.empty:
return data
cutoff = pd.Timestamp(curr_date)
mask = pd.to_datetime(data.columns, errors="coerce") <= cutoff
return data.loc[:, mask]
class StockstatsUtils: class StockstatsUtils:
@staticmethod @staticmethod
def get_stock_stats( def get_stock_stats(
@ -55,42 +113,10 @@ class StockstatsUtils:
str, "curr date for retrieving stock price data, YYYY-mm-dd" str, "curr date for retrieving stock price data, YYYY-mm-dd"
], ],
): ):
config = get_config() data = load_ohlcv(symbol, curr_date)
today_date = pd.Timestamp.today()
curr_date_dt = pd.to_datetime(curr_date)
end_date = today_date
start_date = today_date - pd.DateOffset(years=15)
start_date_str = start_date.strftime("%Y-%m-%d")
end_date_str = end_date.strftime("%Y-%m-%d")
# Ensure cache directory exists
os.makedirs(config["data_cache_dir"], exist_ok=True)
data_file = os.path.join(
config["data_cache_dir"],
f"{symbol}-YFin-data-{start_date_str}-{end_date_str}.csv",
)
if os.path.exists(data_file):
data = pd.read_csv(data_file, on_bad_lines="skip")
else:
data = yf_retry(lambda: yf.download(
symbol,
start=start_date_str,
end=end_date_str,
multi_level_index=False,
progress=False,
auto_adjust=True,
))
data = data.reset_index()
data.to_csv(data_file, index=False)
data = _clean_dataframe(data)
df = wrap(data) df = wrap(data)
df["Date"] = df["Date"].dt.strftime("%Y-%m-%d") df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")
curr_date_str = curr_date_dt.strftime("%Y-%m-%d") curr_date_str = pd.to_datetime(curr_date).strftime("%Y-%m-%d")
df[indicator] # trigger stockstats to calculate the indicator df[indicator] # trigger stockstats to calculate the indicator
matching_rows = df[df["Date"].str.startswith(curr_date_str)] matching_rows = df[df["Date"].str.startswith(curr_date_str)]

View File

@ -3,7 +3,7 @@ from datetime import datetime
from dateutil.relativedelta import relativedelta from dateutil.relativedelta import relativedelta
import yfinance as yf import yfinance as yf
import os import os
from .stockstats_utils import StockstatsUtils, _clean_dataframe, yf_retry from .stockstats_utils import StockstatsUtils, _clean_dataframe, yf_retry, load_ohlcv, filter_financials_by_date
def get_YFin_data_online( def get_YFin_data_online(
symbol: Annotated[str, "ticker symbol of the company"], symbol: Annotated[str, "ticker symbol of the company"],
@ -194,58 +194,9 @@ def _get_stock_stats_bulk(
Fetches data once and calculates indicator for all available dates. Fetches data once and calculates indicator for all available dates.
Returns dict mapping date strings to indicator values. Returns dict mapping date strings to indicator values.
""" """
from .config import get_config
import pandas as pd
from stockstats import wrap from stockstats import wrap
import os
config = get_config()
online = config["data_vendors"]["technical_indicators"] != "local"
if not online:
# Local data path
try:
data = pd.read_csv(
os.path.join(
config.get("data_cache_dir", "data"),
f"{symbol}-YFin-data-2015-01-01-2025-03-25.csv",
),
on_bad_lines="skip",
)
except FileNotFoundError:
raise Exception("Stockstats fail: Yahoo Finance data not fetched yet!")
else:
# Online data fetching with caching
today_date = pd.Timestamp.today()
curr_date_dt = pd.to_datetime(curr_date)
end_date = today_date data = load_ohlcv(symbol, curr_date)
start_date = today_date - pd.DateOffset(years=15)
start_date_str = start_date.strftime("%Y-%m-%d")
end_date_str = end_date.strftime("%Y-%m-%d")
os.makedirs(config["data_cache_dir"], exist_ok=True)
data_file = os.path.join(
config["data_cache_dir"],
f"{symbol}-YFin-data-{start_date_str}-{end_date_str}.csv",
)
if os.path.exists(data_file):
data = pd.read_csv(data_file, on_bad_lines="skip")
else:
data = yf_retry(lambda: yf.download(
symbol,
start=start_date_str,
end=end_date_str,
multi_level_index=False,
progress=False,
auto_adjust=True,
))
data = data.reset_index()
data.to_csv(data_file, index=False)
data = _clean_dataframe(data)
df = wrap(data) df = wrap(data)
df["Date"] = df["Date"].dt.strftime("%Y-%m-%d") df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")
@ -353,7 +304,7 @@ def get_fundamentals(
def get_balance_sheet( def get_balance_sheet(
ticker: Annotated[str, "ticker symbol of the company"], ticker: Annotated[str, "ticker symbol of the company"],
freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly", freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly",
curr_date: Annotated[str, "current date (not used for yfinance)"] = None curr_date: Annotated[str, "current date in YYYY-MM-DD format"] = None
): ):
"""Get balance sheet data from yfinance.""" """Get balance sheet data from yfinance."""
try: try:
@ -363,7 +314,9 @@ def get_balance_sheet(
data = yf_retry(lambda: ticker_obj.quarterly_balance_sheet) data = yf_retry(lambda: ticker_obj.quarterly_balance_sheet)
else: else:
data = yf_retry(lambda: ticker_obj.balance_sheet) data = yf_retry(lambda: ticker_obj.balance_sheet)
data = filter_financials_by_date(data, curr_date)
if data.empty: if data.empty:
return f"No balance sheet data found for symbol '{ticker}'" return f"No balance sheet data found for symbol '{ticker}'"
@ -383,7 +336,7 @@ def get_balance_sheet(
def get_cashflow( def get_cashflow(
ticker: Annotated[str, "ticker symbol of the company"], ticker: Annotated[str, "ticker symbol of the company"],
freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly", freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly",
curr_date: Annotated[str, "current date (not used for yfinance)"] = None curr_date: Annotated[str, "current date in YYYY-MM-DD format"] = None
): ):
"""Get cash flow data from yfinance.""" """Get cash flow data from yfinance."""
try: try:
@ -393,7 +346,9 @@ def get_cashflow(
data = yf_retry(lambda: ticker_obj.quarterly_cashflow) data = yf_retry(lambda: ticker_obj.quarterly_cashflow)
else: else:
data = yf_retry(lambda: ticker_obj.cashflow) data = yf_retry(lambda: ticker_obj.cashflow)
data = filter_financials_by_date(data, curr_date)
if data.empty: if data.empty:
return f"No cash flow data found for symbol '{ticker}'" return f"No cash flow data found for symbol '{ticker}'"
@ -413,7 +368,7 @@ def get_cashflow(
def get_income_statement( def get_income_statement(
ticker: Annotated[str, "ticker symbol of the company"], ticker: Annotated[str, "ticker symbol of the company"],
freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly", freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly",
curr_date: Annotated[str, "current date (not used for yfinance)"] = None curr_date: Annotated[str, "current date in YYYY-MM-DD format"] = None
): ):
"""Get income statement data from yfinance.""" """Get income statement data from yfinance."""
try: try:
@ -423,7 +378,9 @@ def get_income_statement(
data = yf_retry(lambda: ticker_obj.quarterly_income_stmt) data = yf_retry(lambda: ticker_obj.quarterly_income_stmt)
else: else:
data = yf_retry(lambda: ticker_obj.income_stmt) data = yf_retry(lambda: ticker_obj.income_stmt)
data = filter_financials_by_date(data, curr_date)
if data.empty: if data.empty:
return f"No income statement data found for symbol '{ticker}'" return f"No income statement data found for symbol '{ticker}'"

View File

@ -167,6 +167,11 @@ def get_global_news_yfinance(
# Handle both flat and nested structures # Handle both flat and nested structures
if "content" in article: if "content" in article:
data = _extract_article_data(article) data = _extract_article_data(article)
# Skip articles published after curr_date (look-ahead guard)
if data.get("pub_date"):
pub_naive = data["pub_date"].replace(tzinfo=None) if hasattr(data["pub_date"], "replace") else data["pub_date"]
if pub_naive > curr_dt + relativedelta(days=1):
continue
title = data["title"] title = data["title"]
publisher = data["publisher"] publisher = data["publisher"]
link = data["link"] link = data["link"]