fix: prevent look-ahead bias in backtesting data fetchers (#475)

This commit is contained in:
Yijia-Xiao 2026-03-29 17:34:35 +00:00
parent 589b351f2a
commit e1113880a1
No known key found for this signature in database
4 changed files with 108 additions and 142 deletions

View File

@ -1,6 +1,23 @@
from .alpha_vantage_common import _make_api_request
def _filter_reports_by_date(result, curr_date: str):
"""Filter annualReports/quarterlyReports to exclude entries after curr_date.
Prevents look-ahead bias by removing fiscal periods that end after
the simulation's current date.
"""
if not curr_date or not isinstance(result, dict):
return result
for key in ("annualReports", "quarterlyReports"):
if key in result:
result[key] = [
r for r in result[key]
if r.get("fiscalDateEnding", "") <= curr_date
]
return result
def get_fundamentals(ticker: str, curr_date: str = None) -> str:
"""
Retrieve comprehensive fundamental data for a given ticker symbol using Alpha Vantage.
@ -19,59 +36,20 @@ def get_fundamentals(ticker: str, curr_date: str = None) -> str:
return _make_api_request("OVERVIEW", params)
def get_balance_sheet(ticker: str, freq: str = "quarterly", curr_date: str = None) -> str:
"""
Retrieve balance sheet data for a given ticker symbol using Alpha Vantage.
Args:
ticker (str): Ticker symbol of the company
freq (str): Reporting frequency: annual/quarterly (default quarterly) - not used for Alpha Vantage
curr_date (str): Current date you are trading at, yyyy-mm-dd (not used for Alpha Vantage)
Returns:
str: Balance sheet data with normalized fields
"""
params = {
"symbol": ticker,
}
return _make_api_request("BALANCE_SHEET", params)
def get_balance_sheet(ticker: str, freq: str = "quarterly", curr_date: str = None):
"""Retrieve balance sheet data for a given ticker symbol using Alpha Vantage."""
result = _make_api_request("BALANCE_SHEET", {"symbol": ticker})
return _filter_reports_by_date(result, curr_date)
def get_cashflow(ticker: str, freq: str = "quarterly", curr_date: str = None) -> str:
"""
Retrieve cash flow statement data for a given ticker symbol using Alpha Vantage.
Args:
ticker (str): Ticker symbol of the company
freq (str): Reporting frequency: annual/quarterly (default quarterly) - not used for Alpha Vantage
curr_date (str): Current date you are trading at, yyyy-mm-dd (not used for Alpha Vantage)
Returns:
str: Cash flow statement data with normalized fields
"""
params = {
"symbol": ticker,
}
return _make_api_request("CASH_FLOW", params)
def get_cashflow(ticker: str, freq: str = "quarterly", curr_date: str = None):
"""Retrieve cash flow statement data for a given ticker symbol using Alpha Vantage."""
result = _make_api_request("CASH_FLOW", {"symbol": ticker})
return _filter_reports_by_date(result, curr_date)
def get_income_statement(ticker: str, freq: str = "quarterly", curr_date: str = None) -> str:
"""
Retrieve income statement data for a given ticker symbol using Alpha Vantage.
Args:
ticker (str): Ticker symbol of the company
freq (str): Reporting frequency: annual/quarterly (default quarterly) - not used for Alpha Vantage
curr_date (str): Current date you are trading at, yyyy-mm-dd (not used for Alpha Vantage)
Returns:
str: Income statement data with normalized fields
"""
params = {
"symbol": ticker,
}
return _make_api_request("INCOME_STATEMENT", params)
def get_income_statement(ticker: str, freq: str = "quarterly", curr_date: str = None):
"""Retrieve income statement data for a given ticker symbol using Alpha Vantage."""
result = _make_api_request("INCOME_STATEMENT", {"symbol": ticker})
return _filter_reports_by_date(result, curr_date)

View File

@ -44,6 +44,64 @@ def _clean_dataframe(data: pd.DataFrame) -> pd.DataFrame:
return data
def load_ohlcv(symbol: str, curr_date: str) -> pd.DataFrame:
"""Fetch OHLCV data with caching, filtered to prevent look-ahead bias.
Downloads 15 years of data up to today and caches per symbol. On
subsequent calls the cache is reused. Rows after curr_date are
filtered out so backtests never see future prices.
"""
config = get_config()
curr_date_dt = pd.to_datetime(curr_date)
# Cache uses a fixed window (15y to today) so one file per symbol
today_date = pd.Timestamp.today()
start_date = today_date - pd.DateOffset(years=5)
start_str = start_date.strftime("%Y-%m-%d")
end_str = today_date.strftime("%Y-%m-%d")
os.makedirs(config["data_cache_dir"], exist_ok=True)
data_file = os.path.join(
config["data_cache_dir"],
f"{symbol}-YFin-data-{start_str}-{end_str}.csv",
)
if os.path.exists(data_file):
data = pd.read_csv(data_file, on_bad_lines="skip")
else:
data = yf_retry(lambda: yf.download(
symbol,
start=start_str,
end=end_str,
multi_level_index=False,
progress=False,
auto_adjust=True,
))
data = data.reset_index()
data.to_csv(data_file, index=False)
data = _clean_dataframe(data)
# Filter to curr_date to prevent look-ahead bias in backtesting
data = data[data["Date"] <= curr_date_dt]
return data
def filter_financials_by_date(data: pd.DataFrame, curr_date: str) -> pd.DataFrame:
"""Drop financial statement columns (fiscal period timestamps) after curr_date.
yfinance financial statements use fiscal period end dates as columns.
Columns after curr_date represent future data and are removed to
prevent look-ahead bias.
"""
if not curr_date or data.empty:
return data
cutoff = pd.Timestamp(curr_date)
mask = pd.to_datetime(data.columns, errors="coerce") <= cutoff
return data.loc[:, mask]
class StockstatsUtils:
@staticmethod
def get_stock_stats(
@ -55,42 +113,10 @@ class StockstatsUtils:
str, "curr date for retrieving stock price data, YYYY-mm-dd"
],
):
config = get_config()
today_date = pd.Timestamp.today()
curr_date_dt = pd.to_datetime(curr_date)
end_date = today_date
start_date = today_date - pd.DateOffset(years=15)
start_date_str = start_date.strftime("%Y-%m-%d")
end_date_str = end_date.strftime("%Y-%m-%d")
# Ensure cache directory exists
os.makedirs(config["data_cache_dir"], exist_ok=True)
data_file = os.path.join(
config["data_cache_dir"],
f"{symbol}-YFin-data-{start_date_str}-{end_date_str}.csv",
)
if os.path.exists(data_file):
data = pd.read_csv(data_file, on_bad_lines="skip")
else:
data = yf_retry(lambda: yf.download(
symbol,
start=start_date_str,
end=end_date_str,
multi_level_index=False,
progress=False,
auto_adjust=True,
))
data = data.reset_index()
data.to_csv(data_file, index=False)
data = _clean_dataframe(data)
data = load_ohlcv(symbol, curr_date)
df = wrap(data)
df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")
curr_date_str = curr_date_dt.strftime("%Y-%m-%d")
curr_date_str = pd.to_datetime(curr_date).strftime("%Y-%m-%d")
df[indicator] # trigger stockstats to calculate the indicator
matching_rows = df[df["Date"].str.startswith(curr_date_str)]

View File

@ -3,7 +3,7 @@ from datetime import datetime
from dateutil.relativedelta import relativedelta
import yfinance as yf
import os
from .stockstats_utils import StockstatsUtils, _clean_dataframe, yf_retry
from .stockstats_utils import StockstatsUtils, _clean_dataframe, yf_retry, load_ohlcv, filter_financials_by_date
def get_YFin_data_online(
symbol: Annotated[str, "ticker symbol of the company"],
@ -194,58 +194,9 @@ def _get_stock_stats_bulk(
Fetches data once and calculates indicator for all available dates.
Returns dict mapping date strings to indicator values.
"""
from .config import get_config
import pandas as pd
from stockstats import wrap
import os
config = get_config()
online = config["data_vendors"]["technical_indicators"] != "local"
if not online:
# Local data path
try:
data = pd.read_csv(
os.path.join(
config.get("data_cache_dir", "data"),
f"{symbol}-YFin-data-2015-01-01-2025-03-25.csv",
),
on_bad_lines="skip",
)
except FileNotFoundError:
raise Exception("Stockstats fail: Yahoo Finance data not fetched yet!")
else:
# Online data fetching with caching
today_date = pd.Timestamp.today()
curr_date_dt = pd.to_datetime(curr_date)
end_date = today_date
start_date = today_date - pd.DateOffset(years=15)
start_date_str = start_date.strftime("%Y-%m-%d")
end_date_str = end_date.strftime("%Y-%m-%d")
os.makedirs(config["data_cache_dir"], exist_ok=True)
data_file = os.path.join(
config["data_cache_dir"],
f"{symbol}-YFin-data-{start_date_str}-{end_date_str}.csv",
)
if os.path.exists(data_file):
data = pd.read_csv(data_file, on_bad_lines="skip")
else:
data = yf_retry(lambda: yf.download(
symbol,
start=start_date_str,
end=end_date_str,
multi_level_index=False,
progress=False,
auto_adjust=True,
))
data = data.reset_index()
data.to_csv(data_file, index=False)
data = _clean_dataframe(data)
data = load_ohlcv(symbol, curr_date)
df = wrap(data)
df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")
@ -353,7 +304,7 @@ def get_fundamentals(
def get_balance_sheet(
ticker: Annotated[str, "ticker symbol of the company"],
freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly",
curr_date: Annotated[str, "current date (not used for yfinance)"] = None
curr_date: Annotated[str, "current date in YYYY-MM-DD format"] = None
):
"""Get balance sheet data from yfinance."""
try:
@ -363,7 +314,9 @@ def get_balance_sheet(
data = yf_retry(lambda: ticker_obj.quarterly_balance_sheet)
else:
data = yf_retry(lambda: ticker_obj.balance_sheet)
data = filter_financials_by_date(data, curr_date)
if data.empty:
return f"No balance sheet data found for symbol '{ticker}'"
@ -383,7 +336,7 @@ def get_balance_sheet(
def get_cashflow(
ticker: Annotated[str, "ticker symbol of the company"],
freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly",
curr_date: Annotated[str, "current date (not used for yfinance)"] = None
curr_date: Annotated[str, "current date in YYYY-MM-DD format"] = None
):
"""Get cash flow data from yfinance."""
try:
@ -393,7 +346,9 @@ def get_cashflow(
data = yf_retry(lambda: ticker_obj.quarterly_cashflow)
else:
data = yf_retry(lambda: ticker_obj.cashflow)
data = filter_financials_by_date(data, curr_date)
if data.empty:
return f"No cash flow data found for symbol '{ticker}'"
@ -413,7 +368,7 @@ def get_cashflow(
def get_income_statement(
ticker: Annotated[str, "ticker symbol of the company"],
freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly",
curr_date: Annotated[str, "current date (not used for yfinance)"] = None
curr_date: Annotated[str, "current date in YYYY-MM-DD format"] = None
):
"""Get income statement data from yfinance."""
try:
@ -423,7 +378,9 @@ def get_income_statement(
data = yf_retry(lambda: ticker_obj.quarterly_income_stmt)
else:
data = yf_retry(lambda: ticker_obj.income_stmt)
data = filter_financials_by_date(data, curr_date)
if data.empty:
return f"No income statement data found for symbol '{ticker}'"

View File

@ -167,6 +167,11 @@ def get_global_news_yfinance(
# Handle both flat and nested structures
if "content" in article:
data = _extract_article_data(article)
# Skip articles published after curr_date (look-ahead guard)
if data.get("pub_date"):
pub_naive = data["pub_date"].replace(tzinfo=None) if hasattr(data["pub_date"], "replace") else data["pub_date"]
if pub_naive > curr_dt + relativedelta(days=1):
continue
title = data["title"]
publisher = data["publisher"]
link = data["link"]