fix: prevent look-ahead bias in backtesting data fetchers (#475)
This commit is contained in:
parent
589b351f2a
commit
e1113880a1
|
|
@ -1,6 +1,23 @@
|
||||||
from .alpha_vantage_common import _make_api_request
|
from .alpha_vantage_common import _make_api_request
|
||||||
|
|
||||||
|
|
||||||
|
def _filter_reports_by_date(result, curr_date: str):
|
||||||
|
"""Filter annualReports/quarterlyReports to exclude entries after curr_date.
|
||||||
|
|
||||||
|
Prevents look-ahead bias by removing fiscal periods that end after
|
||||||
|
the simulation's current date.
|
||||||
|
"""
|
||||||
|
if not curr_date or not isinstance(result, dict):
|
||||||
|
return result
|
||||||
|
for key in ("annualReports", "quarterlyReports"):
|
||||||
|
if key in result:
|
||||||
|
result[key] = [
|
||||||
|
r for r in result[key]
|
||||||
|
if r.get("fiscalDateEnding", "") <= curr_date
|
||||||
|
]
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def get_fundamentals(ticker: str, curr_date: str = None) -> str:
|
def get_fundamentals(ticker: str, curr_date: str = None) -> str:
|
||||||
"""
|
"""
|
||||||
Retrieve comprehensive fundamental data for a given ticker symbol using Alpha Vantage.
|
Retrieve comprehensive fundamental data for a given ticker symbol using Alpha Vantage.
|
||||||
|
|
@ -19,59 +36,20 @@ def get_fundamentals(ticker: str, curr_date: str = None) -> str:
|
||||||
return _make_api_request("OVERVIEW", params)
|
return _make_api_request("OVERVIEW", params)
|
||||||
|
|
||||||
|
|
||||||
def get_balance_sheet(ticker: str, freq: str = "quarterly", curr_date: str = None) -> str:
|
def get_balance_sheet(ticker: str, freq: str = "quarterly", curr_date: str = None):
|
||||||
"""
|
"""Retrieve balance sheet data for a given ticker symbol using Alpha Vantage."""
|
||||||
Retrieve balance sheet data for a given ticker symbol using Alpha Vantage.
|
result = _make_api_request("BALANCE_SHEET", {"symbol": ticker})
|
||||||
|
return _filter_reports_by_date(result, curr_date)
|
||||||
Args:
|
|
||||||
ticker (str): Ticker symbol of the company
|
|
||||||
freq (str): Reporting frequency: annual/quarterly (default quarterly) - not used for Alpha Vantage
|
|
||||||
curr_date (str): Current date you are trading at, yyyy-mm-dd (not used for Alpha Vantage)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: Balance sheet data with normalized fields
|
|
||||||
"""
|
|
||||||
params = {
|
|
||||||
"symbol": ticker,
|
|
||||||
}
|
|
||||||
|
|
||||||
return _make_api_request("BALANCE_SHEET", params)
|
|
||||||
|
|
||||||
|
|
||||||
def get_cashflow(ticker: str, freq: str = "quarterly", curr_date: str = None) -> str:
|
def get_cashflow(ticker: str, freq: str = "quarterly", curr_date: str = None):
|
||||||
"""
|
"""Retrieve cash flow statement data for a given ticker symbol using Alpha Vantage."""
|
||||||
Retrieve cash flow statement data for a given ticker symbol using Alpha Vantage.
|
result = _make_api_request("CASH_FLOW", {"symbol": ticker})
|
||||||
|
return _filter_reports_by_date(result, curr_date)
|
||||||
Args:
|
|
||||||
ticker (str): Ticker symbol of the company
|
|
||||||
freq (str): Reporting frequency: annual/quarterly (default quarterly) - not used for Alpha Vantage
|
|
||||||
curr_date (str): Current date you are trading at, yyyy-mm-dd (not used for Alpha Vantage)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: Cash flow statement data with normalized fields
|
|
||||||
"""
|
|
||||||
params = {
|
|
||||||
"symbol": ticker,
|
|
||||||
}
|
|
||||||
|
|
||||||
return _make_api_request("CASH_FLOW", params)
|
|
||||||
|
|
||||||
|
|
||||||
def get_income_statement(ticker: str, freq: str = "quarterly", curr_date: str = None) -> str:
|
def get_income_statement(ticker: str, freq: str = "quarterly", curr_date: str = None):
|
||||||
"""
|
"""Retrieve income statement data for a given ticker symbol using Alpha Vantage."""
|
||||||
Retrieve income statement data for a given ticker symbol using Alpha Vantage.
|
result = _make_api_request("INCOME_STATEMENT", {"symbol": ticker})
|
||||||
|
return _filter_reports_by_date(result, curr_date)
|
||||||
Args:
|
|
||||||
ticker (str): Ticker symbol of the company
|
|
||||||
freq (str): Reporting frequency: annual/quarterly (default quarterly) - not used for Alpha Vantage
|
|
||||||
curr_date (str): Current date you are trading at, yyyy-mm-dd (not used for Alpha Vantage)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: Income statement data with normalized fields
|
|
||||||
"""
|
|
||||||
params = {
|
|
||||||
"symbol": ticker,
|
|
||||||
}
|
|
||||||
|
|
||||||
return _make_api_request("INCOME_STATEMENT", params)
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -44,6 +44,64 @@ def _clean_dataframe(data: pd.DataFrame) -> pd.DataFrame:
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def load_ohlcv(symbol: str, curr_date: str) -> pd.DataFrame:
|
||||||
|
"""Fetch OHLCV data with caching, filtered to prevent look-ahead bias.
|
||||||
|
|
||||||
|
Downloads 15 years of data up to today and caches per symbol. On
|
||||||
|
subsequent calls the cache is reused. Rows after curr_date are
|
||||||
|
filtered out so backtests never see future prices.
|
||||||
|
"""
|
||||||
|
config = get_config()
|
||||||
|
curr_date_dt = pd.to_datetime(curr_date)
|
||||||
|
|
||||||
|
# Cache uses a fixed window (15y to today) so one file per symbol
|
||||||
|
today_date = pd.Timestamp.today()
|
||||||
|
start_date = today_date - pd.DateOffset(years=5)
|
||||||
|
start_str = start_date.strftime("%Y-%m-%d")
|
||||||
|
end_str = today_date.strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
os.makedirs(config["data_cache_dir"], exist_ok=True)
|
||||||
|
data_file = os.path.join(
|
||||||
|
config["data_cache_dir"],
|
||||||
|
f"{symbol}-YFin-data-{start_str}-{end_str}.csv",
|
||||||
|
)
|
||||||
|
|
||||||
|
if os.path.exists(data_file):
|
||||||
|
data = pd.read_csv(data_file, on_bad_lines="skip")
|
||||||
|
else:
|
||||||
|
data = yf_retry(lambda: yf.download(
|
||||||
|
symbol,
|
||||||
|
start=start_str,
|
||||||
|
end=end_str,
|
||||||
|
multi_level_index=False,
|
||||||
|
progress=False,
|
||||||
|
auto_adjust=True,
|
||||||
|
))
|
||||||
|
data = data.reset_index()
|
||||||
|
data.to_csv(data_file, index=False)
|
||||||
|
|
||||||
|
data = _clean_dataframe(data)
|
||||||
|
|
||||||
|
# Filter to curr_date to prevent look-ahead bias in backtesting
|
||||||
|
data = data[data["Date"] <= curr_date_dt]
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def filter_financials_by_date(data: pd.DataFrame, curr_date: str) -> pd.DataFrame:
|
||||||
|
"""Drop financial statement columns (fiscal period timestamps) after curr_date.
|
||||||
|
|
||||||
|
yfinance financial statements use fiscal period end dates as columns.
|
||||||
|
Columns after curr_date represent future data and are removed to
|
||||||
|
prevent look-ahead bias.
|
||||||
|
"""
|
||||||
|
if not curr_date or data.empty:
|
||||||
|
return data
|
||||||
|
cutoff = pd.Timestamp(curr_date)
|
||||||
|
mask = pd.to_datetime(data.columns, errors="coerce") <= cutoff
|
||||||
|
return data.loc[:, mask]
|
||||||
|
|
||||||
|
|
||||||
class StockstatsUtils:
|
class StockstatsUtils:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_stock_stats(
|
def get_stock_stats(
|
||||||
|
|
@ -55,42 +113,10 @@ class StockstatsUtils:
|
||||||
str, "curr date for retrieving stock price data, YYYY-mm-dd"
|
str, "curr date for retrieving stock price data, YYYY-mm-dd"
|
||||||
],
|
],
|
||||||
):
|
):
|
||||||
config = get_config()
|
data = load_ohlcv(symbol, curr_date)
|
||||||
|
|
||||||
today_date = pd.Timestamp.today()
|
|
||||||
curr_date_dt = pd.to_datetime(curr_date)
|
|
||||||
|
|
||||||
end_date = today_date
|
|
||||||
start_date = today_date - pd.DateOffset(years=15)
|
|
||||||
start_date_str = start_date.strftime("%Y-%m-%d")
|
|
||||||
end_date_str = end_date.strftime("%Y-%m-%d")
|
|
||||||
|
|
||||||
# Ensure cache directory exists
|
|
||||||
os.makedirs(config["data_cache_dir"], exist_ok=True)
|
|
||||||
|
|
||||||
data_file = os.path.join(
|
|
||||||
config["data_cache_dir"],
|
|
||||||
f"{symbol}-YFin-data-{start_date_str}-{end_date_str}.csv",
|
|
||||||
)
|
|
||||||
|
|
||||||
if os.path.exists(data_file):
|
|
||||||
data = pd.read_csv(data_file, on_bad_lines="skip")
|
|
||||||
else:
|
|
||||||
data = yf_retry(lambda: yf.download(
|
|
||||||
symbol,
|
|
||||||
start=start_date_str,
|
|
||||||
end=end_date_str,
|
|
||||||
multi_level_index=False,
|
|
||||||
progress=False,
|
|
||||||
auto_adjust=True,
|
|
||||||
))
|
|
||||||
data = data.reset_index()
|
|
||||||
data.to_csv(data_file, index=False)
|
|
||||||
|
|
||||||
data = _clean_dataframe(data)
|
|
||||||
df = wrap(data)
|
df = wrap(data)
|
||||||
df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")
|
df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")
|
||||||
curr_date_str = curr_date_dt.strftime("%Y-%m-%d")
|
curr_date_str = pd.to_datetime(curr_date).strftime("%Y-%m-%d")
|
||||||
|
|
||||||
df[indicator] # trigger stockstats to calculate the indicator
|
df[indicator] # trigger stockstats to calculate the indicator
|
||||||
matching_rows = df[df["Date"].str.startswith(curr_date_str)]
|
matching_rows = df[df["Date"].str.startswith(curr_date_str)]
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ from datetime import datetime
|
||||||
from dateutil.relativedelta import relativedelta
|
from dateutil.relativedelta import relativedelta
|
||||||
import yfinance as yf
|
import yfinance as yf
|
||||||
import os
|
import os
|
||||||
from .stockstats_utils import StockstatsUtils, _clean_dataframe, yf_retry
|
from .stockstats_utils import StockstatsUtils, _clean_dataframe, yf_retry, load_ohlcv, filter_financials_by_date
|
||||||
|
|
||||||
def get_YFin_data_online(
|
def get_YFin_data_online(
|
||||||
symbol: Annotated[str, "ticker symbol of the company"],
|
symbol: Annotated[str, "ticker symbol of the company"],
|
||||||
|
|
@ -194,58 +194,9 @@ def _get_stock_stats_bulk(
|
||||||
Fetches data once and calculates indicator for all available dates.
|
Fetches data once and calculates indicator for all available dates.
|
||||||
Returns dict mapping date strings to indicator values.
|
Returns dict mapping date strings to indicator values.
|
||||||
"""
|
"""
|
||||||
from .config import get_config
|
|
||||||
import pandas as pd
|
|
||||||
from stockstats import wrap
|
from stockstats import wrap
|
||||||
import os
|
|
||||||
|
|
||||||
config = get_config()
|
|
||||||
online = config["data_vendors"]["technical_indicators"] != "local"
|
|
||||||
|
|
||||||
if not online:
|
|
||||||
# Local data path
|
|
||||||
try:
|
|
||||||
data = pd.read_csv(
|
|
||||||
os.path.join(
|
|
||||||
config.get("data_cache_dir", "data"),
|
|
||||||
f"{symbol}-YFin-data-2015-01-01-2025-03-25.csv",
|
|
||||||
),
|
|
||||||
on_bad_lines="skip",
|
|
||||||
)
|
|
||||||
except FileNotFoundError:
|
|
||||||
raise Exception("Stockstats fail: Yahoo Finance data not fetched yet!")
|
|
||||||
else:
|
|
||||||
# Online data fetching with caching
|
|
||||||
today_date = pd.Timestamp.today()
|
|
||||||
curr_date_dt = pd.to_datetime(curr_date)
|
|
||||||
|
|
||||||
end_date = today_date
|
data = load_ohlcv(symbol, curr_date)
|
||||||
start_date = today_date - pd.DateOffset(years=15)
|
|
||||||
start_date_str = start_date.strftime("%Y-%m-%d")
|
|
||||||
end_date_str = end_date.strftime("%Y-%m-%d")
|
|
||||||
|
|
||||||
os.makedirs(config["data_cache_dir"], exist_ok=True)
|
|
||||||
|
|
||||||
data_file = os.path.join(
|
|
||||||
config["data_cache_dir"],
|
|
||||||
f"{symbol}-YFin-data-{start_date_str}-{end_date_str}.csv",
|
|
||||||
)
|
|
||||||
|
|
||||||
if os.path.exists(data_file):
|
|
||||||
data = pd.read_csv(data_file, on_bad_lines="skip")
|
|
||||||
else:
|
|
||||||
data = yf_retry(lambda: yf.download(
|
|
||||||
symbol,
|
|
||||||
start=start_date_str,
|
|
||||||
end=end_date_str,
|
|
||||||
multi_level_index=False,
|
|
||||||
progress=False,
|
|
||||||
auto_adjust=True,
|
|
||||||
))
|
|
||||||
data = data.reset_index()
|
|
||||||
data.to_csv(data_file, index=False)
|
|
||||||
|
|
||||||
data = _clean_dataframe(data)
|
|
||||||
df = wrap(data)
|
df = wrap(data)
|
||||||
df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")
|
df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
|
@ -353,7 +304,7 @@ def get_fundamentals(
|
||||||
def get_balance_sheet(
|
def get_balance_sheet(
|
||||||
ticker: Annotated[str, "ticker symbol of the company"],
|
ticker: Annotated[str, "ticker symbol of the company"],
|
||||||
freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly",
|
freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly",
|
||||||
curr_date: Annotated[str, "current date (not used for yfinance)"] = None
|
curr_date: Annotated[str, "current date in YYYY-MM-DD format"] = None
|
||||||
):
|
):
|
||||||
"""Get balance sheet data from yfinance."""
|
"""Get balance sheet data from yfinance."""
|
||||||
try:
|
try:
|
||||||
|
|
@ -363,7 +314,9 @@ def get_balance_sheet(
|
||||||
data = yf_retry(lambda: ticker_obj.quarterly_balance_sheet)
|
data = yf_retry(lambda: ticker_obj.quarterly_balance_sheet)
|
||||||
else:
|
else:
|
||||||
data = yf_retry(lambda: ticker_obj.balance_sheet)
|
data = yf_retry(lambda: ticker_obj.balance_sheet)
|
||||||
|
|
||||||
|
data = filter_financials_by_date(data, curr_date)
|
||||||
|
|
||||||
if data.empty:
|
if data.empty:
|
||||||
return f"No balance sheet data found for symbol '{ticker}'"
|
return f"No balance sheet data found for symbol '{ticker}'"
|
||||||
|
|
||||||
|
|
@ -383,7 +336,7 @@ def get_balance_sheet(
|
||||||
def get_cashflow(
|
def get_cashflow(
|
||||||
ticker: Annotated[str, "ticker symbol of the company"],
|
ticker: Annotated[str, "ticker symbol of the company"],
|
||||||
freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly",
|
freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly",
|
||||||
curr_date: Annotated[str, "current date (not used for yfinance)"] = None
|
curr_date: Annotated[str, "current date in YYYY-MM-DD format"] = None
|
||||||
):
|
):
|
||||||
"""Get cash flow data from yfinance."""
|
"""Get cash flow data from yfinance."""
|
||||||
try:
|
try:
|
||||||
|
|
@ -393,7 +346,9 @@ def get_cashflow(
|
||||||
data = yf_retry(lambda: ticker_obj.quarterly_cashflow)
|
data = yf_retry(lambda: ticker_obj.quarterly_cashflow)
|
||||||
else:
|
else:
|
||||||
data = yf_retry(lambda: ticker_obj.cashflow)
|
data = yf_retry(lambda: ticker_obj.cashflow)
|
||||||
|
|
||||||
|
data = filter_financials_by_date(data, curr_date)
|
||||||
|
|
||||||
if data.empty:
|
if data.empty:
|
||||||
return f"No cash flow data found for symbol '{ticker}'"
|
return f"No cash flow data found for symbol '{ticker}'"
|
||||||
|
|
||||||
|
|
@ -413,7 +368,7 @@ def get_cashflow(
|
||||||
def get_income_statement(
|
def get_income_statement(
|
||||||
ticker: Annotated[str, "ticker symbol of the company"],
|
ticker: Annotated[str, "ticker symbol of the company"],
|
||||||
freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly",
|
freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly",
|
||||||
curr_date: Annotated[str, "current date (not used for yfinance)"] = None
|
curr_date: Annotated[str, "current date in YYYY-MM-DD format"] = None
|
||||||
):
|
):
|
||||||
"""Get income statement data from yfinance."""
|
"""Get income statement data from yfinance."""
|
||||||
try:
|
try:
|
||||||
|
|
@ -423,7 +378,9 @@ def get_income_statement(
|
||||||
data = yf_retry(lambda: ticker_obj.quarterly_income_stmt)
|
data = yf_retry(lambda: ticker_obj.quarterly_income_stmt)
|
||||||
else:
|
else:
|
||||||
data = yf_retry(lambda: ticker_obj.income_stmt)
|
data = yf_retry(lambda: ticker_obj.income_stmt)
|
||||||
|
|
||||||
|
data = filter_financials_by_date(data, curr_date)
|
||||||
|
|
||||||
if data.empty:
|
if data.empty:
|
||||||
return f"No income statement data found for symbol '{ticker}'"
|
return f"No income statement data found for symbol '{ticker}'"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -167,6 +167,11 @@ def get_global_news_yfinance(
|
||||||
# Handle both flat and nested structures
|
# Handle both flat and nested structures
|
||||||
if "content" in article:
|
if "content" in article:
|
||||||
data = _extract_article_data(article)
|
data = _extract_article_data(article)
|
||||||
|
# Skip articles published after curr_date (look-ahead guard)
|
||||||
|
if data.get("pub_date"):
|
||||||
|
pub_naive = data["pub_date"].replace(tzinfo=None) if hasattr(data["pub_date"], "replace") else data["pub_date"]
|
||||||
|
if pub_naive > curr_dt + relativedelta(days=1):
|
||||||
|
continue
|
||||||
title = data["title"]
|
title = data["title"]
|
||||||
publisher = data["publisher"]
|
publisher = data["publisher"]
|
||||||
link = data["link"]
|
link = data["link"]
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue