fix: prevent look-ahead bias in backtesting data fetchers
Closes #203 Data fetch functions were using today's date as the end bound instead of the simulation's curr_date, allowing future data to leak into backtests. Changes: - stockstats_utils.py: replace pd.Timestamp.today() with curr_date + 1 day - y_finance.py _get_stock_stats_bulk: same fix for yfinance OHLCV download - y_finance.py get_balance_sheet/get_cashflow/get_income_statement: filter out fiscal periods whose column timestamp exceeds curr_date - alpha_vantage_fundamentals.py get_balance_sheet/get_cashflow/get_income_statement: filter annualReports and quarterlyReports by fiscalDateEnding <= curr_date - yfinance_news.py get_global_news_yfinance: skip articles with pub_date after curr_date Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
589b351f2a
commit
abd13c0153
|
|
@ -26,7 +26,7 @@ def get_balance_sheet(ticker: str, freq: str = "quarterly", curr_date: str = Non
|
||||||
Args:
|
Args:
|
||||||
ticker (str): Ticker symbol of the company
|
ticker (str): Ticker symbol of the company
|
||||||
freq (str): Reporting frequency: annual/quarterly (default quarterly) - not used for Alpha Vantage
|
freq (str): Reporting frequency: annual/quarterly (default quarterly) - not used for Alpha Vantage
|
||||||
curr_date (str): Current date you are trading at, yyyy-mm-dd (not used for Alpha Vantage)
|
curr_date (str): Current date you are trading at, yyyy-mm-dd
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str: Balance sheet data with normalized fields
|
str: Balance sheet data with normalized fields
|
||||||
|
|
@ -35,7 +35,14 @@ def get_balance_sheet(ticker: str, freq: str = "quarterly", curr_date: str = Non
|
||||||
"symbol": ticker,
|
"symbol": ticker,
|
||||||
}
|
}
|
||||||
|
|
||||||
return _make_api_request("BALANCE_SHEET", params)
|
result = _make_api_request("BALANCE_SHEET", params)
|
||||||
|
# Filter out reports whose fiscalDateEnding is after curr_date to prevent look-ahead bias.
|
||||||
|
if curr_date and isinstance(result, dict):
|
||||||
|
for key in ("annualReports", "quarterlyReports"):
|
||||||
|
if key in result:
|
||||||
|
result[key] = [r for r in result[key]
|
||||||
|
if r.get("fiscalDateEnding", "") <= curr_date]
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def get_cashflow(ticker: str, freq: str = "quarterly", curr_date: str = None) -> str:
|
def get_cashflow(ticker: str, freq: str = "quarterly", curr_date: str = None) -> str:
|
||||||
|
|
@ -45,7 +52,7 @@ def get_cashflow(ticker: str, freq: str = "quarterly", curr_date: str = None) ->
|
||||||
Args:
|
Args:
|
||||||
ticker (str): Ticker symbol of the company
|
ticker (str): Ticker symbol of the company
|
||||||
freq (str): Reporting frequency: annual/quarterly (default quarterly) - not used for Alpha Vantage
|
freq (str): Reporting frequency: annual/quarterly (default quarterly) - not used for Alpha Vantage
|
||||||
curr_date (str): Current date you are trading at, yyyy-mm-dd (not used for Alpha Vantage)
|
curr_date (str): Current date you are trading at, yyyy-mm-dd
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str: Cash flow statement data with normalized fields
|
str: Cash flow statement data with normalized fields
|
||||||
|
|
@ -54,7 +61,14 @@ def get_cashflow(ticker: str, freq: str = "quarterly", curr_date: str = None) ->
|
||||||
"symbol": ticker,
|
"symbol": ticker,
|
||||||
}
|
}
|
||||||
|
|
||||||
return _make_api_request("CASH_FLOW", params)
|
result = _make_api_request("CASH_FLOW", params)
|
||||||
|
# Filter out reports whose fiscalDateEnding is after curr_date to prevent look-ahead bias.
|
||||||
|
if curr_date and isinstance(result, dict):
|
||||||
|
for key in ("annualReports", "quarterlyReports"):
|
||||||
|
if key in result:
|
||||||
|
result[key] = [r for r in result[key]
|
||||||
|
if r.get("fiscalDateEnding", "") <= curr_date]
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def get_income_statement(ticker: str, freq: str = "quarterly", curr_date: str = None) -> str:
|
def get_income_statement(ticker: str, freq: str = "quarterly", curr_date: str = None) -> str:
|
||||||
|
|
@ -64,7 +78,7 @@ def get_income_statement(ticker: str, freq: str = "quarterly", curr_date: str =
|
||||||
Args:
|
Args:
|
||||||
ticker (str): Ticker symbol of the company
|
ticker (str): Ticker symbol of the company
|
||||||
freq (str): Reporting frequency: annual/quarterly (default quarterly) - not used for Alpha Vantage
|
freq (str): Reporting frequency: annual/quarterly (default quarterly) - not used for Alpha Vantage
|
||||||
curr_date (str): Current date you are trading at, yyyy-mm-dd (not used for Alpha Vantage)
|
curr_date (str): Current date you are trading at, yyyy-mm-dd
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str: Income statement data with normalized fields
|
str: Income statement data with normalized fields
|
||||||
|
|
@ -73,5 +87,12 @@ def get_income_statement(ticker: str, freq: str = "quarterly", curr_date: str =
|
||||||
"symbol": ticker,
|
"symbol": ticker,
|
||||||
}
|
}
|
||||||
|
|
||||||
return _make_api_request("INCOME_STATEMENT", params)
|
result = _make_api_request("INCOME_STATEMENT", params)
|
||||||
|
# Filter out reports whose fiscalDateEnding is after curr_date to prevent look-ahead bias.
|
||||||
|
if curr_date and isinstance(result, dict):
|
||||||
|
for key in ("annualReports", "quarterlyReports"):
|
||||||
|
if key in result:
|
||||||
|
result[key] = [r for r in result[key]
|
||||||
|
if r.get("fiscalDateEnding", "") <= curr_date]
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -57,11 +57,12 @@ class StockstatsUtils:
|
||||||
):
|
):
|
||||||
config = get_config()
|
config = get_config()
|
||||||
|
|
||||||
today_date = pd.Timestamp.today()
|
|
||||||
curr_date_dt = pd.to_datetime(curr_date)
|
curr_date_dt = pd.to_datetime(curr_date)
|
||||||
|
|
||||||
end_date = today_date
|
# Cap end_date to curr_date to prevent look-ahead bias in backtesting.
|
||||||
start_date = today_date - pd.DateOffset(years=15)
|
# Using curr_date + 1 day so yfinance includes the simulation date itself.
|
||||||
|
end_date = curr_date_dt + pd.DateOffset(days=1)
|
||||||
|
start_date = curr_date_dt - pd.DateOffset(years=15)
|
||||||
start_date_str = start_date.strftime("%Y-%m-%d")
|
start_date_str = start_date.strftime("%Y-%m-%d")
|
||||||
end_date_str = end_date.strftime("%Y-%m-%d")
|
end_date_str = end_date.strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -216,11 +216,11 @@ def _get_stock_stats_bulk(
|
||||||
raise Exception("Stockstats fail: Yahoo Finance data not fetched yet!")
|
raise Exception("Stockstats fail: Yahoo Finance data not fetched yet!")
|
||||||
else:
|
else:
|
||||||
# Online data fetching with caching
|
# Online data fetching with caching
|
||||||
today_date = pd.Timestamp.today()
|
|
||||||
curr_date_dt = pd.to_datetime(curr_date)
|
curr_date_dt = pd.to_datetime(curr_date)
|
||||||
|
|
||||||
end_date = today_date
|
# Cap end_date to curr_date to prevent look-ahead bias in backtesting.
|
||||||
start_date = today_date - pd.DateOffset(years=15)
|
end_date = curr_date_dt + pd.DateOffset(days=1)
|
||||||
|
start_date = curr_date_dt - pd.DateOffset(years=15)
|
||||||
start_date_str = start_date.strftime("%Y-%m-%d")
|
start_date_str = start_date.strftime("%Y-%m-%d")
|
||||||
end_date_str = end_date.strftime("%Y-%m-%d")
|
end_date_str = end_date.strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
|
@ -353,7 +353,7 @@ def get_fundamentals(
|
||||||
def get_balance_sheet(
|
def get_balance_sheet(
|
||||||
ticker: Annotated[str, "ticker symbol of the company"],
|
ticker: Annotated[str, "ticker symbol of the company"],
|
||||||
freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly",
|
freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly",
|
||||||
curr_date: Annotated[str, "current date (not used for yfinance)"] = None
|
curr_date: Annotated[str, "current date in YYYY-MM-DD format for look-ahead bias prevention"] = None
|
||||||
):
|
):
|
||||||
"""Get balance sheet data from yfinance."""
|
"""Get balance sheet data from yfinance."""
|
||||||
try:
|
try:
|
||||||
|
|
@ -363,9 +363,16 @@ def get_balance_sheet(
|
||||||
data = yf_retry(lambda: ticker_obj.quarterly_balance_sheet)
|
data = yf_retry(lambda: ticker_obj.quarterly_balance_sheet)
|
||||||
else:
|
else:
|
||||||
data = yf_retry(lambda: ticker_obj.balance_sheet)
|
data = yf_retry(lambda: ticker_obj.balance_sheet)
|
||||||
|
|
||||||
|
# Filter out fiscal periods after curr_date to prevent look-ahead bias.
|
||||||
|
if curr_date and not data.empty:
|
||||||
|
cutoff = pd.Timestamp(curr_date)
|
||||||
|
data = data.loc[:, [c for c in data.columns if pd.Timestamp(c) <= cutoff]]
|
||||||
|
|
||||||
if data.empty:
|
if data.empty:
|
||||||
return f"No balance sheet data found for symbol '{ticker}'"
|
return f"No balance sheet data found for symbol '{ticker}'" + (
|
||||||
|
f" on or before {curr_date}" if curr_date else ""
|
||||||
|
)
|
||||||
|
|
||||||
# Convert to CSV string for consistency with other functions
|
# Convert to CSV string for consistency with other functions
|
||||||
csv_string = data.to_csv()
|
csv_string = data.to_csv()
|
||||||
|
|
@ -383,7 +390,7 @@ def get_balance_sheet(
|
||||||
def get_cashflow(
|
def get_cashflow(
|
||||||
ticker: Annotated[str, "ticker symbol of the company"],
|
ticker: Annotated[str, "ticker symbol of the company"],
|
||||||
freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly",
|
freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly",
|
||||||
curr_date: Annotated[str, "current date (not used for yfinance)"] = None
|
curr_date: Annotated[str, "current date in YYYY-MM-DD format for look-ahead bias prevention"] = None
|
||||||
):
|
):
|
||||||
"""Get cash flow data from yfinance."""
|
"""Get cash flow data from yfinance."""
|
||||||
try:
|
try:
|
||||||
|
|
@ -393,9 +400,16 @@ def get_cashflow(
|
||||||
data = yf_retry(lambda: ticker_obj.quarterly_cashflow)
|
data = yf_retry(lambda: ticker_obj.quarterly_cashflow)
|
||||||
else:
|
else:
|
||||||
data = yf_retry(lambda: ticker_obj.cashflow)
|
data = yf_retry(lambda: ticker_obj.cashflow)
|
||||||
|
|
||||||
|
# Filter out fiscal periods after curr_date to prevent look-ahead bias.
|
||||||
|
if curr_date and not data.empty:
|
||||||
|
cutoff = pd.Timestamp(curr_date)
|
||||||
|
data = data.loc[:, [c for c in data.columns if pd.Timestamp(c) <= cutoff]]
|
||||||
|
|
||||||
if data.empty:
|
if data.empty:
|
||||||
return f"No cash flow data found for symbol '{ticker}'"
|
return f"No cash flow data found for symbol '{ticker}'" + (
|
||||||
|
f" on or before {curr_date}" if curr_date else ""
|
||||||
|
)
|
||||||
|
|
||||||
# Convert to CSV string for consistency with other functions
|
# Convert to CSV string for consistency with other functions
|
||||||
csv_string = data.to_csv()
|
csv_string = data.to_csv()
|
||||||
|
|
@ -413,7 +427,7 @@ def get_cashflow(
|
||||||
def get_income_statement(
|
def get_income_statement(
|
||||||
ticker: Annotated[str, "ticker symbol of the company"],
|
ticker: Annotated[str, "ticker symbol of the company"],
|
||||||
freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly",
|
freq: Annotated[str, "frequency of data: 'annual' or 'quarterly'"] = "quarterly",
|
||||||
curr_date: Annotated[str, "current date (not used for yfinance)"] = None
|
curr_date: Annotated[str, "current date in YYYY-MM-DD format for look-ahead bias prevention"] = None
|
||||||
):
|
):
|
||||||
"""Get income statement data from yfinance."""
|
"""Get income statement data from yfinance."""
|
||||||
try:
|
try:
|
||||||
|
|
@ -423,9 +437,16 @@ def get_income_statement(
|
||||||
data = yf_retry(lambda: ticker_obj.quarterly_income_stmt)
|
data = yf_retry(lambda: ticker_obj.quarterly_income_stmt)
|
||||||
else:
|
else:
|
||||||
data = yf_retry(lambda: ticker_obj.income_stmt)
|
data = yf_retry(lambda: ticker_obj.income_stmt)
|
||||||
|
|
||||||
|
# Filter out fiscal periods after curr_date to prevent look-ahead bias.
|
||||||
|
if curr_date and not data.empty:
|
||||||
|
cutoff = pd.Timestamp(curr_date)
|
||||||
|
data = data.loc[:, [c for c in data.columns if pd.Timestamp(c) <= cutoff]]
|
||||||
|
|
||||||
if data.empty:
|
if data.empty:
|
||||||
return f"No income statement data found for symbol '{ticker}'"
|
return f"No income statement data found for symbol '{ticker}'" + (
|
||||||
|
f" on or before {curr_date}" if curr_date else ""
|
||||||
|
)
|
||||||
|
|
||||||
# Convert to CSV string for consistency with other functions
|
# Convert to CSV string for consistency with other functions
|
||||||
csv_string = data.to_csv()
|
csv_string = data.to_csv()
|
||||||
|
|
|
||||||
|
|
@ -167,6 +167,11 @@ def get_global_news_yfinance(
|
||||||
# Handle both flat and nested structures
|
# Handle both flat and nested structures
|
||||||
if "content" in article:
|
if "content" in article:
|
||||||
data = _extract_article_data(article)
|
data = _extract_article_data(article)
|
||||||
|
# Skip articles published after curr_date (look-ahead guard)
|
||||||
|
if data["pub_date"]:
|
||||||
|
pub_naive = data["pub_date"].replace(tzinfo=None)
|
||||||
|
if pub_naive > curr_dt + relativedelta(days=1):
|
||||||
|
continue
|
||||||
title = data["title"]
|
title = data["title"]
|
||||||
publisher = data["publisher"]
|
publisher = data["publisher"]
|
||||||
link = data["link"]
|
link = data["link"]
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue