fix: harden stock data parsing against malformed CSV and NaN values

Add _clean_dataframe() to normalize stock DataFrames before stockstats:
coerce invalid dates/prices, drop rows missing Close, fill price gaps.
Also add on_bad_lines="skip" to all cached CSV reads.
This commit is contained in:
Yijia-Xiao 2026-03-15 18:29:43 +00:00
parent 9cc283ac22
commit b0f9d180f9
2 changed files with 27 additions and 14 deletions

View File

@ -6,6 +6,19 @@ import os
from .config import get_config from .config import get_config
def _clean_dataframe(data: pd.DataFrame) -> pd.DataFrame:
"""Normalize a stock DataFrame for stockstats: parse dates, drop invalid rows, fill price gaps."""
data["Date"] = pd.to_datetime(data["Date"], errors="coerce")
data = data.dropna(subset=["Date"])
price_cols = [c for c in ["Open", "High", "Low", "Close", "Volume"] if c in data.columns]
data[price_cols] = data[price_cols].apply(pd.to_numeric, errors="coerce")
data = data.dropna(subset=["Close"])
data[price_cols] = data[price_cols].ffill().bfill()
return data
class StockstatsUtils: class StockstatsUtils:
@staticmethod @staticmethod
def get_stock_stats( def get_stock_stats(
@ -36,8 +49,7 @@ class StockstatsUtils:
) )
if os.path.exists(data_file): if os.path.exists(data_file):
data = pd.read_csv(data_file) data = pd.read_csv(data_file, on_bad_lines="skip")
data["Date"] = pd.to_datetime(data["Date"])
else: else:
data = yf.download( data = yf.download(
symbol, symbol,
@ -50,6 +62,7 @@ class StockstatsUtils:
data = data.reset_index() data = data.reset_index()
data.to_csv(data_file, index=False) data.to_csv(data_file, index=False)
data = _clean_dataframe(data)
df = wrap(data) df = wrap(data)
df["Date"] = df["Date"].dt.strftime("%Y-%m-%d") df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")
curr_date_str = curr_date_dt.strftime("%Y-%m-%d") curr_date_str = curr_date_dt.strftime("%Y-%m-%d")

View File

@ -3,7 +3,7 @@ from datetime import datetime
from dateutil.relativedelta import relativedelta from dateutil.relativedelta import relativedelta
import yfinance as yf import yfinance as yf
import os import os
from .stockstats_utils import StockstatsUtils from .stockstats_utils import StockstatsUtils, _clean_dataframe
def get_YFin_data_online( def get_YFin_data_online(
symbol: Annotated[str, "ticker symbol of the company"], symbol: Annotated[str, "ticker symbol of the company"],
@ -209,31 +209,30 @@ def _get_stock_stats_bulk(
os.path.join( os.path.join(
config.get("data_cache_dir", "data"), config.get("data_cache_dir", "data"),
f"{symbol}-YFin-data-2015-01-01-2025-03-25.csv", f"{symbol}-YFin-data-2015-01-01-2025-03-25.csv",
) ),
on_bad_lines="skip",
) )
df = wrap(data)
except FileNotFoundError: except FileNotFoundError:
raise Exception("Stockstats fail: Yahoo Finance data not fetched yet!") raise Exception("Stockstats fail: Yahoo Finance data not fetched yet!")
else: else:
# Online data fetching with caching # Online data fetching with caching
today_date = pd.Timestamp.today() today_date = pd.Timestamp.today()
curr_date_dt = pd.to_datetime(curr_date) curr_date_dt = pd.to_datetime(curr_date)
end_date = today_date end_date = today_date
start_date = today_date - pd.DateOffset(years=15) start_date = today_date - pd.DateOffset(years=15)
start_date_str = start_date.strftime("%Y-%m-%d") start_date_str = start_date.strftime("%Y-%m-%d")
end_date_str = end_date.strftime("%Y-%m-%d") end_date_str = end_date.strftime("%Y-%m-%d")
os.makedirs(config["data_cache_dir"], exist_ok=True) os.makedirs(config["data_cache_dir"], exist_ok=True)
data_file = os.path.join( data_file = os.path.join(
config["data_cache_dir"], config["data_cache_dir"],
f"{symbol}-YFin-data-{start_date_str}-{end_date_str}.csv", f"{symbol}-YFin-data-{start_date_str}-{end_date_str}.csv",
) )
if os.path.exists(data_file): if os.path.exists(data_file):
data = pd.read_csv(data_file) data = pd.read_csv(data_file, on_bad_lines="skip")
data["Date"] = pd.to_datetime(data["Date"])
else: else:
data = yf.download( data = yf.download(
symbol, symbol,
@ -245,9 +244,10 @@ def _get_stock_stats_bulk(
) )
data = data.reset_index() data = data.reset_index()
data.to_csv(data_file, index=False) data.to_csv(data_file, index=False)
df = wrap(data) data = _clean_dataframe(data)
df["Date"] = df["Date"].dt.strftime("%Y-%m-%d") df = wrap(data)
df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")
# Calculate the indicator for all rows at once # Calculate the indicator for all rows at once
df[indicator] # This triggers stockstats to calculate the indicator df[indicator] # This triggers stockstats to calculate the indicator