fix: harden stock data parsing against malformed CSV and NaN values
Add _clean_dataframe() to normalize stock DataFrames before stockstats: coerce invalid dates/prices, drop rows missing Close, fill price gaps. Also add on_bad_lines="skip" to all cached CSV reads.
This commit is contained in:
parent
9cc283ac22
commit
b0f9d180f9
|
|
@ -6,6 +6,19 @@ import os
|
||||||
from .config import get_config
|
from .config import get_config
|
||||||
|
|
||||||
|
|
||||||
|
def _clean_dataframe(data: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""Normalize a stock DataFrame for stockstats: parse dates, drop invalid rows, fill price gaps."""
|
||||||
|
data["Date"] = pd.to_datetime(data["Date"], errors="coerce")
|
||||||
|
data = data.dropna(subset=["Date"])
|
||||||
|
|
||||||
|
price_cols = [c for c in ["Open", "High", "Low", "Close", "Volume"] if c in data.columns]
|
||||||
|
data[price_cols] = data[price_cols].apply(pd.to_numeric, errors="coerce")
|
||||||
|
data = data.dropna(subset=["Close"])
|
||||||
|
data[price_cols] = data[price_cols].ffill().bfill()
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
class StockstatsUtils:
|
class StockstatsUtils:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_stock_stats(
|
def get_stock_stats(
|
||||||
|
|
@ -36,8 +49,7 @@ class StockstatsUtils:
|
||||||
)
|
)
|
||||||
|
|
||||||
if os.path.exists(data_file):
|
if os.path.exists(data_file):
|
||||||
data = pd.read_csv(data_file)
|
data = pd.read_csv(data_file, on_bad_lines="skip")
|
||||||
data["Date"] = pd.to_datetime(data["Date"])
|
|
||||||
else:
|
else:
|
||||||
data = yf.download(
|
data = yf.download(
|
||||||
symbol,
|
symbol,
|
||||||
|
|
@ -50,6 +62,7 @@ class StockstatsUtils:
|
||||||
data = data.reset_index()
|
data = data.reset_index()
|
||||||
data.to_csv(data_file, index=False)
|
data.to_csv(data_file, index=False)
|
||||||
|
|
||||||
|
data = _clean_dataframe(data)
|
||||||
df = wrap(data)
|
df = wrap(data)
|
||||||
df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")
|
df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")
|
||||||
curr_date_str = curr_date_dt.strftime("%Y-%m-%d")
|
curr_date_str = curr_date_dt.strftime("%Y-%m-%d")
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ from datetime import datetime
|
||||||
from dateutil.relativedelta import relativedelta
|
from dateutil.relativedelta import relativedelta
|
||||||
import yfinance as yf
|
import yfinance as yf
|
||||||
import os
|
import os
|
||||||
from .stockstats_utils import StockstatsUtils
|
from .stockstats_utils import StockstatsUtils, _clean_dataframe
|
||||||
|
|
||||||
def get_YFin_data_online(
|
def get_YFin_data_online(
|
||||||
symbol: Annotated[str, "ticker symbol of the company"],
|
symbol: Annotated[str, "ticker symbol of the company"],
|
||||||
|
|
@ -209,9 +209,9 @@ def _get_stock_stats_bulk(
|
||||||
os.path.join(
|
os.path.join(
|
||||||
config.get("data_cache_dir", "data"),
|
config.get("data_cache_dir", "data"),
|
||||||
f"{symbol}-YFin-data-2015-01-01-2025-03-25.csv",
|
f"{symbol}-YFin-data-2015-01-01-2025-03-25.csv",
|
||||||
|
),
|
||||||
|
on_bad_lines="skip",
|
||||||
)
|
)
|
||||||
)
|
|
||||||
df = wrap(data)
|
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
raise Exception("Stockstats fail: Yahoo Finance data not fetched yet!")
|
raise Exception("Stockstats fail: Yahoo Finance data not fetched yet!")
|
||||||
else:
|
else:
|
||||||
|
|
@ -232,8 +232,7 @@ def _get_stock_stats_bulk(
|
||||||
)
|
)
|
||||||
|
|
||||||
if os.path.exists(data_file):
|
if os.path.exists(data_file):
|
||||||
data = pd.read_csv(data_file)
|
data = pd.read_csv(data_file, on_bad_lines="skip")
|
||||||
data["Date"] = pd.to_datetime(data["Date"])
|
|
||||||
else:
|
else:
|
||||||
data = yf.download(
|
data = yf.download(
|
||||||
symbol,
|
symbol,
|
||||||
|
|
@ -246,6 +245,7 @@ def _get_stock_stats_bulk(
|
||||||
data = data.reset_index()
|
data = data.reset_index()
|
||||||
data.to_csv(data_file, index=False)
|
data.to_csv(data_file, index=False)
|
||||||
|
|
||||||
|
data = _clean_dataframe(data)
|
||||||
df = wrap(data)
|
df = wrap(data)
|
||||||
df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")
|
df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue