Fix the bad data issue

This commit is contained in:
samchenku 2025-09-05 16:43:42 -05:00
parent a438acdbbd
commit 204772b736
2 changed files with 30 additions and 7 deletions

View File

@ -517,7 +517,9 @@ def get_stock_stats_indicators_window(
os.path.join(
DATA_DIR,
f"market_data/price_data/{symbol}-YFin-data-2015-01-01-2025-03-25.csv",
)
),
on_bad_lines='skip',
engine='python'
)
data["Date"] = pd.to_datetime(data["Date"], utc=True)
dates_in_df = data["Date"].astype(str).str[:10]
@ -599,7 +601,9 @@ def get_YFin_data_window(
os.path.join(
DATA_DIR,
f"market_data/price_data/{symbol}-YFin-data-2015-01-01-2025-03-25.csv",
)
),
on_bad_lines='skip',
engine='python'
)
# Extract just the date part for comparison
@ -677,7 +681,9 @@ def get_YFin_data(
os.path.join(
DATA_DIR,
f"market_data/price_data/{symbol}-YFin-data-2015-01-01-2025-03-25.csv",
)
),
on_bad_lines='skip',
engine='python'
)
if end_date > "2025-03-25":

View File

@ -34,7 +34,9 @@ class StockstatsUtils:
os.path.join(
data_dir,
f"{symbol}-YFin-data-2015-01-01-2025-03-25.csv",
)
),
on_bad_lines='skip',
engine='python'
)
df = wrap(data)
except FileNotFoundError:
@ -59,8 +61,11 @@ class StockstatsUtils:
)
if os.path.exists(data_file):
data = pd.read_csv(data_file)
data["Date"] = pd.to_datetime(data["Date"])
data = pd.read_csv(data_file, on_bad_lines='skip', engine='python')
# Handle date parsing with error handling for corrupted dates
data["Date"] = pd.to_datetime(data["Date"], errors='coerce', format='mixed')
# Remove rows with invalid dates
data = data.dropna(subset=['Date'])
else:
data = yf.download(
symbol,
@ -77,7 +82,19 @@ class StockstatsUtils:
df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")
curr_date = curr_date.strftime("%Y-%m-%d")
df[indicator] # trigger stockstats to calculate the indicator
# Clean data before calculating indicator to avoid NaN masking errors
df = df.dropna(subset=['close']) # Remove rows with NaN close prices
try:
df[indicator] # trigger stockstats to calculate the indicator
except Exception as e:
if "Cannot mask with non-boolean array containing NA / NaN values" in str(e):
# Additional cleanup for stubborn NaN values
df = df.fillna(method='ffill').fillna(method='bfill')
df[indicator] # retry calculation
else:
raise e
matching_rows = df[df["Date"].str.startswith(curr_date)]
if not matching_rows.empty: