Fix the bad data issue

This commit is contained in:
samchenku 2025-09-05 16:43:42 -05:00
parent a438acdbbd
commit 204772b736
2 changed files with 30 additions and 7 deletions

View File

@ -517,7 +517,9 @@ def get_stock_stats_indicators_window(
os.path.join( os.path.join(
DATA_DIR, DATA_DIR,
f"market_data/price_data/{symbol}-YFin-data-2015-01-01-2025-03-25.csv", f"market_data/price_data/{symbol}-YFin-data-2015-01-01-2025-03-25.csv",
) ),
on_bad_lines='skip',
engine='python'
) )
data["Date"] = pd.to_datetime(data["Date"], utc=True) data["Date"] = pd.to_datetime(data["Date"], utc=True)
dates_in_df = data["Date"].astype(str).str[:10] dates_in_df = data["Date"].astype(str).str[:10]
@ -599,7 +601,9 @@ def get_YFin_data_window(
os.path.join( os.path.join(
DATA_DIR, DATA_DIR,
f"market_data/price_data/{symbol}-YFin-data-2015-01-01-2025-03-25.csv", f"market_data/price_data/{symbol}-YFin-data-2015-01-01-2025-03-25.csv",
) ),
on_bad_lines='skip',
engine='python'
) )
# Extract just the date part for comparison # Extract just the date part for comparison
@ -677,7 +681,9 @@ def get_YFin_data(
os.path.join( os.path.join(
DATA_DIR, DATA_DIR,
f"market_data/price_data/{symbol}-YFin-data-2015-01-01-2025-03-25.csv", f"market_data/price_data/{symbol}-YFin-data-2015-01-01-2025-03-25.csv",
) ),
on_bad_lines='skip',
engine='python'
) )
if end_date > "2025-03-25": if end_date > "2025-03-25":

View File

@ -34,7 +34,9 @@ class StockstatsUtils:
os.path.join( os.path.join(
data_dir, data_dir,
f"{symbol}-YFin-data-2015-01-01-2025-03-25.csv", f"{symbol}-YFin-data-2015-01-01-2025-03-25.csv",
) ),
on_bad_lines='skip',
engine='python'
) )
df = wrap(data) df = wrap(data)
except FileNotFoundError: except FileNotFoundError:
@ -59,8 +61,11 @@ class StockstatsUtils:
) )
if os.path.exists(data_file): if os.path.exists(data_file):
data = pd.read_csv(data_file) data = pd.read_csv(data_file, on_bad_lines='skip', engine='python')
data["Date"] = pd.to_datetime(data["Date"]) # Handle date parsing with error handling for corrupted dates
data["Date"] = pd.to_datetime(data["Date"], errors='coerce', format='mixed')
# Remove rows with invalid dates
data = data.dropna(subset=['Date'])
else: else:
data = yf.download( data = yf.download(
symbol, symbol,
@ -77,7 +82,19 @@ class StockstatsUtils:
df["Date"] = df["Date"].dt.strftime("%Y-%m-%d") df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")
curr_date = curr_date.strftime("%Y-%m-%d") curr_date = curr_date.strftime("%Y-%m-%d")
df[indicator] # trigger stockstats to calculate the indicator # Clean data before calculating indicator to avoid NaN masking errors
df = df.dropna(subset=['close']) # Remove rows with NaN close prices
try:
df[indicator] # trigger stockstats to calculate the indicator
except Exception as e:
if "Cannot mask with non-boolean array containing NA / NaN values" in str(e):
# Additional cleanup for stubborn NaN values
df = df.fillna(method='ffill').fillna(method='bfill')
df[indicator] # retry calculation
else:
raise e
matching_rows = df[df["Date"].str.startswith(curr_date)] matching_rows = df[df["Date"].str.startswith(curr_date)]
if not matching_rows.empty: if not matching_rows.empty: