fix(cache): track ohlcv_cache.py and data_cache files in git

These were untracked so CI couldn't import tradingagents.dataflows.data_cache.ohlcv_cache.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Youssef Aitousarrah 2026-04-14 16:40:40 -07:00
parent 893eb05349
commit 8050d6764f
2 changed files with 239736 additions and 0 deletions

View File

@ -0,0 +1,238 @@
"""OHLCV disk cache with incremental daily updates.
On the first call for a given ticker universe, downloads the full requested
period and saves it as a parquet file. On subsequent calls during the same
day the cache is served from disk (zero network). On the next calendar day
only the missing trading days are fetched and appended, keeping total download
volume proportional to *time elapsed* rather than universe size.
Cache files live in data/ohlcv_cache/ (configurable):
{md5(sorted_tickers)[:12]}.parquet long-format OHLCV rows
{md5(sorted_tickers)[:12]}.meta.json {last_updated, tickers, period}
"""
import hashlib
import json
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional
import pandas as pd
import yfinance as yf
from tradingagents.utils.logger import get_logger
logger = get_logger(__name__)
DEFAULT_CACHE_DIR = "data/ohlcv_cache"
# Keep at most this many rows per ticker to prevent unbounded growth (~2 years)
MAX_ROWS_PER_TICKER = 504
# Approximate trading days for each yfinance period string (used to slice
# a larger cached history down to the requested period on return).
PERIOD_TO_DAYS: Dict[str, Optional[int]] = {
"1d": 1,
"5d": 5,
"1mo": 21,
"3mo": 63,
"6mo": 130,
"1y": 252,
"2y": 504,
"5y": 1260,
"max": None, # return everything
}
# ──────────────────────────────────────────────────────────────────────────────
# Internal helpers
# ──────────────────────────────────────────────────────────────────────────────
def _cache_key(tickers: List[str]) -> str:
"""Stable 12-char hash of the (sorted, uppercase) ticker set."""
canonical = ",".join(sorted(set(t.upper() for t in tickers)))
return hashlib.md5(canonical.encode()).hexdigest()[:12]
def _paths(cache_dir: str, key: str):
base = Path(cache_dir)
return base / f"{key}.parquet", base / f"{key}.meta.json"
def _read_meta(meta_path: Path) -> Optional[dict]:
try:
with open(meta_path) as f:
return json.load(f)
except Exception:
return None
def _write_meta(meta_path: Path, data: dict) -> None:
meta_path.parent.mkdir(parents=True, exist_ok=True)
with open(meta_path, "w") as f:
json.dump(data, f)
def _batch_download(tickers: List[str], **kwargs) -> pd.DataFrame:
"""Download via yf.download and return in long format.
Long format columns: Date, Ticker, Open, High, Low, Close, Volume
Date is stored as tz-naive datetime (normalised to midnight).
"""
try:
raw = yf.download(tickers, auto_adjust=True, progress=False, **kwargs)
except Exception as e:
logger.warning(f"yfinance download failed: {e}")
return pd.DataFrame()
if raw is None or raw.empty:
return pd.DataFrame()
rows = []
if isinstance(raw.columns, pd.MultiIndex):
available = raw.columns.get_level_values(1).unique()
for ticker in tickers:
t = ticker.upper()
if t not in available:
continue
try:
df = raw.xs(t, axis=1, level=1).dropna(how="all").reset_index()
df["Ticker"] = t
rows.append(df)
except Exception:
continue
else:
# Single-ticker fallback
df = raw.dropna(how="all").reset_index()
df["Ticker"] = tickers[0].upper() if tickers else "UNKNOWN"
rows.append(df)
if not rows:
return pd.DataFrame()
result = pd.concat(rows, ignore_index=True)
result["Date"] = pd.to_datetime(result["Date"]).dt.tz_localize(None).dt.normalize()
return result
def _trim(df: pd.DataFrame, max_rows: int) -> pd.DataFrame:
"""Keep the most recent max_rows rows per ticker."""
return (
df.sort_values("Date")
.groupby("Ticker", group_keys=False)
.tail(max_rows)
.reset_index(drop=True)
)
def _filter_by_period(df: pd.DataFrame, period: str) -> pd.DataFrame:
"""Return only rows within the requested period window."""
days = PERIOD_TO_DAYS.get(period)
if days is None:
return df
cutoff = pd.Timestamp.now(tz=None).normalize() - pd.Timedelta(days=days)
date_col = pd.to_datetime(df["Date"]).dt.tz_localize(None)
return df[date_col >= cutoff].reset_index(drop=True)
def _split_by_ticker(df: pd.DataFrame) -> Dict[str, pd.DataFrame]:
"""Split long-format DataFrame into {ticker: per-ticker DataFrame}."""
result = {}
for ticker, grp in df.groupby("Ticker"):
result[str(ticker)] = grp.drop(columns=["Ticker"]).reset_index(drop=True)
return result
# ──────────────────────────────────────────────────────────────────────────────
# Public API
# ──────────────────────────────────────────────────────────────────────────────
def download_ohlcv_cached(
tickers: List[str],
period: str = "1y",
cache_dir: str = DEFAULT_CACHE_DIR,
**kwargs,
) -> Dict[str, pd.DataFrame]:
"""Download OHLCV data with incremental disk caching.
Behaviour:
- **First call** for a ticker universe: downloads the full `period` of
history and saves it to ``{cache_dir}/{key}.parquet``.
- **Same-day call**: served from disk no network traffic.
- **Next-day call**: fetches only the new trading days (delta), appends
them to the cached parquet, deduplicates, and saves.
The `period` parameter controls the *initial* download size and the window
returned to the caller. If the cache already contains more history (e.g.
a 1y cache serves a 6mo caller by slicing), no extra download occurs.
Args:
tickers: List of ticker symbols (case-insensitive).
period: yfinance period string ("1y", "6mo", "3mo", ).
cache_dir: Directory for parquet + meta files.
**kwargs: Extra kwargs forwarded to ``yf.download`` (e.g. ``interval``).
Returns:
Dict mapping ticker DataFrame with columns
[Date, Open, High, Low, Close, Volume].
"""
Path(cache_dir).mkdir(parents=True, exist_ok=True)
tickers_upper = [t.upper() for t in tickers]
key = _cache_key(tickers_upper)
parquet_path, meta_path = _paths(cache_dir, key)
today = datetime.now().strftime("%Y-%m-%d")
meta = _read_meta(meta_path)
# ── Case 1: Fresh cache (updated today) ───────────────────────────────
if parquet_path.exists() and meta and meta.get("last_updated") == today:
logger.info(f"OHLCV cache hit ({len(tickers_upper)} tickers, updated today)")
df = pd.read_parquet(parquet_path)
return _split_by_ticker(_filter_by_period(df, period))
# ── Case 2: Stale cache — incremental update ──────────────────────────
if parquet_path.exists() and meta and meta.get("last_updated"):
last_updated = meta["last_updated"]
start_dt = datetime.strptime(last_updated, "%Y-%m-%d") + timedelta(days=1)
start_str = start_dt.strftime("%Y-%m-%d")
logger.info(
f"OHLCV cache stale (last: {last_updated}) — "
f"fetching {start_str} → today for {len(tickers_upper)} tickers..."
)
new_data = _batch_download(tickers_upper, start=start_str, **kwargs)
existing = pd.read_parquet(parquet_path)
if not new_data.empty:
combined = pd.concat([existing, new_data], ignore_index=True)
# Dedup on (Date, Ticker) — keep latest version of each row
combined = combined.drop_duplicates(subset=["Date", "Ticker"], keep="last")
combined = _trim(combined, MAX_ROWS_PER_TICKER)
combined.to_parquet(parquet_path, index=False)
logger.info(
f"Incremental update: {len(new_data)} new rows appended "
f"({len(existing)}{len(combined)} total rows)"
)
else:
logger.info("No new rows in incremental fetch — cache unchanged")
combined = existing
_write_meta(meta_path, {"last_updated": today, "tickers": tickers_upper, "period": period})
return _split_by_ticker(_filter_by_period(combined, period))
# ── Case 3: No cache — full download ──────────────────────────────────
logger.info(f"OHLCV cache miss — downloading {len(tickers_upper)} tickers ({period})...")
df = _batch_download(tickers_upper, period=period, **kwargs)
if df.empty:
logger.warning("Full download returned empty data — cache not written")
return {}
df = _trim(df, MAX_ROWS_PER_TICKER)
parquet_path.parent.mkdir(parents=True, exist_ok=True)
df.to_parquet(parquet_path, index=False)
_write_meta(meta_path, {"last_updated": today, "tickers": tickers_upper, "period": period})
logger.info(f"Cache written: {len(df)} rows for {df['Ticker'].nunique()} tickers")
return _split_by_ticker(_filter_by_period(df, period))

File diff suppressed because it is too large Load Diff