fix(cache): track ohlcv_cache.py and data_cache files in git

These were untracked so CI couldn't import tradingagents.dataflows.data_cache.ohlcv_cache. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 16:40:40 -07:00 · 2026-04-14 16:40:40 -07:00 · 8050d6764f
parent 893eb05349
commit 8050d6764f
2 changed files with 239736 additions and 0 deletions
--- a/tradingagents/dataflows/data_cache/ohlcv_cache.py
+++ b/tradingagents/dataflows/data_cache/ohlcv_cache.py
@ -0,0 +1,238 @@
+"""OHLCV disk cache with incremental daily updates.
+
+On the first call for a given ticker universe, downloads the full requested
+period and saves it as a parquet file.  On subsequent calls during the same
+day the cache is served from disk (zero network).  On the next calendar day
+only the missing trading days are fetched and appended, keeping total download
+volume proportional to *time elapsed* rather than universe size.
+
+Cache files live in data/ohlcv_cache/ (configurable):
+  {md5(sorted_tickers)[:12]}.parquet   — long-format OHLCV rows
+  {md5(sorted_tickers)[:12]}.meta.json — {last_updated, tickers, period}
+"""
+
+import hashlib
+import json
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Dict, List, Optional
+
+import pandas as pd
+import yfinance as yf
+
+from tradingagents.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+DEFAULT_CACHE_DIR = "data/ohlcv_cache"
+
+# Keep at most this many rows per ticker to prevent unbounded growth (~2 years)
+MAX_ROWS_PER_TICKER = 504
+
+# Approximate trading days for each yfinance period string (used to slice
+# a larger cached history down to the requested period on return).
+PERIOD_TO_DAYS: Dict[str, Optional[int]] = {
+    "1d": 1,
+    "5d": 5,
+    "1mo": 21,
+    "3mo": 63,
+    "6mo": 130,
+    "1y": 252,
+    "2y": 504,
+    "5y": 1260,
+    "max": None,  # return everything
+}
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Internal helpers
+# ──────────────────────────────────────────────────────────────────────────────
+
+
+def _cache_key(tickers: List[str]) -> str:
+    """Stable 12-char hash of the (sorted, uppercase) ticker set."""
+    canonical = ",".join(sorted(set(t.upper() for t in tickers)))
+    return hashlib.md5(canonical.encode()).hexdigest()[:12]
+
+
+def _paths(cache_dir: str, key: str):
+    base = Path(cache_dir)
+    return base / f"{key}.parquet", base / f"{key}.meta.json"
+
+
+def _read_meta(meta_path: Path) -> Optional[dict]:
+    try:
+        with open(meta_path) as f:
+            return json.load(f)
+    except Exception:
+        return None
+
+
+def _write_meta(meta_path: Path, data: dict) -> None:
+    meta_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(meta_path, "w") as f:
+        json.dump(data, f)
+
+
+def _batch_download(tickers: List[str], **kwargs) -> pd.DataFrame:
+    """Download via yf.download and return in long format.
+
+    Long format columns: Date, Ticker, Open, High, Low, Close, Volume
+    Date is stored as tz-naive datetime (normalised to midnight).
+    """
+    try:
+        raw = yf.download(tickers, auto_adjust=True, progress=False, **kwargs)
+    except Exception as e:
+        logger.warning(f"yfinance download failed: {e}")
+        return pd.DataFrame()
+
+    if raw is None or raw.empty:
+        return pd.DataFrame()
+
+    rows = []
+    if isinstance(raw.columns, pd.MultiIndex):
+        available = raw.columns.get_level_values(1).unique()
+        for ticker in tickers:
+            t = ticker.upper()
+            if t not in available:
+                continue
+            try:
+                df = raw.xs(t, axis=1, level=1).dropna(how="all").reset_index()
+                df["Ticker"] = t
+                rows.append(df)
+            except Exception:
+                continue
+    else:
+        # Single-ticker fallback
+        df = raw.dropna(how="all").reset_index()
+        df["Ticker"] = tickers[0].upper() if tickers else "UNKNOWN"
+        rows.append(df)
+
+    if not rows:
+        return pd.DataFrame()
+
+    result = pd.concat(rows, ignore_index=True)
+    result["Date"] = pd.to_datetime(result["Date"]).dt.tz_localize(None).dt.normalize()
+    return result
+
+
+def _trim(df: pd.DataFrame, max_rows: int) -> pd.DataFrame:
+    """Keep the most recent max_rows rows per ticker."""
+    return (
+        df.sort_values("Date")
+        .groupby("Ticker", group_keys=False)
+        .tail(max_rows)
+        .reset_index(drop=True)
+    )
+
+
+def _filter_by_period(df: pd.DataFrame, period: str) -> pd.DataFrame:
+    """Return only rows within the requested period window."""
+    days = PERIOD_TO_DAYS.get(period)
+    if days is None:
+        return df
+    cutoff = pd.Timestamp.now(tz=None).normalize() - pd.Timedelta(days=days)
+    date_col = pd.to_datetime(df["Date"]).dt.tz_localize(None)
+    return df[date_col >= cutoff].reset_index(drop=True)
+
+
+def _split_by_ticker(df: pd.DataFrame) -> Dict[str, pd.DataFrame]:
+    """Split long-format DataFrame into {ticker: per-ticker DataFrame}."""
+    result = {}
+    for ticker, grp in df.groupby("Ticker"):
+        result[str(ticker)] = grp.drop(columns=["Ticker"]).reset_index(drop=True)
+    return result
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Public API
+# ──────────────────────────────────────────────────────────────────────────────
+
+
+def download_ohlcv_cached(
+    tickers: List[str],
+    period: str = "1y",
+    cache_dir: str = DEFAULT_CACHE_DIR,
+    **kwargs,
+) -> Dict[str, pd.DataFrame]:
+    """Download OHLCV data with incremental disk caching.
+
+    Behaviour:
+    - **First call** for a ticker universe: downloads the full `period` of
+      history and saves it to ``{cache_dir}/{key}.parquet``.
+    - **Same-day call**: served from disk — no network traffic.
+    - **Next-day call**: fetches only the new trading days (delta), appends
+      them to the cached parquet, deduplicates, and saves.
+
+    The `period` parameter controls the *initial* download size and the window
+    returned to the caller.  If the cache already contains more history (e.g.
+    a 1y cache serves a 6mo caller by slicing), no extra download occurs.
+
+    Args:
+        tickers:   List of ticker symbols (case-insensitive).
+        period:    yfinance period string ("1y", "6mo", "3mo", …).
+        cache_dir: Directory for parquet + meta files.
+        **kwargs:  Extra kwargs forwarded to ``yf.download`` (e.g. ``interval``).
+
+    Returns:
+        Dict mapping ticker → DataFrame with columns
+        [Date, Open, High, Low, Close, Volume].
+    """
+    Path(cache_dir).mkdir(parents=True, exist_ok=True)
+    tickers_upper = [t.upper() for t in tickers]
+    key = _cache_key(tickers_upper)
+    parquet_path, meta_path = _paths(cache_dir, key)
+    today = datetime.now().strftime("%Y-%m-%d")
+    meta = _read_meta(meta_path)
+
+    # ── Case 1: Fresh cache (updated today) ───────────────────────────────
+    if parquet_path.exists() and meta and meta.get("last_updated") == today:
+        logger.info(f"OHLCV cache hit ({len(tickers_upper)} tickers, updated today)")
+        df = pd.read_parquet(parquet_path)
+        return _split_by_ticker(_filter_by_period(df, period))
+
+    # ── Case 2: Stale cache — incremental update ──────────────────────────
+    if parquet_path.exists() and meta and meta.get("last_updated"):
+        last_updated = meta["last_updated"]
+        start_dt = datetime.strptime(last_updated, "%Y-%m-%d") + timedelta(days=1)
+        start_str = start_dt.strftime("%Y-%m-%d")
+
+        logger.info(
+            f"OHLCV cache stale (last: {last_updated}) — "
+            f"fetching {start_str} → today for {len(tickers_upper)} tickers..."
+        )
+
+        new_data = _batch_download(tickers_upper, start=start_str, **kwargs)
+        existing = pd.read_parquet(parquet_path)
+
+        if not new_data.empty:
+            combined = pd.concat([existing, new_data], ignore_index=True)
+            # Dedup on (Date, Ticker) — keep latest version of each row
+            combined = combined.drop_duplicates(subset=["Date", "Ticker"], keep="last")
+            combined = _trim(combined, MAX_ROWS_PER_TICKER)
+            combined.to_parquet(parquet_path, index=False)
+            logger.info(
+                f"Incremental update: {len(new_data)} new rows appended "
+                f"({len(existing)} → {len(combined)} total rows)"
+            )
+        else:
+            logger.info("No new rows in incremental fetch — cache unchanged")
+            combined = existing
+
+        _write_meta(meta_path, {"last_updated": today, "tickers": tickers_upper, "period": period})
+        return _split_by_ticker(_filter_by_period(combined, period))
+
+    # ── Case 3: No cache — full download ──────────────────────────────────
+    logger.info(f"OHLCV cache miss — downloading {len(tickers_upper)} tickers ({period})...")
+    df = _batch_download(tickers_upper, period=period, **kwargs)
+
+    if df.empty:
+        logger.warning("Full download returned empty data — cache not written")
+        return {}
+
+    df = _trim(df, MAX_ROWS_PER_TICKER)
+    parquet_path.parent.mkdir(parents=True, exist_ok=True)
+    df.to_parquet(parquet_path, index=False)
+    _write_meta(meta_path, {"last_updated": today, "tickers": tickers_upper, "period": period})
+    logger.info(f"Cache written: {len(df)} rows for {df['Ticker'].nunique()} tickers")
+    return _split_by_ticker(_filter_by_period(df, period))
--- a/tradingagents/dataflows/data_cache/unusual_volume_raw_default.json
+++ b/tradingagents/dataflows/data_cache/unusual_volume_raw_default.json