TradingAgents/scripts/prefetch_ohlcv.py

78 lines
2.4 KiB
Python

#!/usr/bin/env python3
"""Nightly OHLCV prefetch — populates the shared cache for all scanners.
Run nightly at 01:00 UTC (before discovery at 12:30 UTC) so scanners read
from disk instead of hitting yfinance at run time.
First run: downloads 1y of history for the full ticker universe (~592 tickers).
Subsequent runs: appends only the new trading day's bars (incremental update).
Usage:
python scripts/prefetch_ohlcv.py
python scripts/prefetch_ohlcv.py --period 6mo # shorter initial window
"""
import argparse
import os
import sys
import time
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
from tradingagents.dataflows.data_cache.ohlcv_cache import download_ohlcv_cached
from tradingagents.dataflows.universe import load_universe
from tradingagents.default_config import DEFAULT_CONFIG
def main():
parser = argparse.ArgumentParser(description="Prefetch OHLCV data for the ticker universe")
parser.add_argument(
"--period",
default="1y",
help="History window for initial download (default: 1y). Incremental runs ignore this.",
)
parser.add_argument(
"--cache-dir",
default=str(ROOT / "data" / "ohlcv_cache"),
help="Directory to store parquet cache files",
)
args = parser.parse_args()
tickers = load_universe(DEFAULT_CONFIG)
if not tickers:
print("ERROR: No tickers loaded — check data/tickers.txt", flush=True)
sys.exit(1)
print(f"Prefetching OHLCV for {len(tickers)} tickers (period={args.period})...", flush=True)
print(f"Cache dir: {args.cache_dir}", flush=True)
start = time.time()
data = download_ohlcv_cached(
tickers=tickers,
period=args.period,
cache_dir=args.cache_dir,
)
elapsed = time.time() - start
# Summary
n_tickers = len(data)
total_rows = sum(len(df) for df in data.values())
cache_size_mb = sum(
p.stat().st_size for p in Path(args.cache_dir).glob("*.parquet")
) / 1024 / 1024
print(f"\nDone in {elapsed:.1f}s", flush=True)
print(f" Tickers cached : {n_tickers}/{len(tickers)}", flush=True)
print(f" Total rows : {total_rows:,}", flush=True)
print(f" Cache size : {cache_size_mb:.1f} MB", flush=True)
missing = set(tickers) - set(data.keys())
if missing:
print(f" Missing tickers: {len(missing)} (delisted or no data)", flush=True)
if __name__ == "__main__":
main()