78 lines
2.4 KiB
Python
78 lines
2.4 KiB
Python
#!/usr/bin/env python3
|
|
"""Nightly OHLCV prefetch — populates the shared cache for all scanners.
|
|
|
|
Run nightly at 01:00 UTC (before discovery at 12:30 UTC) so scanners read
|
|
from disk instead of hitting yfinance at run time.
|
|
|
|
First run: downloads 1y of history for the full ticker universe (~592 tickers).
|
|
Subsequent runs: appends only the new trading day's bars (incremental update).
|
|
|
|
Usage:
|
|
python scripts/prefetch_ohlcv.py
|
|
python scripts/prefetch_ohlcv.py --period 6mo # shorter initial window
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
sys.path.insert(0, str(ROOT))
|
|
|
|
from tradingagents.dataflows.data_cache.ohlcv_cache import download_ohlcv_cached
|
|
from tradingagents.dataflows.universe import load_universe
|
|
from tradingagents.default_config import DEFAULT_CONFIG
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Prefetch OHLCV data for the ticker universe")
|
|
parser.add_argument(
|
|
"--period",
|
|
default="1y",
|
|
help="History window for initial download (default: 1y). Incremental runs ignore this.",
|
|
)
|
|
parser.add_argument(
|
|
"--cache-dir",
|
|
default=str(ROOT / "data" / "ohlcv_cache"),
|
|
help="Directory to store parquet cache files",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
tickers = load_universe(DEFAULT_CONFIG)
|
|
if not tickers:
|
|
print("ERROR: No tickers loaded — check data/tickers.txt", flush=True)
|
|
sys.exit(1)
|
|
|
|
print(f"Prefetching OHLCV for {len(tickers)} tickers (period={args.period})...", flush=True)
|
|
print(f"Cache dir: {args.cache_dir}", flush=True)
|
|
|
|
start = time.time()
|
|
data = download_ohlcv_cached(
|
|
tickers=tickers,
|
|
period=args.period,
|
|
cache_dir=args.cache_dir,
|
|
)
|
|
elapsed = time.time() - start
|
|
|
|
# Summary
|
|
n_tickers = len(data)
|
|
total_rows = sum(len(df) for df in data.values())
|
|
cache_size_mb = sum(
|
|
p.stat().st_size for p in Path(args.cache_dir).glob("*.parquet")
|
|
) / 1024 / 1024
|
|
|
|
print(f"\nDone in {elapsed:.1f}s", flush=True)
|
|
print(f" Tickers cached : {n_tickers}/{len(tickers)}", flush=True)
|
|
print(f" Total rows : {total_rows:,}", flush=True)
|
|
print(f" Cache size : {cache_size_mb:.1f} MB", flush=True)
|
|
|
|
missing = set(tickers) - set(data.keys())
|
|
if missing:
|
|
print(f" Missing tickers: {len(missing)} (delisted or no data)", flush=True)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|