fix(universe): add missing __init__.py for data_cache; switch to Russell 1000 via iShares

- tradingagents/dataflows/data_cache/__init__.py: fixes ModuleNotFoundError in CI
- universe.py: fetches Russell 1000 from iShares IWB CSV with weekly disk cache + fallback
- default_config.py: universe_source = 'russell1000'
- data/universe_cache.json: initial cache (weekly TTL, auto-refreshed)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Youssef Aitousarrah 2026-04-14 16:36:17 -07:00
parent f87197ef41
commit 65cd0bb094
4 changed files with 101 additions and 5 deletions

1
data/universe_cache.json Normal file

File diff suppressed because one or more lines are too long

View File

@ -4,11 +4,14 @@ All scanners that need a list of tickers should call load_universe(config).
Do NOT hardcode "data/tickers.txt" in scanner files import this module instead. Do NOT hardcode "data/tickers.txt" in scanner files import this module instead.
Priority order: Priority order:
1. config["discovery"]["universe"] explicit list (tests / overrides) 1. config["discovery"]["universe"] explicit list (tests / overrides)
2. config["tickers_file"] path from top-level config 2. config["discovery"]["universe_source"] dynamic index ("russell1000")
3. Default: data/tickers.txt resolved relative to repo root 3. config["tickers_file"] path from top-level config
4. Default: data/tickers.txt resolved relative to repo root
""" """
import json
import time
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
@ -19,6 +22,8 @@ logger = get_logger(__name__)
# Resolved once at import time — works regardless of cwd # Resolved once at import time — works regardless of cwd
_REPO_ROOT = Path(__file__).resolve().parent.parent.parent _REPO_ROOT = Path(__file__).resolve().parent.parent.parent
DEFAULT_TICKERS_FILE = str(_REPO_ROOT / "data" / "tickers.txt") DEFAULT_TICKERS_FILE = str(_REPO_ROOT / "data" / "tickers.txt")
_UNIVERSE_CACHE_FILE = _REPO_ROOT / "data" / "universe_cache.json"
_CACHE_TTL_SECONDS = 7 * 24 * 3600 # refresh weekly
def load_universe(config: Optional[Dict[str, Any]] = None) -> List[str]: def load_universe(config: Optional[Dict[str, Any]] = None) -> List[str]:
@ -28,7 +33,7 @@ def load_universe(config: Optional[Dict[str, Any]] = None) -> List[str]:
config: Top-level app config dict. If None, falls back to default file. config: Top-level app config dict. If None, falls back to default file.
Returns: Returns:
Deduplicated list of ticker symbols in the order they appear in the file. Deduplicated list of ticker symbols in the order they appear in the source.
""" """
cfg = config or {} cfg = config or {}
@ -39,11 +44,100 @@ def load_universe(config: Optional[Dict[str, Any]] = None) -> List[str]:
logger.info(f"Universe: {len(tickers)} tickers from config override") logger.info(f"Universe: {len(tickers)} tickers from config override")
return tickers return tickers
# 2. Config-specified file path, falling back to repo-relative default # 2. Dynamic index source
source = cfg.get("discovery", {}).get("universe_source", "")
if source == "russell1000":
tickers = _load_russell1000()
if tickers:
return tickers
logger.warning("Russell 1000 fetch failed — falling back to tickers.txt")
# 3. Config-specified file path, falling back to repo-relative default
file_path = cfg.get("tickers_file", DEFAULT_TICKERS_FILE) file_path = cfg.get("tickers_file", DEFAULT_TICKERS_FILE)
return _load_from_file(file_path) return _load_from_file(file_path)
def _load_russell1000() -> List[str]:
"""Fetch Russell 1000 constituents from iShares IWB ETF holdings, with weekly disk cache."""
# Return cached copy if fresh
cached = _read_universe_cache("russell1000")
if cached:
return cached
logger.info("Fetching Russell 1000 constituents from iShares IWB holdings...")
try:
import io
import urllib.request
import pandas as pd
url = (
"https://www.ishares.com/us/products/239707/ISHARES-RUSSELL-1000-ETF"
"/1467271812596.ajax?fileType=csv&fileName=IWB_holdings&dataType=fund"
)
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urllib.request.urlopen(req, timeout=30) as r:
content = r.read().decode("utf-8", errors="ignore")
# iShares CSV has a few header rows before the actual data
df = pd.read_csv(io.StringIO(content), skiprows=9)
if "Ticker" not in df.columns:
logger.warning("Could not find Ticker column in iShares IWB CSV")
return []
tickers = []
for t in df["Ticker"].dropna():
s = str(t).strip().upper().replace(".", "-")
# Valid tickers: 1-5 alpha chars, optionally one hyphen (e.g. BRK-B)
if s and len(s) <= 6 and s.replace("-", "").isalpha():
tickers.append(s)
# Deduplicate while preserving order (by weight — iShares sorts by weight desc)
seen: set = set()
tickers = [t for t in tickers if not (t in seen or seen.add(t))]
if not tickers:
logger.warning("No tickers parsed from iShares IWB CSV")
return []
_write_universe_cache("russell1000", tickers)
logger.info(f"Universe: {len(tickers)} Russell 1000 tickers (cached)")
return tickers
except Exception as e:
logger.warning(f"Failed to fetch Russell 1000 from iShares: {e}")
return []
def _read_universe_cache(key: str) -> List[str]:
"""Return cached ticker list if it exists and is within TTL."""
try:
if not _UNIVERSE_CACHE_FILE.exists():
return []
data = json.loads(_UNIVERSE_CACHE_FILE.read_text())
entry = data.get(key, {})
if time.time() - entry.get("ts", 0) < _CACHE_TTL_SECONDS:
tickers = entry.get("tickers", [])
logger.info(f"Universe: {len(tickers)} {key} tickers (from disk cache)")
return tickers
except Exception:
pass
return []
def _write_universe_cache(key: str, tickers: List[str]) -> None:
"""Persist ticker list to disk cache."""
try:
data: dict = {}
if _UNIVERSE_CACHE_FILE.exists():
data = json.loads(_UNIVERSE_CACHE_FILE.read_text())
data[key] = {"ts": time.time(), "tickers": tickers}
_UNIVERSE_CACHE_FILE.write_text(json.dumps(data))
except Exception as e:
logger.debug(f"Failed to write universe cache: {e}")
def _load_from_file(path: str) -> List[str]: def _load_from_file(path: str) -> List[str]:
"""Load tickers from a text file (one per line, # comments ignored).""" """Load tickers from a text file (one per line, # comments ignored)."""
try: try:

View File

@ -28,6 +28,7 @@ DEFAULT_CONFIG = {
"final_recommendations": 15, # Number of final opportunities to recommend "final_recommendations": 15, # Number of final opportunities to recommend
"deep_dive_max_workers": 1, # Parallel workers for deep-dive analysis (1 = sequential) "deep_dive_max_workers": 1, # Parallel workers for deep-dive analysis (1 = sequential)
"discovery_mode": "hybrid", # "traditional", "semantic", or "hybrid" "discovery_mode": "hybrid", # "traditional", "semantic", or "hybrid"
"universe_source": "russell1000", # "russell1000" or "" (uses tickers_file)
# Ranking context truncation # Ranking context truncation
"truncate_ranking_context": False, # True = truncate to save tokens, False = full context "truncate_ranking_context": False, # True = truncate to save tokens, False = full context
"max_news_chars": 500, # Only used if truncate_ranking_context=True "max_news_chars": 500, # Only used if truncate_ranking_context=True