"""Ticker universe — single source of truth. All scanners that need a list of tickers should call load_universe(config). Do NOT hardcode "data/tickers.txt" in scanner files — import this module instead. Priority order: 1. config["discovery"]["universe"] — explicit list (tests / overrides) 2. config["discovery"]["universe_source"] — dynamic index ("russell1000") 3. config["tickers_file"] — path from top-level config 4. Default: data/tickers.txt resolved relative to repo root """ import json import time from pathlib import Path from typing import Any, Dict, List, Optional from tradingagents.utils.logger import get_logger logger = get_logger(__name__) # Resolved once at import time — works regardless of cwd _REPO_ROOT = Path(__file__).resolve().parent.parent.parent DEFAULT_TICKERS_FILE = str(_REPO_ROOT / "data" / "tickers.txt") _UNIVERSE_CACHE_FILE = _REPO_ROOT / "data" / "universe_cache.json" _CACHE_TTL_SECONDS = 7 * 24 * 3600 # refresh weekly def load_universe(config: Optional[Dict[str, Any]] = None) -> List[str]: """Return the full ticker universe as a list of uppercase strings. Args: config: Top-level app config dict. If None, falls back to default file. Returns: Deduplicated list of ticker symbols in the order they appear in the source. """ cfg = config or {} # 1. Explicit list in config (useful for tests or targeted overrides) explicit = cfg.get("discovery", {}).get("universe") if explicit: tickers = [t.strip().upper() for t in explicit if t.strip()] logger.info(f"Universe: {len(tickers)} tickers from config override") return tickers # 2. Dynamic index source source = cfg.get("discovery", {}).get("universe_source", "") if source == "russell1000": tickers = _load_russell1000() if tickers: return tickers logger.warning("Russell 1000 fetch failed — falling back to tickers.txt") # 3. Config-specified file path, falling back to repo-relative default file_path = cfg.get("tickers_file", DEFAULT_TICKERS_FILE) return _load_from_file(file_path) def _load_russell1000() -> List[str]: """Fetch Russell 1000 constituents from iShares IWB ETF holdings, with weekly disk cache.""" # Return cached copy if fresh cached = _read_universe_cache("russell1000") if cached: return cached logger.info("Fetching Russell 1000 constituents from iShares IWB holdings...") try: import io import urllib.request import pandas as pd url = ( "https://www.ishares.com/us/products/239707/ISHARES-RUSSELL-1000-ETF" "/1467271812596.ajax?fileType=csv&fileName=IWB_holdings&dataType=fund" ) req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}) with urllib.request.urlopen(req, timeout=30) as r: content = r.read().decode("utf-8", errors="ignore") # iShares CSV has a few header rows before the actual data df = pd.read_csv(io.StringIO(content), skiprows=9) if "Ticker" not in df.columns: logger.warning("Could not find Ticker column in iShares IWB CSV") return [] # Only take equity rows — excludes cash collateral, money market, etc. if "Asset Class" in df.columns: df = df[df["Asset Class"].astype(str).str.strip() == "Equity"] # iShares uses compact tickers for some dual-class shares (no hyphen). # Map the compact form → canonical yfinance symbol. _ISHARES_REMAP = { "BRKB": "BRK-B", "BFA": "BF-A", "BFB": "BF-B", "HEIA": "HEI-A", "LENB": "LEN-B", "UHALB": "UHAL-B", "CWENA": "CWEN-A", } tickers = [] for t in df["Ticker"].dropna(): s = str(t).strip().upper().replace(".", "-") # Valid tickers: 1-6 alpha chars only if not (s and len(s) <= 7 and s.replace("-", "").isalpha()): continue s = _ISHARES_REMAP.get(s, s) tickers.append(s) # Deduplicate while preserving order (by weight — iShares sorts by weight desc) seen: set = set() tickers = [t for t in tickers if not (t in seen or seen.add(t))] if not tickers: logger.warning("No tickers parsed from iShares IWB CSV") return [] _write_universe_cache("russell1000", tickers) logger.info(f"Universe: {len(tickers)} Russell 1000 tickers (cached)") return tickers except Exception as e: logger.warning(f"Failed to fetch Russell 1000 from iShares: {e}") return [] def _read_universe_cache(key: str) -> List[str]: """Return cached ticker list if it exists and is within TTL.""" try: if not _UNIVERSE_CACHE_FILE.exists(): return [] data = json.loads(_UNIVERSE_CACHE_FILE.read_text()) entry = data.get(key, {}) if time.time() - entry.get("ts", 0) < _CACHE_TTL_SECONDS: tickers = entry.get("tickers", []) logger.info(f"Universe: {len(tickers)} {key} tickers (from disk cache)") return tickers except Exception: pass return [] def _write_universe_cache(key: str, tickers: List[str]) -> None: """Persist ticker list to disk cache.""" try: data: dict = {} if _UNIVERSE_CACHE_FILE.exists(): data = json.loads(_UNIVERSE_CACHE_FILE.read_text()) data[key] = {"ts": time.time(), "tickers": tickers} _UNIVERSE_CACHE_FILE.write_text(json.dumps(data)) except Exception as e: logger.debug(f"Failed to write universe cache: {e}") def _load_from_file(path: str) -> List[str]: """Load tickers from a text file (one per line, # comments ignored).""" try: with open(path) as f: tickers = [ line.strip().upper() for line in f if line.strip() and not line.strip().startswith("#") ] # Deduplicate while preserving order seen: set = set() unique = [t for t in tickers if not (t in seen or seen.add(t))] logger.info(f"Universe: loaded {len(unique)} tickers from {path}") return unique except FileNotFoundError: logger.warning(f"Ticker file not found: {path} — universe will be empty") return [] except Exception as e: logger.warning(f"Failed to load ticker file {path}: {e}") return []