TradingAgents/tradingagents/dataflows/discovery/ticker_matcher.py

233 lines
6.2 KiB
Python

"""
Ticker Matching Utility
Maps company names to ticker symbols using fuzzy string matching
with the ticker universe CSV.
Usage:
from tradingagents.dataflows.discovery.ticker_matcher import match_company_to_ticker
ticker = match_company_to_ticker("Apple Inc")
# Returns: "AAPL"
"""
import csv
import re
from pathlib import Path
from typing import Dict, Optional
from rapidfuzz import fuzz, process
from tradingagents.utils.logger import get_logger
logger = get_logger(__name__)
# Global cache
_TICKER_UNIVERSE: Optional[Dict[str, str]] = None # ticker -> name
_NAME_TO_TICKER: Optional[Dict[str, str]] = None # normalized_name -> ticker
_MATCH_CACHE: Dict[str, Optional[str]] = {} # company_name -> ticker
def _normalize_company_name(name: str) -> str:
"""
Normalize company name for matching.
Removes common suffixes, punctuation, and standardizes format.
"""
if not name:
return ""
# Convert to uppercase
name = name.upper()
# Remove common suffixes
suffixes = [
r"\s+INC\.?",
r"\s+INCORPORATED",
r"\s+CORP\.?",
r"\s+CORPORATION",
r"\s+LTD\.?",
r"\s+LIMITED",
r"\s+LLC",
r"\s+L\.?L\.?C\.?",
r"\s+PLC",
r"\s+CO\.?",
r"\s+COMPANY",
r"\s+CLASS [A-Z]",
r"\s+COMMON STOCK",
r"\s+ORDINARY SHARES?",
r"\s+-\s+.*$", # Remove everything after dash
r"\s+\(.*?\)", # Remove parenthetical
]
for suffix in suffixes:
name = re.sub(suffix, "", name, flags=re.IGNORECASE)
# Remove punctuation except spaces
name = re.sub(r"[^\w\s]", "", name)
# Normalize whitespace
name = " ".join(name.split())
return name.strip()
def load_ticker_universe(force_reload: bool = False) -> Dict[str, str]:
"""
Load ticker universe from CSV.
Args:
force_reload: Force reload even if already loaded
Returns:
Dict mapping ticker -> company name
"""
global _TICKER_UNIVERSE, _NAME_TO_TICKER
if _TICKER_UNIVERSE is not None and not force_reload:
return _TICKER_UNIVERSE
# Find CSV file
project_root = Path(__file__).parent.parent.parent.parent
csv_path = project_root / "data" / "ticker_universe.csv"
if not csv_path.exists():
raise FileNotFoundError(f"Ticker universe not found: {csv_path}")
ticker_universe = {}
name_to_ticker = {}
with open(csv_path, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
ticker = row["ticker"]
name = row["name"]
# Store ticker -> name mapping
ticker_universe[ticker] = name
# Build reverse index (normalized name -> ticker)
normalized = _normalize_company_name(name)
if normalized:
# If multiple tickers have same normalized name, prefer common stocks
if normalized not in name_to_ticker:
name_to_ticker[normalized] = ticker
elif (
"COMMON" in name.upper()
and "COMMON" not in ticker_universe.get(name_to_ticker[normalized], "").upper()
):
# Prefer common stock over other securities
name_to_ticker[normalized] = ticker
_TICKER_UNIVERSE = ticker_universe
_NAME_TO_TICKER = name_to_ticker
logger.info(f"Loaded {len(ticker_universe)} tickers from universe")
return ticker_universe
def match_company_to_ticker(
company_name: str,
min_confidence: float = 80.0,
use_cache: bool = True,
) -> Optional[str]:
"""
Match a company name to a ticker symbol using fuzzy matching.
Args:
company_name: Company name from 13F filing
min_confidence: Minimum fuzzy match score (0-100)
use_cache: Use cached results
Returns:
Ticker symbol or None if no good match found
Examples:
>>> match_company_to_ticker("Apple Inc")
'AAPL'
>>> match_company_to_ticker("MICROSOFT CORP")
'MSFT'
>>> match_company_to_ticker("Berkshire Hathaway Inc")
'BRK.B'
"""
if not company_name:
return None
# Check cache
if use_cache and company_name in _MATCH_CACHE:
return _MATCH_CACHE[company_name]
# Ensure universe is loaded
if _TICKER_UNIVERSE is None or _NAME_TO_TICKER is None:
load_ticker_universe()
# Normalize input
normalized_input = _normalize_company_name(company_name)
if not normalized_input:
return None
# Try exact match first
if normalized_input in _NAME_TO_TICKER:
result = _NAME_TO_TICKER[normalized_input]
_MATCH_CACHE[company_name] = result
return result
# Fuzzy match against all normalized names
choices = list(_NAME_TO_TICKER.keys())
# Use token_sort_ratio for best results with company names
match_result = process.extractOne(
normalized_input, choices, scorer=fuzz.token_sort_ratio, score_cutoff=min_confidence
)
if match_result:
matched_name, score, _ = match_result
ticker = _NAME_TO_TICKER[matched_name]
# Log match for debugging
if score < 95:
logger.info(f"Fuzzy match: '{company_name}' -> {ticker} (score: {score:.1f})")
_MATCH_CACHE[company_name] = ticker
return ticker
# No match found
logger.info(f"No ticker match for: '{company_name}'")
_MATCH_CACHE[company_name] = None
return None
def get_match_confidence(company_name: str, ticker: str) -> float:
"""
Get confidence score for a company name -> ticker match.
Args:
company_name: Company name
ticker: Ticker symbol
Returns:
Confidence score (0-100)
"""
if _TICKER_UNIVERSE is None:
load_ticker_universe()
if ticker not in _TICKER_UNIVERSE:
return 0.0
ticker_name = _TICKER_UNIVERSE[ticker]
# Normalize both names
norm_input = _normalize_company_name(company_name)
norm_ticker = _normalize_company_name(ticker_name)
# Calculate similarity
return fuzz.token_sort_ratio(norm_input, norm_ticker)
def clear_cache():
"""Clear the match cache."""
global _MATCH_CACHE
_MATCH_CACHE = {}