TradingAgents/tradingagents/dataflows/discovery/ticker_matcher.py

228 lines
6.3 KiB
Python

"""
Ticker Matching Utility
Maps company names to ticker symbols using fuzzy string matching
with the ticker universe CSV.
Usage:
from tradingagents.dataflows.discovery.ticker_matcher import match_company_to_ticker
ticker = match_company_to_ticker("Apple Inc")
# Returns: "AAPL"
"""
import csv
import re
from pathlib import Path
from typing import Dict, Optional, Tuple
from rapidfuzz import fuzz, process
# Global cache
_TICKER_UNIVERSE: Optional[Dict[str, str]] = None # ticker -> name
_NAME_TO_TICKER: Optional[Dict[str, str]] = None # normalized_name -> ticker
_MATCH_CACHE: Dict[str, Optional[str]] = {} # company_name -> ticker
def _normalize_company_name(name: str) -> str:
"""
Normalize company name for matching.
Removes common suffixes, punctuation, and standardizes format.
"""
if not name:
return ""
# Convert to uppercase
name = name.upper()
# Remove common suffixes
suffixes = [
r'\s+INC\.?',
r'\s+INCORPORATED',
r'\s+CORP\.?',
r'\s+CORPORATION',
r'\s+LTD\.?',
r'\s+LIMITED',
r'\s+LLC',
r'\s+L\.?L\.?C\.?',
r'\s+PLC',
r'\s+CO\.?',
r'\s+COMPANY',
r'\s+CLASS [A-Z]',
r'\s+COMMON STOCK',
r'\s+ORDINARY SHARES?',
r'\s+-\s+.*$', # Remove everything after dash
r'\s+\(.*?\)', # Remove parenthetical
]
for suffix in suffixes:
name = re.sub(suffix, '', name, flags=re.IGNORECASE)
# Remove punctuation except spaces
name = re.sub(r'[^\w\s]', '', name)
# Normalize whitespace
name = ' '.join(name.split())
return name.strip()
def load_ticker_universe(force_reload: bool = False) -> Dict[str, str]:
"""
Load ticker universe from CSV.
Args:
force_reload: Force reload even if already loaded
Returns:
Dict mapping ticker -> company name
"""
global _TICKER_UNIVERSE, _NAME_TO_TICKER
if _TICKER_UNIVERSE is not None and not force_reload:
return _TICKER_UNIVERSE
# Find CSV file
project_root = Path(__file__).parent.parent.parent.parent
csv_path = project_root / "data" / "ticker_universe.csv"
if not csv_path.exists():
raise FileNotFoundError(f"Ticker universe not found: {csv_path}")
ticker_universe = {}
name_to_ticker = {}
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
ticker = row['ticker']
name = row['name']
# Store ticker -> name mapping
ticker_universe[ticker] = name
# Build reverse index (normalized name -> ticker)
normalized = _normalize_company_name(name)
if normalized:
# If multiple tickers have same normalized name, prefer common stocks
if normalized not in name_to_ticker:
name_to_ticker[normalized] = ticker
elif "COMMON" in name.upper() and "COMMON" not in ticker_universe.get(name_to_ticker[normalized], "").upper():
# Prefer common stock over other securities
name_to_ticker[normalized] = ticker
_TICKER_UNIVERSE = ticker_universe
_NAME_TO_TICKER = name_to_ticker
print(f" Loaded {len(ticker_universe)} tickers from universe")
return ticker_universe
def match_company_to_ticker(
company_name: str,
min_confidence: float = 80.0,
use_cache: bool = True,
) -> Optional[str]:
"""
Match a company name to a ticker symbol using fuzzy matching.
Args:
company_name: Company name from 13F filing
min_confidence: Minimum fuzzy match score (0-100)
use_cache: Use cached results
Returns:
Ticker symbol or None if no good match found
Examples:
>>> match_company_to_ticker("Apple Inc")
'AAPL'
>>> match_company_to_ticker("MICROSOFT CORP")
'MSFT'
>>> match_company_to_ticker("Berkshire Hathaway Inc")
'BRK.B'
"""
if not company_name:
return None
# Check cache
if use_cache and company_name in _MATCH_CACHE:
return _MATCH_CACHE[company_name]
# Ensure universe is loaded
if _TICKER_UNIVERSE is None or _NAME_TO_TICKER is None:
load_ticker_universe()
# Normalize input
normalized_input = _normalize_company_name(company_name)
if not normalized_input:
return None
# Try exact match first
if normalized_input in _NAME_TO_TICKER:
result = _NAME_TO_TICKER[normalized_input]
_MATCH_CACHE[company_name] = result
return result
# Fuzzy match against all normalized names
choices = list(_NAME_TO_TICKER.keys())
# Use token_sort_ratio for best results with company names
match_result = process.extractOne(
normalized_input,
choices,
scorer=fuzz.token_sort_ratio,
score_cutoff=min_confidence
)
if match_result:
matched_name, score, _ = match_result
ticker = _NAME_TO_TICKER[matched_name]
# Log match for debugging
if score < 95:
print(f" Fuzzy match: '{company_name}' -> {ticker} (score: {score:.1f})")
_MATCH_CACHE[company_name] = ticker
return ticker
# No match found
print(f" No ticker match for: '{company_name}'")
_MATCH_CACHE[company_name] = None
return None
def get_match_confidence(company_name: str, ticker: str) -> float:
"""
Get confidence score for a company name -> ticker match.
Args:
company_name: Company name
ticker: Ticker symbol
Returns:
Confidence score (0-100)
"""
if _TICKER_UNIVERSE is None:
load_ticker_universe()
if ticker not in _TICKER_UNIVERSE:
return 0.0
ticker_name = _TICKER_UNIVERSE[ticker]
# Normalize both names
norm_input = _normalize_company_name(company_name)
norm_ticker = _normalize_company_name(ticker_name)
# Calculate similarity
return fuzz.token_sort_ratio(norm_input, norm_ticker)
def clear_cache():
"""Clear the match cache."""
global _MATCH_CACHE
_MATCH_CACHE = {}