TradingAgents/tradingagents/dataflows/discovery/common_utils.py

180 lines
4.3 KiB
Python

"""Common utilities for discovery scanners."""
import re
from typing import List, Optional, Set
from tradingagents.utils.logger import get_logger
logger = get_logger(__name__)
def get_common_stopwords() -> Set[str]:
"""Get common words that look like tickers but aren't.
Returns:
Set of uppercase words to filter out from ticker extraction
"""
return {
# Common words
"THE",
"AND",
"FOR",
"ARE",
"BUT",
"NOT",
"YOU",
"ALL",
"CAN",
"HER",
"WAS",
"ONE",
"OUR",
"OUT",
"DAY",
"WHO",
"HAS",
"HAD",
"NEW",
"NOW",
"GET",
"GOT",
"PUT",
"SET",
"RUN",
"TOP",
"BIG",
# Financial terms
"CEO",
"CFO",
"CTO",
"COO",
"USD",
"USA",
"SEC",
"IPO",
"ETF",
"NYSE",
"NASDAQ",
"WSB",
"DD",
"YOLO",
"FD",
"ATH",
"ATL",
"GDP",
"STOCK",
"STOCKS",
"MARKET",
"NEWS",
"PRICE",
"TRADE",
"SALES",
# Time
"JAN",
"FEB",
"MAR",
"APR",
"MAY",
"JUN",
"JUL",
"AUG",
"SEP",
"OCT",
"NOV",
"DEC",
"MON",
"TUE",
"WED",
"THU",
"FRI",
"SAT",
"SUN",
}
def extract_tickers_from_text(
text: str, stop_words: Optional[Set[str]] = None, max_text_length: int = 100_000
) -> List[str]:
"""Extract valid ticker symbols from text.
Uses regex patterns to find potential tickers ($TICKER or standalone TICKER),
filters out common stopwords, and returns deduplicated list.
Args:
text: Text to extract tickers from
stop_words: Custom stopwords to filter (uses defaults if None)
max_text_length: Maximum text length to process (prevents ReDoS)
Returns:
List of unique ticker symbols found in text
Example:
>>> extract_tickers_from_text("I like $AAPL and MSFT stocks")
['AAPL', 'MSFT']
"""
# Truncate oversized text to prevent ReDoS
if len(text) > max_text_length:
logger.warning(f"Truncating oversized text from {len(text)} to {max_text_length} chars")
text = text[:max_text_length]
# Match: $TICKER or standalone TICKER (2-5 uppercase letters)
ticker_pattern = r"\b([A-Z]{2,5})\b|\$([A-Z]{2,5})"
matches = re.findall(ticker_pattern, text)
# Flatten tuples and deduplicate
tickers = list(set([t[0] or t[1] for t in matches if t[0] or t[1]]))
# Filter stopwords
stop_words = stop_words or get_common_stopwords()
filtered_tickers = [t for t in tickers if t not in stop_words]
return filtered_tickers
def validate_ticker_format(ticker: str) -> bool:
"""Validate ticker symbol format.
Args:
ticker: Ticker symbol to validate
Returns:
True if ticker matches expected format (1-5 uppercase letters).
Single-letter tickers (C, A, F, T, X, M, etc.) are valid NYSE symbols.
"""
if not ticker or not isinstance(ticker, str):
return False
return bool(re.match(r"^[A-Z]{1,5}$", ticker.strip().upper()))
def validate_candidate_structure(candidate: dict) -> bool:
"""Validate candidate dictionary has required keys.
Args:
candidate: Candidate dictionary to validate
Returns:
True if candidate has all required keys with valid types
"""
required_keys = {"ticker", "source", "context", "priority"}
if not isinstance(candidate, dict):
return False
if not required_keys.issubset(candidate.keys()):
missing = required_keys - set(candidate.keys())
logger.warning(f"Candidate missing required keys: {missing}")
return False
# Validate ticker format
if not validate_ticker_format(candidate.get("ticker", "")):
logger.warning(f"Invalid ticker format: {candidate.get('ticker')}")
return False
# Validate priority is string
if not isinstance(candidate.get("priority"), str):
logger.warning(f"Invalid priority type: {type(candidate.get('priority'))}")
return False
return True