180 lines
4.3 KiB
Python
180 lines
4.3 KiB
Python
"""Common utilities for discovery scanners."""
|
|
|
|
import re
|
|
from typing import List, Optional, Set
|
|
|
|
from tradingagents.utils.logger import get_logger
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
def get_common_stopwords() -> Set[str]:
|
|
"""Get common words that look like tickers but aren't.
|
|
|
|
Returns:
|
|
Set of uppercase words to filter out from ticker extraction
|
|
"""
|
|
return {
|
|
# Common words
|
|
"THE",
|
|
"AND",
|
|
"FOR",
|
|
"ARE",
|
|
"BUT",
|
|
"NOT",
|
|
"YOU",
|
|
"ALL",
|
|
"CAN",
|
|
"HER",
|
|
"WAS",
|
|
"ONE",
|
|
"OUR",
|
|
"OUT",
|
|
"DAY",
|
|
"WHO",
|
|
"HAS",
|
|
"HAD",
|
|
"NEW",
|
|
"NOW",
|
|
"GET",
|
|
"GOT",
|
|
"PUT",
|
|
"SET",
|
|
"RUN",
|
|
"TOP",
|
|
"BIG",
|
|
# Financial terms
|
|
"CEO",
|
|
"CFO",
|
|
"CTO",
|
|
"COO",
|
|
"USD",
|
|
"USA",
|
|
"SEC",
|
|
"IPO",
|
|
"ETF",
|
|
"NYSE",
|
|
"NASDAQ",
|
|
"WSB",
|
|
"DD",
|
|
"YOLO",
|
|
"FD",
|
|
"ATH",
|
|
"ATL",
|
|
"GDP",
|
|
"STOCK",
|
|
"STOCKS",
|
|
"MARKET",
|
|
"NEWS",
|
|
"PRICE",
|
|
"TRADE",
|
|
"SALES",
|
|
# Time
|
|
"JAN",
|
|
"FEB",
|
|
"MAR",
|
|
"APR",
|
|
"MAY",
|
|
"JUN",
|
|
"JUL",
|
|
"AUG",
|
|
"SEP",
|
|
"OCT",
|
|
"NOV",
|
|
"DEC",
|
|
"MON",
|
|
"TUE",
|
|
"WED",
|
|
"THU",
|
|
"FRI",
|
|
"SAT",
|
|
"SUN",
|
|
}
|
|
|
|
|
|
def extract_tickers_from_text(
|
|
text: str, stop_words: Optional[Set[str]] = None, max_text_length: int = 100_000
|
|
) -> List[str]:
|
|
"""Extract valid ticker symbols from text.
|
|
|
|
Uses regex patterns to find potential tickers ($TICKER or standalone TICKER),
|
|
filters out common stopwords, and returns deduplicated list.
|
|
|
|
Args:
|
|
text: Text to extract tickers from
|
|
stop_words: Custom stopwords to filter (uses defaults if None)
|
|
max_text_length: Maximum text length to process (prevents ReDoS)
|
|
|
|
Returns:
|
|
List of unique ticker symbols found in text
|
|
|
|
Example:
|
|
>>> extract_tickers_from_text("I like $AAPL and MSFT stocks")
|
|
['AAPL', 'MSFT']
|
|
"""
|
|
# Truncate oversized text to prevent ReDoS
|
|
if len(text) > max_text_length:
|
|
logger.warning(f"Truncating oversized text from {len(text)} to {max_text_length} chars")
|
|
text = text[:max_text_length]
|
|
|
|
# Match: $TICKER or standalone TICKER (2-5 uppercase letters)
|
|
ticker_pattern = r"\b([A-Z]{2,5})\b|\$([A-Z]{2,5})"
|
|
matches = re.findall(ticker_pattern, text)
|
|
|
|
# Flatten tuples and deduplicate
|
|
tickers = list(set([t[0] or t[1] for t in matches if t[0] or t[1]]))
|
|
|
|
# Filter stopwords
|
|
stop_words = stop_words or get_common_stopwords()
|
|
filtered_tickers = [t for t in tickers if t not in stop_words]
|
|
|
|
return filtered_tickers
|
|
|
|
|
|
def validate_ticker_format(ticker: str) -> bool:
|
|
"""Validate ticker symbol format.
|
|
|
|
Args:
|
|
ticker: Ticker symbol to validate
|
|
|
|
Returns:
|
|
True if ticker matches expected format (1-5 uppercase letters).
|
|
Single-letter tickers (C, A, F, T, X, M, etc.) are valid NYSE symbols.
|
|
"""
|
|
if not ticker or not isinstance(ticker, str):
|
|
return False
|
|
|
|
return bool(re.match(r"^[A-Z]{1,5}$", ticker.strip().upper()))
|
|
|
|
|
|
def validate_candidate_structure(candidate: dict) -> bool:
|
|
"""Validate candidate dictionary has required keys.
|
|
|
|
Args:
|
|
candidate: Candidate dictionary to validate
|
|
|
|
Returns:
|
|
True if candidate has all required keys with valid types
|
|
"""
|
|
required_keys = {"ticker", "source", "context", "priority"}
|
|
|
|
if not isinstance(candidate, dict):
|
|
return False
|
|
|
|
if not required_keys.issubset(candidate.keys()):
|
|
missing = required_keys - set(candidate.keys())
|
|
logger.warning(f"Candidate missing required keys: {missing}")
|
|
return False
|
|
|
|
# Validate ticker format
|
|
if not validate_ticker_format(candidate.get("ticker", "")):
|
|
logger.warning(f"Invalid ticker format: {candidate.get('ticker')}")
|
|
return False
|
|
|
|
# Validate priority is string
|
|
if not isinstance(candidate.get("priority"), str):
|
|
logger.warning(f"Invalid priority type: {type(candidate.get('priority'))}")
|
|
return False
|
|
|
|
return True
|