""" Semantic Discovery System ------------------------ Combines news scanning with ticker semantic matching to discover investment opportunities based on breaking news before they show up in social media or price action. Flow: 1. Scan news from multiple sources 2. Generate embeddings for each news item 3. Match news against ticker descriptions semantically 4. Filter and rank opportunities 5. Return actionable ticker candidates """ import re from datetime import datetime from typing import Any, Dict, List from dotenv import load_dotenv from tradingagents.dataflows.news_semantic_scanner import NewsSemanticScanner from tradingagents.dataflows.ticker_semantic_db import TickerSemanticDB from tradingagents.utils.logger import get_logger load_dotenv() logger = get_logger(__name__) class SemanticDiscovery: """Discovers investment opportunities through news-ticker semantic matching.""" def __init__(self, config: Dict[str, Any]): """ Initialize semantic discovery system. Args: config: Configuration dict with settings for both ticker DB and news scanner """ self.config = config # Initialize ticker database self.ticker_db = TickerSemanticDB(config) # Initialize news scanner self.news_scanner = NewsSemanticScanner(config) # Discovery settings self.min_similarity_threshold = config.get("min_similarity_threshold", 0.3) self.min_news_importance = config.get("min_news_importance", 5) self.max_tickers_per_news = config.get("max_tickers_per_news", 5) self.max_total_candidates = config.get("max_total_candidates", 20) self.news_sentiment_filter = config.get("news_sentiment_filter", "positive") self.group_by_news = config.get("group_by_news", False) def _extract_tickers(self, mentions: List[str]) -> List[str]: from tradingagents.dataflows.discovery.utils import is_valid_ticker tickers = set() for mention in mentions or []: for match in re.findall(r"\b[A-Z]{1,5}\b", str(mention)): # APPLY VALIDATION IMMEDIATELY if is_valid_ticker(match): tickers.add(match) return sorted(tickers) def get_directly_mentioned_tickers(self) -> List[Dict[str, Any]]: """ Get tickers that are directly mentioned in news (highest signal). This extracts tickers from the 'companies_mentioned' field of news items, which represents explicit company references rather than semantic matches. Returns: List of ticker info dicts with news context """ # Scan news if not already done news_items = self.news_scanner.scan_news() # Filter by importance important_news = [ item for item in news_items if item.get("importance", 0) >= self.min_news_importance ] # Extract directly mentioned tickers mentioned_tickers = {} # ticker -> list of news items # Common words to exclude (not tickers) exclude_words = { "A", "I", "AN", "AI", "CEO", "CFO", "CTO", "FDA", "SEC", "IPO", "ETF", "GDP", "CPI", "FED", "NYSE", "Q1", "Q2", "Q3", "Q4", "US", "UK", "EU", "AT", "BE", "BY", "DO", "GO", "IF", "IN", "IS", "IT", "ME", "MY", "NO", "OF", "ON", "OR", "SO", "TO", "UP", "WE", "ALL", "ARE", "FOR", "HAS", "NEW", "NOW", "OLD", "OUR", "OUT", "THE", "TOP", "TWO", "WAS", "WHO", "WHY", "WIN", "BUY", "COO", "EPS", "P/E", "ROE", "ROI", # Common business abbreviations that aren't tickers "INC", "CO", "LLC", "LTD", "CORP", "PLC", "AG", "SA", "SE", "NV", "GAS", "OIL", "MGE", "LG", # Common words/abbreviations from logs # Single/two-letter words often false positives "AM", "AS", } for news_item in important_news: companies = news_item.get("companies_mentioned", []) extracted = self._extract_tickers(companies) for ticker in extracted: if ticker in exclude_words: continue if len(ticker) < 2: continue if ticker not in mentioned_tickers: mentioned_tickers[ticker] = [] mentioned_tickers[ticker].append( { "news_title": news_item.get("title", ""), "news_summary": news_item.get("summary", ""), "sentiment": news_item.get("sentiment", "neutral"), "importance": news_item.get("importance", 5), "themes": news_item.get("themes", []), "source": news_item.get("source", "unknown"), } ) # Convert to list format, prioritizing by news importance result = [] for ticker, news_list in mentioned_tickers.items(): # Use the most important news item as primary best_news = max(news_list, key=lambda x: x["importance"]) result.append( { "ticker": ticker, "news_title": best_news["news_title"], "news_summary": best_news["news_summary"], "sentiment": best_news["sentiment"], "importance": best_news["importance"], "themes": best_news["themes"], "source": best_news["source"], "mention_count": len(news_list), } ) # Sort by importance and mention count result.sort(key=lambda x: (x["importance"], x["mention_count"]), reverse=True) logger.info(f"šŸ“Œ Found {len(result)} directly mentioned tickers in news") return result[: self.max_total_candidates] def discover(self) -> List[Dict[str, Any]]: """ Run semantic discovery to find ticker opportunities. Returns: List of ticker candidates with news context and relevance scores """ logger.info("=" * 60) logger.info("šŸš€ SEMANTIC DISCOVERY") logger.info("=" * 60) # Step 1: Scan news news_items = self.news_scanner.scan_news() if not news_items: logger.info("No news items found.") return [] # Filter news by importance threshold important_news = [ item for item in news_items if item.get("importance", 0) >= self.min_news_importance ] logger.info(f"šŸ“° Processing {len(important_news)} high-importance news items...") logger.info(f"(Filtered from {len(news_items)} total items)") if self.news_sentiment_filter: before_count = len(important_news) important_news = [ item for item in important_news if item.get("sentiment", "").lower() == self.news_sentiment_filter ] logger.info( f"Sentiment filter: {self.news_sentiment_filter} " f"({len(important_news)}/{before_count} kept)" ) # Step 2: For each news item, find matching tickers all_candidates = [] news_ticker_map = {} # Track which news items match which tickers news_groups = {} # Track which tickers match each news item for i, news_item in enumerate(important_news, 1): title = news_item.get("title", "Untitled") logger.info(f"{i}. {title}") logger.debug(f"Importance: {news_item.get('importance', 0)}/10") mentioned_tickers = self._extract_tickers(news_item.get("companies_mentioned", [])) # Generate search query from news search_text = self.news_scanner.generate_news_summary(news_item) # Search ticker database matches = self.ticker_db.search_by_text( query_text=search_text, top_k=self.max_tickers_per_news ) # Filter by similarity threshold relevant_matches = [ match for match in matches if match["similarity_score"] >= self.min_similarity_threshold ] if relevant_matches: logger.info(f"Found {len(relevant_matches)} relevant tickers:") news_key = ( f"{title}|{news_item.get('source', '')}|" f"{news_item.get('published_at') or news_item.get('timestamp', '')}" ) if news_key not in news_groups: news_groups[news_key] = { "news_title": title, "news_summary": news_item.get("summary", ""), "news_importance": news_item.get("importance", 0), "news_themes": news_item.get("themes", []), "news_sentiment": news_item.get("sentiment"), "news_source": news_item.get("source"), "published_at": news_item.get("published_at"), "timestamp": news_item.get("timestamp"), "mentioned_tickers": mentioned_tickers, "tickers": [], } for match in relevant_matches: symbol = match["symbol"] score = match["similarity_score"] logger.debug(f"{symbol} (similarity: {score:.3f})") # Track news-ticker mapping if symbol not in news_ticker_map: news_ticker_map[symbol] = [] news_ticker_map[symbol].append( { "news_title": title, "news_summary": news_item.get("summary", ""), "news_importance": news_item.get("importance", 0), "news_themes": news_item.get("themes", []), "news_sentiment": news_item.get("sentiment"), "news_tickers_mentioned": mentioned_tickers, "similarity_score": score, "timestamp": news_item.get("timestamp"), "source": news_item.get("source"), } ) if symbol not in {t["ticker"] for t in news_groups[news_key]["tickers"]}: news_groups[news_key]["tickers"].append( { "ticker": symbol, "similarity_score": score, "ticker_name": match["metadata"]["name"], "ticker_sector": match["metadata"]["sector"], "ticker_industry": match["metadata"]["industry"], } ) # Add to candidates all_candidates.append( { "ticker": symbol, "ticker_name": match["metadata"]["name"], "ticker_sector": match["metadata"]["sector"], "ticker_industry": match["metadata"]["industry"], "news_title": title, "news_summary": news_item.get("summary", ""), "news_importance": news_item.get("importance", 0), "news_themes": news_item.get("themes", []), "news_sentiment": news_item.get("sentiment"), "news_tickers_mentioned": mentioned_tickers, "similarity_score": score, "news_source": news_item.get("source"), "discovery_timestamp": datetime.now().isoformat(), } ) else: logger.debug("No relevant tickers found (below threshold)") if self.group_by_news: grouped_candidates = [] for news_entry in news_groups.values(): tickers = news_entry["tickers"] if not tickers: continue avg_similarity = sum(t["similarity_score"] for t in tickers) / len(tickers) aggregate_score = ( (news_entry["news_importance"] * 1.5) + (avg_similarity * 3.0) + (len(tickers) * 0.5) ) grouped_candidates.append( { **news_entry, "num_tickers": len(tickers), "avg_similarity": round(avg_similarity, 3), "aggregate_score": round(aggregate_score, 2), } ) grouped_candidates.sort(key=lambda x: x["aggregate_score"], reverse=True) grouped_candidates = grouped_candidates[: self.max_total_candidates] logger.info("šŸ“Š Aggregating and ranking news items...") logger.info(f"Identified {len(grouped_candidates)} news items with tickers") return grouped_candidates # Step 3: Aggregate and rank candidates logger.info("šŸ“Š Aggregating and ranking candidates...") # Group by ticker and calculate aggregate scores ticker_aggregates = {} for ticker, news_matches in news_ticker_map.items(): # Calculate aggregate score # Factors: number of news matches, importance, similarity num_matches = len(news_matches) avg_importance = sum(n["news_importance"] for n in news_matches) / num_matches avg_similarity = sum(n["similarity_score"] for n in news_matches) / num_matches max_importance = max(n["news_importance"] for n in news_matches) # Weighted score aggregate_score = ( (num_matches * 2.0) # More news = higher score + (avg_importance * 1.5) # Average importance + (avg_similarity * 3.0) # Similarity strength + (max_importance * 1.0) # Bonus for having one very important match ) ticker_aggregates[ticker] = { "ticker": ticker, "num_news_matches": num_matches, "avg_importance": round(avg_importance, 2), "avg_similarity": round(avg_similarity, 3), "max_importance": max_importance, "aggregate_score": round(aggregate_score, 2), "news_matches": news_matches, } # Sort by aggregate score ranked_candidates = sorted( ticker_aggregates.values(), key=lambda x: x["aggregate_score"], reverse=True ) # Limit to max candidates ranked_candidates = ranked_candidates[: self.max_total_candidates] logger.info(f"Identified {len(ranked_candidates)} unique ticker candidates") return ranked_candidates def format_discovery_report(self, candidates: List[Dict[str, Any]]) -> str: """ Format discovery results as a readable report. Args: candidates: List of ranked candidates Returns: Formatted text report """ if not candidates: return "No opportunities discovered." if "tickers" in candidates[0]: report = "\n" + "=" * 60 report += "\nšŸ“° NEWS-DRIVEN RESULTS" report += "\n" + "=" * 60 + "\n" for i, news in enumerate(candidates, 1): title = news["news_title"] score = news["aggregate_score"] num_tickers = news["num_tickers"] importance = news["news_importance"] report += f"\n{i}. {title}" report += f"\n Score: {score:.2f} | Tickers: {num_tickers} | Importance: {importance}/10" report += f"\n Source: {news.get('news_source', 'unknown')}" if news.get("news_themes"): report += f"\n Themes: {', '.join(news['news_themes'])}" if news.get("news_summary"): report += f"\n Summary: {news['news_summary']}" if news.get("mentioned_tickers"): report += f"\n Mentioned Tickers: {', '.join(news['mentioned_tickers'])}" tickers = sorted(news["tickers"], key=lambda x: x["similarity_score"], reverse=True) report += "\n Related Tickers:" for j, ticker_info in enumerate(tickers[:5], 1): report += ( f"\n {j}. {ticker_info['ticker']} " f"(similarity: {ticker_info['similarity_score']:.3f})" ) if len(tickers) > 5: report += f"\n ... and {len(tickers) - 5} more" report += "\n" return report report = "\n" + "=" * 60 report += "\nšŸŽÆ SEMANTIC DISCOVERY RESULTS" report += "\n" + "=" * 60 + "\n" for i, candidate in enumerate(candidates, 1): ticker = candidate["ticker"] score = candidate["aggregate_score"] num_matches = candidate["num_news_matches"] avg_importance = candidate["avg_importance"] report += f"\n{i}. {ticker}" report += f"\n Score: {score:.2f} | Matches: {num_matches} | Avg Importance: {avg_importance}/10" report += "\n Related News:" for j, news in enumerate(candidate["news_matches"][:3], 1): # Show top 3 news report += f"\n {j}. {news['news_title']}" report += f"\n Similarity: {news['similarity_score']:.3f} | Importance: {news['news_importance']}/10" if news.get("news_themes"): report += f"\n Themes: {', '.join(news['news_themes'])}" if len(candidate["news_matches"]) > 3: report += f"\n ... and {len(candidate['news_matches']) - 3} more" report += "\n" return report def main(): """CLI for running semantic discovery.""" import argparse import json parser = argparse.ArgumentParser(description="Run semantic discovery") parser.add_argument( "--news-sources", nargs="+", default=["openai"], choices=["openai", "google_news", "sec_filings", "alpha_vantage", "gemini_search"], help="News sources to use", ) parser.add_argument( "--min-importance", type=int, default=5, help="Minimum news importance (1-10)" ) parser.add_argument( "--min-similarity", type=float, default=0.2, help="Minimum similarity threshold (0-1)" ) parser.add_argument( "--max-candidates", type=int, default=15, help="Maximum ticker candidates to return" ) parser.add_argument( "--lookback-hours", type=int, default=24, help="How far back to look for news (in hours). Examples: 1, 6, 24, 168", ) parser.add_argument("--output", type=str, help="Output file for results JSON") parser.add_argument( "--group-by-news", action="store_true", help="Group results by news item instead of ticker" ) args = parser.parse_args() # Load project config from tradingagents.default_config import DEFAULT_CONFIG config = { "project_dir": DEFAULT_CONFIG["project_dir"], "use_openai_embeddings": True, "news_sources": args.news_sources, "news_lookback_hours": args.lookback_hours, "min_news_importance": args.min_importance, "min_similarity_threshold": args.min_similarity, "max_tickers_per_news": 5, "max_total_candidates": args.max_candidates, "news_sentiment_filter": "positive", "group_by_news": args.group_by_news, } # Run discovery discovery = SemanticDiscovery(config) candidates = discovery.discover() # Display report report = discovery.format_discovery_report(candidates) logger.info(report) # Save to file if specified if args.output: with open(args.output, "w") as f: json.dump(candidates, f, indent=2) logger.info(f"āœ… Saved {len(candidates)} candidates to {args.output}") if __name__ == "__main__": main()