TradingAgents/tradingagents/dataflows/news_semantic_scanner.py

947 lines
34 KiB
Python

"""
News Semantic Scanner
--------------------
Scans news from multiple sources, summarizes key themes, and enables semantic
matching against ticker descriptions to find relevant investment opportunities.
Sources:
- OpenAI web search (real-time market news)
- SEC EDGAR filings (regulatory news)
- Google News
- Alpha Vantage news
"""
import json
import os
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional, Tuple
import requests
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from openai import OpenAI
from tradingagents.dataflows.discovery.utils import build_llm_log_entry
from tradingagents.schemas import FilingsList, NewsList
from tradingagents.utils.logger import get_logger
load_dotenv()
logger = get_logger(__name__)
class NewsSemanticScanner:
"""Scans and processes news for semantic ticker matching."""
def __init__(self, config: Dict[str, Any]):
"""
Initialize news scanner.
Args:
config: Configuration dict with:
- openai_api_key: OpenAI API key
- news_sources: List of sources to use
- max_news_items: Maximum news items to process
- news_lookback_hours: How far back to look for news (default: 24 hours)
"""
self.config = config
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
raise ValueError("OPENAI_API_KEY not found in environment")
self.openai_client = OpenAI(api_key=openai_api_key)
self.news_sources = config.get("news_sources", ["openai", "google_news"])
self.max_news_items = config.get("max_news_items", 20)
self.news_lookback_hours = config.get("news_lookback_hours", 24)
self.log_callback = config.get("log_callback")
# Calculate time window
self.cutoff_time = datetime.now() - timedelta(hours=self.news_lookback_hours)
def _emit_log(self, entry: Dict[str, Any]) -> None:
if self.log_callback:
try:
self.log_callback(entry)
except Exception:
pass
def _log_llm(
self,
step: str,
model: str,
prompt: Any,
output: Any,
error: str = "",
) -> None:
entry = build_llm_log_entry(
node="semantic_news",
step=step,
model=model,
prompt=prompt,
output=output,
error=error,
)
self._emit_log(entry)
def _get_time_phrase(self) -> str:
"""Generate human-readable time phrase for queries."""
if self.news_lookback_hours <= 1:
return "from the last hour"
elif self.news_lookback_hours <= 6:
return f"from the last {self.news_lookback_hours} hours"
elif self.news_lookback_hours <= 24:
return "from today"
elif self.news_lookback_hours <= 48:
return "from the last 2 days"
else:
days = int(self.news_lookback_hours / 24)
return f"from the last {days} days"
def _deduplicate_news(
self, news_items: List[Dict[str, Any]], similarity_threshold: float = 0.85
) -> List[Dict[str, Any]]:
"""
Deduplicate news items using semantic similarity (embeddings + cosine similarity).
Two-pass approach:
1. Fast hash-based pass for exact/near-exact duplicates
2. Embedding-based cosine similarity for semantically similar stories
Args:
news_items: List of news items from various sources
similarity_threshold: Cosine similarity threshold (0.85 = very similar)
Returns:
Deduplicated list, keeping highest importance version of each story
"""
import hashlib
import re
import numpy as np
if not news_items:
return []
def normalize_text(text: str) -> str:
"""Normalize text for comparison."""
if not text:
return ""
text = text.lower()
text = re.sub(r"[^\w\s]", "", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def get_content_hash(item: Dict[str, Any]) -> str:
"""Generate hash from normalized title + summary."""
title = normalize_text(item.get("title", ""))
summary = normalize_text(item.get("summary", ""))[:100]
content = title + " " + summary
return hashlib.md5(content.encode()).hexdigest()
def get_news_text(item: Dict[str, Any]) -> str:
"""Get combined text for embedding."""
title = item.get("title", "")
summary = item.get("summary", "")
return f"{title}. {summary}"[:500] # Limit length for efficiency
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
"""Compute cosine similarity between two vectors."""
norm_a = np.linalg.norm(a)
norm_b = np.linalg.norm(b)
if norm_a == 0 or norm_b == 0:
return 0.0
return float(np.dot(a, b) / (norm_a * norm_b))
# === PASS 1: Hash-based deduplication (fast, exact matches) ===
seen_hashes: Dict[str, Dict[str, Any]] = {}
hash_duplicates = 0
for item in news_items:
content_hash = get_content_hash(item)
if content_hash not in seen_hashes:
seen_hashes[content_hash] = item
else:
existing = seen_hashes[content_hash]
if (item.get("importance", 0) or 0) > (existing.get("importance", 0) or 0):
seen_hashes[content_hash] = item
hash_duplicates += 1
after_hash = list(seen_hashes.values())
logger.info(
f"Hash dedup: {len(news_items)}{len(after_hash)} ({hash_duplicates} exact duplicates)"
)
# === PASS 2: Embedding-based semantic similarity ===
# Only run if we have enough items to justify the cost
if len(after_hash) <= 3:
return after_hash
try:
# Generate embeddings for all remaining items
texts = [get_news_text(item) for item in after_hash]
# Use OpenAI embeddings (same as ticker_semantic_db)
response = self.openai_client.embeddings.create(
model="text-embedding-3-small",
input=texts,
)
embeddings = np.array([e.embedding for e in response.data])
# Find semantic duplicates using cosine similarity
unique_indices = []
semantic_duplicates = 0
for i in range(len(after_hash)):
is_duplicate = False
for j in unique_indices:
sim = cosine_similarity(embeddings[i], embeddings[j])
if sim >= similarity_threshold:
# This is a semantic duplicate
is_duplicate = True
semantic_duplicates += 1
# Keep higher importance version
existing_item = after_hash[j]
new_item = after_hash[i]
if (new_item.get("importance", 0) or 0) > (
existing_item.get("importance", 0) or 0
):
# Replace with higher importance
unique_indices.remove(j)
unique_indices.append(i)
logger.debug(
f"Semantic duplicate (sim={sim:.2f}): "
f"'{new_item.get('title', '')[:40]}' vs "
f"'{existing_item.get('title', '')[:40]}'"
)
break
if not is_duplicate:
unique_indices.append(i)
final_items = [after_hash[i] for i in unique_indices]
logger.info(
f"Semantic dedup: {len(after_hash)}{len(final_items)} "
f"({semantic_duplicates} similar stories merged)"
)
return final_items
except Exception as e:
logger.warning(f"Embedding-based dedup failed, using hash-only results: {e}")
return after_hash
def _filter_by_time(self, news_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Filter news items by timestamp to respect lookback window.
Args:
news_items: List of news items with 'published_at' or 'timestamp' field
Returns:
Filtered list of news items within time window
"""
filtered = []
filtered_out_count = 0
for item in news_items:
timestamp_str = item.get("published_at") or item.get("timestamp")
title_preview = item.get("title", "")[:60]
if not timestamp_str:
# No timestamp, keep it (assume recent)
logger.debug(f"No timestamp for '{title_preview}', keeping")
filtered.append(item)
continue
item_time = self._parse_timestamp(timestamp_str, date_only_end=True)
if not item_time:
# If parsing fails, keep it
logger.debug(f"Parse failed for '{timestamp_str}' on '{title_preview}', keeping")
filtered.append(item)
continue
if item_time >= self.cutoff_time:
filtered.append(item)
else:
filtered_out_count += 1
logger.debug(
f"FILTERED OUT: '{title_preview}' | "
f"published_at='{item.get('published_at')}' | "
f"parsed={item_time.strftime('%Y-%m-%d %H:%M')} | "
f"cutoff={self.cutoff_time.strftime('%Y-%m-%d %H:%M')}"
)
if filtered_out_count > 0:
logger.info(
f"Time filter removed {filtered_out_count} items with timestamps before cutoff"
)
return filtered
def _parse_timestamp(self, timestamp_str: str, date_only_end: bool) -> Optional[datetime]:
"""Parse a timestamp string into a naive datetime, or return None if invalid."""
try:
# Handle date-only strings
if len(timestamp_str) == 10 and timestamp_str[4] == "-" and timestamp_str[7] == "-":
base_time = datetime.fromisoformat(timestamp_str)
if date_only_end:
return base_time.replace(hour=23, minute=59, second=59)
return base_time
# Parse ISO timestamp
parsed_time = datetime.fromisoformat(timestamp_str.replace("Z", "+00:00"))
if parsed_time.tzinfo:
parsed_time = parsed_time.astimezone().replace(tzinfo=None)
return parsed_time
except Exception:
return None
def _publish_date_range(
self, news_items: List[Dict[str, Any]]
) -> Tuple[Optional[datetime], Optional[datetime]]:
"""Get the earliest and latest publish timestamps from a list of news items."""
min_time = None
max_time = None
for item in news_items:
timestamp_str = item.get("published_at") or item.get("timestamp")
if not timestamp_str:
continue
item_time = self._parse_timestamp(timestamp_str, date_only_end=False)
if not item_time:
continue
if min_time is None or item_time < min_time:
min_time = item_time
if max_time is None or item_time > max_time:
max_time = item_time
return min_time, max_time
def _build_time_constraint(self) -> str:
"""Build the shared time constraint block used by all news prompts."""
current_datetime = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
cutoff_datetime = self.cutoff_time.strftime("%Y-%m-%dT%H:%M:%S")
return (
f"CRITICAL TIME CONSTRAINT:\n"
f"- Current time: {current_datetime}\n"
f"- Only include items published AFTER: {cutoff_datetime}\n"
f"- Skip anything older than {self.news_lookback_hours} hours"
)
def _build_extraction_fields(self, detail_level: str = "full") -> str:
"""Build the shared extraction fields block.
Args:
detail_level: "full" for primary searches, "brief" for parsing raw feeds.
"""
current_datetime = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
base = "For each item, extract:\n" "- title: Headline\n"
if detail_level == "full":
base += "- summary: 2-3 sentence summary of key points\n"
else:
base += "- summary: Brief summary of key points\n"
base += (
f"- published_at: ISO-8601 timestamp (REQUIRED — convert relative times like '2 hours ago' to full timestamp using current time {current_datetime})\n"
"- companies_mentioned: List of stock ticker symbols (prefer tickers over company names, e.g. 'AAPL' not 'Apple Inc.')\n"
"- themes: Key themes (e.g., 'earnings beat', 'FDA approval', 'merger', 'insider buying')\n"
"- sentiment: one of positive, negative, neutral\n"
"- importance: 1-10 score (10 = highly market-moving, company-specific catalysts score higher than broad market news)"
)
return base
_COMPANY_SPECIFIC_INSTRUCTION = (
"Prefer company-specific or single-catalyst stories that impact one company or a small "
"group of companies. Avoid broad market, index, or macroeconomic headlines unless they "
"have a clear company-specific catalyst. If a story is sector-wide without a specific "
"company catalyst, skip it."
)
def _build_web_search_prompt(self, query: str = "breaking stock market news today") -> str:
"""
Build unified web search prompt for both OpenAI and Gemini.
Args:
query: Search query for news
Returns:
Formatted search prompt string
"""
time_phrase = self._get_time_phrase()
time_query = f"{query} {time_phrase}"
return f"""Search the web for: {time_query}
{self._build_time_constraint()}
Find the top {self.max_news_items} most important market-moving news stories from the last {self.news_lookback_hours} hours.
{self._COMPANY_SPECIFIC_INSTRUCTION}
Focus on:
- Earnings reports and guidance
- FDA approvals / regulatory decisions
- Mergers, acquisitions, partnerships
- Product launches
- Executive changes
- Legal/regulatory actions
- Analyst upgrades/downgrades
{self._build_extraction_fields("full")}
"""
def _build_openai_input(self, system_text: str, user_text: str) -> str:
"""Build Responses API input as a single prompt string."""
if system_text:
return f"{system_text}\n\n{user_text}"
return user_text
def _fetch_openai_news(
self, query: str = "breaking stock market news today"
) -> List[Dict[str, Any]]:
"""
Fetch news using OpenAI's web search capability.
Args:
query: Search query for news
Returns:
List of news items with title, summary, published_at, timestamp
"""
try:
# Build search prompt
search_prompt = self._build_web_search_prompt(query)
# Use OpenAI web search tool for real-time news
response = self.openai_client.responses.parse(
model="gpt-4o",
tools=[{"type": "web_search"}],
input=self._build_openai_input(
"You are a financial news analyst. Search the web for the latest market news "
"and return structured summaries.",
search_prompt,
),
text_format=NewsList,
)
news_list = response.output_parsed
news_items = [item.model_dump() for item in news_list.news]
self._log_llm(
step="OpenAI web search",
model="gpt-4o",
prompt=search_prompt,
output=news_items,
)
# Add metadata
for item in news_items:
item["source"] = "openai_search"
item["timestamp"] = datetime.now().isoformat()
return news_items[: self.max_news_items]
except Exception as e:
self._log_llm(
step="OpenAI web search",
model="gpt-4o",
prompt=search_prompt if "search_prompt" in locals() else "",
output="",
error=str(e),
)
logger.error(f"Error fetching OpenAI news: {e}")
return []
def _fetch_google_news(self, query: str = "stock market") -> List[Dict[str, Any]]:
"""
Fetch news from Google News RSS.
Args:
query: Search query
Returns:
List of news items
"""
try:
# Use Google News helper
from tradingagents.dataflows.google import get_google_news
# Convert hours to days (round up)
lookback_days = max(1, int((self.news_lookback_hours + 23) / 24))
news_report = get_google_news(
query=query,
curr_date=datetime.now().strftime("%Y-%m-%d"),
look_back_days=lookback_days,
)
# Parse the report using LLM to extract structured data
parse_prompt = f"""Parse this news report and extract individual news items.
{self._build_time_constraint()}
{self._COMPANY_SPECIFIC_INSTRUCTION}
{news_report}
{self._build_extraction_fields("brief")}
Return as JSON array with key "news"."""
response = self.openai_client.responses.parse(
model="gpt-4o-mini",
input=self._build_openai_input(
"Extract news items from this report into structured JSON format.",
parse_prompt,
),
text_format=NewsList,
)
news_list = response.output_parsed
news_items = [item.model_dump() for item in news_list.news]
self._log_llm(
step="Parse Google News",
model="gpt-4o-mini",
prompt=parse_prompt,
output=news_items,
)
# Add metadata
for item in news_items:
item["source"] = "google_news"
item["timestamp"] = datetime.now().isoformat()
return news_items[: self.max_news_items]
except Exception as e:
self._log_llm(
step="Parse Google News",
model="gpt-4o-mini",
prompt=parse_prompt if "parse_prompt" in locals() else "",
output="",
error=str(e),
)
logger.error(f"Error fetching Google News: {e}")
return []
def _fetch_sec_filings(self) -> List[Dict[str, Any]]:
"""
Fetch recent SEC filings (8-K, 13D, 13G - market-moving events).
Returns:
List of filing summaries
"""
try:
# SEC EDGAR API endpoint
# Get recent 8-K filings (material events)
url = "https://www.sec.gov/cgi-bin/browse-edgar"
params = {"action": "getcurrent", "type": "8-K", "output": "atom", "count": 20}
headers = {"User-Agent": "TradingAgents/1.0 (contact@example.com)"}
response = requests.get(url, params=params, headers=headers, timeout=10)
if response.status_code != 200:
return []
# Parse SEC filings using LLM
# (SEC returns XML/Atom feed, we'll parse with LLM for simplicity)
filings_prompt = f"""Parse these SEC 8-K filings and extract the most important material events.
{self._build_time_constraint()}
Prefer company-specific filings and material events; skip broad market commentary or routine filings.
{response.text}
{self._build_extraction_fields("brief")}
Return as JSON array with key "filings"."""
llm_response = self.openai_client.responses.parse(
model="gpt-4o-mini",
input=self._build_openai_input(
"Extract important SEC 8-K filings from this data and summarize the market-moving events.",
filings_prompt,
),
text_format=FilingsList,
)
filings_list = llm_response.output_parsed
filings = [item.model_dump() for item in filings_list.filings]
self._log_llm(
step="Parse SEC filings",
model="gpt-4o-mini",
prompt=filings_prompt,
output=filings,
)
# Add metadata
for filing in filings:
filing["source"] = "sec_edgar"
filing["timestamp"] = datetime.now().isoformat()
return filings[: self.max_news_items]
except Exception as e:
self._log_llm(
step="Parse SEC filings",
model="gpt-4o-mini",
prompt=filings_prompt if "filings_prompt" in locals() else "",
output="",
error=str(e),
)
logger.error(f"Error fetching SEC filings: {e}")
return []
def _fetch_alpha_vantage_news(
self, topics: str = "earnings,technology"
) -> List[Dict[str, Any]]:
"""
Fetch news from Alpha Vantage.
Args:
topics: News topics to filter
Returns:
List of news items
"""
try:
from tradingagents.dataflows.alpha_vantage_news import get_alpha_vantage_news_feed
# Use cutoff time for Alpha Vantage
time_from = self.cutoff_time.strftime("%Y%m%dT%H%M")
news_report = get_alpha_vantage_news_feed(topics=topics, time_from=time_from, limit=50)
# Parse with LLM
parse_prompt = f"""Parse this news feed and extract the most important market-moving stories.
{self._build_time_constraint()}
{self._COMPANY_SPECIFIC_INSTRUCTION}
{news_report}
{self._build_extraction_fields("brief")}
Return as JSON array with key "news"."""
response = self.openai_client.responses.parse(
model="gpt-4o-mini",
input=self._build_openai_input(
"Extract and summarize important market news.",
parse_prompt,
),
text_format=NewsList,
)
news_list = response.output_parsed
news_items = [item.model_dump() for item in news_list.news]
self._log_llm(
step="Parse Alpha Vantage news",
model="gpt-4o-mini",
prompt=parse_prompt,
output=news_items,
)
# Add metadata
for item in news_items:
item["source"] = "alpha_vantage"
item["timestamp"] = datetime.now().isoformat()
return news_items[: self.max_news_items]
except Exception as e:
self._log_llm(
step="Parse Alpha Vantage news",
model="gpt-4o-mini",
prompt=parse_prompt if "parse_prompt" in locals() else "",
output="",
error=str(e),
)
logger.error(f"Error fetching Alpha Vantage news: {e}")
return []
def _fetch_gemini_search_news(
self, query: str = "breaking stock market news today"
) -> List[Dict[str, Any]]:
"""
Fetch news using Google Gemini's native web search (grounding) capability.
This uses Gemini's built-in web search tool for real-time market news,
which may provide different results than OpenAI's web search.
Args:
query: Search query for news
Returns:
List of news items with title, summary, published_at, timestamp
"""
try:
import os
# Get API key
google_api_key = os.getenv("GOOGLE_API_KEY")
if not google_api_key:
logger.error("GOOGLE_API_KEY not set, skipping Gemini search")
return []
# Build search prompt
search_prompt = self._build_web_search_prompt(query)
# Step 1: Execute web search using Gemini with google_search tool
search_llm = ChatGoogleGenerativeAI(
model="gemini-2.5-flash-lite", # Fast model for search
api_key=google_api_key,
temperature=1.0, # Higher temperature for diverse results
).bind_tools([{"google_search": {}}])
# Execute search
raw_response = search_llm.invoke(search_prompt)
self._log_llm(
step="Gemini search",
model="gemini-2.5-flash-lite",
prompt=search_prompt,
output=raw_response.content if hasattr(raw_response, "content") else raw_response,
)
# Step 2: Structure the results using Gemini with JSON schema
structured_llm = ChatGoogleGenerativeAI(
model="gemini-2.5-flash-lite", api_key=google_api_key
).with_structured_output(NewsList, method="json_schema")
structure_prompt = f"""Parse the following web search results into structured news items.
{self._build_time_constraint()}
{self._build_extraction_fields("full")}
Web search results:
{raw_response.content}
Return as JSON with "news" array."""
structured_response = structured_llm.invoke(structure_prompt)
self._log_llm(
step="Gemini search structuring",
model="gemini-2.5-flash-lite",
prompt=structure_prompt,
output=structured_response,
)
# Extract news items
news_items = [item.model_dump() for item in structured_response.news]
# Add metadata
for item in news_items:
item["source"] = "gemini_search"
item["timestamp"] = datetime.now().isoformat()
return news_items[: self.max_news_items]
except Exception as e:
self._log_llm(
step="Gemini search",
model="gemini-2.5-flash-lite",
prompt=search_prompt if "search_prompt" in locals() else "",
output="",
error=str(e),
)
logger.error(f"Error fetching Gemini search news: {e}")
return []
def scan_news(self) -> List[Dict[str, Any]]:
"""
Scan news from all enabled sources.
Returns:
Aggregated list of news items sorted by importance
"""
all_news = []
logger.info("Scanning news sources...")
logger.info(f"Time window: {self._get_time_phrase()} (last {self.news_lookback_hours}h)")
logger.info(f"Cutoff: {self.cutoff_time.strftime('%Y-%m-%d %H:%M')}")
# Fetch from each enabled source
if "openai" in self.news_sources:
logger.info("Fetching OpenAI web search...")
openai_news = self._fetch_openai_news()
all_news.extend(openai_news)
logger.info(f"Found {len(openai_news)} items from OpenAI")
min_date, max_date = self._publish_date_range(openai_news)
if min_date:
logger.debug(f"Min publish date (OpenAI): {min_date.strftime('%Y-%m-%d %H:%M')}")
else:
logger.debug("Min publish date (OpenAI): N/A")
if max_date:
logger.debug(f"Max publish date (OpenAI): {max_date.strftime('%Y-%m-%d %H:%M')}")
else:
logger.debug("Max publish date (OpenAI): N/A")
if "google_news" in self.news_sources:
logger.info("Fetching Google News...")
google_news = self._fetch_google_news()
all_news.extend(google_news)
logger.info(f"Found {len(google_news)} items from Google News")
min_date, max_date = self._publish_date_range(google_news)
if min_date:
logger.debug(
f"Min publish date (Google News): {min_date.strftime('%Y-%m-%d %H:%M')}"
)
else:
logger.debug("Min publish date (Google News): N/A")
if max_date:
logger.debug(
f"Max publish date (Google News): {max_date.strftime('%Y-%m-%d %H:%M')}"
)
else:
logger.debug("Max publish date (Google News): N/A")
if "sec_filings" in self.news_sources:
logger.info("Fetching SEC filings...")
sec_filings = self._fetch_sec_filings()
all_news.extend(sec_filings)
logger.info(f"Found {len(sec_filings)} items from SEC")
min_date, max_date = self._publish_date_range(sec_filings)
if min_date:
logger.debug(f"Min publish date (SEC): {min_date.strftime('%Y-%m-%d %H:%M')}")
else:
logger.debug("Min publish date (SEC): N/A")
if max_date:
logger.debug(f"Max publish date (SEC): {max_date.strftime('%Y-%m-%d %H:%M')}")
else:
logger.debug("Max publish date (SEC): N/A")
if "alpha_vantage" in self.news_sources:
logger.info("Fetching Alpha Vantage news...")
av_news = self._fetch_alpha_vantage_news()
all_news.extend(av_news)
logger.info(f"Found {len(av_news)} items from Alpha Vantage")
min_date, max_date = self._publish_date_range(av_news)
if min_date:
logger.debug(
f"Min publish date (Alpha Vantage): {min_date.strftime('%Y-%m-%d %H:%M')}"
)
else:
logger.debug("Min publish date (Alpha Vantage): N/A")
if max_date:
logger.debug(
f"Max publish date (Alpha Vantage): {max_date.strftime('%Y-%m-%d %H:%M')}"
)
else:
logger.debug("Max publish date (Alpha Vantage): N/A")
if "gemini_search" in self.news_sources:
logger.info("Fetching Google Gemini search...")
gemini_news = self._fetch_gemini_search_news()
all_news.extend(gemini_news)
logger.info(f"Found {len(gemini_news)} items from Gemini search")
min_date, max_date = self._publish_date_range(gemini_news)
if min_date:
logger.debug(f"Min publish date (Gemini): {min_date.strftime('%Y-%m-%d %H:%M')}")
else:
logger.debug("Min publish date (Gemini): N/A")
if max_date:
logger.debug(f"Max publish date (Gemini): {max_date.strftime('%Y-%m-%d %H:%M')}")
else:
logger.debug("Max publish date (Gemini): N/A")
# Apply time filtering
logger.info(f"Collected {len(all_news)} raw news items")
all_news = self._filter_by_time(all_news)
logger.info(f"After time filtering: {len(all_news)} items")
# Deduplicate news from multiple sources (same story = same hash)
all_news = self._deduplicate_news(all_news)
logger.info(f"After deduplication: {len(all_news)} items")
# Sort by importance
all_news.sort(key=lambda x: x.get("importance", 0), reverse=True)
logger.info(f"Total news items collected: {len(all_news)}")
return all_news[: self.max_news_items]
def generate_news_summary(self, news_item: Dict[str, Any]) -> str:
"""
Generate a semantic search-optimized summary for a news item.
Args:
news_item: News item dict
Returns:
Optimized summary text for embedding/matching
"""
title = news_item.get("title", "")
summary = news_item.get("summary", "")
themes = news_item.get("themes", [])
companies = news_item.get("companies_mentioned", [])
# Create rich text for semantic matching
search_text = f"""
{title}
{summary}
Key themes: {', '.join(themes) if themes else 'General market news'}
Companies mentioned: {', '.join(companies) if companies else 'Broad market'}
""".strip()
return search_text
def main():
"""CLI for testing news scanner."""
import argparse
parser = argparse.ArgumentParser(description="Scan news for semantic ticker matching")
parser.add_argument(
"--sources",
nargs="+",
default=["openai"],
choices=["openai", "google_news", "sec_filings", "alpha_vantage", "gemini_search"],
help="News sources to use",
)
parser.add_argument("--max-items", type=int, default=10, help="Maximum news items to fetch")
parser.add_argument(
"--lookback-hours",
type=int,
default=24,
help="How far back to look for news (in hours). Examples: 1 (last hour), 6 (last 6 hours), 24 (last day), 168 (last week)",
)
parser.add_argument("--output", type=str, help="Output file for news JSON")
args = parser.parse_args()
config = {
"news_sources": args.sources,
"max_news_items": args.max_items,
"news_lookback_hours": args.lookback_hours,
}
scanner = NewsSemanticScanner(config)
news_items = scanner.scan_news()
# Display results
logger.info("\n" + "=" * 60)
logger.info(f"Top {min(5, len(news_items))} Most Important News Items:")
logger.info("=" * 60 + "\n")
for i, item in enumerate(news_items[:5], 1):
logger.info(f"{i}. {item.get('title', 'Untitled')}")
logger.info(f" Source: {item.get('source', 'unknown')}")
logger.info(f" Importance: {item.get('importance', 'N/A')}/10")
logger.info(f" Summary: {item.get('summary', '')[:150]}...")
logger.info(f" Themes: {', '.join(item.get('themes', []))}")
logger.info("")
# Save to file if specified
if args.output:
with open(args.output, "w") as f:
json.dump(news_items, f, indent=2)
logger.info(f"✅ Saved {len(news_items)} news items to {args.output}")
if __name__ == "__main__":
main()