""" News Semantic Scanner -------------------- Scans news from multiple sources, summarizes key themes, and enables semantic matching against ticker descriptions to find relevant investment opportunities. Sources: - OpenAI web search (real-time market news) - SEC EDGAR filings (regulatory news) - Google News - Alpha Vantage news """ import json import os from datetime import datetime, timedelta from typing import Any, Dict, List, Optional, Tuple import requests from dotenv import load_dotenv from langchain_google_genai import ChatGoogleGenerativeAI from openai import OpenAI from tradingagents.dataflows.discovery.utils import build_llm_log_entry from tradingagents.schemas import FilingsList, NewsList from tradingagents.utils.logger import get_logger load_dotenv() logger = get_logger(__name__) class NewsSemanticScanner: """Scans and processes news for semantic ticker matching.""" def __init__(self, config: Dict[str, Any]): """ Initialize news scanner. Args: config: Configuration dict with: - openai_api_key: OpenAI API key - news_sources: List of sources to use - max_news_items: Maximum news items to process - news_lookback_hours: How far back to look for news (default: 24 hours) """ self.config = config openai_api_key = os.getenv("OPENAI_API_KEY") if not openai_api_key: raise ValueError("OPENAI_API_KEY not found in environment") self.openai_client = OpenAI(api_key=openai_api_key) self.news_sources = config.get("news_sources", ["openai", "google_news"]) self.max_news_items = config.get("max_news_items", 20) self.news_lookback_hours = config.get("news_lookback_hours", 24) self.log_callback = config.get("log_callback") # Calculate time window self.cutoff_time = datetime.now() - timedelta(hours=self.news_lookback_hours) def _emit_log(self, entry: Dict[str, Any]) -> None: if self.log_callback: try: self.log_callback(entry) except Exception: pass def _log_llm( self, step: str, model: str, prompt: Any, output: Any, error: str = "", ) -> None: entry = build_llm_log_entry( node="semantic_news", step=step, model=model, prompt=prompt, output=output, error=error, ) self._emit_log(entry) def _get_time_phrase(self) -> str: """Generate human-readable time phrase for queries.""" if self.news_lookback_hours <= 1: return "from the last hour" elif self.news_lookback_hours <= 6: return f"from the last {self.news_lookback_hours} hours" elif self.news_lookback_hours <= 24: return "from today" elif self.news_lookback_hours <= 48: return "from the last 2 days" else: days = int(self.news_lookback_hours / 24) return f"from the last {days} days" def _deduplicate_news( self, news_items: List[Dict[str, Any]], similarity_threshold: float = 0.85 ) -> List[Dict[str, Any]]: """ Deduplicate news items using semantic similarity (embeddings + cosine similarity). Two-pass approach: 1. Fast hash-based pass for exact/near-exact duplicates 2. Embedding-based cosine similarity for semantically similar stories Args: news_items: List of news items from various sources similarity_threshold: Cosine similarity threshold (0.85 = very similar) Returns: Deduplicated list, keeping highest importance version of each story """ import hashlib import re import numpy as np if not news_items: return [] def normalize_text(text: str) -> str: """Normalize text for comparison.""" if not text: return "" text = text.lower() text = re.sub(r"[^\w\s]", "", text) text = re.sub(r"\s+", " ", text).strip() return text def get_content_hash(item: Dict[str, Any]) -> str: """Generate hash from normalized title + summary.""" title = normalize_text(item.get("title", "")) summary = normalize_text(item.get("summary", ""))[:100] content = title + " " + summary return hashlib.md5(content.encode()).hexdigest() def get_news_text(item: Dict[str, Any]) -> str: """Get combined text for embedding.""" title = item.get("title", "") summary = item.get("summary", "") return f"{title}. {summary}"[:500] # Limit length for efficiency def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float: """Compute cosine similarity between two vectors.""" norm_a = np.linalg.norm(a) norm_b = np.linalg.norm(b) if norm_a == 0 or norm_b == 0: return 0.0 return float(np.dot(a, b) / (norm_a * norm_b)) # === PASS 1: Hash-based deduplication (fast, exact matches) === seen_hashes: Dict[str, Dict[str, Any]] = {} hash_duplicates = 0 for item in news_items: content_hash = get_content_hash(item) if content_hash not in seen_hashes: seen_hashes[content_hash] = item else: existing = seen_hashes[content_hash] if (item.get("importance", 0) or 0) > (existing.get("importance", 0) or 0): seen_hashes[content_hash] = item hash_duplicates += 1 after_hash = list(seen_hashes.values()) logger.info( f"Hash dedup: {len(news_items)} → {len(after_hash)} ({hash_duplicates} exact duplicates)" ) # === PASS 2: Embedding-based semantic similarity === # Only run if we have enough items to justify the cost if len(after_hash) <= 3: return after_hash try: # Generate embeddings for all remaining items texts = [get_news_text(item) for item in after_hash] # Use OpenAI embeddings (same as ticker_semantic_db) response = self.openai_client.embeddings.create( model="text-embedding-3-small", input=texts, ) embeddings = np.array([e.embedding for e in response.data]) # Find semantic duplicates using cosine similarity unique_indices = [] semantic_duplicates = 0 for i in range(len(after_hash)): is_duplicate = False for j in unique_indices: sim = cosine_similarity(embeddings[i], embeddings[j]) if sim >= similarity_threshold: # This is a semantic duplicate is_duplicate = True semantic_duplicates += 1 # Keep higher importance version existing_item = after_hash[j] new_item = after_hash[i] if (new_item.get("importance", 0) or 0) > ( existing_item.get("importance", 0) or 0 ): # Replace with higher importance unique_indices.remove(j) unique_indices.append(i) logger.debug( f"Semantic duplicate (sim={sim:.2f}): " f"'{new_item.get('title', '')[:40]}' vs " f"'{existing_item.get('title', '')[:40]}'" ) break if not is_duplicate: unique_indices.append(i) final_items = [after_hash[i] for i in unique_indices] logger.info( f"Semantic dedup: {len(after_hash)} → {len(final_items)} " f"({semantic_duplicates} similar stories merged)" ) return final_items except Exception as e: logger.warning(f"Embedding-based dedup failed, using hash-only results: {e}") return after_hash def _filter_by_time(self, news_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Filter news items by timestamp to respect lookback window. Args: news_items: List of news items with 'published_at' or 'timestamp' field Returns: Filtered list of news items within time window """ filtered = [] filtered_out_count = 0 for item in news_items: timestamp_str = item.get("published_at") or item.get("timestamp") title_preview = item.get("title", "")[:60] if not timestamp_str: # No timestamp, keep it (assume recent) logger.debug(f"No timestamp for '{title_preview}', keeping") filtered.append(item) continue item_time = self._parse_timestamp(timestamp_str, date_only_end=True) if not item_time: # If parsing fails, keep it logger.debug(f"Parse failed for '{timestamp_str}' on '{title_preview}', keeping") filtered.append(item) continue if item_time >= self.cutoff_time: filtered.append(item) else: filtered_out_count += 1 logger.debug( f"FILTERED OUT: '{title_preview}' | " f"published_at='{item.get('published_at')}' | " f"parsed={item_time.strftime('%Y-%m-%d %H:%M')} | " f"cutoff={self.cutoff_time.strftime('%Y-%m-%d %H:%M')}" ) if filtered_out_count > 0: logger.info( f"Time filter removed {filtered_out_count} items with timestamps before cutoff" ) return filtered def _parse_timestamp(self, timestamp_str: str, date_only_end: bool) -> Optional[datetime]: """Parse a timestamp string into a naive datetime, or return None if invalid.""" try: # Handle date-only strings if len(timestamp_str) == 10 and timestamp_str[4] == "-" and timestamp_str[7] == "-": base_time = datetime.fromisoformat(timestamp_str) if date_only_end: return base_time.replace(hour=23, minute=59, second=59) return base_time # Parse ISO timestamp parsed_time = datetime.fromisoformat(timestamp_str.replace("Z", "+00:00")) if parsed_time.tzinfo: parsed_time = parsed_time.astimezone().replace(tzinfo=None) return parsed_time except Exception: return None def _publish_date_range( self, news_items: List[Dict[str, Any]] ) -> Tuple[Optional[datetime], Optional[datetime]]: """Get the earliest and latest publish timestamps from a list of news items.""" min_time = None max_time = None for item in news_items: timestamp_str = item.get("published_at") or item.get("timestamp") if not timestamp_str: continue item_time = self._parse_timestamp(timestamp_str, date_only_end=False) if not item_time: continue if min_time is None or item_time < min_time: min_time = item_time if max_time is None or item_time > max_time: max_time = item_time return min_time, max_time def _build_time_constraint(self) -> str: """Build the shared time constraint block used by all news prompts.""" current_datetime = datetime.now().strftime("%Y-%m-%dT%H:%M:%S") cutoff_datetime = self.cutoff_time.strftime("%Y-%m-%dT%H:%M:%S") return ( f"CRITICAL TIME CONSTRAINT:\n" f"- Current time: {current_datetime}\n" f"- Only include items published AFTER: {cutoff_datetime}\n" f"- Skip anything older than {self.news_lookback_hours} hours" ) def _build_extraction_fields(self, detail_level: str = "full") -> str: """Build the shared extraction fields block. Args: detail_level: "full" for primary searches, "brief" for parsing raw feeds. """ current_datetime = datetime.now().strftime("%Y-%m-%dT%H:%M:%S") base = "For each item, extract:\n" "- title: Headline\n" if detail_level == "full": base += "- summary: 2-3 sentence summary of key points\n" else: base += "- summary: Brief summary of key points\n" base += ( f"- published_at: ISO-8601 timestamp (REQUIRED — convert relative times like '2 hours ago' to full timestamp using current time {current_datetime})\n" "- companies_mentioned: List of stock ticker symbols (prefer tickers over company names, e.g. 'AAPL' not 'Apple Inc.')\n" "- themes: Key themes (e.g., 'earnings beat', 'FDA approval', 'merger', 'insider buying')\n" "- sentiment: one of positive, negative, neutral\n" "- importance: 1-10 score (10 = highly market-moving, company-specific catalysts score higher than broad market news)" ) return base _COMPANY_SPECIFIC_INSTRUCTION = ( "Prefer company-specific or single-catalyst stories that impact one company or a small " "group of companies. Avoid broad market, index, or macroeconomic headlines unless they " "have a clear company-specific catalyst. If a story is sector-wide without a specific " "company catalyst, skip it." ) def _build_web_search_prompt(self, query: str = "breaking stock market news today") -> str: """ Build unified web search prompt for both OpenAI and Gemini. Args: query: Search query for news Returns: Formatted search prompt string """ time_phrase = self._get_time_phrase() time_query = f"{query} {time_phrase}" return f"""Search the web for: {time_query} {self._build_time_constraint()} Find the top {self.max_news_items} most important market-moving news stories from the last {self.news_lookback_hours} hours. {self._COMPANY_SPECIFIC_INSTRUCTION} Focus on: - Earnings reports and guidance - FDA approvals / regulatory decisions - Mergers, acquisitions, partnerships - Product launches - Executive changes - Legal/regulatory actions - Analyst upgrades/downgrades {self._build_extraction_fields("full")} """ def _build_openai_input(self, system_text: str, user_text: str) -> str: """Build Responses API input as a single prompt string.""" if system_text: return f"{system_text}\n\n{user_text}" return user_text def _fetch_openai_news( self, query: str = "breaking stock market news today" ) -> List[Dict[str, Any]]: """ Fetch news using OpenAI's web search capability. Args: query: Search query for news Returns: List of news items with title, summary, published_at, timestamp """ try: # Build search prompt search_prompt = self._build_web_search_prompt(query) # Use OpenAI web search tool for real-time news response = self.openai_client.responses.parse( model="gpt-4o", tools=[{"type": "web_search"}], input=self._build_openai_input( "You are a financial news analyst. Search the web for the latest market news " "and return structured summaries.", search_prompt, ), text_format=NewsList, ) news_list = response.output_parsed news_items = [item.model_dump() for item in news_list.news] self._log_llm( step="OpenAI web search", model="gpt-4o", prompt=search_prompt, output=news_items, ) # Add metadata for item in news_items: item["source"] = "openai_search" item["timestamp"] = datetime.now().isoformat() return news_items[: self.max_news_items] except Exception as e: self._log_llm( step="OpenAI web search", model="gpt-4o", prompt=search_prompt if "search_prompt" in locals() else "", output="", error=str(e), ) logger.error(f"Error fetching OpenAI news: {e}") return [] def _fetch_google_news(self, query: str = "stock market") -> List[Dict[str, Any]]: """ Fetch news from Google News RSS. Args: query: Search query Returns: List of news items """ try: # Use Google News helper from tradingagents.dataflows.google import get_google_news # Convert hours to days (round up) lookback_days = max(1, int((self.news_lookback_hours + 23) / 24)) news_report = get_google_news( query=query, curr_date=datetime.now().strftime("%Y-%m-%d"), look_back_days=lookback_days, ) # Parse the report using LLM to extract structured data parse_prompt = f"""Parse this news report and extract individual news items. {self._build_time_constraint()} {self._COMPANY_SPECIFIC_INSTRUCTION} {news_report} {self._build_extraction_fields("brief")} Return as JSON array with key "news".""" response = self.openai_client.responses.parse( model="gpt-4o-mini", input=self._build_openai_input( "Extract news items from this report into structured JSON format.", parse_prompt, ), text_format=NewsList, ) news_list = response.output_parsed news_items = [item.model_dump() for item in news_list.news] self._log_llm( step="Parse Google News", model="gpt-4o-mini", prompt=parse_prompt, output=news_items, ) # Add metadata for item in news_items: item["source"] = "google_news" item["timestamp"] = datetime.now().isoformat() return news_items[: self.max_news_items] except Exception as e: self._log_llm( step="Parse Google News", model="gpt-4o-mini", prompt=parse_prompt if "parse_prompt" in locals() else "", output="", error=str(e), ) logger.error(f"Error fetching Google News: {e}") return [] def _fetch_sec_filings(self) -> List[Dict[str, Any]]: """ Fetch recent SEC filings (8-K, 13D, 13G - market-moving events). Returns: List of filing summaries """ try: # SEC EDGAR API endpoint # Get recent 8-K filings (material events) url = "https://www.sec.gov/cgi-bin/browse-edgar" params = {"action": "getcurrent", "type": "8-K", "output": "atom", "count": 20} headers = {"User-Agent": "TradingAgents/1.0 (contact@example.com)"} response = requests.get(url, params=params, headers=headers, timeout=10) if response.status_code != 200: return [] # Parse SEC filings using LLM # (SEC returns XML/Atom feed, we'll parse with LLM for simplicity) filings_prompt = f"""Parse these SEC 8-K filings and extract the most important material events. {self._build_time_constraint()} Prefer company-specific filings and material events; skip broad market commentary or routine filings. {response.text} {self._build_extraction_fields("brief")} Return as JSON array with key "filings".""" llm_response = self.openai_client.responses.parse( model="gpt-4o-mini", input=self._build_openai_input( "Extract important SEC 8-K filings from this data and summarize the market-moving events.", filings_prompt, ), text_format=FilingsList, ) filings_list = llm_response.output_parsed filings = [item.model_dump() for item in filings_list.filings] self._log_llm( step="Parse SEC filings", model="gpt-4o-mini", prompt=filings_prompt, output=filings, ) # Add metadata for filing in filings: filing["source"] = "sec_edgar" filing["timestamp"] = datetime.now().isoformat() return filings[: self.max_news_items] except Exception as e: self._log_llm( step="Parse SEC filings", model="gpt-4o-mini", prompt=filings_prompt if "filings_prompt" in locals() else "", output="", error=str(e), ) logger.error(f"Error fetching SEC filings: {e}") return [] def _fetch_alpha_vantage_news( self, topics: str = "earnings,technology" ) -> List[Dict[str, Any]]: """ Fetch news from Alpha Vantage. Args: topics: News topics to filter Returns: List of news items """ try: from tradingagents.dataflows.alpha_vantage_news import get_alpha_vantage_news_feed # Use cutoff time for Alpha Vantage time_from = self.cutoff_time.strftime("%Y%m%dT%H%M") news_report = get_alpha_vantage_news_feed(topics=topics, time_from=time_from, limit=50) # Parse with LLM parse_prompt = f"""Parse this news feed and extract the most important market-moving stories. {self._build_time_constraint()} {self._COMPANY_SPECIFIC_INSTRUCTION} {news_report} {self._build_extraction_fields("brief")} Return as JSON array with key "news".""" response = self.openai_client.responses.parse( model="gpt-4o-mini", input=self._build_openai_input( "Extract and summarize important market news.", parse_prompt, ), text_format=NewsList, ) news_list = response.output_parsed news_items = [item.model_dump() for item in news_list.news] self._log_llm( step="Parse Alpha Vantage news", model="gpt-4o-mini", prompt=parse_prompt, output=news_items, ) # Add metadata for item in news_items: item["source"] = "alpha_vantage" item["timestamp"] = datetime.now().isoformat() return news_items[: self.max_news_items] except Exception as e: self._log_llm( step="Parse Alpha Vantage news", model="gpt-4o-mini", prompt=parse_prompt if "parse_prompt" in locals() else "", output="", error=str(e), ) logger.error(f"Error fetching Alpha Vantage news: {e}") return [] def _fetch_gemini_search_news( self, query: str = "breaking stock market news today" ) -> List[Dict[str, Any]]: """ Fetch news using Google Gemini's native web search (grounding) capability. This uses Gemini's built-in web search tool for real-time market news, which may provide different results than OpenAI's web search. Args: query: Search query for news Returns: List of news items with title, summary, published_at, timestamp """ try: import os # Get API key google_api_key = os.getenv("GOOGLE_API_KEY") if not google_api_key: logger.error("GOOGLE_API_KEY not set, skipping Gemini search") return [] # Build search prompt search_prompt = self._build_web_search_prompt(query) # Step 1: Execute web search using Gemini with google_search tool search_llm = ChatGoogleGenerativeAI( model="gemini-2.5-flash-lite", # Fast model for search api_key=google_api_key, temperature=1.0, # Higher temperature for diverse results ).bind_tools([{"google_search": {}}]) # Execute search raw_response = search_llm.invoke(search_prompt) self._log_llm( step="Gemini search", model="gemini-2.5-flash-lite", prompt=search_prompt, output=raw_response.content if hasattr(raw_response, "content") else raw_response, ) # Step 2: Structure the results using Gemini with JSON schema structured_llm = ChatGoogleGenerativeAI( model="gemini-2.5-flash-lite", api_key=google_api_key ).with_structured_output(NewsList, method="json_schema") structure_prompt = f"""Parse the following web search results into structured news items. {self._build_time_constraint()} {self._build_extraction_fields("full")} Web search results: {raw_response.content} Return as JSON with "news" array.""" structured_response = structured_llm.invoke(structure_prompt) self._log_llm( step="Gemini search structuring", model="gemini-2.5-flash-lite", prompt=structure_prompt, output=structured_response, ) # Extract news items news_items = [item.model_dump() for item in structured_response.news] # Add metadata for item in news_items: item["source"] = "gemini_search" item["timestamp"] = datetime.now().isoformat() return news_items[: self.max_news_items] except Exception as e: self._log_llm( step="Gemini search", model="gemini-2.5-flash-lite", prompt=search_prompt if "search_prompt" in locals() else "", output="", error=str(e), ) logger.error(f"Error fetching Gemini search news: {e}") return [] def scan_news(self) -> List[Dict[str, Any]]: """ Scan news from all enabled sources. Returns: Aggregated list of news items sorted by importance """ all_news = [] logger.info("Scanning news sources...") logger.info(f"Time window: {self._get_time_phrase()} (last {self.news_lookback_hours}h)") logger.info(f"Cutoff: {self.cutoff_time.strftime('%Y-%m-%d %H:%M')}") # Fetch from each enabled source if "openai" in self.news_sources: logger.info("Fetching OpenAI web search...") openai_news = self._fetch_openai_news() all_news.extend(openai_news) logger.info(f"Found {len(openai_news)} items from OpenAI") min_date, max_date = self._publish_date_range(openai_news) if min_date: logger.debug(f"Min publish date (OpenAI): {min_date.strftime('%Y-%m-%d %H:%M')}") else: logger.debug("Min publish date (OpenAI): N/A") if max_date: logger.debug(f"Max publish date (OpenAI): {max_date.strftime('%Y-%m-%d %H:%M')}") else: logger.debug("Max publish date (OpenAI): N/A") if "google_news" in self.news_sources: logger.info("Fetching Google News...") google_news = self._fetch_google_news() all_news.extend(google_news) logger.info(f"Found {len(google_news)} items from Google News") min_date, max_date = self._publish_date_range(google_news) if min_date: logger.debug( f"Min publish date (Google News): {min_date.strftime('%Y-%m-%d %H:%M')}" ) else: logger.debug("Min publish date (Google News): N/A") if max_date: logger.debug( f"Max publish date (Google News): {max_date.strftime('%Y-%m-%d %H:%M')}" ) else: logger.debug("Max publish date (Google News): N/A") if "sec_filings" in self.news_sources: logger.info("Fetching SEC filings...") sec_filings = self._fetch_sec_filings() all_news.extend(sec_filings) logger.info(f"Found {len(sec_filings)} items from SEC") min_date, max_date = self._publish_date_range(sec_filings) if min_date: logger.debug(f"Min publish date (SEC): {min_date.strftime('%Y-%m-%d %H:%M')}") else: logger.debug("Min publish date (SEC): N/A") if max_date: logger.debug(f"Max publish date (SEC): {max_date.strftime('%Y-%m-%d %H:%M')}") else: logger.debug("Max publish date (SEC): N/A") if "alpha_vantage" in self.news_sources: logger.info("Fetching Alpha Vantage news...") av_news = self._fetch_alpha_vantage_news() all_news.extend(av_news) logger.info(f"Found {len(av_news)} items from Alpha Vantage") min_date, max_date = self._publish_date_range(av_news) if min_date: logger.debug( f"Min publish date (Alpha Vantage): {min_date.strftime('%Y-%m-%d %H:%M')}" ) else: logger.debug("Min publish date (Alpha Vantage): N/A") if max_date: logger.debug( f"Max publish date (Alpha Vantage): {max_date.strftime('%Y-%m-%d %H:%M')}" ) else: logger.debug("Max publish date (Alpha Vantage): N/A") if "gemini_search" in self.news_sources: logger.info("Fetching Google Gemini search...") gemini_news = self._fetch_gemini_search_news() all_news.extend(gemini_news) logger.info(f"Found {len(gemini_news)} items from Gemini search") min_date, max_date = self._publish_date_range(gemini_news) if min_date: logger.debug(f"Min publish date (Gemini): {min_date.strftime('%Y-%m-%d %H:%M')}") else: logger.debug("Min publish date (Gemini): N/A") if max_date: logger.debug(f"Max publish date (Gemini): {max_date.strftime('%Y-%m-%d %H:%M')}") else: logger.debug("Max publish date (Gemini): N/A") # Apply time filtering logger.info(f"Collected {len(all_news)} raw news items") all_news = self._filter_by_time(all_news) logger.info(f"After time filtering: {len(all_news)} items") # Deduplicate news from multiple sources (same story = same hash) all_news = self._deduplicate_news(all_news) logger.info(f"After deduplication: {len(all_news)} items") # Sort by importance all_news.sort(key=lambda x: x.get("importance", 0), reverse=True) logger.info(f"Total news items collected: {len(all_news)}") return all_news[: self.max_news_items] def generate_news_summary(self, news_item: Dict[str, Any]) -> str: """ Generate a semantic search-optimized summary for a news item. Args: news_item: News item dict Returns: Optimized summary text for embedding/matching """ title = news_item.get("title", "") summary = news_item.get("summary", "") themes = news_item.get("themes", []) companies = news_item.get("companies_mentioned", []) # Create rich text for semantic matching search_text = f""" {title} {summary} Key themes: {', '.join(themes) if themes else 'General market news'} Companies mentioned: {', '.join(companies) if companies else 'Broad market'} """.strip() return search_text def main(): """CLI for testing news scanner.""" import argparse parser = argparse.ArgumentParser(description="Scan news for semantic ticker matching") parser.add_argument( "--sources", nargs="+", default=["openai"], choices=["openai", "google_news", "sec_filings", "alpha_vantage", "gemini_search"], help="News sources to use", ) parser.add_argument("--max-items", type=int, default=10, help="Maximum news items to fetch") parser.add_argument( "--lookback-hours", type=int, default=24, help="How far back to look for news (in hours). Examples: 1 (last hour), 6 (last 6 hours), 24 (last day), 168 (last week)", ) parser.add_argument("--output", type=str, help="Output file for news JSON") args = parser.parse_args() config = { "news_sources": args.sources, "max_news_items": args.max_items, "news_lookback_hours": args.lookback_hours, } scanner = NewsSemanticScanner(config) news_items = scanner.scan_news() # Display results logger.info("\n" + "=" * 60) logger.info(f"Top {min(5, len(news_items))} Most Important News Items:") logger.info("=" * 60 + "\n") for i, item in enumerate(news_items[:5], 1): logger.info(f"{i}. {item.get('title', 'Untitled')}") logger.info(f" Source: {item.get('source', 'unknown')}") logger.info(f" Importance: {item.get('importance', 'N/A')}/10") logger.info(f" Summary: {item.get('summary', '')[:150]}...") logger.info(f" Themes: {', '.join(item.get('themes', []))}") logger.info("") # Save to file if specified if args.output: with open(args.output, "w") as f: json.dump(news_items, f, indent=2) logger.info(f"✅ Saved {len(news_items)} news items to {args.output}") if __name__ == "__main__": main()