TradingAgents/tradingagents/dataflows/news_semantic_scanner.py

"""
News Semantic Scanner
--------------------
Scans news from multiple sources, summarizes key themes, and enables semantic
matching against ticker descriptions to find relevant investment opportunities.

Sources:
- OpenAI web search (real-time market news)
- SEC EDGAR filings (regulatory news)
- Google News
- Alpha Vantage news
"""

import json
import os
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional, Tuple

import requests
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from openai import OpenAI

from tradingagents.dataflows.discovery.utils import build_llm_log_entry
from tradingagents.schemas import FilingsList, NewsList
from tradingagents.utils.logger import get_logger

load_dotenv()

logger = get_logger(__name__)


class NewsSemanticScanner:
    """Scans and processes news for semantic ticker matching."""

    def __init__(self, config: Dict[str, Any]):
        """
        Initialize news scanner.

        Args:
            config: Configuration dict with:
                - openai_api_key: OpenAI API key
                - news_sources: List of sources to use
                - max_news_items: Maximum news items to process
                - news_lookback_hours: How far back to look for news (default: 24 hours)
        """
        self.config = config
        openai_api_key = os.getenv("OPENAI_API_KEY")
        if not openai_api_key:
            raise ValueError("OPENAI_API_KEY not found in environment")
        self.openai_client = OpenAI(api_key=openai_api_key)
        self.news_sources = config.get("news_sources", ["openai", "google_news"])
        self.max_news_items = config.get("max_news_items", 20)
        self.news_lookback_hours = config.get("news_lookback_hours", 24)
        self.log_callback = config.get("log_callback")

        # Calculate time window
        self.cutoff_time = datetime.now() - timedelta(hours=self.news_lookback_hours)

    def _emit_log(self, entry: Dict[str, Any]) -> None:
        if self.log_callback:
            try:
                self.log_callback(entry)
            except Exception:
                pass

    def _log_llm(
        self,
        step: str,
        model: str,
        prompt: Any,
        output: Any,
        error: str = "",
    ) -> None:
        entry = build_llm_log_entry(
            node="semantic_news",
            step=step,
            model=model,
            prompt=prompt,
            output=output,
            error=error,
        )
        self._emit_log(entry)

    def _get_time_phrase(self) -> str:
        """Generate human-readable time phrase for queries."""
        if self.news_lookback_hours <= 1:
            return "from the last hour"
        elif self.news_lookback_hours <= 6:
            return f"from the last {self.news_lookback_hours} hours"
        elif self.news_lookback_hours <= 24:
            return "from today"
        elif self.news_lookback_hours <= 48:
            return "from the last 2 days"
        else:
            days = int(self.news_lookback_hours / 24)
            return f"from the last {days} days"

    def _deduplicate_news(
        self, news_items: List[Dict[str, Any]], similarity_threshold: float = 0.85
    ) -> List[Dict[str, Any]]:
        """
        Deduplicate news items using semantic similarity (embeddings + cosine similarity).

        Two-pass approach:
        1. Fast hash-based pass for exact/near-exact duplicates
        2. Embedding-based cosine similarity for semantically similar stories

        Args:
            news_items: List of news items from various sources
            similarity_threshold: Cosine similarity threshold (0.85 = very similar)

        Returns:
            Deduplicated list, keeping highest importance version of each story
        """
        import hashlib
        import re

        import numpy as np

        if not news_items:
            return []

        def normalize_text(text: str) -> str:
            """Normalize text for comparison."""
            if not text:
                return ""
            text = text.lower()
            text = re.sub(r"[^\w\s]", "", text)
            text = re.sub(r"\s+", " ", text).strip()
            return text

        def get_content_hash(item: Dict[str, Any]) -> str:
            """Generate hash from normalized title + summary."""
            title = normalize_text(item.get("title", ""))
            summary = normalize_text(item.get("summary", ""))[:100]
            content = title + " " + summary
            return hashlib.md5(content.encode()).hexdigest()

        def get_news_text(item: Dict[str, Any]) -> str:
            """Get combined text for embedding."""
            title = item.get("title", "")
            summary = item.get("summary", "")
            return f"{title}. {summary}"[:500]  # Limit length for efficiency

        def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
            """Compute cosine similarity between two vectors."""
            norm_a = np.linalg.norm(a)
            norm_b = np.linalg.norm(b)
            if norm_a == 0 or norm_b == 0:
                return 0.0
            return float(np.dot(a, b) / (norm_a * norm_b))

        # === PASS 1: Hash-based deduplication (fast, exact matches) ===
        seen_hashes: Dict[str, Dict[str, Any]] = {}
        hash_duplicates = 0

        for item in news_items:
            content_hash = get_content_hash(item)
            if content_hash not in seen_hashes:
                seen_hashes[content_hash] = item
            else:
                existing = seen_hashes[content_hash]
                if (item.get("importance", 0) or 0) > (existing.get("importance", 0) or 0):
                    seen_hashes[content_hash] = item
                hash_duplicates += 1

        after_hash = list(seen_hashes.values())
        logger.info(
            f"Hash dedup: {len(news_items)} → {len(after_hash)} ({hash_duplicates} exact duplicates)"
        )

        # === PASS 2: Embedding-based semantic similarity ===
        # Only run if we have enough items to justify the cost
        if len(after_hash) <= 3:
            return after_hash

        try:
            # Generate embeddings for all remaining items
            texts = [get_news_text(item) for item in after_hash]

            # Use OpenAI embeddings (same as ticker_semantic_db)
            response = self.openai_client.embeddings.create(
                model="text-embedding-3-small",
                input=texts,
            )
            embeddings = np.array([e.embedding for e in response.data])

            # Find semantic duplicates using cosine similarity
            unique_indices = []
            semantic_duplicates = 0

            for i in range(len(after_hash)):
                is_duplicate = False

                for j in unique_indices:
                    sim = cosine_similarity(embeddings[i], embeddings[j])
                    if sim >= similarity_threshold:
                        # This is a semantic duplicate
                        is_duplicate = True
                        semantic_duplicates += 1

                        # Keep higher importance version
                        existing_item = after_hash[j]
                        new_item = after_hash[i]
                        if (new_item.get("importance", 0) or 0) > (
                            existing_item.get("importance", 0) or 0
                        ):
                            # Replace with higher importance
                            unique_indices.remove(j)
                            unique_indices.append(i)

                        logger.debug(
                            f"Semantic duplicate (sim={sim:.2f}): "
                            f"'{new_item.get('title', '')[:40]}' vs "
                            f"'{existing_item.get('title', '')[:40]}'"
                        )
                        break

                if not is_duplicate:
                    unique_indices.append(i)

            final_items = [after_hash[i] for i in unique_indices]
            logger.info(
                f"Semantic dedup: {len(after_hash)} → {len(final_items)} "
                f"({semantic_duplicates} similar stories merged)"
            )

            return final_items

        except Exception as e:
            logger.warning(f"Embedding-based dedup failed, using hash-only results: {e}")
            return after_hash

    def _filter_by_time(self, news_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """
        Filter news items by timestamp to respect lookback window.

        Args:
            news_items: List of news items with 'published_at' or 'timestamp' field

        Returns:
            Filtered list of news items within time window
        """
        filtered = []
        filtered_out_count = 0

        for item in news_items:
            timestamp_str = item.get("published_at") or item.get("timestamp")
            title_preview = item.get("title", "")[:60]

            if not timestamp_str:
                # No timestamp, keep it (assume recent)
                logger.debug(f"No timestamp for '{title_preview}', keeping")
                filtered.append(item)
                continue

            item_time = self._parse_timestamp(timestamp_str, date_only_end=True)
            if not item_time:
                # If parsing fails, keep it
                logger.debug(f"Parse failed for '{timestamp_str}' on '{title_preview}', keeping")
                filtered.append(item)
                continue

            if item_time >= self.cutoff_time:
                filtered.append(item)
            else:
                filtered_out_count += 1
                logger.debug(
                    f"FILTERED OUT: '{title_preview}' | "
                    f"published_at='{item.get('published_at')}' | "
                    f"parsed={item_time.strftime('%Y-%m-%d %H:%M')} | "
                    f"cutoff={self.cutoff_time.strftime('%Y-%m-%d %H:%M')}"
                )

        if filtered_out_count > 0:
            logger.info(
                f"Time filter removed {filtered_out_count} items with timestamps before cutoff"
            )

        return filtered

    def _parse_timestamp(self, timestamp_str: str, date_only_end: bool) -> Optional[datetime]:
        """Parse a timestamp string into a naive datetime, or return None if invalid."""
        try:
            # Handle date-only strings
            if len(timestamp_str) == 10 and timestamp_str[4] == "-" and timestamp_str[7] == "-":
                base_time = datetime.fromisoformat(timestamp_str)
                if date_only_end:
                    return base_time.replace(hour=23, minute=59, second=59)
                return base_time

            # Parse ISO timestamp
            parsed_time = datetime.fromisoformat(timestamp_str.replace("Z", "+00:00"))
            if parsed_time.tzinfo:
                parsed_time = parsed_time.astimezone().replace(tzinfo=None)
            return parsed_time
        except Exception:
            return None

    def _publish_date_range(
        self, news_items: List[Dict[str, Any]]
    ) -> Tuple[Optional[datetime], Optional[datetime]]:
        """Get the earliest and latest publish timestamps from a list of news items."""
        min_time = None
        max_time = None
        for item in news_items:
            timestamp_str = item.get("published_at") or item.get("timestamp")
            if not timestamp_str:
                continue
            item_time = self._parse_timestamp(timestamp_str, date_only_end=False)
            if not item_time:
                continue
            if min_time is None or item_time < min_time:
                min_time = item_time
            if max_time is None or item_time > max_time:
                max_time = item_time
        return min_time, max_time

    def _build_time_constraint(self) -> str:
        """Build the shared time constraint block used by all news prompts."""
        current_datetime = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
        cutoff_datetime = self.cutoff_time.strftime("%Y-%m-%dT%H:%M:%S")
        return (
            f"CRITICAL TIME CONSTRAINT:\n"
            f"- Current time: {current_datetime}\n"
            f"- Only include items published AFTER: {cutoff_datetime}\n"
            f"- Skip anything older than {self.news_lookback_hours} hours"
        )

    def _build_extraction_fields(self, detail_level: str = "full") -> str:
        """Build the shared extraction fields block.

        Args:
            detail_level: "full" for primary searches, "brief" for parsing raw feeds.
        """
        current_datetime = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
        base = "For each item, extract:\n" "- title: Headline\n"
        if detail_level == "full":
            base += "- summary: 2-3 sentence summary of key points\n"
        else:
            base += "- summary: Brief summary of key points\n"
        base += (
            f"- published_at: ISO-8601 timestamp (REQUIRED — convert relative times like '2 hours ago' to full timestamp using current time {current_datetime})\n"
            "- companies_mentioned: List of stock ticker symbols (prefer tickers over company names, e.g. 'AAPL' not 'Apple Inc.')\n"
            "- themes: Key themes (e.g., 'earnings beat', 'FDA approval', 'merger', 'insider buying')\n"
            "- sentiment: one of positive, negative, neutral\n"
            "- importance: 1-10 score (10 = highly market-moving, company-specific catalysts score higher than broad market news)"
        )
        return base

    _COMPANY_SPECIFIC_INSTRUCTION = (
        "Prefer company-specific or single-catalyst stories that impact one company or a small "
        "group of companies. Avoid broad market, index, or macroeconomic headlines unless they "
        "have a clear company-specific catalyst. If a story is sector-wide without a specific "
        "company catalyst, skip it."
    )

    def _build_web_search_prompt(self, query: str = "breaking stock market news today") -> str:
        """
        Build unified web search prompt for both OpenAI and Gemini.

        Args:
            query: Search query for news

        Returns:
            Formatted search prompt string
        """
        time_phrase = self._get_time_phrase()
        time_query = f"{query} {time_phrase}"

        return f"""Search the web for: {time_query}

{self._build_time_constraint()}

Find the top {self.max_news_items} most important market-moving news stories from the last {self.news_lookback_hours} hours.

{self._COMPANY_SPECIFIC_INSTRUCTION}

Focus on:
- Earnings reports and guidance
- FDA approvals / regulatory decisions
- Mergers, acquisitions, partnerships
- Product launches
- Executive changes
- Legal/regulatory actions
- Analyst upgrades/downgrades

{self._build_extraction_fields("full")}
"""

    def _build_openai_input(self, system_text: str, user_text: str) -> str:
        """Build Responses API input as a single prompt string."""
        if system_text:
            return f"{system_text}\n\n{user_text}"
        return user_text

    def _fetch_openai_news(
        self, query: str = "breaking stock market news today"
    ) -> List[Dict[str, Any]]:
        """
        Fetch news using OpenAI's web search capability.

        Args:
            query: Search query for news

        Returns:
            List of news items with title, summary, published_at, timestamp
        """
        try:
            # Build search prompt
            search_prompt = self._build_web_search_prompt(query)

            # Use OpenAI web search tool for real-time news
            response = self.openai_client.responses.parse(
                model="gpt-4o",
                tools=[{"type": "web_search"}],
                input=self._build_openai_input(
                    "You are a financial news analyst. Search the web for the latest market news "
                    "and return structured summaries.",
                    search_prompt,
                ),
                text_format=NewsList,
            )

            news_list = response.output_parsed
            news_items = [item.model_dump() for item in news_list.news]

            self._log_llm(
                step="OpenAI web search",
                model="gpt-4o",
                prompt=search_prompt,
                output=news_items,
            )

            # Add metadata
            for item in news_items:
                item["source"] = "openai_search"
                item["timestamp"] = datetime.now().isoformat()

            return news_items[: self.max_news_items]

        except Exception as e:
            self._log_llm(
                step="OpenAI web search",
                model="gpt-4o",
                prompt=search_prompt if "search_prompt" in locals() else "",
                output="",
                error=str(e),
            )
            logger.error(f"Error fetching OpenAI news: {e}")
            return []

    def _fetch_google_news(self, query: str = "stock market") -> List[Dict[str, Any]]:
        """
        Fetch news from Google News RSS.

        Args:
            query: Search query

        Returns:
            List of news items
        """
        try:
            # Use Google News helper
            from tradingagents.dataflows.google import get_google_news

            # Convert hours to days (round up)
            lookback_days = max(1, int((self.news_lookback_hours + 23) / 24))

            news_report = get_google_news(
                query=query,
                curr_date=datetime.now().strftime("%Y-%m-%d"),
                look_back_days=lookback_days,
            )

            # Parse the report using LLM to extract structured data
            parse_prompt = f"""Parse this news report and extract individual news items.

{self._build_time_constraint()}

{self._COMPANY_SPECIFIC_INSTRUCTION}

{news_report}

{self._build_extraction_fields("brief")}

Return as JSON array with key "news"."""
            response = self.openai_client.responses.parse(
                model="gpt-4o-mini",
                input=self._build_openai_input(
                    "Extract news items from this report into structured JSON format.",
                    parse_prompt,
                ),
                text_format=NewsList,
            )

            news_list = response.output_parsed
            news_items = [item.model_dump() for item in news_list.news]

            self._log_llm(
                step="Parse Google News",
                model="gpt-4o-mini",
                prompt=parse_prompt,
                output=news_items,
            )

            # Add metadata
            for item in news_items:
                item["source"] = "google_news"
                item["timestamp"] = datetime.now().isoformat()

            return news_items[: self.max_news_items]

        except Exception as e:
            self._log_llm(
                step="Parse Google News",
                model="gpt-4o-mini",
                prompt=parse_prompt if "parse_prompt" in locals() else "",
                output="",
                error=str(e),
            )
            logger.error(f"Error fetching Google News: {e}")
            return []

    def _fetch_sec_filings(self) -> List[Dict[str, Any]]:
        """
        Fetch recent SEC filings (8-K, 13D, 13G - market-moving events).

        Returns:
            List of filing summaries
        """
        try:
            # SEC EDGAR API endpoint
            # Get recent 8-K filings (material events)
            url = "https://www.sec.gov/cgi-bin/browse-edgar"
            params = {"action": "getcurrent", "type": "8-K", "output": "atom", "count": 20}
            headers = {"User-Agent": "TradingAgents/1.0 (contact@example.com)"}

            response = requests.get(url, params=params, headers=headers, timeout=10)

            if response.status_code != 200:
                return []

            # Parse SEC filings using LLM
            # (SEC returns XML/Atom feed, we'll parse with LLM for simplicity)
            filings_prompt = f"""Parse these SEC 8-K filings and extract the most important material events.

{self._build_time_constraint()}

Prefer company-specific filings and material events; skip broad market commentary or routine filings.

{response.text}

{self._build_extraction_fields("brief")}

Return as JSON array with key "filings"."""
            llm_response = self.openai_client.responses.parse(
                model="gpt-4o-mini",
                input=self._build_openai_input(
                    "Extract important SEC 8-K filings from this data and summarize the market-moving events.",
                    filings_prompt,
                ),
                text_format=FilingsList,
            )

            filings_list = llm_response.output_parsed
            filings = [item.model_dump() for item in filings_list.filings]

            self._log_llm(
                step="Parse SEC filings",
                model="gpt-4o-mini",
                prompt=filings_prompt,
                output=filings,
            )

            # Add metadata
            for filing in filings:
                filing["source"] = "sec_edgar"
                filing["timestamp"] = datetime.now().isoformat()

            return filings[: self.max_news_items]

        except Exception as e:
            self._log_llm(
                step="Parse SEC filings",
                model="gpt-4o-mini",
                prompt=filings_prompt if "filings_prompt" in locals() else "",
                output="",
                error=str(e),
            )
            logger.error(f"Error fetching SEC filings: {e}")
            return []

    def _fetch_alpha_vantage_news(
        self, topics: str = "earnings,technology"
    ) -> List[Dict[str, Any]]:
        """
        Fetch news from Alpha Vantage.

        Args:
            topics: News topics to filter

        Returns:
            List of news items
        """
        try:
            from tradingagents.dataflows.alpha_vantage_news import get_alpha_vantage_news_feed

            # Use cutoff time for Alpha Vantage
            time_from = self.cutoff_time.strftime("%Y%m%dT%H%M")

            news_report = get_alpha_vantage_news_feed(topics=topics, time_from=time_from, limit=50)

            # Parse with LLM
            parse_prompt = f"""Parse this news feed and extract the most important market-moving stories.

{self._build_time_constraint()}

{self._COMPANY_SPECIFIC_INSTRUCTION}

{news_report}

{self._build_extraction_fields("brief")}

Return as JSON array with key "news"."""
            response = self.openai_client.responses.parse(
                model="gpt-4o-mini",
                input=self._build_openai_input(
                    "Extract and summarize important market news.",
                    parse_prompt,
                ),
                text_format=NewsList,
            )

            news_list = response.output_parsed
            news_items = [item.model_dump() for item in news_list.news]

            self._log_llm(
                step="Parse Alpha Vantage news",
                model="gpt-4o-mini",
                prompt=parse_prompt,
                output=news_items,
            )

            # Add metadata
            for item in news_items:
                item["source"] = "alpha_vantage"
                item["timestamp"] = datetime.now().isoformat()

            return news_items[: self.max_news_items]

        except Exception as e:
            self._log_llm(
                step="Parse Alpha Vantage news",
                model="gpt-4o-mini",
                prompt=parse_prompt if "parse_prompt" in locals() else "",
                output="",
                error=str(e),
            )
            logger.error(f"Error fetching Alpha Vantage news: {e}")
            return []

    def _fetch_gemini_search_news(
        self, query: str = "breaking stock market news today"
    ) -> List[Dict[str, Any]]:
        """
        Fetch news using Google Gemini's native web search (grounding) capability.

        This uses Gemini's built-in web search tool for real-time market news,
        which may provide different results than OpenAI's web search.

        Args:
            query: Search query for news

        Returns:
            List of news items with title, summary, published_at, timestamp
        """
        try:
            import os

            # Get API key
            google_api_key = os.getenv("GOOGLE_API_KEY")
            if not google_api_key:
                logger.error("GOOGLE_API_KEY not set, skipping Gemini search")
                return []

            # Build search prompt
            search_prompt = self._build_web_search_prompt(query)

            # Step 1: Execute web search using Gemini with google_search tool
            search_llm = ChatGoogleGenerativeAI(
                model="gemini-2.5-flash-lite",  # Fast model for search
                api_key=google_api_key,
                temperature=1.0,  # Higher temperature for diverse results
            ).bind_tools([{"google_search": {}}])

            # Execute search
            raw_response = search_llm.invoke(search_prompt)
            self._log_llm(
                step="Gemini search",
                model="gemini-2.5-flash-lite",
                prompt=search_prompt,
                output=raw_response.content if hasattr(raw_response, "content") else raw_response,
            )

            # Step 2: Structure the results using Gemini with JSON schema
            structured_llm = ChatGoogleGenerativeAI(
                model="gemini-2.5-flash-lite", api_key=google_api_key
            ).with_structured_output(NewsList, method="json_schema")

            structure_prompt = f"""Parse the following web search results into structured news items.

{self._build_time_constraint()}

{self._build_extraction_fields("full")}

Web search results:
{raw_response.content}

Return as JSON with "news" array."""

            structured_response = structured_llm.invoke(structure_prompt)
            self._log_llm(
                step="Gemini search structuring",
                model="gemini-2.5-flash-lite",
                prompt=structure_prompt,
                output=structured_response,
            )

            # Extract news items
            news_items = [item.model_dump() for item in structured_response.news]

            # Add metadata
            for item in news_items:
                item["source"] = "gemini_search"
                item["timestamp"] = datetime.now().isoformat()

            return news_items[: self.max_news_items]

        except Exception as e:
            self._log_llm(
                step="Gemini search",
                model="gemini-2.5-flash-lite",
                prompt=search_prompt if "search_prompt" in locals() else "",
                output="",
                error=str(e),
            )
            logger.error(f"Error fetching Gemini search news: {e}")
            return []

    def scan_news(self) -> List[Dict[str, Any]]:
        """
        Scan news from all enabled sources.

        Returns:
            Aggregated list of news items sorted by importance
        """
        all_news = []

        logger.info("Scanning news sources...")
        logger.info(f"Time window: {self._get_time_phrase()} (last {self.news_lookback_hours}h)")
        logger.info(f"Cutoff: {self.cutoff_time.strftime('%Y-%m-%d %H:%M')}")

        # Fetch from each enabled source
        if "openai" in self.news_sources:
            logger.info("Fetching OpenAI web search...")
            openai_news = self._fetch_openai_news()
            all_news.extend(openai_news)
            logger.info(f"Found {len(openai_news)} items from OpenAI")
            min_date, max_date = self._publish_date_range(openai_news)
            if min_date:
                logger.debug(f"Min publish date (OpenAI): {min_date.strftime('%Y-%m-%d %H:%M')}")
            else:
                logger.debug("Min publish date (OpenAI): N/A")
            if max_date:
                logger.debug(f"Max publish date (OpenAI): {max_date.strftime('%Y-%m-%d %H:%M')}")
            else:
                logger.debug("Max publish date (OpenAI): N/A")

        if "google_news" in self.news_sources:
            logger.info("Fetching Google News...")
            google_news = self._fetch_google_news()
            all_news.extend(google_news)
            logger.info(f"Found {len(google_news)} items from Google News")
            min_date, max_date = self._publish_date_range(google_news)
            if min_date:
                logger.debug(
                    f"Min publish date (Google News): {min_date.strftime('%Y-%m-%d %H:%M')}"
                )
            else:
                logger.debug("Min publish date (Google News): N/A")
            if max_date:
                logger.debug(
                    f"Max publish date (Google News): {max_date.strftime('%Y-%m-%d %H:%M')}"
                )
            else:
                logger.debug("Max publish date (Google News): N/A")

        if "sec_filings" in self.news_sources:
            logger.info("Fetching SEC filings...")
            sec_filings = self._fetch_sec_filings()
            all_news.extend(sec_filings)
            logger.info(f"Found {len(sec_filings)} items from SEC")
            min_date, max_date = self._publish_date_range(sec_filings)
            if min_date:
                logger.debug(f"Min publish date (SEC): {min_date.strftime('%Y-%m-%d %H:%M')}")
            else:
                logger.debug("Min publish date (SEC): N/A")
            if max_date:
                logger.debug(f"Max publish date (SEC): {max_date.strftime('%Y-%m-%d %H:%M')}")
            else:
                logger.debug("Max publish date (SEC): N/A")

        if "alpha_vantage" in self.news_sources:
            logger.info("Fetching Alpha Vantage news...")
            av_news = self._fetch_alpha_vantage_news()
            all_news.extend(av_news)
            logger.info(f"Found {len(av_news)} items from Alpha Vantage")
            min_date, max_date = self._publish_date_range(av_news)
            if min_date:
                logger.debug(
                    f"Min publish date (Alpha Vantage): {min_date.strftime('%Y-%m-%d %H:%M')}"
                )
            else:
                logger.debug("Min publish date (Alpha Vantage): N/A")
            if max_date:
                logger.debug(
                    f"Max publish date (Alpha Vantage): {max_date.strftime('%Y-%m-%d %H:%M')}"
                )
            else:
                logger.debug("Max publish date (Alpha Vantage): N/A")

        if "gemini_search" in self.news_sources:
            logger.info("Fetching Google Gemini search...")
            gemini_news = self._fetch_gemini_search_news()
            all_news.extend(gemini_news)
            logger.info(f"Found {len(gemini_news)} items from Gemini search")
            min_date, max_date = self._publish_date_range(gemini_news)
            if min_date:
                logger.debug(f"Min publish date (Gemini): {min_date.strftime('%Y-%m-%d %H:%M')}")
            else:
                logger.debug("Min publish date (Gemini): N/A")
            if max_date:
                logger.debug(f"Max publish date (Gemini): {max_date.strftime('%Y-%m-%d %H:%M')}")
            else:
                logger.debug("Max publish date (Gemini): N/A")

        # Apply time filtering
        logger.info(f"Collected {len(all_news)} raw news items")
        all_news = self._filter_by_time(all_news)
        logger.info(f"After time filtering: {len(all_news)} items")

        # Deduplicate news from multiple sources (same story = same hash)
        all_news = self._deduplicate_news(all_news)
        logger.info(f"After deduplication: {len(all_news)} items")

        # Sort by importance
        all_news.sort(key=lambda x: x.get("importance", 0), reverse=True)

        logger.info(f"Total news items collected: {len(all_news)}")

        return all_news[: self.max_news_items]

    def generate_news_summary(self, news_item: Dict[str, Any]) -> str:
        """
        Generate a semantic search-optimized summary for a news item.

        Args:
            news_item: News item dict

        Returns:
            Optimized summary text for embedding/matching
        """
        title = news_item.get("title", "")
        summary = news_item.get("summary", "")
        themes = news_item.get("themes", [])
        companies = news_item.get("companies_mentioned", [])

        # Create rich text for semantic matching
        search_text = f"""
        {title}

        {summary}

        Key themes: {', '.join(themes) if themes else 'General market news'}
        Companies mentioned: {', '.join(companies) if companies else 'Broad market'}
        """.strip()

        return search_text


def main():
    """CLI for testing news scanner."""
    import argparse

    parser = argparse.ArgumentParser(description="Scan news for semantic ticker matching")
    parser.add_argument(
        "--sources",
        nargs="+",
        default=["openai"],
        choices=["openai", "google_news", "sec_filings", "alpha_vantage", "gemini_search"],
        help="News sources to use",
    )
    parser.add_argument("--max-items", type=int, default=10, help="Maximum news items to fetch")
    parser.add_argument(
        "--lookback-hours",
        type=int,
        default=24,
        help="How far back to look for news (in hours). Examples: 1 (last hour), 6 (last 6 hours), 24 (last day), 168 (last week)",
    )
    parser.add_argument("--output", type=str, help="Output file for news JSON")

    args = parser.parse_args()

    config = {
        "news_sources": args.sources,
        "max_news_items": args.max_items,
        "news_lookback_hours": args.lookback_hours,
    }

    scanner = NewsSemanticScanner(config)
    news_items = scanner.scan_news()

    # Display results
    logger.info("\n" + "=" * 60)
    logger.info(f"Top {min(5, len(news_items))} Most Important News Items:")
    logger.info("=" * 60 + "\n")

    for i, item in enumerate(news_items[:5], 1):
        logger.info(f"{i}. {item.get('title', 'Untitled')}")
        logger.info(f"   Source: {item.get('source', 'unknown')}")
        logger.info(f"   Importance: {item.get('importance', 'N/A')}/10")
        logger.info(f"   Summary: {item.get('summary', '')[:150]}...")
        logger.info(f"   Themes: {', '.join(item.get('themes', []))}")
        logger.info("")

    # Save to file if specified
    if args.output:
        with open(args.output, "w") as f:
            json.dump(news_items, f, indent=2)
        logger.info(f"✅ Saved {len(news_items)} news items to {args.output}")


if __name__ == "__main__":
    main()