TradingAgents/tradingagents/dataflows/googlenews_utils.py

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import urllib.parse


def getNewsData(query, start_date, end_date):
    """
    Fetch Google News via RSS feed for a given query and date range.

    Uses Google News RSS which is reliable (no JS rendering or CSS selectors needed).
    Results are filtered to only include articles within the date range.

    query: str - search query (spaces or '+' separated)
    start_date: str - start date in yyyy-mm-dd or mm/dd/yyyy format
    end_date: str - end date in yyyy-mm-dd or mm/dd/yyyy format
    """
    # Normalize dates to datetime objects for filtering
    if "/" in str(start_date):
        start_dt = datetime.strptime(start_date, "%m/%d/%Y")
    else:
        start_dt = datetime.strptime(start_date, "%Y-%m-%d")

    if "/" in str(end_date):
        end_dt = datetime.strptime(end_date, "%m/%d/%Y")
    else:
        end_dt = datetime.strptime(end_date, "%Y-%m-%d")

    # Clean up query (replace + with spaces for URL encoding)
    clean_query = query.replace("+", " ")
    encoded_query = urllib.parse.quote(clean_query)

    # Use Google News RSS feed — reliable, no scraping issues
    url = f"https://news.google.com/rss/search?q={encoded_query}+after:{start_dt.strftime('%Y-%m-%d')}+before:{end_dt.strftime('%Y-%m-%d')}&hl=en-IN&gl=IN&ceid=IN:en"

    news_results = []
    try:
        resp = requests.get(url, timeout=15)
        if resp.status_code != 200:
            return news_results

        soup = BeautifulSoup(resp.content, "xml")
        items = soup.find_all("item")

        for item in items[:20]:  # Limit to 20 articles
            try:
                title = item.find("title").text if item.find("title") else ""
                pub_date_str = item.find("pubDate").text if item.find("pubDate") else ""
                source = item.find("source").text if item.find("source") else ""
                link = item.find("link").text if item.find("link") else ""
                # Description often contains HTML snippet
                desc_tag = item.find("description")
                snippet = ""
                if desc_tag:
                    desc_soup = BeautifulSoup(desc_tag.text, "html.parser")
                    snippet = desc_soup.get_text()[:300]

                # Parse and filter by date
                if pub_date_str:
                    try:
                        pub_dt = datetime.strptime(pub_date_str, "%a, %d %b %Y %H:%M:%S %Z")
                        if pub_dt.date() < start_dt.date() or pub_dt.date() > end_dt.date():
                            continue
                        date_display = pub_dt.strftime("%Y-%m-%d")
                    except ValueError:
                        date_display = pub_date_str
                else:
                    date_display = ""

                news_results.append({
                    "link": link,
                    "title": title,
                    "snippet": snippet if snippet else title,
                    "date": date_display,
                    "source": source,
                })
            except Exception:
                continue

    except Exception as e:
        print(f"Google News RSS fetch failed: {e}")

    return news_results


def getGlobalNewsData(curr_date, look_back_days=7, limit=10):
    """
    Fetch global/macro news via Google News RSS feed.

    Uses broad financial/market queries to get macroeconomic news.
    """
    if isinstance(curr_date, str):
        end_dt = datetime.strptime(curr_date, "%Y-%m-%d")
    else:
        end_dt = curr_date

    from dateutil.relativedelta import relativedelta
    start_dt = end_dt - relativedelta(days=look_back_days)

    queries = [
        "stock market India NSE Nifty",
        "global economy markets finance",
    ]

    all_results = []
    seen_titles = set()

    for query in queries:
        encoded = urllib.parse.quote(query)
        url = f"https://news.google.com/rss/search?q={encoded}+after:{start_dt.strftime('%Y-%m-%d')}+before:{end_dt.strftime('%Y-%m-%d')}&hl=en-IN&gl=IN&ceid=IN:en"

        try:
            resp = requests.get(url, timeout=15)
            if resp.status_code != 200:
                continue

            soup = BeautifulSoup(resp.content, "xml")
            items = soup.find_all("item")

            for item in items:
                try:
                    title = item.find("title").text if item.find("title") else ""
                    if title in seen_titles:
                        continue
                    seen_titles.add(title)

                    pub_date_str = item.find("pubDate").text if item.find("pubDate") else ""
                    source = item.find("source").text if item.find("source") else ""
                    desc_tag = item.find("description")
                    snippet = ""
                    if desc_tag:
                        desc_soup = BeautifulSoup(desc_tag.text, "html.parser")
                        snippet = desc_soup.get_text()[:300]

                    date_display = ""
                    if pub_date_str:
                        try:
                            pub_dt = datetime.strptime(pub_date_str, "%a, %d %b %Y %H:%M:%S %Z")
                            date_display = pub_dt.strftime("%Y-%m-%d")
                        except ValueError:
                            date_display = pub_date_str

                    all_results.append({
                        "title": title,
                        "snippet": snippet if snippet else title,
                        "date": date_display,
                        "source": source,
                    })
                except Exception:
                    continue

        except Exception:
            continue

    # Sort by date descending and limit
    all_results.sort(key=lambda x: x.get("date", ""), reverse=True)
    return all_results[:limit]