TradingAgents/tradingagents/dataflows/googlenews_utils.py

158 lines
5.7 KiB
Python

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import urllib.parse
def getNewsData(query, start_date, end_date):
"""
Fetch Google News via RSS feed for a given query and date range.
Uses Google News RSS which is reliable (no JS rendering or CSS selectors needed).
Results are filtered to only include articles within the date range.
query: str - search query (spaces or '+' separated)
start_date: str - start date in yyyy-mm-dd or mm/dd/yyyy format
end_date: str - end date in yyyy-mm-dd or mm/dd/yyyy format
"""
# Normalize dates to datetime objects for filtering
if "/" in str(start_date):
start_dt = datetime.strptime(start_date, "%m/%d/%Y")
else:
start_dt = datetime.strptime(start_date, "%Y-%m-%d")
if "/" in str(end_date):
end_dt = datetime.strptime(end_date, "%m/%d/%Y")
else:
end_dt = datetime.strptime(end_date, "%Y-%m-%d")
# Clean up query (replace + with spaces for URL encoding)
clean_query = query.replace("+", " ")
encoded_query = urllib.parse.quote(clean_query)
# Use Google News RSS feed — reliable, no scraping issues
url = f"https://news.google.com/rss/search?q={encoded_query}+after:{start_dt.strftime('%Y-%m-%d')}+before:{end_dt.strftime('%Y-%m-%d')}&hl=en-IN&gl=IN&ceid=IN:en"
news_results = []
try:
resp = requests.get(url, timeout=15)
if resp.status_code != 200:
return news_results
soup = BeautifulSoup(resp.content, "xml")
items = soup.find_all("item")
for item in items[:20]: # Limit to 20 articles
try:
title = item.find("title").text if item.find("title") else ""
pub_date_str = item.find("pubDate").text if item.find("pubDate") else ""
source = item.find("source").text if item.find("source") else ""
link = item.find("link").text if item.find("link") else ""
# Description often contains HTML snippet
desc_tag = item.find("description")
snippet = ""
if desc_tag:
desc_soup = BeautifulSoup(desc_tag.text, "html.parser")
snippet = desc_soup.get_text()[:300]
# Parse and filter by date
if pub_date_str:
try:
pub_dt = datetime.strptime(pub_date_str, "%a, %d %b %Y %H:%M:%S %Z")
if pub_dt.date() < start_dt.date() or pub_dt.date() > end_dt.date():
continue
date_display = pub_dt.strftime("%Y-%m-%d")
except ValueError:
date_display = pub_date_str
else:
date_display = ""
news_results.append({
"link": link,
"title": title,
"snippet": snippet if snippet else title,
"date": date_display,
"source": source,
})
except Exception:
continue
except Exception as e:
print(f"Google News RSS fetch failed: {e}")
return news_results
def getGlobalNewsData(curr_date, look_back_days=7, limit=10):
"""
Fetch global/macro news via Google News RSS feed.
Uses broad financial/market queries to get macroeconomic news.
"""
if isinstance(curr_date, str):
end_dt = datetime.strptime(curr_date, "%Y-%m-%d")
else:
end_dt = curr_date
from dateutil.relativedelta import relativedelta
start_dt = end_dt - relativedelta(days=look_back_days)
queries = [
"stock market India NSE Nifty",
"global economy markets finance",
]
all_results = []
seen_titles = set()
for query in queries:
encoded = urllib.parse.quote(query)
url = f"https://news.google.com/rss/search?q={encoded}+after:{start_dt.strftime('%Y-%m-%d')}+before:{end_dt.strftime('%Y-%m-%d')}&hl=en-IN&gl=IN&ceid=IN:en"
try:
resp = requests.get(url, timeout=15)
if resp.status_code != 200:
continue
soup = BeautifulSoup(resp.content, "xml")
items = soup.find_all("item")
for item in items:
try:
title = item.find("title").text if item.find("title") else ""
if title in seen_titles:
continue
seen_titles.add(title)
pub_date_str = item.find("pubDate").text if item.find("pubDate") else ""
source = item.find("source").text if item.find("source") else ""
desc_tag = item.find("description")
snippet = ""
if desc_tag:
desc_soup = BeautifulSoup(desc_tag.text, "html.parser")
snippet = desc_soup.get_text()[:300]
date_display = ""
if pub_date_str:
try:
pub_dt = datetime.strptime(pub_date_str, "%a, %d %b %Y %H:%M:%S %Z")
date_display = pub_dt.strftime("%Y-%m-%d")
except ValueError:
date_display = pub_date_str
all_results.append({
"title": title,
"snippet": snippet if snippet else title,
"date": date_display,
"source": source,
})
except Exception:
continue
except Exception:
continue
# Sort by date descending and limit
all_results.sort(key=lambda x: x.get("date", ""), reverse=True)
return all_results[:limit]