158 lines
5.7 KiB
Python
158 lines
5.7 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
from datetime import datetime
|
|
import urllib.parse
|
|
|
|
|
|
def getNewsData(query, start_date, end_date):
|
|
"""
|
|
Fetch Google News via RSS feed for a given query and date range.
|
|
|
|
Uses Google News RSS which is reliable (no JS rendering or CSS selectors needed).
|
|
Results are filtered to only include articles within the date range.
|
|
|
|
query: str - search query (spaces or '+' separated)
|
|
start_date: str - start date in yyyy-mm-dd or mm/dd/yyyy format
|
|
end_date: str - end date in yyyy-mm-dd or mm/dd/yyyy format
|
|
"""
|
|
# Normalize dates to datetime objects for filtering
|
|
if "/" in str(start_date):
|
|
start_dt = datetime.strptime(start_date, "%m/%d/%Y")
|
|
else:
|
|
start_dt = datetime.strptime(start_date, "%Y-%m-%d")
|
|
|
|
if "/" in str(end_date):
|
|
end_dt = datetime.strptime(end_date, "%m/%d/%Y")
|
|
else:
|
|
end_dt = datetime.strptime(end_date, "%Y-%m-%d")
|
|
|
|
# Clean up query (replace + with spaces for URL encoding)
|
|
clean_query = query.replace("+", " ")
|
|
encoded_query = urllib.parse.quote(clean_query)
|
|
|
|
# Use Google News RSS feed — reliable, no scraping issues
|
|
url = f"https://news.google.com/rss/search?q={encoded_query}+after:{start_dt.strftime('%Y-%m-%d')}+before:{end_dt.strftime('%Y-%m-%d')}&hl=en-IN&gl=IN&ceid=IN:en"
|
|
|
|
news_results = []
|
|
try:
|
|
resp = requests.get(url, timeout=15)
|
|
if resp.status_code != 200:
|
|
return news_results
|
|
|
|
soup = BeautifulSoup(resp.content, "xml")
|
|
items = soup.find_all("item")
|
|
|
|
for item in items[:20]: # Limit to 20 articles
|
|
try:
|
|
title = item.find("title").text if item.find("title") else ""
|
|
pub_date_str = item.find("pubDate").text if item.find("pubDate") else ""
|
|
source = item.find("source").text if item.find("source") else ""
|
|
link = item.find("link").text if item.find("link") else ""
|
|
# Description often contains HTML snippet
|
|
desc_tag = item.find("description")
|
|
snippet = ""
|
|
if desc_tag:
|
|
desc_soup = BeautifulSoup(desc_tag.text, "html.parser")
|
|
snippet = desc_soup.get_text()[:300]
|
|
|
|
# Parse and filter by date
|
|
if pub_date_str:
|
|
try:
|
|
pub_dt = datetime.strptime(pub_date_str, "%a, %d %b %Y %H:%M:%S %Z")
|
|
if pub_dt.date() < start_dt.date() or pub_dt.date() > end_dt.date():
|
|
continue
|
|
date_display = pub_dt.strftime("%Y-%m-%d")
|
|
except ValueError:
|
|
date_display = pub_date_str
|
|
else:
|
|
date_display = ""
|
|
|
|
news_results.append({
|
|
"link": link,
|
|
"title": title,
|
|
"snippet": snippet if snippet else title,
|
|
"date": date_display,
|
|
"source": source,
|
|
})
|
|
except Exception:
|
|
continue
|
|
|
|
except Exception as e:
|
|
print(f"Google News RSS fetch failed: {e}")
|
|
|
|
return news_results
|
|
|
|
|
|
def getGlobalNewsData(curr_date, look_back_days=7, limit=10):
|
|
"""
|
|
Fetch global/macro news via Google News RSS feed.
|
|
|
|
Uses broad financial/market queries to get macroeconomic news.
|
|
"""
|
|
if isinstance(curr_date, str):
|
|
end_dt = datetime.strptime(curr_date, "%Y-%m-%d")
|
|
else:
|
|
end_dt = curr_date
|
|
|
|
from dateutil.relativedelta import relativedelta
|
|
start_dt = end_dt - relativedelta(days=look_back_days)
|
|
|
|
queries = [
|
|
"stock market India NSE Nifty",
|
|
"global economy markets finance",
|
|
]
|
|
|
|
all_results = []
|
|
seen_titles = set()
|
|
|
|
for query in queries:
|
|
encoded = urllib.parse.quote(query)
|
|
url = f"https://news.google.com/rss/search?q={encoded}+after:{start_dt.strftime('%Y-%m-%d')}+before:{end_dt.strftime('%Y-%m-%d')}&hl=en-IN&gl=IN&ceid=IN:en"
|
|
|
|
try:
|
|
resp = requests.get(url, timeout=15)
|
|
if resp.status_code != 200:
|
|
continue
|
|
|
|
soup = BeautifulSoup(resp.content, "xml")
|
|
items = soup.find_all("item")
|
|
|
|
for item in items:
|
|
try:
|
|
title = item.find("title").text if item.find("title") else ""
|
|
if title in seen_titles:
|
|
continue
|
|
seen_titles.add(title)
|
|
|
|
pub_date_str = item.find("pubDate").text if item.find("pubDate") else ""
|
|
source = item.find("source").text if item.find("source") else ""
|
|
desc_tag = item.find("description")
|
|
snippet = ""
|
|
if desc_tag:
|
|
desc_soup = BeautifulSoup(desc_tag.text, "html.parser")
|
|
snippet = desc_soup.get_text()[:300]
|
|
|
|
date_display = ""
|
|
if pub_date_str:
|
|
try:
|
|
pub_dt = datetime.strptime(pub_date_str, "%a, %d %b %Y %H:%M:%S %Z")
|
|
date_display = pub_dt.strftime("%Y-%m-%d")
|
|
except ValueError:
|
|
date_display = pub_date_str
|
|
|
|
all_results.append({
|
|
"title": title,
|
|
"snippet": snippet if snippet else title,
|
|
"date": date_display,
|
|
"source": source,
|
|
})
|
|
except Exception:
|
|
continue
|
|
|
|
except Exception:
|
|
continue
|
|
|
|
# Sort by date descending and limit
|
|
all_results.sort(key=lambda x: x.get("date", ""), reverse=True)
|
|
return all_results[:limit]
|