198 lines
6.6 KiB
Python
198 lines
6.6 KiB
Python
"""
|
|
Google News client for live news data via RSS feeds.
|
|
"""
|
|
|
|
import logging
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timezone
|
|
from urllib.parse import quote
|
|
|
|
import feedparser
|
|
import requests
|
|
from dateutil import parser as date_parser
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class GoogleNewsArticle:
|
|
"""Represents a news article from Google News RSS feed."""
|
|
|
|
title: str
|
|
link: str
|
|
published: datetime
|
|
summary: str
|
|
source: str
|
|
guid: str
|
|
|
|
|
|
class GoogleNewsClient:
|
|
"""Client for Google News data via web scraping."""
|
|
|
|
def get_company_news(self, symbol: str) -> list[GoogleNewsArticle]:
|
|
"""
|
|
Get news data specific to a company symbol.
|
|
|
|
Args:
|
|
symbol: Stock ticker symbol
|
|
|
|
Returns:
|
|
list[GoogleNewsArticle]: Company-specific news data
|
|
"""
|
|
return self._get_rss_feed(symbol)
|
|
|
|
def get_global_news(
|
|
self,
|
|
categories: list[str],
|
|
) -> list[GoogleNewsArticle]:
|
|
"""
|
|
Get global/macro news that might affect markets.
|
|
|
|
Args:
|
|
categories: List of news categories to search
|
|
|
|
Returns:
|
|
list[GoogleNewsArticle]: Global news data
|
|
"""
|
|
# Get RSS queries for categories
|
|
all_articles = []
|
|
|
|
for category in categories:
|
|
try:
|
|
articles = self._get_rss_feed(category)
|
|
all_articles.extend(articles)
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to fetch news for category '{category}': {e}")
|
|
continue
|
|
|
|
return all_articles
|
|
|
|
def _get_rss_feed(self, query: str) -> list[GoogleNewsArticle]:
|
|
"""
|
|
Fetch RSS feed from Google News for a given query.
|
|
|
|
Args:
|
|
query: Search query (company symbol or news category)
|
|
|
|
Returns:
|
|
list[GoogleNewsArticle]: Parsed articles from RSS feed
|
|
"""
|
|
try:
|
|
# Construct Google News RSS URL
|
|
encoded_query = quote(query)
|
|
rss_url = f"https://news.google.com/rss/search?q={encoded_query}&hl=en-US&gl=US&ceid=US:en"
|
|
|
|
logger.info(f"Fetching RSS feed for query: {query}")
|
|
|
|
# Use requests with timeout and User-Agent header
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
|
}
|
|
|
|
response = requests.get(rss_url, timeout=10, headers=headers)
|
|
response.raise_for_status()
|
|
|
|
# Use feedparser to parse the fetched content
|
|
feed = feedparser.parse(response.content)
|
|
|
|
# Check if feed was parsed successfully
|
|
if feed.bozo:
|
|
logger.warning(
|
|
f"Feed parsing had issues for query '{query}': {feed.bozo_exception}"
|
|
)
|
|
|
|
articles = []
|
|
for entry in feed.entries:
|
|
try:
|
|
article = self._parse_feed_entry(entry)
|
|
articles.append(article)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to parse article entry: {e}")
|
|
continue
|
|
|
|
logger.info(
|
|
f"Successfully fetched {len(articles)} articles for query: {query}"
|
|
)
|
|
return articles
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
logger.error(f"Network error fetching RSS feed for query '{query}': {e}")
|
|
return []
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Unexpected error fetching RSS feed for query '{query}': {e}",
|
|
exc_info=True,
|
|
)
|
|
return []
|
|
|
|
def _parse_feed_entry(self, entry: feedparser.FeedParserDict) -> GoogleNewsArticle:
|
|
"""
|
|
Parse a feedparser entry directly into a GoogleNewsArticle.
|
|
|
|
Args:
|
|
entry: Raw feedparser entry
|
|
|
|
Returns:
|
|
GoogleNewsArticle: Parsed article object
|
|
"""
|
|
# Get fields with safe defaults
|
|
raw_title = getattr(entry, "title", "Untitled")
|
|
raw_link = getattr(entry, "link", "")
|
|
published_str = getattr(entry, "published", "")
|
|
summary = getattr(entry, "summary", "")
|
|
guid = getattr(entry, "id", getattr(entry, "link", ""))
|
|
|
|
# Decode Google News redirect URL to get actual article URL
|
|
# Only attempt to decode if it's a Google News URL
|
|
if raw_link and "news.google.com" in raw_link:
|
|
try:
|
|
from googlenewsdecoder import gnewsdecoder
|
|
|
|
decoded_result = gnewsdecoder(raw_link, interval=0)
|
|
# gnewsdecoder returns a dict with 'status' and 'decoded_url' keys
|
|
if decoded_result.get("status"):
|
|
link = decoded_result["decoded_url"]
|
|
else:
|
|
# Failed to decode, use original URL
|
|
logger.debug(
|
|
f"Failed to decode Google News URL: {decoded_result.get('message', 'Unknown error')}"
|
|
)
|
|
link = raw_link
|
|
except Exception as e:
|
|
logger.debug(f"Failed to decode Google News URL: {e}")
|
|
link = raw_link
|
|
else:
|
|
# Not a Google News URL, use as-is
|
|
link = raw_link
|
|
|
|
# Parse published date with fallback to current time
|
|
try:
|
|
if published_str:
|
|
parsed_result = date_parser.parse(published_str)
|
|
# dateutil.parser.parse() always returns datetime unless fuzzy_with_tokens=True
|
|
# Since we're not using fuzzy_with_tokens, we know it's a datetime
|
|
if isinstance(parsed_result, datetime):
|
|
published = parsed_result
|
|
else:
|
|
# Fallback for any unexpected types (shouldn't happen)
|
|
published = datetime.now(timezone.utc).replace(tzinfo=None)
|
|
else:
|
|
published = datetime.now(timezone.utc).replace(tzinfo=None)
|
|
except (ValueError, OverflowError, TypeError):
|
|
published = datetime.now(timezone.utc).replace(tzinfo=None)
|
|
|
|
# Extract source from title (Google News format: "Title - Source")
|
|
title_parts = raw_title.split(" - ")
|
|
title = title_parts[0] if title_parts else raw_title
|
|
source = title_parts[-1] if len(title_parts) > 1 else "Unknown"
|
|
|
|
return GoogleNewsArticle(
|
|
title=title,
|
|
link=str(link) if link else "",
|
|
published=published,
|
|
summary=summary,
|
|
source=source,
|
|
guid=guid,
|
|
)
|