211 lines
6.7 KiB
Python
211 lines
6.7 KiB
Python
"""
|
|
Google News client for live news data via web scraping.
|
|
"""
|
|
|
|
import logging
|
|
from datetime import datetime, timedelta
|
|
from typing import Any
|
|
|
|
from tradingagents.dataflows.googlenews_utils import getNewsData
|
|
|
|
from .base import BaseClient
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class GoogleNewsClient(BaseClient):
|
|
"""Client for Google News data via web scraping."""
|
|
|
|
def __init__(self, **kwargs):
|
|
"""
|
|
Initialize Google News client.
|
|
|
|
Args:
|
|
**kwargs: Configuration options including rate limits
|
|
"""
|
|
super().__init__(**kwargs)
|
|
self.max_retries = kwargs.get("max_retries", 3)
|
|
self.delay_between_requests = kwargs.get("delay_between_requests", 1.0)
|
|
|
|
def test_connection(self) -> bool:
|
|
"""Test Google News connection by fetching a simple query."""
|
|
try:
|
|
# Test with a simple query for recent news
|
|
end_date = datetime.now().strftime("%Y-%m-%d")
|
|
start_date = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
|
|
|
|
test_data = getNewsData("technology", start_date, end_date)
|
|
return isinstance(test_data, list)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Google News connection test failed: {e}")
|
|
return False
|
|
|
|
def get_data(
|
|
self, query: str, start_date: str, end_date: str, **kwargs
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Get news data for a query and date range.
|
|
|
|
Args:
|
|
query: Search query
|
|
start_date: Start date in YYYY-MM-DD format
|
|
end_date: End date in YYYY-MM-DD format
|
|
**kwargs: Additional parameters
|
|
|
|
Returns:
|
|
Dict[str, Any]: News data with metadata
|
|
"""
|
|
if not self.validate_date_range(start_date, end_date):
|
|
raise ValueError(f"Invalid date range: {start_date} to {end_date}")
|
|
|
|
try:
|
|
# Replace spaces with + for URL encoding
|
|
formatted_query = query.replace(" ", "+")
|
|
|
|
logger.info(
|
|
f"Fetching Google News for query: {query} from {start_date} to {end_date}"
|
|
)
|
|
|
|
news_results = getNewsData(formatted_query, start_date, end_date)
|
|
|
|
if not news_results:
|
|
logger.warning(f"No news found for query: {query}")
|
|
return {
|
|
"query": query,
|
|
"period": {"start": start_date, "end": end_date},
|
|
"articles": [],
|
|
"metadata": {
|
|
"source": "google_news",
|
|
"empty": True,
|
|
"reason": "no_articles_found",
|
|
},
|
|
}
|
|
|
|
# Process and standardize article data
|
|
processed_articles = []
|
|
for article in news_results:
|
|
processed_article = {
|
|
"headline": article.get("title", ""),
|
|
"summary": article.get("snippet", ""),
|
|
"url": article.get("link", ""),
|
|
"source": article.get("source", "Unknown"),
|
|
"date": article.get(
|
|
"date", end_date
|
|
), # Fallback to end_date if no date
|
|
"entities": article.get("entities", []),
|
|
}
|
|
processed_articles.append(processed_article)
|
|
|
|
return {
|
|
"query": query,
|
|
"period": {"start": start_date, "end": end_date},
|
|
"articles": processed_articles,
|
|
"metadata": {
|
|
"source": "google_news",
|
|
"article_count": len(processed_articles),
|
|
"retrieved_at": datetime.utcnow().isoformat(),
|
|
"search_query": formatted_query,
|
|
},
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error fetching Google News for query '{query}': {e}")
|
|
raise
|
|
|
|
def get_company_news(
|
|
self, symbol: str, start_date: str, end_date: str, **kwargs
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Get news data specific to a company symbol.
|
|
|
|
Args:
|
|
symbol: Stock ticker symbol
|
|
start_date: Start date in YYYY-MM-DD format
|
|
end_date: End date in YYYY-MM-DD format
|
|
**kwargs: Additional parameters
|
|
|
|
Returns:
|
|
Dict[str, Any]: Company-specific news data
|
|
"""
|
|
# Create company-focused search query
|
|
company_query = f"{symbol} stock"
|
|
|
|
result = self.get_data(company_query, start_date, end_date, **kwargs)
|
|
result["symbol"] = symbol
|
|
result["metadata"]["query_type"] = "company_specific"
|
|
|
|
return result
|
|
|
|
def get_global_news(
|
|
self,
|
|
start_date: str,
|
|
end_date: str,
|
|
categories: list[str] | None = None,
|
|
**kwargs,
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Get global/macro news that might affect markets.
|
|
|
|
Args:
|
|
start_date: Start date in YYYY-MM-DD format
|
|
end_date: End date in YYYY-MM-DD format
|
|
categories: List of news categories to search
|
|
**kwargs: Additional parameters
|
|
|
|
Returns:
|
|
Dict[str, Any]: Global news data
|
|
"""
|
|
if categories is None:
|
|
categories = ["economy", "finance", "markets", "business"]
|
|
|
|
all_articles = []
|
|
|
|
for category in categories:
|
|
try:
|
|
category_data = self.get_data(category, start_date, end_date, **kwargs)
|
|
|
|
# Add category tag to each article
|
|
for article in category_data.get("articles", []):
|
|
article["category"] = category
|
|
|
|
all_articles.extend(category_data.get("articles", []))
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to fetch news for category '{category}': {e}")
|
|
continue
|
|
|
|
return {
|
|
"query": "global_news",
|
|
"categories": categories,
|
|
"period": {"start": start_date, "end": end_date},
|
|
"articles": all_articles,
|
|
"metadata": {
|
|
"source": "google_news",
|
|
"article_count": len(all_articles),
|
|
"categories_searched": categories,
|
|
"retrieved_at": datetime.utcnow().isoformat(),
|
|
"query_type": "global_news",
|
|
},
|
|
}
|
|
|
|
def get_available_categories(self) -> list[str]:
|
|
"""
|
|
Get list of commonly used news categories.
|
|
|
|
Returns:
|
|
List[str]: News categories
|
|
"""
|
|
return [
|
|
"business",
|
|
"economy",
|
|
"finance",
|
|
"markets",
|
|
"technology",
|
|
"politics",
|
|
"world",
|
|
"healthcare",
|
|
"energy",
|
|
"crypto",
|
|
]
|