TradingAgents/backend/tradingagents/dataflows/serpapi_utils.py

196 lines
7.3 KiB
Python

import os
import time
from datetime import datetime
from typing import List, Dict, Any
from serpapi import GoogleSearch
import logging
logger = logging.getLogger(__name__)
def getNewsDataSerpAPI(query: str, start_date: str, end_date: str, serpapi_key: str = None) -> List[Dict[str, Any]]:
"""
Get news data using SerpAPI (much faster than web scraping).
Args:
query: Search query string
start_date: Start date in YYYY-MM-DD format
end_date: End date in YYYY-MM-DD format
serpapi_key: SerpAPI key (if not provided, will use environment variable)
Returns:
List of dictionaries with news data
"""
if not serpapi_key:
serpapi_key = os.getenv("SERPAPI_API_KEY")
if not serpapi_key:
logger.error("❌ SerpAPI key not found. Please set SERPAPI_API_KEY environment variable.")
raise ValueError("SerpAPI key not found. Please set SERPAPI_API_KEY environment variable.")
# Convert dates to Google News format if needed
if "-" in start_date:
start_dt = datetime.strptime(start_date, "%Y-%m-%d")
start_date = start_dt.strftime("%m/%d/%Y")
if "-" in end_date:
end_dt = datetime.strptime(end_date, "%Y-%m-%d")
end_date = end_dt.strftime("%m/%d/%Y")
news_results = []
start_time = time.time()
try:
# SerpAPI parameters for Google News search
params = {
"engine": "google",
"q": query,
"tbm": "nws", # News search
"tbs": f"cdr:1,cd_min:{start_date},cd_max:{end_date}", # Date range
"api_key": serpapi_key,
"num": 100, # Get up to 100 results
"hl": "en", # Language
"gl": "us", # Country
}
logger.info(f"🔍 SerpAPI: Searching for '{query}' from {start_date} to {end_date}")
search = GoogleSearch(params)
results = search.get_dict()
# Check for errors
if "error" in results:
logger.error(f"❌ SerpAPI Error: {results['error']}")
raise Exception(f"SerpAPI Error: {results['error']}")
# Extract news results
news_items = results.get("news_results", [])
for item in news_items:
try:
news_result = {
"link": item.get("link", ""),
"title": item.get("title", "No title"),
"snippet": item.get("snippet", "No snippet"),
"date": item.get("date", "No date"),
"source": item.get("source", "Unknown source"),
}
news_results.append(news_result)
except Exception as e:
logger.warning(f"⚠️ Error processing news item: {e}")
continue
duration = time.time() - start_time
logger.info(f"✅ SerpAPI: Retrieved {len(news_results)} news items in {duration:.2f}s")
return news_results
except Exception as e:
duration = time.time() - start_time
logger.error(f"❌ SerpAPI Error after {duration:.2f}s: {str(e)}")
# Fallback to empty results rather than crashing
logger.info("🔄 Returning empty results as fallback")
return []
def getNewsDataSerpAPIWithPagination(query: str, start_date: str, end_date: str,
max_results: int = 300, serpapi_key: str = None) -> List[Dict[str, Any]]:
"""
Get news data using SerpAPI with pagination support for more results.
Args:
query: Search query string
start_date: Start date in YYYY-MM-DD format
end_date: End date in YYYY-MM-DD format
max_results: Maximum number of results to fetch
serpapi_key: SerpAPI key (if not provided, will use environment variable)
Returns:
List of dictionaries with news data
"""
if not serpapi_key:
serpapi_key = os.getenv("SERPAPI_API_KEY")
if not serpapi_key:
logger.error("❌ SerpAPI key not found. Please set SERPAPI_API_KEY environment variable.")
raise ValueError("SerpAPI key not found. Please set SERPAPI_API_KEY environment variable.")
# Convert dates to Google News format if needed
if "-" in start_date:
start_dt = datetime.strptime(start_date, "%Y-%m-%d")
start_date = start_dt.strftime("%m/%d/%Y")
if "-" in end_date:
end_dt = datetime.strptime(end_date, "%Y-%m-%d")
end_date = end_dt.strftime("%m/%d/%Y")
all_news_results = []
start_time = time.time()
page = 0
try:
while len(all_news_results) < max_results:
# SerpAPI parameters for Google News search
params = {
"engine": "google",
"q": query,
"tbm": "nws", # News search
"tbs": f"cdr:1,cd_min:{start_date},cd_max:{end_date}", # Date range
"api_key": serpapi_key,
"num": 100, # Get up to 100 results per page
"start": page * 100, # Pagination offset
"hl": "en", # Language
"gl": "us", # Country
}
logger.info(f"🔍 SerpAPI: Page {page + 1} - Searching for '{query}' from {start_date} to {end_date}")
search = GoogleSearch(params)
results = search.get_dict()
# Check for errors
if "error" in results:
logger.error(f"❌ SerpAPI Error: {results['error']}")
break
# Extract news results
news_items = results.get("news_results", [])
if not news_items:
logger.info(f"📭 No more results found on page {page + 1}")
break
for item in news_items:
try:
news_result = {
"link": item.get("link", ""),
"title": item.get("title", "No title"),
"snippet": item.get("snippet", "No snippet"),
"date": item.get("date", "No date"),
"source": item.get("source", "Unknown source"),
}
all_news_results.append(news_result)
if len(all_news_results) >= max_results:
break
except Exception as e:
logger.warning(f"⚠️ Error processing news item: {e}")
continue
page += 1
# Add small delay between requests to be respectful
time.sleep(0.5)
duration = time.time() - start_time
logger.info(f"✅ SerpAPI: Retrieved {len(all_news_results)} news items in {duration:.2f}s across {page} pages")
return all_news_results[:max_results] # Ensure we don't exceed max_results
except Exception as e:
duration = time.time() - start_time
logger.error(f"❌ SerpAPI Error after {duration:.2f}s: {str(e)}")
# Return whatever we managed to collect
logger.info(f"🔄 Returning {len(all_news_results)} partial results as fallback")
return all_news_results