196 lines
7.3 KiB
Python
196 lines
7.3 KiB
Python
import os
|
|
import time
|
|
from datetime import datetime
|
|
from typing import List, Dict, Any
|
|
from serpapi import GoogleSearch
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def getNewsDataSerpAPI(query: str, start_date: str, end_date: str, serpapi_key: str = None) -> List[Dict[str, Any]]:
|
|
"""
|
|
Get news data using SerpAPI (much faster than web scraping).
|
|
|
|
Args:
|
|
query: Search query string
|
|
start_date: Start date in YYYY-MM-DD format
|
|
end_date: End date in YYYY-MM-DD format
|
|
serpapi_key: SerpAPI key (if not provided, will use environment variable)
|
|
|
|
Returns:
|
|
List of dictionaries with news data
|
|
"""
|
|
if not serpapi_key:
|
|
serpapi_key = os.getenv("SERPAPI_API_KEY")
|
|
|
|
if not serpapi_key:
|
|
logger.error("❌ SerpAPI key not found. Please set SERPAPI_API_KEY environment variable.")
|
|
raise ValueError("SerpAPI key not found. Please set SERPAPI_API_KEY environment variable.")
|
|
|
|
# Convert dates to Google News format if needed
|
|
if "-" in start_date:
|
|
start_dt = datetime.strptime(start_date, "%Y-%m-%d")
|
|
start_date = start_dt.strftime("%m/%d/%Y")
|
|
if "-" in end_date:
|
|
end_dt = datetime.strptime(end_date, "%Y-%m-%d")
|
|
end_date = end_dt.strftime("%m/%d/%Y")
|
|
|
|
news_results = []
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# SerpAPI parameters for Google News search
|
|
params = {
|
|
"engine": "google",
|
|
"q": query,
|
|
"tbm": "nws", # News search
|
|
"tbs": f"cdr:1,cd_min:{start_date},cd_max:{end_date}", # Date range
|
|
"api_key": serpapi_key,
|
|
"num": 100, # Get up to 100 results
|
|
"hl": "en", # Language
|
|
"gl": "us", # Country
|
|
}
|
|
|
|
logger.info(f"🔍 SerpAPI: Searching for '{query}' from {start_date} to {end_date}")
|
|
|
|
search = GoogleSearch(params)
|
|
results = search.get_dict()
|
|
|
|
# Check for errors
|
|
if "error" in results:
|
|
logger.error(f"❌ SerpAPI Error: {results['error']}")
|
|
raise Exception(f"SerpAPI Error: {results['error']}")
|
|
|
|
# Extract news results
|
|
news_items = results.get("news_results", [])
|
|
|
|
for item in news_items:
|
|
try:
|
|
news_result = {
|
|
"link": item.get("link", ""),
|
|
"title": item.get("title", "No title"),
|
|
"snippet": item.get("snippet", "No snippet"),
|
|
"date": item.get("date", "No date"),
|
|
"source": item.get("source", "Unknown source"),
|
|
}
|
|
news_results.append(news_result)
|
|
|
|
except Exception as e:
|
|
logger.warning(f"⚠️ Error processing news item: {e}")
|
|
continue
|
|
|
|
duration = time.time() - start_time
|
|
logger.info(f"✅ SerpAPI: Retrieved {len(news_results)} news items in {duration:.2f}s")
|
|
|
|
return news_results
|
|
|
|
except Exception as e:
|
|
duration = time.time() - start_time
|
|
logger.error(f"❌ SerpAPI Error after {duration:.2f}s: {str(e)}")
|
|
|
|
# Fallback to empty results rather than crashing
|
|
logger.info("🔄 Returning empty results as fallback")
|
|
return []
|
|
|
|
|
|
def getNewsDataSerpAPIWithPagination(query: str, start_date: str, end_date: str,
|
|
max_results: int = 300, serpapi_key: str = None) -> List[Dict[str, Any]]:
|
|
"""
|
|
Get news data using SerpAPI with pagination support for more results.
|
|
|
|
Args:
|
|
query: Search query string
|
|
start_date: Start date in YYYY-MM-DD format
|
|
end_date: End date in YYYY-MM-DD format
|
|
max_results: Maximum number of results to fetch
|
|
serpapi_key: SerpAPI key (if not provided, will use environment variable)
|
|
|
|
Returns:
|
|
List of dictionaries with news data
|
|
"""
|
|
if not serpapi_key:
|
|
serpapi_key = os.getenv("SERPAPI_API_KEY")
|
|
|
|
if not serpapi_key:
|
|
logger.error("❌ SerpAPI key not found. Please set SERPAPI_API_KEY environment variable.")
|
|
raise ValueError("SerpAPI key not found. Please set SERPAPI_API_KEY environment variable.")
|
|
|
|
# Convert dates to Google News format if needed
|
|
if "-" in start_date:
|
|
start_dt = datetime.strptime(start_date, "%Y-%m-%d")
|
|
start_date = start_dt.strftime("%m/%d/%Y")
|
|
if "-" in end_date:
|
|
end_dt = datetime.strptime(end_date, "%Y-%m-%d")
|
|
end_date = end_dt.strftime("%m/%d/%Y")
|
|
|
|
all_news_results = []
|
|
start_time = time.time()
|
|
page = 0
|
|
|
|
try:
|
|
while len(all_news_results) < max_results:
|
|
# SerpAPI parameters for Google News search
|
|
params = {
|
|
"engine": "google",
|
|
"q": query,
|
|
"tbm": "nws", # News search
|
|
"tbs": f"cdr:1,cd_min:{start_date},cd_max:{end_date}", # Date range
|
|
"api_key": serpapi_key,
|
|
"num": 100, # Get up to 100 results per page
|
|
"start": page * 100, # Pagination offset
|
|
"hl": "en", # Language
|
|
"gl": "us", # Country
|
|
}
|
|
|
|
logger.info(f"🔍 SerpAPI: Page {page + 1} - Searching for '{query}' from {start_date} to {end_date}")
|
|
|
|
search = GoogleSearch(params)
|
|
results = search.get_dict()
|
|
|
|
# Check for errors
|
|
if "error" in results:
|
|
logger.error(f"❌ SerpAPI Error: {results['error']}")
|
|
break
|
|
|
|
# Extract news results
|
|
news_items = results.get("news_results", [])
|
|
|
|
if not news_items:
|
|
logger.info(f"📭 No more results found on page {page + 1}")
|
|
break
|
|
|
|
for item in news_items:
|
|
try:
|
|
news_result = {
|
|
"link": item.get("link", ""),
|
|
"title": item.get("title", "No title"),
|
|
"snippet": item.get("snippet", "No snippet"),
|
|
"date": item.get("date", "No date"),
|
|
"source": item.get("source", "Unknown source"),
|
|
}
|
|
all_news_results.append(news_result)
|
|
|
|
if len(all_news_results) >= max_results:
|
|
break
|
|
|
|
except Exception as e:
|
|
logger.warning(f"⚠️ Error processing news item: {e}")
|
|
continue
|
|
|
|
page += 1
|
|
|
|
# Add small delay between requests to be respectful
|
|
time.sleep(0.5)
|
|
|
|
duration = time.time() - start_time
|
|
logger.info(f"✅ SerpAPI: Retrieved {len(all_news_results)} news items in {duration:.2f}s across {page} pages")
|
|
|
|
return all_news_results[:max_results] # Ensure we don't exceed max_results
|
|
|
|
except Exception as e:
|
|
duration = time.time() - start_time
|
|
logger.error(f"❌ SerpAPI Error after {duration:.2f}s: {str(e)}")
|
|
|
|
# Return whatever we managed to collect
|
|
logger.info(f"🔄 Returning {len(all_news_results)} partial results as fallback")
|
|
return all_news_results |