TradingAgents/tradingagents/dataflows/googlenews_utils.py

127 lines
4.1 KiB
Python

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time
import random
import logging
from urllib.parse import quote_plus
logger = logging.getLogger(__name__)
from tenacity import (
retry,
stop_after_attempt,
wait_exponential,
retry_if_result,
)
def is_rate_limited(response):
"""Check if the response indicates we should back off (rate-limited or temporarily unavailable)."""
return response.status_code in (429, 403, 503)
def _add_jitter(retry_state):
# Add small random jitter before each retry to avoid detection patterns
time.sleep(random.uniform(1, 3))
@retry(
retry=(retry_if_result(is_rate_limited)),
wait=wait_exponential(multiplier=1, min=4, max=60),
before_sleep=_add_jitter,
stop=stop_after_attempt(5),
)
def make_request(url, headers):
"""Make a request with retry logic for rate limiting"""
# The retry decorator already applies exponential backoff with jitter
response = requests.get(url, headers=headers, timeout=(5, 20))
return response
def getNewsData(query, start_date, end_date):
"""
Scrape Google News search results for a given query and date range.
query: str - search query
start_date: str - start date in the format yyyy-mm-dd or mm/dd/yyyy
end_date: str - end date in the format yyyy-mm-dd or mm/dd/yyyy
"""
if "-" in start_date:
start_date = datetime.strptime(start_date, "%Y-%m-%d")
start_date = start_date.strftime("%m/%d/%Y")
if "-" in end_date:
end_date = datetime.strptime(end_date, "%Y-%m-%d")
end_date = end_date.strftime("%m/%d/%Y")
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/101.0.4951.54 Safari/537.36"
)
}
news_results = []
page = 0
while True:
offset = page * 10
encoded_query = quote_plus(query)
url = (
f"https://www.google.com/search?q={encoded_query}"
f"&tbs=cdr:1,cd_min:{start_date},cd_max:{end_date}"
f"&tbm=nws&start={offset}"
)
try:
response = make_request(url, headers)
soup = BeautifulSoup(response.content, "html.parser")
results_on_page = soup.select("div.SoaBEf")
if not results_on_page:
break # No more results found
for el in results_on_page:
try:
link_tag = el.find("a")
title_el = el.select_one("div.MBeuO")
if not link_tag or not title_el:
# Skip if required elements are missing
continue
link = link_tag.get("href")
title = title_el.get_text(strip=True)
snippet_el = el.select_one(".GI74Re")
date_el = el.select_one(".LfVVr")
source_el = el.select_one(".NUnG9d span")
news_results.append(
{
"link": link,
"title": title,
"snippet": (
snippet_el.get_text(strip=True) if snippet_el else ""
),
"date": date_el.get_text(strip=True) if date_el else "",
"source": (
source_el.get_text(strip=True) if source_el else ""
),
}
)
except Exception as e:
logger.warning("Error processing result: %s", e)
# If one of the fields is not found, skip this result
continue
# Update the progress bar with the current count of results scraped
# Check for the "Next" link (pagination)
next_link = soup.find("a", id="pnnext")
if not next_link:
break
page += 1
except Exception as e:
logger.error("Failed after multiple retries: %s", e)
break
return news_results