import logging import json import requests from bs4 import BeautifulSoup from datetime import datetime import time import random from tenacity import ( retry, stop_after_attempt, wait_exponential, retry_if_exception_type, retry_if_result, ) logger = logging.getLogger(__name__) def is_rate_limited(response): return response.status_code == 429 @retry( retry=(retry_if_result(is_rate_limited)), wait=wait_exponential(multiplier=1, min=4, max=60), stop=stop_after_attempt(5), ) def make_request(url, headers): time.sleep(random.uniform(2, 6)) response = requests.get(url, headers=headers) return response def getNewsData(query, start_date, end_date): if "-" in start_date: start_date = datetime.strptime(start_date, "%Y-%m-%d") start_date = start_date.strftime("%m/%d/%Y") if "-" in end_date: end_date = datetime.strptime(end_date, "%Y-%m-%d") end_date = end_date.strftime("%m/%d/%Y") headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/101.0.4951.54 Safari/537.36" ) } news_results = [] page = 0 while True: offset = page * 10 url = ( f"https://www.google.com/search?q={query}" f"&tbs=cdr:1,cd_min:{start_date},cd_max:{end_date}" f"&tbm=nws&start={offset}" ) try: response = make_request(url, headers) soup = BeautifulSoup(response.content, "html.parser") results_on_page = soup.select("div.SoaBEf") if not results_on_page: break for el in results_on_page: try: link = el.find("a")["href"] title = el.select_one("div.MBeuO").get_text() snippet = el.select_one(".GI74Re").get_text() date = el.select_one(".LfVVr").get_text() source = el.select_one(".NUnG9d span").get_text() news_results.append( { "link": link, "title": title, "snippet": snippet, "date": date, "source": source, } ) except Exception as e: logger.debug("Error processing result: %s", e) continue next_link = soup.find("a", id="pnnext") if not next_link: break page += 1 except Exception as e: logger.debug("Failed after multiple retries: %s", e) break return news_results