TradingAgents/tradingagents/dataflows/googlenews_utils.py

108 lines
3.3 KiB
Python

import json
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time
import random
from tenacity import (
retry,
stop_after_attempt,
wait_exponential,
retry_if_exception_type,
retry_if_result,
)
def is_rate_limited(response):
"""檢查回應是否表示速率限制 (狀態碼 429)"""
return response.status_code == 429
@retry(
retry=(retry_if_result(is_rate_limited)),
wait=wait_exponential(multiplier=1, min=4, max=60),
stop=stop_after_attempt(5),
)
def make_request(url, headers):
"""使用重試邏輯發出請求以處理速率限制"""
# 在每個請求前隨機延遲以避免被偵測
time.sleep(random.uniform(2, 6))
response = requests.get(url, headers=headers)
return response
def getNewsData(query, start_date, end_date):
"""
抓取給定查詢和日期範圍的 Google 新聞搜索結果。
query: str - 搜索查詢
start_date: str - 開始日期,格式為 yyyy-mm-dd 或 mm/dd/yyyy
end_date: str - 結束日期,格式為 yyyy-mm-dd 或 mm/dd/yyyy
"""
if "-" in start_date:
start_date = datetime.strptime(start_date, "%Y-%m-%d")
start_date = start_date.strftime("%m/%d/%Y")
if "-" in end_date:
end_date = datetime.strptime(end_date, "%Y-%m-%d")
end_date = end_date.strftime("%m/%d/%Y")
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/101.0.4951.54 Safari/537.36"
)
}
news_results = []
page = 0
while True:
offset = page * 10
url = (
f"https://www.google.com/search?q={query}"
f"&tbs=cdr:1,cd_min:{start_date},cd_max:{end_date}"
f"&tbm=nws&start={offset}"
)
try:
response = make_request(url, headers)
soup = BeautifulSoup(response.content, "html.parser")
results_on_page = soup.select("div.SoaBEf")
if not results_on_page:
break # 找不到更多結果
for el in results_on_page:
try:
link = el.find("a")["href"]
title = el.select_one("div.MBeuO").get_text()
snippet = el.select_one(".GI74Re").get_text()
date = el.select_one(".LfVVr").get_text()
source = el.select_one(".NUnG9d span").get_text()
news_results.append(
{
"link": link,
"title": title,
"snippet": snippet,
"date": date,
"source": source,
}
)
except Exception as e:
print(f"處理結果時出錯:{e}")
# 如果找不到其中一個欄位,則跳過此結果
continue
# 使用當前抓取的結果數量更新進度條
# 檢查「下一頁」連結 (分頁)
next_link = soup.find("a", id="pnnext")
if not next_link:
break
page += 1
except Exception as e:
print(f"多次重試後失敗:{e}")
break
return news_results