TradingAgents/tradingagents/dataflows/google.py

115 lines
3.5 KiB
Python

import re
from typing import Annotated, List, Dict, Any
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from dateutil import parser as dateutil_parser
from .googlenews_utils import getNewsData
def _parse_google_news_date(date_str: str) -> datetime:
if not date_str:
return datetime.now()
date_str = date_str.strip().lower()
relative_patterns = [
(r"(\d+)\s*(?:hour|hr)s?\s*ago", "hours"),
(r"(\d+)\s*(?:minute|min)s?\s*ago", "minutes"),
(r"(\d+)\s*(?:day)s?\s*ago", "days"),
(r"(\d+)\s*(?:week)s?\s*ago", "weeks"),
(r"(\d+)\s*(?:month)s?\s*ago", "months"),
]
for pattern, unit in relative_patterns:
match = re.search(pattern, date_str)
if match:
value = int(match.group(1))
now = datetime.now()
if unit == "hours":
return now - timedelta(hours=value)
elif unit == "minutes":
return now - timedelta(minutes=value)
elif unit == "days":
return now - timedelta(days=value)
elif unit == "weeks":
return now - timedelta(weeks=value)
elif unit == "months":
return now - relativedelta(months=value)
if "yesterday" in date_str:
return datetime.now() - timedelta(days=1)
try:
return dateutil_parser.parse(date_str, fuzzy=True)
except (ValueError, TypeError):
return datetime.now()
def get_google_news(
query: Annotated[str, "Query to search with"],
curr_date: Annotated[str, "Curr date in yyyy-mm-dd format"],
look_back_days: Annotated[int, "how many days to look back"],
) -> str:
query = query.replace(" ", "+")
start_date = datetime.strptime(curr_date, "%Y-%m-%d")
before = start_date - relativedelta(days=look_back_days)
before = before.strftime("%Y-%m-%d")
news_results = getNewsData(query, before, curr_date)
news_str = ""
for news in news_results:
news_str += (
f"### {news['title']} (source: {news['source']}) \n\n{news['snippet']}\n\n"
)
if len(news_results) == 0:
return ""
return f"## {query} Google News, from {before} to {curr_date}:\n\n{news_str}"
def get_bulk_news_google(lookback_hours: int) -> List[Dict[str, Any]]:
end_date = datetime.now()
start_date = end_date - timedelta(hours=lookback_hours)
start_str = start_date.strftime("%Y-%m-%d")
end_str = end_date.strftime("%Y-%m-%d")
queries = [
"stock market",
"trading news",
"earnings report",
]
all_articles = []
seen_titles = set()
for query in queries:
try:
news_results = getNewsData(query.replace(" ", "+"), start_str, end_str)
for news in news_results:
title = news.get("title", "")
if title and title not in seen_titles:
seen_titles.add(title)
date_str = news.get("date", "")
published_at = _parse_google_news_date(date_str)
article = {
"title": title,
"source": news.get("source", "Google News"),
"url": news.get("link", ""),
"published_at": published_at.isoformat(),
"content_snippet": news.get("snippet", "")[:500],
}
all_articles.append(article)
except Exception:
continue
return all_articles