TradingAgents/tradingagents/dataflows/korea_news.py

291 lines
9.6 KiB
Python

"""Korean news data source using Naver Finance API and web scraping.
Provides Korean financial news, company-specific news, and macro news
relevant to the Korean market.
"""
import requests
from datetime import datetime, timedelta
from typing import Annotated
from dateutil.relativedelta import relativedelta
# Naver Search API headers (user should set NAVER_CLIENT_ID and NAVER_CLIENT_SECRET env vars)
_NAVER_HEADERS = None
def _get_naver_headers():
"""Get Naver API headers, lazy-loaded."""
global _NAVER_HEADERS
if _NAVER_HEADERS is None:
import os
client_id = os.environ.get("NAVER_CLIENT_ID", "")
client_secret = os.environ.get("NAVER_CLIENT_SECRET", "")
if client_id and client_secret:
_NAVER_HEADERS = {
"X-Naver-Client-Id": client_id,
"X-Naver-Client-Secret": client_secret,
}
else:
_NAVER_HEADERS = {}
return _NAVER_HEADERS
def _get_stock_name_from_code(code: str) -> str:
"""Try to resolve stock name from code for better news search."""
try:
import FinanceDataReader as fdr
listing = fdr.StockListing("KRX")
if listing is not None and not listing.empty:
match = listing[listing["Code"] == code]
if match.empty:
match = listing[listing["Symbol"] == code]
if not match.empty:
return match.iloc[0].get("Name", code)
except Exception:
pass
return code
def get_korean_news(
ticker: Annotated[str, "KRX ticker symbol or company name"],
start_date: Annotated[str, "Start date in yyyy-mm-dd format"],
end_date: Annotated[str, "End date in yyyy-mm-dd format"],
) -> str:
"""Retrieve Korean financial news for a specific stock.
Uses Naver Search API if credentials are available, otherwise falls back
to RSS-based news fetching.
"""
# Try to resolve ticker to company name for better search
company_name = ticker
if ticker.isdigit():
company_name = _get_stock_name_from_code(ticker)
headers = _get_naver_headers()
if headers:
return _fetch_naver_api_news(company_name, ticker, start_date, end_date, headers)
else:
return _fetch_rss_news(company_name, ticker, start_date, end_date)
def _fetch_naver_api_news(
company_name: str,
ticker: str,
start_date: str,
end_date: str,
headers: dict,
) -> str:
"""Fetch news using Naver Search API."""
try:
query = f"{company_name} 주가"
url = "https://openapi.naver.com/v1/search/news.json"
params = {
"query": query,
"display": 20,
"sort": "date",
}
response = requests.get(url, headers=headers, params=params, timeout=10)
response.raise_for_status()
data = response.json()
items = data.get("items", [])
if not items:
return f"No Korean news found for {company_name} ({ticker})"
# Parse date range
start_dt = datetime.strptime(start_date, "%Y-%m-%d")
end_dt = datetime.strptime(end_date, "%Y-%m-%d") + timedelta(days=1)
news_str = ""
count = 0
for item in items:
# Parse pubDate
pub_date_str = item.get("pubDate", "")
try:
pub_date = datetime.strptime(pub_date_str, "%a, %d %b %Y %H:%M:%S %z")
pub_date_naive = pub_date.replace(tzinfo=None)
if not (start_dt <= pub_date_naive <= end_dt):
continue
except (ValueError, TypeError):
pass
title = _clean_html(item.get("title", ""))
description = _clean_html(item.get("description", ""))
link = item.get("originallink", item.get("link", ""))
news_str += f"### {title}\n"
if description:
news_str += f"{description}\n"
if link:
news_str += f"Link: {link}\n"
news_str += "\n"
count += 1
if count == 0:
return f"No Korean news found for {company_name} ({ticker}) between {start_date} and {end_date}"
return f"## {company_name} ({ticker}) 한국 뉴스 ({start_date} ~ {end_date}):\n\n{news_str}"
except Exception as e:
return f"Error fetching Korean news for {company_name}: {str(e)}"
def _fetch_rss_news(
company_name: str,
ticker: str,
start_date: str,
end_date: str,
) -> str:
"""Fallback: Fetch news using Google News RSS for Korean content."""
try:
import urllib.parse
query = urllib.parse.quote(f"{company_name} 주식")
rss_url = f"https://news.google.com/rss/search?q={query}&hl=ko&gl=KR&ceid=KR:ko"
response = requests.get(rss_url, timeout=10)
response.raise_for_status()
# Parse RSS XML
import xml.etree.ElementTree as ET
root = ET.fromstring(response.text)
items = root.findall(".//item")
if not items:
return f"No Korean news found for {company_name} ({ticker})"
news_str = ""
count = 0
for item in items[:15]:
title = item.findtext("title", "")
link = item.findtext("link", "")
pub_date = item.findtext("pubDate", "")
source = item.findtext("source", "")
news_str += f"### {title}"
if source:
news_str += f" (source: {source})"
news_str += "\n"
if pub_date:
news_str += f"Published: {pub_date}\n"
if link:
news_str += f"Link: {link}\n"
news_str += "\n"
count += 1
if count == 0:
return f"No Korean news found for {company_name} ({ticker})"
return f"## {company_name} ({ticker}) 한국 뉴스 ({start_date} ~ {end_date}):\n\n{news_str}"
except Exception as e:
return f"Error fetching Korean news via RSS: {str(e)}"
def get_korean_global_news(
curr_date: Annotated[str, "Current date in yyyy-mm-dd format"],
look_back_days: Annotated[int, "Number of days to look back"] = 7,
limit: Annotated[int, "Maximum number of articles to return"] = 10,
) -> str:
"""Retrieve Korean macro/global economic news.
Searches for key Korean market topics: BOK base rate, KOSPI outlook,
USD/KRW exchange rate, Korean economy, etc.
"""
search_queries = [
"한국은행 기준금리",
"코스피 전망",
"원달러 환율",
"한국 경제 전망",
"외국인 투자 한국",
]
headers = _get_naver_headers()
curr_dt = datetime.strptime(curr_date, "%Y-%m-%d")
start_dt = curr_dt - timedelta(days=look_back_days)
start_date = start_dt.strftime("%Y-%m-%d")
all_news = []
seen_titles = set()
for query in search_queries:
try:
if headers:
url = "https://openapi.naver.com/v1/search/news.json"
params = {"query": query, "display": 5, "sort": "date"}
resp = requests.get(url, headers=headers, params=params, timeout=10)
resp.raise_for_status()
items = resp.json().get("items", [])
for item in items:
title = _clean_html(item.get("title", ""))
if title and title not in seen_titles:
seen_titles.add(title)
all_news.append({
"title": title,
"description": _clean_html(item.get("description", "")),
"link": item.get("originallink", item.get("link", "")),
"pubDate": item.get("pubDate", ""),
})
else:
# Fallback to Google News RSS
import urllib.parse
encoded_query = urllib.parse.quote(query)
rss_url = f"https://news.google.com/rss/search?q={encoded_query}&hl=ko&gl=KR&ceid=KR:ko"
resp = requests.get(rss_url, timeout=10)
if resp.status_code == 200:
import xml.etree.ElementTree as ET
root = ET.fromstring(resp.text)
for item in root.findall(".//item")[:3]:
title = item.findtext("title", "")
if title and title not in seen_titles:
seen_titles.add(title)
all_news.append({
"title": title,
"description": "",
"link": item.findtext("link", ""),
"pubDate": item.findtext("pubDate", ""),
})
except Exception:
continue
if len(all_news) >= limit:
break
if not all_news:
return f"No Korean global/macro news found for {curr_date}"
news_str = ""
for article in all_news[:limit]:
news_str += f"### {article['title']}\n"
if article["description"]:
news_str += f"{article['description']}\n"
if article["pubDate"]:
news_str += f"Published: {article['pubDate']}\n"
if article["link"]:
news_str += f"Link: {article['link']}\n"
news_str += "\n"
return f"## 한국 시장/거시경제 뉴스 ({start_date} ~ {curr_date}):\n\n{news_str}"
def _clean_html(text: str) -> str:
"""Remove HTML tags from text."""
import re
clean = re.sub(r"<[^>]+>", "", text)
clean = clean.replace("&quot;", '"').replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">")
return clean.strip()