291 lines
9.6 KiB
Python
291 lines
9.6 KiB
Python
"""Korean news data source using Naver Finance API and web scraping.
|
|
|
|
Provides Korean financial news, company-specific news, and macro news
|
|
relevant to the Korean market.
|
|
"""
|
|
|
|
import requests
|
|
from datetime import datetime, timedelta
|
|
from typing import Annotated
|
|
from dateutil.relativedelta import relativedelta
|
|
|
|
|
|
# Naver Search API headers (user should set NAVER_CLIENT_ID and NAVER_CLIENT_SECRET env vars)
|
|
_NAVER_HEADERS = None
|
|
|
|
|
|
def _get_naver_headers():
|
|
"""Get Naver API headers, lazy-loaded."""
|
|
global _NAVER_HEADERS
|
|
if _NAVER_HEADERS is None:
|
|
import os
|
|
|
|
client_id = os.environ.get("NAVER_CLIENT_ID", "")
|
|
client_secret = os.environ.get("NAVER_CLIENT_SECRET", "")
|
|
if client_id and client_secret:
|
|
_NAVER_HEADERS = {
|
|
"X-Naver-Client-Id": client_id,
|
|
"X-Naver-Client-Secret": client_secret,
|
|
}
|
|
else:
|
|
_NAVER_HEADERS = {}
|
|
return _NAVER_HEADERS
|
|
|
|
|
|
def _get_stock_name_from_code(code: str) -> str:
|
|
"""Try to resolve stock name from code for better news search."""
|
|
try:
|
|
import FinanceDataReader as fdr
|
|
|
|
listing = fdr.StockListing("KRX")
|
|
if listing is not None and not listing.empty:
|
|
match = listing[listing["Code"] == code]
|
|
if match.empty:
|
|
match = listing[listing["Symbol"] == code]
|
|
if not match.empty:
|
|
return match.iloc[0].get("Name", code)
|
|
except Exception:
|
|
pass
|
|
return code
|
|
|
|
|
|
def get_korean_news(
|
|
ticker: Annotated[str, "KRX ticker symbol or company name"],
|
|
start_date: Annotated[str, "Start date in yyyy-mm-dd format"],
|
|
end_date: Annotated[str, "End date in yyyy-mm-dd format"],
|
|
) -> str:
|
|
"""Retrieve Korean financial news for a specific stock.
|
|
|
|
Uses Naver Search API if credentials are available, otherwise falls back
|
|
to RSS-based news fetching.
|
|
"""
|
|
# Try to resolve ticker to company name for better search
|
|
company_name = ticker
|
|
if ticker.isdigit():
|
|
company_name = _get_stock_name_from_code(ticker)
|
|
|
|
headers = _get_naver_headers()
|
|
|
|
if headers:
|
|
return _fetch_naver_api_news(company_name, ticker, start_date, end_date, headers)
|
|
else:
|
|
return _fetch_rss_news(company_name, ticker, start_date, end_date)
|
|
|
|
|
|
def _fetch_naver_api_news(
|
|
company_name: str,
|
|
ticker: str,
|
|
start_date: str,
|
|
end_date: str,
|
|
headers: dict,
|
|
) -> str:
|
|
"""Fetch news using Naver Search API."""
|
|
try:
|
|
query = f"{company_name} 주가"
|
|
url = "https://openapi.naver.com/v1/search/news.json"
|
|
params = {
|
|
"query": query,
|
|
"display": 20,
|
|
"sort": "date",
|
|
}
|
|
|
|
response = requests.get(url, headers=headers, params=params, timeout=10)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
items = data.get("items", [])
|
|
if not items:
|
|
return f"No Korean news found for {company_name} ({ticker})"
|
|
|
|
# Parse date range
|
|
start_dt = datetime.strptime(start_date, "%Y-%m-%d")
|
|
end_dt = datetime.strptime(end_date, "%Y-%m-%d") + timedelta(days=1)
|
|
|
|
news_str = ""
|
|
count = 0
|
|
|
|
for item in items:
|
|
# Parse pubDate
|
|
pub_date_str = item.get("pubDate", "")
|
|
try:
|
|
pub_date = datetime.strptime(pub_date_str, "%a, %d %b %Y %H:%M:%S %z")
|
|
pub_date_naive = pub_date.replace(tzinfo=None)
|
|
if not (start_dt <= pub_date_naive <= end_dt):
|
|
continue
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
title = _clean_html(item.get("title", ""))
|
|
description = _clean_html(item.get("description", ""))
|
|
link = item.get("originallink", item.get("link", ""))
|
|
|
|
news_str += f"### {title}\n"
|
|
if description:
|
|
news_str += f"{description}\n"
|
|
if link:
|
|
news_str += f"Link: {link}\n"
|
|
news_str += "\n"
|
|
count += 1
|
|
|
|
if count == 0:
|
|
return f"No Korean news found for {company_name} ({ticker}) between {start_date} and {end_date}"
|
|
|
|
return f"## {company_name} ({ticker}) 한국 뉴스 ({start_date} ~ {end_date}):\n\n{news_str}"
|
|
|
|
except Exception as e:
|
|
return f"Error fetching Korean news for {company_name}: {str(e)}"
|
|
|
|
|
|
def _fetch_rss_news(
|
|
company_name: str,
|
|
ticker: str,
|
|
start_date: str,
|
|
end_date: str,
|
|
) -> str:
|
|
"""Fallback: Fetch news using Google News RSS for Korean content."""
|
|
try:
|
|
import urllib.parse
|
|
|
|
query = urllib.parse.quote(f"{company_name} 주식")
|
|
rss_url = f"https://news.google.com/rss/search?q={query}&hl=ko&gl=KR&ceid=KR:ko"
|
|
|
|
response = requests.get(rss_url, timeout=10)
|
|
response.raise_for_status()
|
|
|
|
# Parse RSS XML
|
|
import xml.etree.ElementTree as ET
|
|
|
|
root = ET.fromstring(response.text)
|
|
items = root.findall(".//item")
|
|
|
|
if not items:
|
|
return f"No Korean news found for {company_name} ({ticker})"
|
|
|
|
news_str = ""
|
|
count = 0
|
|
|
|
for item in items[:15]:
|
|
title = item.findtext("title", "")
|
|
link = item.findtext("link", "")
|
|
pub_date = item.findtext("pubDate", "")
|
|
source = item.findtext("source", "")
|
|
|
|
news_str += f"### {title}"
|
|
if source:
|
|
news_str += f" (source: {source})"
|
|
news_str += "\n"
|
|
if pub_date:
|
|
news_str += f"Published: {pub_date}\n"
|
|
if link:
|
|
news_str += f"Link: {link}\n"
|
|
news_str += "\n"
|
|
count += 1
|
|
|
|
if count == 0:
|
|
return f"No Korean news found for {company_name} ({ticker})"
|
|
|
|
return f"## {company_name} ({ticker}) 한국 뉴스 ({start_date} ~ {end_date}):\n\n{news_str}"
|
|
|
|
except Exception as e:
|
|
return f"Error fetching Korean news via RSS: {str(e)}"
|
|
|
|
|
|
def get_korean_global_news(
|
|
curr_date: Annotated[str, "Current date in yyyy-mm-dd format"],
|
|
look_back_days: Annotated[int, "Number of days to look back"] = 7,
|
|
limit: Annotated[int, "Maximum number of articles to return"] = 10,
|
|
) -> str:
|
|
"""Retrieve Korean macro/global economic news.
|
|
|
|
Searches for key Korean market topics: BOK base rate, KOSPI outlook,
|
|
USD/KRW exchange rate, Korean economy, etc.
|
|
"""
|
|
search_queries = [
|
|
"한국은행 기준금리",
|
|
"코스피 전망",
|
|
"원달러 환율",
|
|
"한국 경제 전망",
|
|
"외국인 투자 한국",
|
|
]
|
|
|
|
headers = _get_naver_headers()
|
|
|
|
curr_dt = datetime.strptime(curr_date, "%Y-%m-%d")
|
|
start_dt = curr_dt - timedelta(days=look_back_days)
|
|
start_date = start_dt.strftime("%Y-%m-%d")
|
|
|
|
all_news = []
|
|
seen_titles = set()
|
|
|
|
for query in search_queries:
|
|
try:
|
|
if headers:
|
|
url = "https://openapi.naver.com/v1/search/news.json"
|
|
params = {"query": query, "display": 5, "sort": "date"}
|
|
resp = requests.get(url, headers=headers, params=params, timeout=10)
|
|
resp.raise_for_status()
|
|
items = resp.json().get("items", [])
|
|
|
|
for item in items:
|
|
title = _clean_html(item.get("title", ""))
|
|
if title and title not in seen_titles:
|
|
seen_titles.add(title)
|
|
all_news.append({
|
|
"title": title,
|
|
"description": _clean_html(item.get("description", "")),
|
|
"link": item.get("originallink", item.get("link", "")),
|
|
"pubDate": item.get("pubDate", ""),
|
|
})
|
|
else:
|
|
# Fallback to Google News RSS
|
|
import urllib.parse
|
|
|
|
encoded_query = urllib.parse.quote(query)
|
|
rss_url = f"https://news.google.com/rss/search?q={encoded_query}&hl=ko&gl=KR&ceid=KR:ko"
|
|
resp = requests.get(rss_url, timeout=10)
|
|
if resp.status_code == 200:
|
|
import xml.etree.ElementTree as ET
|
|
|
|
root = ET.fromstring(resp.text)
|
|
for item in root.findall(".//item")[:3]:
|
|
title = item.findtext("title", "")
|
|
if title and title not in seen_titles:
|
|
seen_titles.add(title)
|
|
all_news.append({
|
|
"title": title,
|
|
"description": "",
|
|
"link": item.findtext("link", ""),
|
|
"pubDate": item.findtext("pubDate", ""),
|
|
})
|
|
|
|
except Exception:
|
|
continue
|
|
|
|
if len(all_news) >= limit:
|
|
break
|
|
|
|
if not all_news:
|
|
return f"No Korean global/macro news found for {curr_date}"
|
|
|
|
news_str = ""
|
|
for article in all_news[:limit]:
|
|
news_str += f"### {article['title']}\n"
|
|
if article["description"]:
|
|
news_str += f"{article['description']}\n"
|
|
if article["pubDate"]:
|
|
news_str += f"Published: {article['pubDate']}\n"
|
|
if article["link"]:
|
|
news_str += f"Link: {article['link']}\n"
|
|
news_str += "\n"
|
|
|
|
return f"## 한국 시장/거시경제 뉴스 ({start_date} ~ {curr_date}):\n\n{news_str}"
|
|
|
|
|
|
def _clean_html(text: str) -> str:
|
|
"""Remove HTML tags from text."""
|
|
import re
|
|
|
|
clean = re.sub(r"<[^>]+>", "", text)
|
|
clean = clean.replace(""", '"').replace("&", "&").replace("<", "<").replace(">", ">")
|
|
return clean.strip()
|