feat: add Naver Finance as KRX universe data source
FDR and pykrx APIs are currently blocked by KRX servers. Added Naver Finance scraping (KOSPI + KOSDAQ market cap ranking) as secondary source before falling back to hardcoded universe. Fallback chain: FDR → Naver Finance (720+ stocks) → Hardcoded (86 stocks) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
57d82164d2
commit
d066820307
|
|
@ -5,13 +5,24 @@ bypassing LangChain tool wrappers for performance.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
|
import time
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
from io import StringIO
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import requests
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_NAVER_HEADERS = {
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_krx_universe(
|
def get_krx_universe(
|
||||||
min_market_cap: float = 500_000_000_000,
|
min_market_cap: float = 500_000_000_000,
|
||||||
|
|
@ -34,7 +45,18 @@ def get_krx_universe(
|
||||||
logger.warning(f"FDR KRX listing failed: {e}")
|
logger.warning(f"FDR KRX listing failed: {e}")
|
||||||
|
|
||||||
if listing is None or listing.empty:
|
if listing is None or listing.empty:
|
||||||
logger.info("Using fallback KRX universe (top stocks by market cap)")
|
logger.info("FDR failed, trying Naver Finance...")
|
||||||
|
try:
|
||||||
|
listing = _get_naver_krx_universe(
|
||||||
|
min_market_cap=min_market_cap,
|
||||||
|
)
|
||||||
|
if listing is not None and not listing.empty:
|
||||||
|
logger.info(f"Naver Finance universe: {len(listing)} stocks")
|
||||||
|
return listing.reset_index(drop=True)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Naver Finance failed: {e}")
|
||||||
|
|
||||||
|
logger.info("Using fallback KRX universe (hardcoded top stocks)")
|
||||||
listing = _get_krx_fallback_universe()
|
listing = _get_krx_fallback_universe()
|
||||||
is_fallback = True
|
is_fallback = True
|
||||||
|
|
||||||
|
|
@ -302,6 +324,94 @@ def _get_nasdaq100_tickers() -> list[str]:
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _get_naver_krx_universe(
|
||||||
|
min_market_cap: float = 500_000_000_000,
|
||||||
|
max_pages: int = 10,
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""Get KRX stocks from Naver Finance market cap ranking.
|
||||||
|
|
||||||
|
Scrapes KOSPI (sosok=0) and KOSDAQ (sosok=1) pages.
|
||||||
|
Returns DataFrame with columns: Code, Name, Market, Sector, MarketCap, Volume.
|
||||||
|
"""
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
all_rows = []
|
||||||
|
|
||||||
|
for sosok, market_name in [(0, "KOSPI"), (1, "KOSDAQ")]:
|
||||||
|
for page in range(1, max_pages + 1):
|
||||||
|
url = f"https://finance.naver.com/sise/sise_market_sum.naver?sosok={sosok}&page={page}"
|
||||||
|
try:
|
||||||
|
resp = requests.get(url, headers=_NAVER_HEADERS, timeout=10)
|
||||||
|
resp.raise_for_status()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Naver Finance request failed (sosok={sosok}, page={page}): {e}")
|
||||||
|
break
|
||||||
|
|
||||||
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
|
||||||
|
# Parse table rows
|
||||||
|
table = soup.select_one("table.type_2")
|
||||||
|
if not table:
|
||||||
|
break
|
||||||
|
|
||||||
|
rows = table.select("tr")
|
||||||
|
found_any = False
|
||||||
|
for row in rows:
|
||||||
|
tds = row.select("td")
|
||||||
|
if len(tds) < 10:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Extract ticker code from link
|
||||||
|
link = tds[1].select_one("a[href*='/item/main.naver?code=']")
|
||||||
|
if not link:
|
||||||
|
continue
|
||||||
|
|
||||||
|
code_match = re.search(r"code=(\d+)", link["href"])
|
||||||
|
if not code_match:
|
||||||
|
continue
|
||||||
|
|
||||||
|
code = code_match.group(1)
|
||||||
|
name = link.text.strip()
|
||||||
|
|
||||||
|
# Parse numeric values (remove commas)
|
||||||
|
def parse_num(td):
|
||||||
|
text = td.text.strip().replace(",", "").replace("%", "")
|
||||||
|
try:
|
||||||
|
return float(text)
|
||||||
|
except ValueError:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
market_cap = parse_num(tds[6]) * 1_0000_0000 # 억 → 원
|
||||||
|
volume = parse_num(tds[9])
|
||||||
|
|
||||||
|
all_rows.append({
|
||||||
|
"Code": code,
|
||||||
|
"Name": name,
|
||||||
|
"Market": market_name,
|
||||||
|
"Sector": "",
|
||||||
|
"MarketCap": market_cap,
|
||||||
|
"Volume": volume,
|
||||||
|
})
|
||||||
|
found_any = True
|
||||||
|
|
||||||
|
if not found_any:
|
||||||
|
break
|
||||||
|
|
||||||
|
time.sleep(0.3) # Rate limiting
|
||||||
|
|
||||||
|
if not all_rows:
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
df = pd.DataFrame(all_rows)
|
||||||
|
|
||||||
|
# Filter by market cap
|
||||||
|
if min_market_cap > 0:
|
||||||
|
df = df[df["MarketCap"] >= min_market_cap]
|
||||||
|
|
||||||
|
logger.info(f"Naver Finance: {len(df)} stocks loaded")
|
||||||
|
return df.reset_index(drop=True)
|
||||||
|
|
||||||
|
|
||||||
def _get_krx_fallback_universe() -> pd.DataFrame:
|
def _get_krx_fallback_universe() -> pd.DataFrame:
|
||||||
"""Fallback KRX universe when API listing is unavailable.
|
"""Fallback KRX universe when API listing is unavailable.
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue