TradingAgents/tradingagents/dataflows/markets/vn/news.py

"""Vietnam news provider using vnstock and VnExpress RSS."""

import feedparser
import requests
from datetime import datetime
from dateutil.relativedelta import relativedelta

from . import config as vn_config

_RSS_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
}


def _parse_rss_articles(feed_url: str, limit: int = 20) -> list:
    """Parse articles from an RSS feed URL using requests for proper headers."""
    articles = []
    try:
        resp = requests.get(feed_url, headers=_RSS_HEADERS, timeout=10)
        if resp.status_code != 200:
            return articles

        feed = feedparser.parse(resp.text)
        for entry in feed.entries[:limit]:
            pub_date = None
            if hasattr(entry, "published_parsed") and entry.published_parsed:
                try:
                    pub_date = datetime(*entry.published_parsed[:6])
                except Exception:
                    pass

            articles.append({
                "title": entry.get("title", "No title"),
                "summary": entry.get("summary", ""),
                "link": entry.get("link", ""),
                "publisher": feed.feed.get("title", "VnExpress"),
                "pub_date": pub_date,
            })
    except Exception as e:
        print(f"Warning: Could not parse RSS feed {feed_url}: {e}")

    return articles


def _get_vnstock_news(ticker: str, source: str = "VCI") -> list:
    """Fetch news from vnstock company.news() API."""
    articles = []
    try:
        from vnstock import Vnstock
        stock = Vnstock().stock(symbol=ticker.upper(), source=source)
        news_df = stock.company.news()

        if news_df is not None and not news_df.empty:
            for _, row in news_df.iterrows():
                pub_date = None
                date_str = row.get("public_date") or row.get("created_at")
                if date_str:
                    try:
                        pub_date = datetime.fromisoformat(str(date_str).replace("Z", "+00:00"))
                        pub_date = pub_date.replace(tzinfo=None)
                    except (ValueError, AttributeError):
                        pass

                articles.append({
                    "title": row.get("news_title", "No title"),
                    "summary": row.get("news_short_content", ""),
                    "link": row.get("news_source_link", ""),
                    "publisher": "vnstock",
                    "pub_date": pub_date,
                })
    except Exception as e:
        print(f"Warning: vnstock news fetch failed for {ticker}: {e}")

    return articles


def _filter_by_date(articles: list, start_dt: datetime, end_dt: datetime) -> list:
    """Filter articles by date range."""
    filtered = []
    for article in articles:
        if article["pub_date"]:
            pub = article["pub_date"]
            if isinstance(pub, datetime) and not (start_dt <= pub <= end_dt + relativedelta(days=1)):
                continue
        filtered.append(article)
    return filtered


def _strip_html(text: str) -> str:
    """Strip HTML tags from text."""
    try:
        from parsel import Selector
        sel = Selector(text=text)
        clean = sel.css("::text").getall()
        return " ".join(clean) if clean else text
    except ImportError:
        import re
        return re.sub(r"<[^>]+>", "", text)


def _format_articles(articles: list, header: str) -> str:
    """Format articles into the standard output string."""
    if not articles:
        return header + "No articles found.\n"

    news_str = ""
    seen_titles = set()
    for article in articles:
        title = article["title"]
        if title in seen_titles:
            continue
        seen_titles.add(title)

        news_str += f"### {title} (source: {article['publisher']})\n"
        if article.get("summary"):
            summary = _strip_html(article["summary"])
            news_str += f"{summary}\n"
        if article.get("link"):
            news_str += f"Link: {article['link']}\n"
        news_str += "\n"

    return header + news_str


def get_news_vn(
    ticker: str,
    start_date: str,
    end_date: str,
    source: str = "VCI",
) -> str:
    """
    Retrieve news for a VN stock ticker.

    Priority chain:
    1. vnstock company.news() API
    2. VnExpress RSS filtered by ticker keyword
    """
    start_dt = datetime.strptime(start_date, "%Y-%m-%d")
    end_dt = datetime.strptime(end_date, "%Y-%m-%d")
    all_articles = []

    # 1. Try vnstock built-in news
    vnstock_articles = _get_vnstock_news(ticker, source=source)
    all_articles.extend(vnstock_articles)

    # 2. Try VnExpress RSS filtered by ticker
    for url in vn_config.VNEXPRESS_RSS_URLS:
        rss_articles = _parse_rss_articles(url, limit=30)
        ticker_upper = ticker.upper()
        for article in rss_articles:
            title_upper = article["title"].upper()
            summary_upper = article.get("summary", "").upper()
            if ticker_upper in title_upper or ticker_upper in summary_upper:
                all_articles.append(article)

    # Filter by date range
    all_articles = _filter_by_date(all_articles, start_dt, end_dt)

    if not all_articles:
        return f"No news found for {ticker} between {start_date} and {end_date}"

    header = f"## {ticker} News (Vietnam Market), from {start_date} to {end_date}:\n\n"
    return _format_articles(all_articles, header)


def get_global_news_vn(
    curr_date: str,
    look_back_days: int = 7,
    limit: int = 10,
) -> str:
    """
    Retrieve global/macro Vietnam market news from VnExpress RSS.
    """
    curr_dt = datetime.strptime(curr_date, "%Y-%m-%d")
    start_dt = curr_dt - relativedelta(days=look_back_days)
    start_date = start_dt.strftime("%Y-%m-%d")

    all_articles = []
    seen_titles = set()

    # Fetch from VnExpress RSS
    for url in vn_config.VNEXPRESS_RSS_URLS:
        rss_articles = _parse_rss_articles(url, limit=limit * 2)
        for article in rss_articles:
            title = article["title"]
            if title not in seen_titles:
                seen_titles.add(title)
                all_articles.append(article)

        if len(all_articles) >= limit:
            break

    # Filter by date range
    all_articles = _filter_by_date(all_articles, start_dt, curr_dt)

    # Limit results
    all_articles = all_articles[:limit]

    if not all_articles:
        return f"No Vietnam market news found for {curr_date}"

    header = f"## Vietnam Market News, from {start_date} to {curr_date}:\n\n"
    return _format_articles(all_articles, header)