TradingAgents/tradingagents/dataflows/opencli_news.py

from __future__ import annotations

import json
import shutil
import subprocess
from datetime import datetime, timedelta

from .exceptions import DataVendorUnavailable


def _parse_date(date_str: str) -> datetime:
    return datetime.strptime(date_str, "%Y-%m-%d")


def _ensure_opencli() -> str:
    binary = shutil.which("opencli-rs")
    if not binary:
        raise DataVendorUnavailable("opencli-rs is not installed or not on PATH.")
    return binary


def _run_opencli(args: list[str]) -> list[dict]:
    binary = _ensure_opencli()
    try:
        result = subprocess.run(
            [binary, *args],
            check=False,
            capture_output=True,
            text=True,
            timeout=60,
        )
    except (OSError, subprocess.SubprocessError) as exc:
        raise DataVendorUnavailable(f"opencli-rs execution failed: {exc}") from exc

    if result.returncode != 0:
        stderr = (result.stderr or result.stdout or "").strip()
        raise DataVendorUnavailable(f"opencli-rs command failed: {stderr}")

    try:
        payload = json.loads(result.stdout)
    except json.JSONDecodeError as exc:
        raise DataVendorUnavailable("opencli-rs returned non-JSON output.") from exc

    if not isinstance(payload, list):
        raise DataVendorUnavailable("opencli-rs returned an unexpected payload format.")

    return payload


def _safe_run_opencli(args: list[str]) -> tuple[list[dict], str | None]:
    try:
        return _run_opencli(args), None
    except DataVendorUnavailable as exc:
        return [], str(exc)


def _format_block(title: str, records: list[str]) -> str:
    if not records:
        return f"### {title}\nNo results."
    return f"### {title}\n" + "\n\n".join(records)


def _dedupe_records(items: list[dict], keys: tuple[str, ...]) -> list[dict]:
    seen: set[str] = set()
    output: list[dict] = []
    for item in items:
        identity = " | ".join(str(item.get(key, "")).strip() for key in keys).strip()
        if not identity or identity in seen:
            continue
        seen.add(identity)
        output.append(item)
    return output


def _clean_symbol(symbol: str) -> str:
    return symbol.strip().upper()


def _symbol_without_suffix(symbol: str) -> str:
    clean = _clean_symbol(symbol)
    return clean.split(".", 1)[0]


def _resolve_company_aliases(ticker: str) -> list[str]:
    aliases: list[str] = []

    try:
        from .tushare import _classify_market, _get_pro_client, _normalize_ts_code

        ts_code = _normalize_ts_code(ticker)
        market = _classify_market(ts_code)
        pro = _get_pro_client()

        if market == "a_share":
            basic = pro.stock_basic(ts_code=ts_code, fields="ts_code,name")
        elif market == "hk":
            basic = pro.hk_basic(ts_code=ts_code)
        else:
            basic = pro.us_basic(ts_code=ts_code)

        if basic is not None and not basic.empty:
            row = basic.iloc[0]
            for field in ("name", "fullname", "enname"):
                value = row.get(field)
                if value:
                    aliases.append(str(value).strip())
    except Exception:
        pass

    aliases.extend([_clean_symbol(ticker), _symbol_without_suffix(ticker)])

    expanded_aliases: list[str] = []
    for alias in aliases:
        alias = alias.strip()
        if not alias:
            continue
        expanded_aliases.append(alias)
        if alias.endswith("股份有限公司"):
            short_alias = alias[: -len("股份有限公司")].strip()
            if short_alias:
                expanded_aliases.append(short_alias)
        if alias.endswith("有限公司"):
            short_alias = alias[: -len("有限公司")].strip()
            if short_alias:
                expanded_aliases.append(short_alias)

    seen: set[str] = set()
    result: list[str] = []
    for alias in expanded_aliases:
        if alias not in seen:
            seen.add(alias)
            result.append(alias)
    return result


def _build_google_queries(ticker: str) -> list[str]:
    aliases = _resolve_company_aliases(ticker)
    queries: list[str] = []
    for alias in aliases:
        queries.append(f"{alias} stock")
        queries.append(alias)
    return queries


def _collect_google_news(ticker: str, limit: int = 8) -> tuple[list[dict], list[str]]:
    items: list[dict] = []
    errors: list[str] = []

    for query in _build_google_queries(ticker):
        payload, error = _safe_run_opencli(
            ["google", "news", query, "--limit", str(limit), "--format", "json"]
        )
        if error:
            errors.append(f"{query}: {error}")
            continue
        items.extend(payload)
        if len(_dedupe_records(items, ("url", "title"))) >= limit:
            break

    return _dedupe_records(items, ("url", "title"))[:limit], errors


def _collect_google_search_results(ticker: str, limit: int = 8) -> tuple[list[dict], list[str]]:
    items: list[dict] = []
    errors: list[str] = []

    for query in _build_google_queries(ticker):
        payload, error = _safe_run_opencli(
            ["google", "search", query, "--lang", "zh", "--limit", str(limit), "--format", "json"]
        )
        if error:
            errors.append(f"{query}: {error}")
            continue
        items.extend(payload)
        if len(_dedupe_records(items, ("url", "title"))) >= limit:
            break

    return _dedupe_records(items, ("url", "title"))[:limit], errors


def _collect_xueqiu_results(ticker: str, limit: int = 8) -> tuple[list[dict], list[str]]:
    items: list[dict] = []
    errors: list[str] = []

    for keyword in _resolve_company_aliases(ticker):
        payload, error = _safe_run_opencli(
            ["xueqiu", "search", keyword, "--limit", str(limit), "--format", "json"]
        )
        if error:
            errors.append(f"{keyword}: {error}")
            continue
        items.extend(payload)
        if len(_dedupe_records(items, ("symbol", "name"))) >= limit:
            break

    return _dedupe_records(items, ("symbol", "name"))[:limit], errors


def _collect_weibo_results(ticker: str, limit: int = 8) -> tuple[list[dict], list[str]]:
    items: list[dict] = []
    errors: list[str] = []

    for keyword in _resolve_company_aliases(ticker):
        payload, error = _safe_run_opencli(
            ["weibo", "search", keyword, "--limit", str(limit), "--format", "json"]
        )
        if error:
            errors.append(f"{keyword}: {error}")
            continue
        items.extend(payload)
        if len(_dedupe_records(items, ("url", "text", "word"))) >= limit:
            break

    return _dedupe_records(items, ("url", "text", "word"))[:limit], errors


def _collect_xiaohongshu_results(ticker: str, limit: int = 8) -> tuple[list[dict], list[str]]:
    items: list[dict] = []
    errors: list[str] = []

    for keyword in _resolve_company_aliases(ticker):
        payload, error = _safe_run_opencli(
            ["xiaohongshu", "search", keyword, "--limit", str(limit), "--format", "json"]
        )
        if error:
            errors.append(f"{keyword}: {error}")
            continue
        items.extend(payload)
        if len(_dedupe_records(items, ("id", "note_id", "url", "title"))) >= limit:
            break

    return _dedupe_records(items, ("id", "note_id", "url", "title"))[:limit], errors


def _collect_sinafinance_results(ticker: str, limit: int = 8) -> tuple[list[dict], list[str]]:
    aliases = _resolve_company_aliases(ticker)
    payload, error = _safe_run_opencli(
        ["sinafinance", "news", "--type", "1", "--limit", "50", "--format", "json"]
    )
    if error:
        return [], [error]

    filtered: list[dict] = []
    for item in payload:
        haystack = " ".join(
            str(item.get(field, "")).strip()
            for field in ("content", "title", "symbol", "name")
        )
        if any(alias and alias in haystack for alias in aliases):
            filtered.append(item)

    return _dedupe_records(filtered, ("time", "content", "title"))[:limit], []


def get_news(ticker: str, start_date: str, end_date: str) -> str:
    _parse_date(start_date)
    _parse_date(end_date)

    sections: list[str] = []
    errors: list[str] = []

    xueqiu_items, xueqiu_errors = _collect_xueqiu_results(ticker, limit=6)
    errors.extend(xueqiu_errors)
    if xueqiu_items:
        sections.append(
            _format_block(
                "Xueqiu Search",
                [
                    (
                        f"- {item.get('name', item.get('symbol', 'Unknown'))} "
                        f"(symbol: {item.get('symbol', 'Unknown')})"
                    )
                    for item in xueqiu_items
                ],
            )
        )

    weibo_items, weibo_errors = _collect_weibo_results(ticker, limit=6)
    errors.extend(weibo_errors)
    if weibo_items:
        sections.append(
            _format_block(
                "Weibo Search",
                [
                    (
                        f"- {item.get('text', item.get('word', 'No text'))}\n"
                        f"  Link: {item.get('url', '')}"
                    )
                    for item in weibo_items
                ],
            )
        )

    xiaohongshu_items, xiaohongshu_errors = _collect_xiaohongshu_results(ticker, limit=6)
    errors.extend(xiaohongshu_errors)
    if xiaohongshu_items:
        sections.append(
            _format_block(
                "Xiaohongshu Search",
                [
                    (
                        f"- {item.get('title', item.get('desc', 'No title'))}\n"
                        f"  Link: {item.get('url', '')}"
                    )
                    for item in xiaohongshu_items
                ],
            )
        )

    sina_items, sina_errors = _collect_sinafinance_results(ticker, limit=6)
    errors.extend(sina_errors)
    if sina_items:
        sections.append(
            _format_block(
                "Sina Finance A-Share Flash",
                [
                    (
                        f"- {item.get('content', item.get('title', 'No content'))} "
                        f"(time: {item.get('time', 'Unknown')}, views: {item.get('views', 'Unknown')})"
                    )
                    for item in sina_items
                ],
            )
        )

    google_news_items, google_news_errors = _collect_google_news(ticker, limit=6)
    errors.extend(google_news_errors)
    if google_news_items:
        sections.append(
            _format_block(
                "Google News",
                [
                    (
                        f"- {item.get('title', 'No title')} "
                        f"(source: {item.get('source', 'Unknown')}, date: {item.get('date', 'Unknown')})\n"
                        f"  Link: {item.get('url', '')}"
                    )
                    for item in google_news_items
                ],
            )
        )

    google_search_items, google_search_errors = _collect_google_search_results(ticker, limit=6)
    errors.extend(google_search_errors)
    if google_search_items:
        sections.append(
            _format_block(
                "Google Search (ZH)",
                [
                    (
                        f"- {item.get('title', 'No title')}\n"
                        f"  Link: {item.get('url', '')}"
                    )
                    for item in google_search_items
                ],
            )
        )

    if not sections:
        aliases = ", ".join(_resolve_company_aliases(ticker))
        detail = (
            f"No relevant news found via opencli-rs for {ticker} "
            f"between {start_date} and {end_date}. "
            f"Queries tried: {aliases or ticker}."
        )
        if errors:
            detail += f" Source errors: {'; '.join(errors[:3])}."
        return detail

    return f"## {ticker} News and Social Signals, from {start_date} to {end_date}:\n\n" + "\n\n".join(sections)


def get_global_news(curr_date: str, look_back_days: int = 7, limit: int = 10) -> str:
    end_dt = _parse_date(curr_date)
    start_date = (end_dt - timedelta(days=look_back_days)).strftime("%Y-%m-%d")

    sections = []

    google_items = _run_opencli(["google", "news", "--limit", str(limit), "--format", "json"])
    sections.append(
        _format_block(
            "Google News Top Stories",
            [
                (
                    f"- {item.get('title', 'No title')} "
                    f"(source: {item.get('source', 'Unknown')}, date: {item.get('date', 'Unknown')})\n"
                    f"  Link: {item.get('url', '')}"
                )
                for item in google_items[:limit]
            ],
        )
    )

    sina_items = _run_opencli(["sinafinance", "news", "--limit", str(limit), "--format", "json"])
    sections.append(
        _format_block(
            "Sina Finance Flash News",
            [
                (
                    f"- {item.get('content', 'No content')} "
                    f"(time: {item.get('time', 'Unknown')}, views: {item.get('views', 'Unknown')})"
                )
                for item in sina_items[:limit]
            ],
        )
    )

    xueqiu_hot = _run_opencli(["xueqiu", "hot", "--limit", str(min(limit, 8)), "--format", "json"])
    sections.append(
        _format_block(
            "Xueqiu Hot Discussions",
            [
                (
                    f"- {item.get('text', 'No text')} "
                    f"(author: {item.get('author', 'Unknown')}, likes: {item.get('likes', 'Unknown')})\n"
                    f"  Link: {item.get('url', '')}"
                )
                for item in xueqiu_hot[:limit]
            ],
        )
    )

    weibo_hot = _run_opencli(["weibo", "hot", "--limit", str(min(limit, 8)), "--format", "json"])
    sections.append(
        _format_block(
            "Weibo Hot Topics",
            [
                (
                    f"- {item.get('word', 'No topic')} "
                    f"(category: {item.get('category', 'Unknown')}, heat: {item.get('hot_value', 'Unknown')})\n"
                    f"  Link: {item.get('url', '')}"
                )
                for item in weibo_hot[:limit]
            ],
        )
    )

    return f"## Global Market News and Social Signals, from {start_date} to {curr_date}:\n\n" + "\n\n".join(sections)