TradingAgents/tradingagents/dataflows/news_parsers.py

import re
from typing import List, Dict, Any


def _extract_urls(text: str) -> List[str]:
    url_pattern = re.compile(r"https?://[^\s)]+")
    return url_pattern.findall(text or "")


def _strip_md(s: str) -> str:
    if not s:
        return s
    # Remove simple markdown bold/italics markers
    return re.sub(r"[*_`]+", "", s).strip()


def parse_global_news(raw_text: str) -> List[Dict[str, Any]]:
    """
    Parses global news text produced by get_global_news_openai into a list of items.
    Expected patterns include enumerated bold headings like:
      1. **October 25, 2025: "Headline"**
         - Trading Relevance: ...
         (source links)
    Returns a list of dicts with keys: date, headline, relevance, sources, raw.
    """
    if not raw_text or not isinstance(raw_text, str):
        return []

    items: List[Dict[str, Any]] = []

    # Find each enumerated bold heading and take content until next heading
    header_iter = list(
        re.finditer(r"(?m)^\s*\d+\.\s+\*\*(.+?)\*\*\s*$", raw_text)
    )
    if not header_iter:
        # Fallback: try to split by lines that start with a date-like pattern in bold
        header_iter = list(
            re.finditer(r"(?m)^\s*\*\*([A-Za-z]+\s+\d{1,2},\s+\d{4}.*)\*\*\s*$", raw_text)
        )

    boundaries = []
    for m in header_iter:
        boundaries.append((m.start(), m.end(), m.group(1)))
    # Add sentinel end
    text_len = len(raw_text)
    for i, (s, e, header_text) in enumerate(boundaries):
        next_start = boundaries[i + 1][0] if i + 1 < len(boundaries) else text_len
        block = raw_text[e:next_start].strip()

        header = header_text.strip()
        # Extract date and headline from header
        date_match = re.search(r"([A-Za-z]+\s+\d{1,2},\s+\d{4})", header)
        quoted_headline = re.search(r"\"([^\"]+)\"", header)
        headline_after_colon = None
        if ":" in header:
            parts = header.split(":", 1)
            headline_after_colon = parts[1].strip()
            # Remove surrounding quotes if present
            headline_after_colon = headline_after_colon.strip("\"“”")

        date_str = date_match.group(1) if date_match else None
        headline = (
            quoted_headline.group(1)
            if quoted_headline
            else (headline_after_colon or _strip_md(header))
        )

        # Extract trading relevance line(s)
        rel_match = re.search(
            r"(?i)Trading\s+Relevance:\s*(.+)", block
        )
        relevance = rel_match.group(1).strip() if rel_match else ""

        sources = _extract_urls(block + " " + header)

        items.append(
            {
                "date": date_str,
                "headline": headline,
                "relevance": relevance,
                "sources": list(dict.fromkeys(sources)),
                "raw": header + "\n" + block,
            }
        )
    return items


def parse_stock_news(raw_text: str) -> List[Dict[str, Any]]:
    """
    Parses company-specific news text from get_stock_news_openai into a list of items.
    Expected patterns include bold enumerated sections like:
      **1. Topic**
      Description ... (url)
    Returns a list of dicts with keys: title, summary, sources, raw.
    """
    if not raw_text or not isinstance(raw_text, str):
        return []

    items: List[Dict[str, Any]] = []

    # Find headings like **1. Something** or **1. Something** on its own line
    header_iter = list(
        re.finditer(r"(?m)^\s*\*\*\s*\d+\.\s*(.+?)\s*\*\*\s*$", raw_text)
    )

    if not header_iter:
        # Fallback: split by numbered lines even without bold
        header_iter = list(
            re.finditer(r"(?m)^\s*\d+\.\s+(.+?)\s*$", raw_text)
        )

    if header_iter:
        boundaries = []
        for m in header_iter:
            boundaries.append((m.start(), m.end(), m.group(1)))
        text_len = len(raw_text)
        for i, (s, e, header_text) in enumerate(boundaries):
            next_start = boundaries[i + 1][0] if i + 1 < len(boundaries) else text_len
            block = raw_text[e:next_start].strip()
            title = _strip_md(header_text)
            sources = _extract_urls(block + " " + title)
            summary = block.strip()
            items.append(
                {
                    "title": title,
                    "summary": summary,
                    "sources": list(dict.fromkeys(sources)),
                    "raw": f"{title}\n{summary}",
                }
            )
    else:
        # Last resort: try to split paragraphs; each paragraph with a URL is an item
        paragraphs = [p.strip() for p in re.split(r"\n\s*\n", raw_text) if p.strip()]
        for p in paragraphs:
            urls = _extract_urls(p)
            if urls or len(p) > 120:
                items.append(
                    {
                        "title": p.split("\n", 1)[0][:80],
                        "summary": p,
                        "sources": list(dict.fromkeys(urls)),
                        "raw": p,
                    }
                )

    return items