TradingAgents/tradingagents/domains/news/article_scraper_client.py

"""
Article scraper client for extracting full content from news URLs.
"""

import logging
import time
from dataclasses import dataclass
from datetime import datetime
from urllib.parse import urlparse

from newspaper import Article
from newspaper.configuration import Configuration

logger = logging.getLogger(__name__)


@dataclass
class ScrapeResult:
    """Result of article scraping operation."""

    status: str  # 'SUCCESS', 'SCRAPE_FAILED', 'PAYWALL_DETECTED', 'NOT_FOUND'
    content: str = ""
    author: str = ""
    final_url: str = ""
    title: str = ""
    publish_date: str = ""
    is_paywall: bool = False
    keywords: list[str] | None = None  # Extracted keywords from newspaper4k
    summary: str = ""  # Article summary from newspaper4k


class ArticleScraperClient:
    """Client for scraping article content using newspaper4k."""

    def __init__(self, user_agent: str | None = None, delay: float = 1.0):
        """
        Initialize article scraper.

        Args:
            user_agent: User agent string for requests (None for default)
            delay: Delay between requests in seconds
        """
        self.user_agent = user_agent or (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        )
        self.delay = delay

        # Download NLTK data for newspaper4k NLP
        try:
            import nltk

            nltk.download("punkt", quiet=True)
            nltk.download("punkt_tab", quiet=True)
            nltk.download("stopwords", quiet=True)
            nltk.download("averaged_perceptron_tagger", quiet=True)
        except ImportError:
            logger.warning("NLTK not available - NLP features will be limited")

        # Common paywall indicators
        self.paywall_indicators = {
            "subscribe",
            "subscription",
            "premium",
            "paywall",
            "sign in to read",
            "log in to continue",
            "register to read",
            "become a member",
            "upgrade to premium",
            "this article is for subscribers",
            "limited free articles",
            "subscribe now",
            "create a free account",
            "read more with subscription",
            "unlock full access",
            "premium content",
            "subscriber exclusive",
            "behind paywall",
            "free trial",
        }

    def scrape_article(self, url: str) -> ScrapeResult:
        """
        Scrape article content from URL.

        Args:
            url: Article URL to scrape

        Returns:
            ScrapeResult: Scraping result with content and metadata
        """
        if not url or not self._is_valid_url(url):
            return ScrapeResult(status="NOT_FOUND", final_url=url)

        # Scrape from original source
        return self._scrape_from_source(url)

    def _scrape_from_source(self, url: str) -> ScrapeResult:
        """Scrape article from original source using newspaper4k."""
        try:
            # Add delay to be respectful
            time.sleep(self.delay)

            # Configure newspaper4k with optimizations
            config = Configuration()
            config.browser_user_agent = self.user_agent
            config.request_timeout = 10
            config.fetch_images = False

            article = Article(url, config=config)
            article.download()
            article.parse()
            article.nlp()

            # Validate content and check for paywall
            content = article.text.strip() if article.text else ""
            is_paywall = self._detect_paywall(content, article.title or "")

            if not content or len(content) < 100:
                if is_paywall:
                    logger.info(f"Paywall detected for {url}")
                    return ScrapeResult(
                        status="PAYWALL_DETECTED",
                        final_url=url,
                        is_paywall=True,
                        title=article.title or "",
                        content=content,  # Include partial content
                    )
                else:
                    logger.warning(f"Article content too short or empty for {url}")
                    return ScrapeResult(status="SCRAPE_FAILED", final_url=url)

            # Handle publish_date which can be datetime or string
            publish_date_str = ""
            if article.publish_date:
                if isinstance(article.publish_date, datetime):
                    publish_date_str = article.publish_date.strftime("%Y-%m-%d")
                elif isinstance(article.publish_date, str):
                    publish_date_str = article.publish_date
                else:
                    # Try to convert to string
                    publish_date_str = str(article.publish_date)

            return ScrapeResult(
                status="SUCCESS",
                content=content,
                author=", ".join(article.authors) if article.authors else "",
                final_url=url,
                title=article.title or "",
                publish_date=publish_date_str,
                is_paywall=is_paywall,
                keywords=list(article.keywords) if article.keywords else [],
                summary=article.summary or "",
            )

        except Exception as e:
            logger.warning(f"Error scraping article from {url}: {e}")
            return ScrapeResult(status="SCRAPE_FAILED", final_url=url)

    def _detect_paywall(self, content: str, title: str) -> bool:
        """
        Detect if article is behind a paywall.

        Args:
            content: Article content text
            title: Article title

        Returns:
            bool: True if paywall indicators are found
        """
        if not content and not title:
            return False

        # Combine content and title for analysis
        text_to_check = f"{title} {content}".lower()

        # Check for paywall indicators
        for indicator in self.paywall_indicators:
            if indicator in text_to_check:
                return True

        # Additional heuristics
        # Very short content with subscription-related words
        if len(content) < 200 and any(
            word in text_to_check
            for word in ["subscription", "subscribe", "member", "premium"]
        ):
            return True

        # Content that ends abruptly with subscription prompts
        content_end = content[-200:].lower() if len(content) > 200 else content.lower()
        return any(
            phrase in content_end
            for phrase in ["to continue reading", "subscribe to", "become a member"]
        )

    def _is_valid_url(self, url: str) -> bool:
        """Check if URL is valid and accessible."""
        try:
            parsed = urlparse(url)
            return bool(parsed.netloc) and parsed.scheme in ("http", "https")
        except Exception:
            return False

    def scrape_multiple_articles(self, urls: list[str]) -> dict[str, ScrapeResult]:
        """
        Scrape multiple articles sequentially.

        Args:
            urls: List of article URLs to scrape

        Returns:
            Dict mapping URLs to ScrapeResults
        """
        results = {}

        for i, url in enumerate(urls):
            logger.info(f"Scraping article {i + 1}/{len(urls)}: {url}")
            results[url] = self.scrape_article(url)

            # Add delay between requests
            if i < len(urls) - 1:
                time.sleep(self.delay)

        return results