TradingAgents/tradingagents/domains/news/article_scraper_client.py

227 lines
7.5 KiB
Python

"""
Article scraper client for extracting full content from news URLs.
"""
import logging
import time
from dataclasses import dataclass
from datetime import datetime
from urllib.parse import urlparse
from newspaper import Article
from newspaper.configuration import Configuration
logger = logging.getLogger(__name__)
@dataclass
class ScrapeResult:
"""Result of article scraping operation."""
status: str # 'SUCCESS', 'SCRAPE_FAILED', 'PAYWALL_DETECTED', 'NOT_FOUND'
content: str = ""
author: str = ""
final_url: str = ""
title: str = ""
publish_date: str = ""
is_paywall: bool = False
keywords: list[str] | None = None # Extracted keywords from newspaper4k
summary: str = "" # Article summary from newspaper4k
class ArticleScraperClient:
"""Client for scraping article content using newspaper4k."""
def __init__(self, user_agent: str | None = None, delay: float = 1.0):
"""
Initialize article scraper.
Args:
user_agent: User agent string for requests (None for default)
delay: Delay between requests in seconds
"""
self.user_agent = user_agent or (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)
self.delay = delay
# Download NLTK data for newspaper4k NLP
try:
import nltk
nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True)
nltk.download("stopwords", quiet=True)
nltk.download("averaged_perceptron_tagger", quiet=True)
except ImportError:
logger.warning("NLTK not available - NLP features will be limited")
# Common paywall indicators
self.paywall_indicators = {
"subscribe",
"subscription",
"premium",
"paywall",
"sign in to read",
"log in to continue",
"register to read",
"become a member",
"upgrade to premium",
"this article is for subscribers",
"limited free articles",
"subscribe now",
"create a free account",
"read more with subscription",
"unlock full access",
"premium content",
"subscriber exclusive",
"behind paywall",
"free trial",
}
def scrape_article(self, url: str) -> ScrapeResult:
"""
Scrape article content from URL.
Args:
url: Article URL to scrape
Returns:
ScrapeResult: Scraping result with content and metadata
"""
if not url or not self._is_valid_url(url):
return ScrapeResult(status="NOT_FOUND", final_url=url)
# Scrape from original source
return self._scrape_from_source(url)
def _scrape_from_source(self, url: str) -> ScrapeResult:
"""Scrape article from original source using newspaper4k."""
try:
# Add delay to be respectful
time.sleep(self.delay)
# Configure newspaper4k with optimizations
config = Configuration()
config.browser_user_agent = self.user_agent
config.request_timeout = 10
config.fetch_images = False
article = Article(url, config=config)
article.download()
article.parse()
article.nlp()
# Validate content and check for paywall
content = article.text.strip() if article.text else ""
is_paywall = self._detect_paywall(content, article.title or "")
if not content or len(content) < 100:
if is_paywall:
logger.info(f"Paywall detected for {url}")
return ScrapeResult(
status="PAYWALL_DETECTED",
final_url=url,
is_paywall=True,
title=article.title or "",
content=content, # Include partial content
)
else:
logger.warning(f"Article content too short or empty for {url}")
return ScrapeResult(status="SCRAPE_FAILED", final_url=url)
# Handle publish_date which can be datetime or string
publish_date_str = ""
if article.publish_date:
if isinstance(article.publish_date, datetime):
publish_date_str = article.publish_date.strftime("%Y-%m-%d")
elif isinstance(article.publish_date, str):
publish_date_str = article.publish_date
else:
# Try to convert to string
publish_date_str = str(article.publish_date)
return ScrapeResult(
status="SUCCESS",
content=content,
author=", ".join(article.authors) if article.authors else "",
final_url=url,
title=article.title or "",
publish_date=publish_date_str,
is_paywall=is_paywall,
keywords=list(article.keywords) if article.keywords else [],
summary=article.summary or "",
)
except Exception as e:
logger.warning(f"Error scraping article from {url}: {e}")
return ScrapeResult(status="SCRAPE_FAILED", final_url=url)
def _detect_paywall(self, content: str, title: str) -> bool:
"""
Detect if article is behind a paywall.
Args:
content: Article content text
title: Article title
Returns:
bool: True if paywall indicators are found
"""
if not content and not title:
return False
# Combine content and title for analysis
text_to_check = f"{title} {content}".lower()
# Check for paywall indicators
for indicator in self.paywall_indicators:
if indicator in text_to_check:
return True
# Additional heuristics
# Very short content with subscription-related words
if len(content) < 200 and any(
word in text_to_check
for word in ["subscription", "subscribe", "member", "premium"]
):
return True
# Content that ends abruptly with subscription prompts
content_end = content[-200:].lower() if len(content) > 200 else content.lower()
return any(
phrase in content_end
for phrase in ["to continue reading", "subscribe to", "become a member"]
)
def _is_valid_url(self, url: str) -> bool:
"""Check if URL is valid and accessible."""
try:
parsed = urlparse(url)
return bool(parsed.netloc) and parsed.scheme in ("http", "https")
except Exception:
return False
def scrape_multiple_articles(self, urls: list[str]) -> dict[str, ScrapeResult]:
"""
Scrape multiple articles sequentially.
Args:
urls: List of article URLs to scrape
Returns:
Dict mapping URLs to ScrapeResults
"""
results = {}
for i, url in enumerate(urls):
logger.info(f"Scraping article {i + 1}/{len(urls)}: {url}")
results[url] = self.scrape_article(url)
# Add delay between requests
if i < len(urls) - 1:
time.sleep(self.delay)
return results