TradingAgents/tradingagents/domains/news/article_scraper_client.py

186 lines
6.2 KiB
Python

"""
Article scraper client for extracting full content from news URLs.
"""
import logging
import time
from dataclasses import dataclass
from datetime import datetime
from urllib.parse import urlparse
from newspaper import Article, Config
logger = logging.getLogger(__name__)
@dataclass
class ScrapeResult:
"""Result of article scraping operation."""
status: str # 'SUCCESS', 'SCRAPE_FAILED', 'ARCHIVE_SUCCESS', 'NOT_FOUND'
content: str = ""
author: str = ""
final_url: str = ""
title: str = ""
publish_date: str = ""
class ArticleScraperClient:
"""Client for scraping article content with Internet Archive fallback."""
def __init__(self, user_agent: str | None = None, delay: float = 1.0):
"""
Initialize article scraper.
Args:
user_agent: User agent string for requests (None for default)
delay: Delay between requests in seconds
"""
self.user_agent = user_agent or (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)
self.delay = delay
def scrape_article(self, url: str) -> ScrapeResult:
"""
Scrape article content from URL with fallback to Internet Archive.
Args:
url: Article URL to scrape
Returns:
ScrapeResult: Scraping result with content and metadata
"""
if not url or not self._is_valid_url(url):
return ScrapeResult(status="NOT_FOUND", final_url=url)
# Try original source first
result = self._scrape_from_source(url)
if result.status == "SUCCESS":
return result
# Fallback to Internet Archive
logger.info(f"Original scraping failed for {url}, trying Internet Archive")
return self._scrape_from_wayback(url)
def _scrape_from_source(self, url: str) -> ScrapeResult:
"""Scrape article from original source using newspaper4k."""
try:
# Add delay to be respectful
time.sleep(self.delay)
# Configure newspaper4k with optimizations
config = Config()
config.browser_user_agent = self.user_agent
config.request_timeout = 10
config.fetch_images = False
article = Article(url, config=config)
article.download()
article.parse()
# Validate content
if not article.text or len(article.text.strip()) < 100:
logger.warning(f"Article content too short or empty for {url}")
return ScrapeResult(status="SCRAPE_FAILED", final_url=url)
# Handle publish_date which can be datetime or string
publish_date_str = ""
if article.publish_date:
if isinstance(article.publish_date, datetime):
publish_date_str = article.publish_date.strftime("%Y-%m-%d")
elif isinstance(article.publish_date, str):
publish_date_str = article.publish_date
else:
# Try to convert to string
publish_date_str = str(article.publish_date)
return ScrapeResult(
status="SUCCESS",
content=article.text.strip(),
author=", ".join(article.authors) if article.authors else "",
final_url=url,
title=article.title or "",
publish_date=publish_date_str,
)
except Exception as e:
logger.warning(f"Error scraping article from {url}: {e}")
return ScrapeResult(status="SCRAPE_FAILED", final_url=url)
def _scrape_from_wayback(self, url: str) -> ScrapeResult:
"""Scrape article from Internet Archive Wayback Machine."""
try:
import requests
except ImportError:
logger.error("requests not installed. Install with: pip install requests")
return ScrapeResult(status="NOT_FOUND", final_url=url)
try:
# Query Wayback Machine CDX API for snapshots
cdx_url = "http://web.archive.org/cdx/search/cdx"
params = {
"url": url,
"output": "json",
"fl": "timestamp,original",
"filter": "statuscode:200",
"limit": "1",
}
response = requests.get(cdx_url, params=params, timeout=10)
response.raise_for_status()
data = response.json()
if len(data) < 2: # First row is headers
logger.warning(f"No archived snapshots found for {url}")
return ScrapeResult(status="NOT_FOUND", final_url=url)
# Get the most recent snapshot
timestamp, original_url = data[1]
archive_url = f"https://web.archive.org/web/{timestamp}/{original_url}"
logger.info(f"Found archived snapshot: {archive_url}")
# Scrape from archive URL
result = self._scrape_from_source(archive_url)
if result.status == "SUCCESS":
result.status = "ARCHIVE_SUCCESS"
result.final_url = archive_url
return result
except Exception as e:
logger.warning(f"Error accessing Internet Archive for {url}: {e}")
return ScrapeResult(status="NOT_FOUND", final_url=url)
def _is_valid_url(self, url: str) -> bool:
"""Check if URL is valid and accessible."""
try:
parsed = urlparse(url)
return bool(parsed.netloc) and parsed.scheme in ("http", "https")
except Exception:
return False
def scrape_multiple_articles(self, urls: list[str]) -> dict[str, ScrapeResult]:
"""
Scrape multiple articles sequentially.
Args:
urls: List of article URLs to scrape
Returns:
Dict mapping URLs to ScrapeResults
"""
results = {}
for i, url in enumerate(urls):
logger.info(f"Scraping article {i + 1}/{len(urls)}: {url}")
results[url] = self.scrape_article(url)
# Add delay between requests
if i < len(urls) - 1:
time.sleep(self.delay)
return results