""" Article scraper client for extracting full content from news URLs. """ import logging import time from dataclasses import dataclass from datetime import datetime from urllib.parse import urlparse import newspaper logger = logging.getLogger(__name__) @dataclass class ScrapeResult: """Result of article scraping operation.""" status: str # 'SUCCESS', 'SCRAPE_FAILED', 'ARCHIVE_SUCCESS', 'NOT_FOUND' content: str = "" author: str = "" final_url: str = "" title: str = "" publish_date: str = "" class ArticleScraperClient: """Client for scraping article content with Internet Archive fallback.""" def __init__(self, user_agent: str, delay: float = 1.0): """ Initialize article scraper. Args: user_agent: User agent string for requests delay: Delay between requests in seconds """ self.user_agent = user_agent or ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" ) self.delay = delay def scrape_article(self, url: str) -> ScrapeResult: """ Scrape article content from URL with fallback to Internet Archive. Args: url: Article URL to scrape Returns: ScrapeResult: Scraping result with content and metadata """ if not url or not self._is_valid_url(url): return ScrapeResult(status="NOT_FOUND", final_url=url) # Try original source first result = self._scrape_from_source(url) if result.status == "SUCCESS": return result # Fallback to Internet Archive logger.info(f"Original scraping failed for {url}, trying Internet Archive") return self._scrape_from_wayback(url) def _scrape_from_source(self, url: str) -> ScrapeResult: """Scrape article from original source using newspaper3k.""" try: # Add delay to be respectful time.sleep(self.delay) # Configure newspaper article article = newspaper.Article(url) article.config.browser_user_agent = self.user_agent article.config.request_timeout = 10 # Download and parse article.download() article.parse() # Validate content if not article.text or len(article.text.strip()) < 100: logger.warning(f"Article content too short or empty for {url}") return ScrapeResult(status="SCRAPE_FAILED", final_url=url) # Handle publish_date which can be datetime or string publish_date_str = "" if article.publish_date: if isinstance(article.publish_date, datetime): publish_date_str = article.publish_date.strftime("%Y-%m-%d") elif isinstance(article.publish_date, str): publish_date_str = article.publish_date else: # Try to convert to string publish_date_str = str(article.publish_date) return ScrapeResult( status="SUCCESS", content=article.text.strip(), author=", ".join(article.authors) if article.authors else "", final_url=url, title=article.title or "", publish_date=publish_date_str, ) except Exception as e: logger.warning(f"Error scraping article from {url}: {e}") return ScrapeResult(status="SCRAPE_FAILED", final_url=url) def _scrape_from_wayback(self, url: str) -> ScrapeResult: """Scrape article from Internet Archive Wayback Machine.""" try: import requests except ImportError: logger.error("requests not installed. Install with: pip install requests") return ScrapeResult(status="NOT_FOUND", final_url=url) try: # Query Wayback Machine CDX API for snapshots cdx_url = "http://web.archive.org/cdx/search/cdx" params = { "url": url, "output": "json", "fl": "timestamp,original", "filter": "statuscode:200", "limit": "1", } response = requests.get(cdx_url, params=params, timeout=10) response.raise_for_status() data = response.json() if len(data) < 2: # First row is headers logger.warning(f"No archived snapshots found for {url}") return ScrapeResult(status="NOT_FOUND", final_url=url) # Get the most recent snapshot timestamp, original_url = data[1] archive_url = f"https://web.archive.org/web/{timestamp}/{original_url}" logger.info(f"Found archived snapshot: {archive_url}") # Scrape from archive URL result = self._scrape_from_source(archive_url) if result.status == "SUCCESS": result.status = "ARCHIVE_SUCCESS" result.final_url = archive_url return result except Exception as e: logger.warning(f"Error accessing Internet Archive for {url}: {e}") return ScrapeResult(status="NOT_FOUND", final_url=url) def _is_valid_url(self, url: str) -> bool: """Check if URL is valid and accessible.""" try: parsed = urlparse(url) return bool(parsed.netloc) and parsed.scheme in ("http", "https") except Exception: return False def scrape_multiple_articles(self, urls: list[str]) -> dict[str, ScrapeResult]: """ Scrape multiple articles sequentially. Args: urls: List of article URLs to scrape Returns: Dict mapping URLs to ScrapeResults """ results = {} for i, url in enumerate(urls): logger.info(f"Scraping article {i + 1}/{len(urls)}: {url}") results[url] = self.scrape_article(url) # Add delay between requests if i < len(urls) - 1: time.sleep(self.delay) return results