227 lines
7.5 KiB
Python
227 lines
7.5 KiB
Python
"""
|
|
Article scraper client for extracting full content from news URLs.
|
|
"""
|
|
|
|
import logging
|
|
import time
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from urllib.parse import urlparse
|
|
|
|
from newspaper import Article
|
|
from newspaper.configuration import Configuration
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class ScrapeResult:
|
|
"""Result of article scraping operation."""
|
|
|
|
status: str # 'SUCCESS', 'SCRAPE_FAILED', 'PAYWALL_DETECTED', 'NOT_FOUND'
|
|
content: str = ""
|
|
author: str = ""
|
|
final_url: str = ""
|
|
title: str = ""
|
|
publish_date: str = ""
|
|
is_paywall: bool = False
|
|
keywords: list[str] | None = None # Extracted keywords from newspaper4k
|
|
summary: str = "" # Article summary from newspaper4k
|
|
|
|
|
|
class ArticleScraperClient:
|
|
"""Client for scraping article content using newspaper4k."""
|
|
|
|
def __init__(self, user_agent: str | None = None, delay: float = 1.0):
|
|
"""
|
|
Initialize article scraper.
|
|
|
|
Args:
|
|
user_agent: User agent string for requests (None for default)
|
|
delay: Delay between requests in seconds
|
|
"""
|
|
self.user_agent = user_agent or (
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
|
)
|
|
self.delay = delay
|
|
|
|
# Download NLTK data for newspaper4k NLP
|
|
try:
|
|
import nltk
|
|
|
|
nltk.download("punkt", quiet=True)
|
|
nltk.download("punkt_tab", quiet=True)
|
|
nltk.download("stopwords", quiet=True)
|
|
nltk.download("averaged_perceptron_tagger", quiet=True)
|
|
except ImportError:
|
|
logger.warning("NLTK not available - NLP features will be limited")
|
|
|
|
# Common paywall indicators
|
|
self.paywall_indicators = {
|
|
"subscribe",
|
|
"subscription",
|
|
"premium",
|
|
"paywall",
|
|
"sign in to read",
|
|
"log in to continue",
|
|
"register to read",
|
|
"become a member",
|
|
"upgrade to premium",
|
|
"this article is for subscribers",
|
|
"limited free articles",
|
|
"subscribe now",
|
|
"create a free account",
|
|
"read more with subscription",
|
|
"unlock full access",
|
|
"premium content",
|
|
"subscriber exclusive",
|
|
"behind paywall",
|
|
"free trial",
|
|
}
|
|
|
|
def scrape_article(self, url: str) -> ScrapeResult:
|
|
"""
|
|
Scrape article content from URL.
|
|
|
|
Args:
|
|
url: Article URL to scrape
|
|
|
|
Returns:
|
|
ScrapeResult: Scraping result with content and metadata
|
|
"""
|
|
if not url or not self._is_valid_url(url):
|
|
return ScrapeResult(status="NOT_FOUND", final_url=url)
|
|
|
|
# Scrape from original source
|
|
return self._scrape_from_source(url)
|
|
|
|
def _scrape_from_source(self, url: str) -> ScrapeResult:
|
|
"""Scrape article from original source using newspaper4k."""
|
|
try:
|
|
# Add delay to be respectful
|
|
time.sleep(self.delay)
|
|
|
|
# Configure newspaper4k with optimizations
|
|
config = Configuration()
|
|
config.browser_user_agent = self.user_agent
|
|
config.request_timeout = 10
|
|
config.fetch_images = False
|
|
|
|
article = Article(url, config=config)
|
|
article.download()
|
|
article.parse()
|
|
article.nlp()
|
|
|
|
# Validate content and check for paywall
|
|
content = article.text.strip() if article.text else ""
|
|
is_paywall = self._detect_paywall(content, article.title or "")
|
|
|
|
if not content or len(content) < 100:
|
|
if is_paywall:
|
|
logger.info(f"Paywall detected for {url}")
|
|
return ScrapeResult(
|
|
status="PAYWALL_DETECTED",
|
|
final_url=url,
|
|
is_paywall=True,
|
|
title=article.title or "",
|
|
content=content, # Include partial content
|
|
)
|
|
else:
|
|
logger.warning(f"Article content too short or empty for {url}")
|
|
return ScrapeResult(status="SCRAPE_FAILED", final_url=url)
|
|
|
|
# Handle publish_date which can be datetime or string
|
|
publish_date_str = ""
|
|
if article.publish_date:
|
|
if isinstance(article.publish_date, datetime):
|
|
publish_date_str = article.publish_date.strftime("%Y-%m-%d")
|
|
elif isinstance(article.publish_date, str):
|
|
publish_date_str = article.publish_date
|
|
else:
|
|
# Try to convert to string
|
|
publish_date_str = str(article.publish_date)
|
|
|
|
return ScrapeResult(
|
|
status="SUCCESS",
|
|
content=content,
|
|
author=", ".join(article.authors) if article.authors else "",
|
|
final_url=url,
|
|
title=article.title or "",
|
|
publish_date=publish_date_str,
|
|
is_paywall=is_paywall,
|
|
keywords=list(article.keywords) if article.keywords else [],
|
|
summary=article.summary or "",
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error scraping article from {url}: {e}")
|
|
return ScrapeResult(status="SCRAPE_FAILED", final_url=url)
|
|
|
|
def _detect_paywall(self, content: str, title: str) -> bool:
|
|
"""
|
|
Detect if article is behind a paywall.
|
|
|
|
Args:
|
|
content: Article content text
|
|
title: Article title
|
|
|
|
Returns:
|
|
bool: True if paywall indicators are found
|
|
"""
|
|
if not content and not title:
|
|
return False
|
|
|
|
# Combine content and title for analysis
|
|
text_to_check = f"{title} {content}".lower()
|
|
|
|
# Check for paywall indicators
|
|
for indicator in self.paywall_indicators:
|
|
if indicator in text_to_check:
|
|
return True
|
|
|
|
# Additional heuristics
|
|
# Very short content with subscription-related words
|
|
if len(content) < 200 and any(
|
|
word in text_to_check
|
|
for word in ["subscription", "subscribe", "member", "premium"]
|
|
):
|
|
return True
|
|
|
|
# Content that ends abruptly with subscription prompts
|
|
content_end = content[-200:].lower() if len(content) > 200 else content.lower()
|
|
return any(
|
|
phrase in content_end
|
|
for phrase in ["to continue reading", "subscribe to", "become a member"]
|
|
)
|
|
|
|
def _is_valid_url(self, url: str) -> bool:
|
|
"""Check if URL is valid and accessible."""
|
|
try:
|
|
parsed = urlparse(url)
|
|
return bool(parsed.netloc) and parsed.scheme in ("http", "https")
|
|
except Exception:
|
|
return False
|
|
|
|
def scrape_multiple_articles(self, urls: list[str]) -> dict[str, ScrapeResult]:
|
|
"""
|
|
Scrape multiple articles sequentially.
|
|
|
|
Args:
|
|
urls: List of article URLs to scrape
|
|
|
|
Returns:
|
|
Dict mapping URLs to ScrapeResults
|
|
"""
|
|
results = {}
|
|
|
|
for i, url in enumerate(urls):
|
|
logger.info(f"Scraping article {i + 1}/{len(urls)}: {url}")
|
|
results[url] = self.scrape_article(url)
|
|
|
|
# Add delay between requests
|
|
if i < len(urls) - 1:
|
|
time.sleep(self.delay)
|
|
|
|
return results
|