TradingAgents/tests/domains/news/test_article_scraper_client.py

358 lines
12 KiB
Python

"""
Tests for ArticleScraperClient using pytest-vcr for HTTP interactions.
"""
import pytest
from tradingagents.domains.news.article_scraper_client import (
ArticleScraperClient,
ScrapeResult,
)
# VCR configuration
vcr = pytest.mark.vcr(
cassette_library_dir="tests/fixtures/vcr_cassettes/news",
record_mode="once", # Record once, then replay
match_on=["uri", "method"],
filter_headers=["authorization", "cookie", "user-agent"],
)
@pytest.fixture
def scraper():
"""ArticleScraperClient instance for testing."""
return ArticleScraperClient(user_agent="Test-Agent/1.0", delay=0.1)
class TestArticleScraperClient:
"""Test ArticleScraperClient functionality."""
def test_initialization(self):
"""Test scraper initializes with correct configuration."""
scraper = ArticleScraperClient("Custom-Agent/1.0", delay=2.0)
assert scraper.user_agent == "Custom-Agent/1.0"
assert scraper.delay == 2.0
scraper_default = ArticleScraperClient(None)
assert "Chrome" in scraper_default.user_agent
assert scraper_default.delay == 1.0
def test_is_valid_url(self, scraper):
"""Test URL validation logic."""
# Valid URLs
assert scraper._is_valid_url("https://example.com/article") is True
assert scraper._is_valid_url("http://example.com/article") is True
# Invalid URLs
assert scraper._is_valid_url("") is False
assert scraper._is_valid_url("not-a-url") is False
assert scraper._is_valid_url("ftp://example.com") is False
def test_scrape_article_invalid_url(self, scraper):
"""Test scraping with invalid URLs returns NOT_FOUND."""
invalid_urls = ["", "not-a-url", "ftp://example.com"]
for url in invalid_urls:
result = scraper.scrape_article(url)
assert result.status == "NOT_FOUND"
assert result.final_url == url
def test_scrape_result_dataclass(self):
"""Test ScrapeResult dataclass."""
result = ScrapeResult(status="SUCCESS", content="Test content")
assert result.status == "SUCCESS"
assert result.content == "Test content"
assert result.author == "" # Default
assert result.final_url == "" # Default
assert result.is_paywall is False # Default
assert result.keywords is None # Default
assert result.summary == "" # Default
def test_paywall_detection_logic(self, scraper):
"""Test paywall detection logic without mocking."""
# Test clear paywall indicators
assert (
scraper._detect_paywall(
"Please subscribe to continue reading", "News Title"
)
is True
)
assert (
scraper._detect_paywall("This article is for subscribers only", "Title")
is True
)
assert scraper._detect_paywall("", "Subscribe now for premium content") is True
# Test no paywall
assert (
scraper._detect_paywall(
"Regular article content without any restrictions", "Normal Title"
)
is False
)
# Test short content with subscription words
assert scraper._detect_paywall("Short article. Subscribe now.", "Title") is True
# Test content ending with subscription prompt
long_content = (
"A" * 300 + " To continue reading, please subscribe to our premium service."
)
assert scraper._detect_paywall(long_content, "Title") is True
@vcr
def test_scrape_article_cnbc(self, scraper):
"""Test scraping CNBC article - commonly appears in Google News (recorded)."""
# Using a generic CNBC tech page URL
url = "https://www.cnbc.com/technology/"
result = scraper.scrape_article(url)
assert isinstance(result, ScrapeResult)
assert result.final_url == url
assert result.status in ["SUCCESS", "SCRAPE_FAILED"]
@vcr
def test_scrape_article_yahoo_finance(self, scraper):
"""Test scraping Yahoo Finance - frequently in Google News results (recorded)."""
# Yahoo Finance main page
url = "https://finance.yahoo.com/"
result = scraper.scrape_article(url)
assert isinstance(result, ScrapeResult)
assert result.final_url == url
assert result.status in ["SUCCESS", "SCRAPE_FAILED"]
@vcr
def test_scrape_article_seeking_alpha(self, scraper):
"""Test scraping Seeking Alpha - common financial news source (recorded)."""
url = "https://seekingalpha.com/"
result = scraper.scrape_article(url)
assert isinstance(result, ScrapeResult)
assert result.final_url == url
# Seeking Alpha often has paywalls
assert result.status in ["SUCCESS", "PAYWALL_DETECTED", "SCRAPE_FAILED"]
@vcr
def test_scrape_article_tip_ranks(self, scraper):
"""Test scraping TipRanks - appears in financial news (recorded)."""
url = "https://www.tipranks.com/"
result = scraper.scrape_article(url)
assert isinstance(result, ScrapeResult)
assert result.final_url == url
assert result.status in ["SUCCESS", "SCRAPE_FAILED", "PAYWALL_DETECTED"]
@vcr
def test_scrape_article_barchart(self, scraper):
"""Test scraping Barchart - financial analysis site (recorded)."""
url = "https://www.barchart.com/"
result = scraper.scrape_article(url)
assert isinstance(result, ScrapeResult)
assert result.final_url == url
assert result.status in ["SUCCESS", "SCRAPE_FAILED"]
@vcr
def test_scrape_multiple_financial_sites(self, scraper):
"""Test scraping multiple financial news sites (recorded)."""
# Common financial news sources that appear in Google News
urls = [
"https://www.cnbc.com/",
"https://finance.yahoo.com/",
"https://www.barchart.com/",
]
results = scraper.scrape_multiple_articles(urls)
assert isinstance(results, dict)
assert len(results) == len(urls)
for url in urls:
assert url in results
assert isinstance(results[url], ScrapeResult)
assert results[url].final_url == url
assert results[url].status in [
"SUCCESS",
"SCRAPE_FAILED",
"NOT_FOUND",
"PAYWALL_DETECTED",
]
@vcr
def test_scrape_article_with_404(self, scraper):
"""Test handling of 404 pages (recorded)."""
# A URL that should return 404
url = "https://www.cnbc.com/this-page-does-not-exist-404-error"
result = scraper.scrape_article(url)
assert isinstance(result, ScrapeResult)
assert result.final_url == url
# Should handle 404 gracefully
assert result.status in ["SCRAPE_FAILED", "NOT_FOUND"]
@vcr
def test_scrape_article_marketwatch(self, scraper):
"""Test scraping MarketWatch - common in financial news (recorded)."""
url = "https://www.marketwatch.com/"
result = scraper.scrape_article(url)
assert isinstance(result, ScrapeResult)
assert result.final_url == url
# MarketWatch sometimes has access restrictions
assert result.status in ["SUCCESS", "SCRAPE_FAILED", "PAYWALL_DETECTED"]
@vcr
def test_scrape_article_reuters(self, scraper):
"""Test scraping Reuters - major news source (recorded)."""
url = "https://www.reuters.com/"
result = scraper.scrape_article(url)
assert isinstance(result, ScrapeResult)
assert result.final_url == url
# Reuters is generally accessible
assert result.status in ["SUCCESS", "SCRAPE_FAILED"]
@vcr
def test_scrape_article_bloomberg(self, scraper):
"""Test scraping Bloomberg - often has paywall (recorded)."""
url = "https://www.bloomberg.com/"
result = scraper.scrape_article(url)
assert isinstance(result, ScrapeResult)
assert result.final_url == url
# Bloomberg frequently has paywalls
if result.status == "PAYWALL_DETECTED":
assert result.is_paywall is True
@vcr
def test_scrape_article_wsj(self, scraper):
"""Test scraping WSJ - typically paywalled (recorded)."""
url = "https://www.wsj.com/"
result = scraper.scrape_article(url)
assert isinstance(result, ScrapeResult)
assert result.final_url == url
# WSJ usually has strong paywalls
assert result.status in ["SUCCESS", "PAYWALL_DETECTED", "SCRAPE_FAILED"]
if result.status == "PAYWALL_DETECTED":
assert result.is_paywall is True
@vcr
def test_scrape_article_forbes(self, scraper):
"""Test scraping Forbes - business news (recorded)."""
url = "https://www.forbes.com/"
result = scraper.scrape_article(url)
assert isinstance(result, ScrapeResult)
assert result.final_url == url
assert result.status in ["SUCCESS", "SCRAPE_FAILED"]
@vcr
def test_scrape_article_business_insider(self, scraper):
"""Test scraping Business Insider (recorded)."""
url = "https://www.businessinsider.com/"
result = scraper.scrape_article(url)
assert isinstance(result, ScrapeResult)
assert result.final_url == url
# Business Insider sometimes has paywalls
assert result.status in ["SUCCESS", "PAYWALL_DETECTED", "SCRAPE_FAILED"]
class TestIntegrationScenarios:
"""Integration tests for ArticleScraperClient with real HTTP requests."""
@pytest.fixture
def scraper(self):
"""Create ArticleScraperClient instance."""
return ArticleScraperClient(delay=0.1)
@vcr
def test_multiple_major_news_sources(self, scraper):
"""Test scraping from various major news sources (recorded)."""
# Mix of generally accessible and paywalled sources
urls = [
"https://www.reuters.com/",
"https://www.cnbc.com/",
"https://www.bloomberg.com/",
"https://finance.yahoo.com/",
]
results = scraper.scrape_multiple_articles(urls)
assert len(results) == len(urls)
for url, result in results.items():
assert isinstance(result, ScrapeResult)
assert result.final_url == url
assert result.status in [
"SUCCESS",
"SCRAPE_FAILED",
"PAYWALL_DETECTED",
"NOT_FOUND",
]
@vcr
def test_financial_news_sources(self, scraper):
"""Test various financial news sources (recorded)."""
urls = [
"https://www.marketwatch.com/",
"https://www.barchart.com/",
"https://seekingalpha.com/",
"https://www.tipranks.com/",
]
results = scraper.scrape_multiple_articles(urls)
assert len(results) == len(urls)
for url, result in results.items():
assert isinstance(result, ScrapeResult)
# Different sources have different paywall policies
if "seekingalpha.com" in url and result.status == "PAYWALL_DETECTED":
assert result.is_paywall is True
elif result.status == "SUCCESS":
assert isinstance(result.content, str)
@vcr
def test_business_news_sources(self, scraper):
"""Test business news sources (recorded)."""
urls = [
"https://www.forbes.com/",
"https://www.businessinsider.com/",
"https://www.wsj.com/",
]
results = scraper.scrape_multiple_articles(urls)
assert len(results) == len(urls)
for url, result in results.items():
assert isinstance(result, ScrapeResult)
assert result.final_url == url
# WSJ is known for paywalls
if "wsj.com" in url and result.status == "PAYWALL_DETECTED":
assert result.is_paywall is True