This commit is contained in:
Martin C. Richards 2025-08-17 18:15:29 +02:00
parent 873ff99173
commit 4565a41600
2 changed files with 40 additions and 148 deletions

View File

@ -9,12 +9,27 @@ from tradingagents.domains.news.article_scraper_client import (
ScrapeResult,
)
# VCR configuration
# VCR configuration optimized for minimal cassette size
def response_content_filter(response):
"""Filter response content to reduce cassette size."""
if "text/html" in response.get("headers", {}).get("content-type", [""])[0]:
# For HTML responses, keep only the first 1KB for basic structure
if "string" in response["body"]:
content = response["body"]["string"]
if len(content) > 1024:
response["body"]["string"] = (
content[:1024] + "... [TRUNCATED for test size]"
)
return response
vcr = pytest.mark.vcr(
cassette_library_dir="tests/fixtures/vcr_cassettes/news",
record_mode="once", # Record once, then replay
match_on=["uri", "method"],
filter_headers=["authorization", "cookie", "user-agent"],
filter_headers=["authorization", "cookie", "user-agent", "set-cookie"],
before_record_response=response_content_filter,
)
@ -277,81 +292,3 @@ class TestArticleScraperClient:
# Business Insider sometimes has paywalls
assert result.status in ["SUCCESS", "PAYWALL_DETECTED", "SCRAPE_FAILED"]
class TestIntegrationScenarios:
"""Integration tests for ArticleScraperClient with real HTTP requests."""
@pytest.fixture
def scraper(self):
"""Create ArticleScraperClient instance."""
return ArticleScraperClient(delay=0.1)
@vcr
def test_multiple_major_news_sources(self, scraper):
"""Test scraping from various major news sources (recorded)."""
# Mix of generally accessible and paywalled sources
urls = [
"https://www.reuters.com/",
"https://www.cnbc.com/",
"https://www.bloomberg.com/",
"https://finance.yahoo.com/",
]
results = scraper.scrape_multiple_articles(urls)
assert len(results) == len(urls)
for url, result in results.items():
assert isinstance(result, ScrapeResult)
assert result.final_url == url
assert result.status in [
"SUCCESS",
"SCRAPE_FAILED",
"PAYWALL_DETECTED",
"NOT_FOUND",
]
@vcr
def test_financial_news_sources(self, scraper):
"""Test various financial news sources (recorded)."""
urls = [
"https://www.marketwatch.com/",
"https://www.barchart.com/",
"https://seekingalpha.com/",
"https://www.tipranks.com/",
]
results = scraper.scrape_multiple_articles(urls)
assert len(results) == len(urls)
for url, result in results.items():
assert isinstance(result, ScrapeResult)
# Different sources have different paywall policies
if "seekingalpha.com" in url and result.status == "PAYWALL_DETECTED":
assert result.is_paywall is True
elif result.status == "SUCCESS":
assert isinstance(result.content, str)
@vcr
def test_business_news_sources(self, scraper):
"""Test business news sources (recorded)."""
urls = [
"https://www.forbes.com/",
"https://www.businessinsider.com/",
"https://www.wsj.com/",
]
results = scraper.scrape_multiple_articles(urls)
assert len(results) == len(urls)
for url, result in results.items():
assert isinstance(result, ScrapeResult)
assert result.final_url == url
# WSJ is known for paywalls
if "wsj.com" in url and result.status == "PAYWALL_DETECTED":
assert result.is_paywall is True

View File

@ -14,12 +14,33 @@ from tradingagents.domains.news.google_news_client import (
GoogleNewsClient,
)
# VCR configuration
# VCR configuration optimized for minimal cassette size
def rss_content_filter(response):
"""Filter RSS content to reduce cassette size while preserving test data."""
content_type = response.get("headers", {}).get("content-type", [""])[0]
if "xml" in content_type and "string" in response["body"]:
content = response["body"]["string"]
# For RSS feeds, keep only first 5 items to reduce size
if len(content) > 5000: # Only truncate large RSS feeds
# Find closing tag of 5th item
item_count = content.count("<item>")
if item_count > 5:
# Keep RSS structure but limit to 5 items
parts = content.split("</item>")
if len(parts) > 6: # 5 items + everything after
response["body"]["string"] = (
"</item>".join(parts[:6]) + "</channel></rss>"
)
return response
vcr = pytest.mark.vcr(
cassette_library_dir="tests/fixtures/vcr_cassettes/news",
record_mode="once", # Record once, then replay
match_on=["uri", "method"],
filter_headers=["authorization", "cookie"],
filter_headers=["authorization", "cookie", "user-agent", "set-cookie"],
before_record_response=rss_content_filter,
)
@ -302,69 +323,3 @@ class TestGoogleNewsClient:
assert articles == []
# Should log warning about failed parsing
mock_logger.warning.assert_called()
class TestIntegrationScenarios:
"""Integration tests with multiple components."""
@pytest.fixture
def client(self):
"""Create GoogleNewsClient instance."""
return GoogleNewsClient()
def test_empty_feed_response(self, client):
"""Test handling of empty RSS feed."""
with patch("requests.get") as mock_get, patch("feedparser.parse") as mock_parse:
mock_response = Mock()
mock_response.content = b"""<?xml version="1.0"?>
<rss version="2.0">
<channel>
<title>Empty Feed</title>
<description>No items</description>
</channel>
</rss>"""
mock_response.raise_for_status.return_value = None
mock_get.return_value = mock_response
mock_feed = Mock()
mock_feed.bozo = False
mock_feed.entries = []
mock_parse.return_value = mock_feed
articles = client._get_rss_feed("EMPTY")
assert articles == []
assert mock_get.called
assert mock_parse.called
@vcr
def test_special_characters_in_query(self, client):
"""Test query with special characters that need URL encoding."""
# Query with spaces and special chars
articles = client.get_company_news("S&P 500")
assert isinstance(articles, list)
# Should handle URL encoding properly
def test_concurrent_category_failures(self, client):
"""Test that failures in one category don't affect others."""
successful_article = GoogleNewsArticle(
title="Success",
link="https://success.com",
published=datetime.now(timezone.utc).replace(tzinfo=None),
summary="Successful fetch",
source="GoodSource",
guid="success-1",
)
with patch.object(client, "_get_rss_feed") as mock_get_rss:
mock_get_rss.side_effect = [
Exception("Network timeout"),
[successful_article],
Exception("Parse error"),
]
articles = client.get_global_news(["fail1", "success", "fail2"])
assert len(articles) == 1
assert articles[0].title == "Success"