diff --git a/tests/domains/news/test_article_scraper_client.py b/tests/domains/news/test_article_scraper_client.py
index d0ee94df..67ecf661 100644
--- a/tests/domains/news/test_article_scraper_client.py
+++ b/tests/domains/news/test_article_scraper_client.py
@@ -9,12 +9,27 @@ from tradingagents.domains.news.article_scraper_client import (
ScrapeResult,
)
-# VCR configuration
+
+# VCR configuration optimized for minimal cassette size
+def response_content_filter(response):
+ """Filter response content to reduce cassette size."""
+ if "text/html" in response.get("headers", {}).get("content-type", [""])[0]:
+ # For HTML responses, keep only the first 1KB for basic structure
+ if "string" in response["body"]:
+ content = response["body"]["string"]
+ if len(content) > 1024:
+ response["body"]["string"] = (
+ content[:1024] + "... [TRUNCATED for test size]"
+ )
+ return response
+
+
vcr = pytest.mark.vcr(
cassette_library_dir="tests/fixtures/vcr_cassettes/news",
record_mode="once", # Record once, then replay
match_on=["uri", "method"],
- filter_headers=["authorization", "cookie", "user-agent"],
+ filter_headers=["authorization", "cookie", "user-agent", "set-cookie"],
+ before_record_response=response_content_filter,
)
@@ -277,81 +292,3 @@ class TestArticleScraperClient:
# Business Insider sometimes has paywalls
assert result.status in ["SUCCESS", "PAYWALL_DETECTED", "SCRAPE_FAILED"]
-
-
-class TestIntegrationScenarios:
- """Integration tests for ArticleScraperClient with real HTTP requests."""
-
- @pytest.fixture
- def scraper(self):
- """Create ArticleScraperClient instance."""
- return ArticleScraperClient(delay=0.1)
-
- @vcr
- def test_multiple_major_news_sources(self, scraper):
- """Test scraping from various major news sources (recorded)."""
- # Mix of generally accessible and paywalled sources
- urls = [
- "https://www.reuters.com/",
- "https://www.cnbc.com/",
- "https://www.bloomberg.com/",
- "https://finance.yahoo.com/",
- ]
-
- results = scraper.scrape_multiple_articles(urls)
-
- assert len(results) == len(urls)
-
- for url, result in results.items():
- assert isinstance(result, ScrapeResult)
- assert result.final_url == url
- assert result.status in [
- "SUCCESS",
- "SCRAPE_FAILED",
- "PAYWALL_DETECTED",
- "NOT_FOUND",
- ]
-
- @vcr
- def test_financial_news_sources(self, scraper):
- """Test various financial news sources (recorded)."""
- urls = [
- "https://www.marketwatch.com/",
- "https://www.barchart.com/",
- "https://seekingalpha.com/",
- "https://www.tipranks.com/",
- ]
-
- results = scraper.scrape_multiple_articles(urls)
-
- assert len(results) == len(urls)
-
- for url, result in results.items():
- assert isinstance(result, ScrapeResult)
-
- # Different sources have different paywall policies
- if "seekingalpha.com" in url and result.status == "PAYWALL_DETECTED":
- assert result.is_paywall is True
- elif result.status == "SUCCESS":
- assert isinstance(result.content, str)
-
- @vcr
- def test_business_news_sources(self, scraper):
- """Test business news sources (recorded)."""
- urls = [
- "https://www.forbes.com/",
- "https://www.businessinsider.com/",
- "https://www.wsj.com/",
- ]
-
- results = scraper.scrape_multiple_articles(urls)
-
- assert len(results) == len(urls)
-
- for url, result in results.items():
- assert isinstance(result, ScrapeResult)
- assert result.final_url == url
-
- # WSJ is known for paywalls
- if "wsj.com" in url and result.status == "PAYWALL_DETECTED":
- assert result.is_paywall is True
diff --git a/tests/domains/news/test_google_news_client.py b/tests/domains/news/test_google_news_client.py
index 057fce16..6a1ad8b2 100644
--- a/tests/domains/news/test_google_news_client.py
+++ b/tests/domains/news/test_google_news_client.py
@@ -14,12 +14,33 @@ from tradingagents.domains.news.google_news_client import (
GoogleNewsClient,
)
-# VCR configuration
+
+# VCR configuration optimized for minimal cassette size
+def rss_content_filter(response):
+ """Filter RSS content to reduce cassette size while preserving test data."""
+ content_type = response.get("headers", {}).get("content-type", [""])[0]
+ if "xml" in content_type and "string" in response["body"]:
+ content = response["body"]["string"]
+ # For RSS feeds, keep only first 5 items to reduce size
+ if len(content) > 5000: # Only truncate large RSS feeds
+ # Find closing tag of 5th item
+ item_count = content.count("- ")
+ if item_count > 5:
+ # Keep RSS structure but limit to 5 items
+ parts = content.split("
")
+ if len(parts) > 6: # 5 items + everything after
+ response["body"]["string"] = (
+ "".join(parts[:6]) + ""
+ )
+ return response
+
+
vcr = pytest.mark.vcr(
cassette_library_dir="tests/fixtures/vcr_cassettes/news",
record_mode="once", # Record once, then replay
match_on=["uri", "method"],
- filter_headers=["authorization", "cookie"],
+ filter_headers=["authorization", "cookie", "user-agent", "set-cookie"],
+ before_record_response=rss_content_filter,
)
@@ -302,69 +323,3 @@ class TestGoogleNewsClient:
assert articles == []
# Should log warning about failed parsing
mock_logger.warning.assert_called()
-
-
-class TestIntegrationScenarios:
- """Integration tests with multiple components."""
-
- @pytest.fixture
- def client(self):
- """Create GoogleNewsClient instance."""
- return GoogleNewsClient()
-
- def test_empty_feed_response(self, client):
- """Test handling of empty RSS feed."""
- with patch("requests.get") as mock_get, patch("feedparser.parse") as mock_parse:
- mock_response = Mock()
- mock_response.content = b"""
-
-
- Empty Feed
- No items
-
- """
- mock_response.raise_for_status.return_value = None
- mock_get.return_value = mock_response
-
- mock_feed = Mock()
- mock_feed.bozo = False
- mock_feed.entries = []
- mock_parse.return_value = mock_feed
-
- articles = client._get_rss_feed("EMPTY")
-
- assert articles == []
- assert mock_get.called
- assert mock_parse.called
-
- @vcr
- def test_special_characters_in_query(self, client):
- """Test query with special characters that need URL encoding."""
- # Query with spaces and special chars
- articles = client.get_company_news("S&P 500")
-
- assert isinstance(articles, list)
- # Should handle URL encoding properly
-
- def test_concurrent_category_failures(self, client):
- """Test that failures in one category don't affect others."""
- successful_article = GoogleNewsArticle(
- title="Success",
- link="https://success.com",
- published=datetime.now(timezone.utc).replace(tzinfo=None),
- summary="Successful fetch",
- source="GoodSource",
- guid="success-1",
- )
-
- with patch.object(client, "_get_rss_feed") as mock_get_rss:
- mock_get_rss.side_effect = [
- Exception("Network timeout"),
- [successful_article],
- Exception("Parse error"),
- ]
-
- articles = client.get_global_news(["fail1", "success", "fail2"])
-
- assert len(articles) == 1
- assert articles[0].title == "Success"