diff --git a/tests/domains/news/test_article_scraper_client.py b/tests/domains/news/test_article_scraper_client.py index d0ee94df..67ecf661 100644 --- a/tests/domains/news/test_article_scraper_client.py +++ b/tests/domains/news/test_article_scraper_client.py @@ -9,12 +9,27 @@ from tradingagents.domains.news.article_scraper_client import ( ScrapeResult, ) -# VCR configuration + +# VCR configuration optimized for minimal cassette size +def response_content_filter(response): + """Filter response content to reduce cassette size.""" + if "text/html" in response.get("headers", {}).get("content-type", [""])[0]: + # For HTML responses, keep only the first 1KB for basic structure + if "string" in response["body"]: + content = response["body"]["string"] + if len(content) > 1024: + response["body"]["string"] = ( + content[:1024] + "... [TRUNCATED for test size]" + ) + return response + + vcr = pytest.mark.vcr( cassette_library_dir="tests/fixtures/vcr_cassettes/news", record_mode="once", # Record once, then replay match_on=["uri", "method"], - filter_headers=["authorization", "cookie", "user-agent"], + filter_headers=["authorization", "cookie", "user-agent", "set-cookie"], + before_record_response=response_content_filter, ) @@ -277,81 +292,3 @@ class TestArticleScraperClient: # Business Insider sometimes has paywalls assert result.status in ["SUCCESS", "PAYWALL_DETECTED", "SCRAPE_FAILED"] - - -class TestIntegrationScenarios: - """Integration tests for ArticleScraperClient with real HTTP requests.""" - - @pytest.fixture - def scraper(self): - """Create ArticleScraperClient instance.""" - return ArticleScraperClient(delay=0.1) - - @vcr - def test_multiple_major_news_sources(self, scraper): - """Test scraping from various major news sources (recorded).""" - # Mix of generally accessible and paywalled sources - urls = [ - "https://www.reuters.com/", - "https://www.cnbc.com/", - "https://www.bloomberg.com/", - "https://finance.yahoo.com/", - ] - - results = scraper.scrape_multiple_articles(urls) - - assert len(results) == len(urls) - - for url, result in results.items(): - assert isinstance(result, ScrapeResult) - assert result.final_url == url - assert result.status in [ - "SUCCESS", - "SCRAPE_FAILED", - "PAYWALL_DETECTED", - "NOT_FOUND", - ] - - @vcr - def test_financial_news_sources(self, scraper): - """Test various financial news sources (recorded).""" - urls = [ - "https://www.marketwatch.com/", - "https://www.barchart.com/", - "https://seekingalpha.com/", - "https://www.tipranks.com/", - ] - - results = scraper.scrape_multiple_articles(urls) - - assert len(results) == len(urls) - - for url, result in results.items(): - assert isinstance(result, ScrapeResult) - - # Different sources have different paywall policies - if "seekingalpha.com" in url and result.status == "PAYWALL_DETECTED": - assert result.is_paywall is True - elif result.status == "SUCCESS": - assert isinstance(result.content, str) - - @vcr - def test_business_news_sources(self, scraper): - """Test business news sources (recorded).""" - urls = [ - "https://www.forbes.com/", - "https://www.businessinsider.com/", - "https://www.wsj.com/", - ] - - results = scraper.scrape_multiple_articles(urls) - - assert len(results) == len(urls) - - for url, result in results.items(): - assert isinstance(result, ScrapeResult) - assert result.final_url == url - - # WSJ is known for paywalls - if "wsj.com" in url and result.status == "PAYWALL_DETECTED": - assert result.is_paywall is True diff --git a/tests/domains/news/test_google_news_client.py b/tests/domains/news/test_google_news_client.py index 057fce16..6a1ad8b2 100644 --- a/tests/domains/news/test_google_news_client.py +++ b/tests/domains/news/test_google_news_client.py @@ -14,12 +14,33 @@ from tradingagents.domains.news.google_news_client import ( GoogleNewsClient, ) -# VCR configuration + +# VCR configuration optimized for minimal cassette size +def rss_content_filter(response): + """Filter RSS content to reduce cassette size while preserving test data.""" + content_type = response.get("headers", {}).get("content-type", [""])[0] + if "xml" in content_type and "string" in response["body"]: + content = response["body"]["string"] + # For RSS feeds, keep only first 5 items to reduce size + if len(content) > 5000: # Only truncate large RSS feeds + # Find closing tag of 5th item + item_count = content.count("") + if item_count > 5: + # Keep RSS structure but limit to 5 items + parts = content.split("") + if len(parts) > 6: # 5 items + everything after + response["body"]["string"] = ( + "".join(parts[:6]) + "" + ) + return response + + vcr = pytest.mark.vcr( cassette_library_dir="tests/fixtures/vcr_cassettes/news", record_mode="once", # Record once, then replay match_on=["uri", "method"], - filter_headers=["authorization", "cookie"], + filter_headers=["authorization", "cookie", "user-agent", "set-cookie"], + before_record_response=rss_content_filter, ) @@ -302,69 +323,3 @@ class TestGoogleNewsClient: assert articles == [] # Should log warning about failed parsing mock_logger.warning.assert_called() - - -class TestIntegrationScenarios: - """Integration tests with multiple components.""" - - @pytest.fixture - def client(self): - """Create GoogleNewsClient instance.""" - return GoogleNewsClient() - - def test_empty_feed_response(self, client): - """Test handling of empty RSS feed.""" - with patch("requests.get") as mock_get, patch("feedparser.parse") as mock_parse: - mock_response = Mock() - mock_response.content = b""" - - - Empty Feed - No items - - """ - mock_response.raise_for_status.return_value = None - mock_get.return_value = mock_response - - mock_feed = Mock() - mock_feed.bozo = False - mock_feed.entries = [] - mock_parse.return_value = mock_feed - - articles = client._get_rss_feed("EMPTY") - - assert articles == [] - assert mock_get.called - assert mock_parse.called - - @vcr - def test_special_characters_in_query(self, client): - """Test query with special characters that need URL encoding.""" - # Query with spaces and special chars - articles = client.get_company_news("S&P 500") - - assert isinstance(articles, list) - # Should handle URL encoding properly - - def test_concurrent_category_failures(self, client): - """Test that failures in one category don't affect others.""" - successful_article = GoogleNewsArticle( - title="Success", - link="https://success.com", - published=datetime.now(timezone.utc).replace(tzinfo=None), - summary="Successful fetch", - source="GoodSource", - guid="success-1", - ) - - with patch.object(client, "_get_rss_feed") as mock_get_rss: - mock_get_rss.side_effect = [ - Exception("Network timeout"), - [successful_article], - Exception("Parse error"), - ] - - articles = client.get_global_news(["fail1", "success", "fail2"]) - - assert len(articles) == 1 - assert articles[0].title == "Success"