rm vcrs

2025-08-17 18:15:29 +02:00 · 2025-08-17 18:15:29 +02:00 · 4565a41600
parent 873ff99173
commit 4565a41600
2 changed files with 40 additions and 148 deletions
--- a/tests/domains/news/test_article_scraper_client.py
+++ b/tests/domains/news/test_article_scraper_client.py
@ -9,12 +9,27 @@ from tradingagents.domains.news.article_scraper_client import (
    ScrapeResult,
 )

-# VCR configuration
+
+# VCR configuration optimized for minimal cassette size
+def response_content_filter(response):
+    """Filter response content to reduce cassette size."""
+    if "text/html" in response.get("headers", {}).get("content-type", [""])[0]:
+        # For HTML responses, keep only the first 1KB for basic structure
+        if "string" in response["body"]:
+            content = response["body"]["string"]
+            if len(content) > 1024:
+                response["body"]["string"] = (
+                    content[:1024] + "... [TRUNCATED for test size]"
+                )
+    return response
+
+
 vcr = pytest.mark.vcr(
    cassette_library_dir="tests/fixtures/vcr_cassettes/news",
    record_mode="once",  # Record once, then replay
    match_on=["uri", "method"],
-    filter_headers=["authorization", "cookie", "user-agent"],
+    filter_headers=["authorization", "cookie", "user-agent", "set-cookie"],
+    before_record_response=response_content_filter,
 )


@ -277,81 +292,3 @@ class TestArticleScraperClient:

        # Business Insider sometimes has paywalls
        assert result.status in ["SUCCESS", "PAYWALL_DETECTED", "SCRAPE_FAILED"]
-
-
-class TestIntegrationScenarios:
-    """Integration tests for ArticleScraperClient with real HTTP requests."""
-
-    @pytest.fixture
-    def scraper(self):
-        """Create ArticleScraperClient instance."""
-        return ArticleScraperClient(delay=0.1)
-
-    @vcr
-    def test_multiple_major_news_sources(self, scraper):
-        """Test scraping from various major news sources (recorded)."""
-        # Mix of generally accessible and paywalled sources
-        urls = [
-            "https://www.reuters.com/",
-            "https://www.cnbc.com/",
-            "https://www.bloomberg.com/",
-            "https://finance.yahoo.com/",
-        ]
-
-        results = scraper.scrape_multiple_articles(urls)
-
-        assert len(results) == len(urls)
-
-        for url, result in results.items():
-            assert isinstance(result, ScrapeResult)
-            assert result.final_url == url
-            assert result.status in [
-                "SUCCESS",
-                "SCRAPE_FAILED",
-                "PAYWALL_DETECTED",
-                "NOT_FOUND",
-            ]
-
-    @vcr
-    def test_financial_news_sources(self, scraper):
-        """Test various financial news sources (recorded)."""
-        urls = [
-            "https://www.marketwatch.com/",
-            "https://www.barchart.com/",
-            "https://seekingalpha.com/",
-            "https://www.tipranks.com/",
-        ]
-
-        results = scraper.scrape_multiple_articles(urls)
-
-        assert len(results) == len(urls)
-
-        for url, result in results.items():
-            assert isinstance(result, ScrapeResult)
-
-            # Different sources have different paywall policies
-            if "seekingalpha.com" in url and result.status == "PAYWALL_DETECTED":
-                assert result.is_paywall is True
-            elif result.status == "SUCCESS":
-                assert isinstance(result.content, str)
-
-    @vcr
-    def test_business_news_sources(self, scraper):
-        """Test business news sources (recorded)."""
-        urls = [
-            "https://www.forbes.com/",
-            "https://www.businessinsider.com/",
-            "https://www.wsj.com/",
-        ]
-
-        results = scraper.scrape_multiple_articles(urls)
-
-        assert len(results) == len(urls)
-
-        for url, result in results.items():
-            assert isinstance(result, ScrapeResult)
-            assert result.final_url == url
-
-            # WSJ is known for paywalls
-            if "wsj.com" in url and result.status == "PAYWALL_DETECTED":
-                assert result.is_paywall is True
--- a/tests/domains/news/test_google_news_client.py
+++ b/tests/domains/news/test_google_news_client.py
@ -14,12 +14,33 @@ from tradingagents.domains.news.google_news_client import (
    GoogleNewsClient,
 )

-# VCR configuration
+
+# VCR configuration optimized for minimal cassette size
+def rss_content_filter(response):
+    """Filter RSS content to reduce cassette size while preserving test data."""
+    content_type = response.get("headers", {}).get("content-type", [""])[0]
+    if "xml" in content_type and "string" in response["body"]:
+        content = response["body"]["string"]
+        # For RSS feeds, keep only first 5 items to reduce size
+        if len(content) > 5000:  # Only truncate large RSS feeds
+            # Find closing tag of 5th item
+            item_count = content.count("<item>")
+            if item_count > 5:
+                # Keep RSS structure but limit to 5 items
+                parts = content.split("</item>")
+                if len(parts) > 6:  # 5 items + everything after
+                    response["body"]["string"] = (
+                        "</item>".join(parts[:6]) + "</channel></rss>"
+                    )
+    return response
+
+
 vcr = pytest.mark.vcr(
    cassette_library_dir="tests/fixtures/vcr_cassettes/news",
    record_mode="once",  # Record once, then replay
    match_on=["uri", "method"],
-    filter_headers=["authorization", "cookie"],
+    filter_headers=["authorization", "cookie", "user-agent", "set-cookie"],
+    before_record_response=rss_content_filter,
 )


@ -302,69 +323,3 @@ class TestGoogleNewsClient:
                assert articles == []
                # Should log warning about failed parsing
                mock_logger.warning.assert_called()
-
-
-class TestIntegrationScenarios:
-    """Integration tests with multiple components."""
-
-    @pytest.fixture
-    def client(self):
-        """Create GoogleNewsClient instance."""
-        return GoogleNewsClient()
-
-    def test_empty_feed_response(self, client):
-        """Test handling of empty RSS feed."""
-        with patch("requests.get") as mock_get, patch("feedparser.parse") as mock_parse:
-            mock_response = Mock()
-            mock_response.content = b"""<?xml version="1.0"?>
-                <rss version="2.0">
-                    <channel>
-                        <title>Empty Feed</title>
-                        <description>No items</description>
-                    </channel>
-                </rss>"""
-            mock_response.raise_for_status.return_value = None
-            mock_get.return_value = mock_response
-
-            mock_feed = Mock()
-            mock_feed.bozo = False
-            mock_feed.entries = []
-            mock_parse.return_value = mock_feed
-
-            articles = client._get_rss_feed("EMPTY")
-
-            assert articles == []
-            assert mock_get.called
-            assert mock_parse.called
-
-    @vcr
-    def test_special_characters_in_query(self, client):
-        """Test query with special characters that need URL encoding."""
-        # Query with spaces and special chars
-        articles = client.get_company_news("S&P 500")
-
-        assert isinstance(articles, list)
-        # Should handle URL encoding properly
-
-    def test_concurrent_category_failures(self, client):
-        """Test that failures in one category don't affect others."""
-        successful_article = GoogleNewsArticle(
-            title="Success",
-            link="https://success.com",
-            published=datetime.now(timezone.utc).replace(tzinfo=None),
-            summary="Successful fetch",
-            source="GoodSource",
-            guid="success-1",
-        )
-
-        with patch.object(client, "_get_rss_feed") as mock_get_rss:
-            mock_get_rss.side_effect = [
-                Exception("Network timeout"),
-                [successful_article],
-                Exception("Parse error"),
-            ]
-
-            articles = client.get_global_news(["fail1", "success", "fail2"])
-
-            assert len(articles) == 1
-            assert articles[0].title == "Success"