rm vcrs

2025-08-17 18:15:29 +02:00 · 2025-08-17 18:15:29 +02:00 · 4565a41600
parent 873ff99173
commit 4565a41600
2 changed files with 40 additions and 148 deletions
--- a/tests/domains/news/test_article_scraper_client.py
+++ b/tests/domains/news/test_article_scraper_client.py
@ -9,12 +9,27 @@ from tradingagents.domains.news.article_scraper_client import (
    ScrapeResult,
 )
-# VCR configuration
+
 # VCR configuration optimized for minimal cassette size
 def response_content_filter(response):
    """Filter response content to reduce cassette size."""
    if "text/html" in response.get("headers", {}).get("content-type", [""])[0]:
        # For HTML responses, keep only the first 1KB for basic structure
        if "string" in response["body"]:
            content = response["body"]["string"]
            if len(content) > 1024:
                response["body"]["string"] = (
                    content[:1024] + "... [TRUNCATED for test size]"
                )
    return response
 vcr = pytest.mark.vcr(
    cassette_library_dir="tests/fixtures/vcr_cassettes/news",
    record_mode="once",  # Record once, then replay
    match_on=["uri", "method"],
-    filter_headers=["authorization", "cookie", "user-agent"],
+    filter_headers=["authorization", "cookie", "user-agent", "set-cookie"],
    before_record_response=response_content_filter,
 )
@ -277,81 +292,3 @@ class TestArticleScraperClient:
        # Business Insider sometimes has paywalls
        assert result.status in ["SUCCESS", "PAYWALL_DETECTED", "SCRAPE_FAILED"]
 class TestIntegrationScenarios:
    """Integration tests for ArticleScraperClient with real HTTP requests."""
    @pytest.fixture
    def scraper(self):
        """Create ArticleScraperClient instance."""
        return ArticleScraperClient(delay=0.1)
    @vcr
    def test_multiple_major_news_sources(self, scraper):
        """Test scraping from various major news sources (recorded)."""
        # Mix of generally accessible and paywalled sources
        urls = [
            "https://www.reuters.com/",
            "https://www.cnbc.com/",
            "https://www.bloomberg.com/",
            "https://finance.yahoo.com/",
        ]
        results = scraper.scrape_multiple_articles(urls)
        assert len(results) == len(urls)
        for url, result in results.items():
            assert isinstance(result, ScrapeResult)
            assert result.final_url == url
            assert result.status in [
                "SUCCESS",
                "SCRAPE_FAILED",
                "PAYWALL_DETECTED",
                "NOT_FOUND",
            ]
    @vcr
    def test_financial_news_sources(self, scraper):
        """Test various financial news sources (recorded)."""
        urls = [
            "https://www.marketwatch.com/",
            "https://www.barchart.com/",
            "https://seekingalpha.com/",
            "https://www.tipranks.com/",
        ]
        results = scraper.scrape_multiple_articles(urls)
        assert len(results) == len(urls)
        for url, result in results.items():
            assert isinstance(result, ScrapeResult)
            # Different sources have different paywall policies
            if "seekingalpha.com" in url and result.status == "PAYWALL_DETECTED":
                assert result.is_paywall is True
            elif result.status == "SUCCESS":
                assert isinstance(result.content, str)
    @vcr
    def test_business_news_sources(self, scraper):
        """Test business news sources (recorded)."""
        urls = [
            "https://www.forbes.com/",
            "https://www.businessinsider.com/",
            "https://www.wsj.com/",
        ]
        results = scraper.scrape_multiple_articles(urls)
        assert len(results) == len(urls)
        for url, result in results.items():
            assert isinstance(result, ScrapeResult)
            assert result.final_url == url
            # WSJ is known for paywalls
            if "wsj.com" in url and result.status == "PAYWALL_DETECTED":
                assert result.is_paywall is True
--- a/tests/domains/news/test_google_news_client.py
+++ b/tests/domains/news/test_google_news_client.py
@ -14,12 +14,33 @@ from tradingagents.domains.news.google_news_client import (
    GoogleNewsClient,
 )
-# VCR configuration
+
 # VCR configuration optimized for minimal cassette size
 def rss_content_filter(response):
    """Filter RSS content to reduce cassette size while preserving test data."""
    content_type = response.get("headers", {}).get("content-type", [""])[0]
    if "xml" in content_type and "string" in response["body"]:
        content = response["body"]["string"]
        # For RSS feeds, keep only first 5 items to reduce size
        if len(content) > 5000:  # Only truncate large RSS feeds
            # Find closing tag of 5th item
            item_count = content.count("<item>")
            if item_count > 5:
                # Keep RSS structure but limit to 5 items
                parts = content.split("</item>")
                if len(parts) > 6:  # 5 items + everything after
                    response["body"]["string"] = (
                        "</item>".join(parts[:6]) + "</channel></rss>"
                    )
    return response
 vcr = pytest.mark.vcr(
    cassette_library_dir="tests/fixtures/vcr_cassettes/news",
    record_mode="once",  # Record once, then replay
    match_on=["uri", "method"],
-    filter_headers=["authorization", "cookie"],
+    filter_headers=["authorization", "cookie", "user-agent", "set-cookie"],
    before_record_response=rss_content_filter,
 )
@ -302,69 +323,3 @@ class TestGoogleNewsClient:
                assert articles == []
                # Should log warning about failed parsing
                mock_logger.warning.assert_called()
 class TestIntegrationScenarios:
    """Integration tests with multiple components."""
    @pytest.fixture
    def client(self):
        """Create GoogleNewsClient instance."""
        return GoogleNewsClient()
    def test_empty_feed_response(self, client):
        """Test handling of empty RSS feed."""
        with patch("requests.get") as mock_get, patch("feedparser.parse") as mock_parse:
            mock_response = Mock()
            mock_response.content = b"""<?xml version="1.0"?>
                <rss version="2.0">
                    <channel>
                        <title>Empty Feed</title>
                        <description>No items</description>
                    </channel>
                </rss>"""
            mock_response.raise_for_status.return_value = None
            mock_get.return_value = mock_response
            mock_feed = Mock()
            mock_feed.bozo = False
            mock_feed.entries = []
            mock_parse.return_value = mock_feed
            articles = client._get_rss_feed("EMPTY")
            assert articles == []
            assert mock_get.called
            assert mock_parse.called
    @vcr
    def test_special_characters_in_query(self, client):
        """Test query with special characters that need URL encoding."""
        # Query with spaces and special chars
        articles = client.get_company_news("S&P 500")
        assert isinstance(articles, list)
        # Should handle URL encoding properly
    def test_concurrent_category_failures(self, client):
        """Test that failures in one category don't affect others."""
        successful_article = GoogleNewsArticle(
            title="Success",
            link="https://success.com",
            published=datetime.now(timezone.utc).replace(tzinfo=None),
            summary="Successful fetch",
            source="GoodSource",
            guid="success-1",
        )
        with patch.object(client, "_get_rss_feed") as mock_get_rss:
            mock_get_rss.side_effect = [
                Exception("Network timeout"),
                [successful_article],
                Exception("Parse error"),
            ]
            articles = client.get_global_news(["fail1", "success", "fail2"])
            assert len(articles) == 1
            assert articles[0].title == "Success"