rm vcrs
This commit is contained in:
parent
873ff99173
commit
4565a41600
|
|
@ -9,12 +9,27 @@ from tradingagents.domains.news.article_scraper_client import (
|
||||||
ScrapeResult,
|
ScrapeResult,
|
||||||
)
|
)
|
||||||
|
|
||||||
# VCR configuration
|
|
||||||
|
# VCR configuration optimized for minimal cassette size
|
||||||
|
def response_content_filter(response):
|
||||||
|
"""Filter response content to reduce cassette size."""
|
||||||
|
if "text/html" in response.get("headers", {}).get("content-type", [""])[0]:
|
||||||
|
# For HTML responses, keep only the first 1KB for basic structure
|
||||||
|
if "string" in response["body"]:
|
||||||
|
content = response["body"]["string"]
|
||||||
|
if len(content) > 1024:
|
||||||
|
response["body"]["string"] = (
|
||||||
|
content[:1024] + "... [TRUNCATED for test size]"
|
||||||
|
)
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
vcr = pytest.mark.vcr(
|
vcr = pytest.mark.vcr(
|
||||||
cassette_library_dir="tests/fixtures/vcr_cassettes/news",
|
cassette_library_dir="tests/fixtures/vcr_cassettes/news",
|
||||||
record_mode="once", # Record once, then replay
|
record_mode="once", # Record once, then replay
|
||||||
match_on=["uri", "method"],
|
match_on=["uri", "method"],
|
||||||
filter_headers=["authorization", "cookie", "user-agent"],
|
filter_headers=["authorization", "cookie", "user-agent", "set-cookie"],
|
||||||
|
before_record_response=response_content_filter,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -277,81 +292,3 @@ class TestArticleScraperClient:
|
||||||
|
|
||||||
# Business Insider sometimes has paywalls
|
# Business Insider sometimes has paywalls
|
||||||
assert result.status in ["SUCCESS", "PAYWALL_DETECTED", "SCRAPE_FAILED"]
|
assert result.status in ["SUCCESS", "PAYWALL_DETECTED", "SCRAPE_FAILED"]
|
||||||
|
|
||||||
|
|
||||||
class TestIntegrationScenarios:
|
|
||||||
"""Integration tests for ArticleScraperClient with real HTTP requests."""
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def scraper(self):
|
|
||||||
"""Create ArticleScraperClient instance."""
|
|
||||||
return ArticleScraperClient(delay=0.1)
|
|
||||||
|
|
||||||
@vcr
|
|
||||||
def test_multiple_major_news_sources(self, scraper):
|
|
||||||
"""Test scraping from various major news sources (recorded)."""
|
|
||||||
# Mix of generally accessible and paywalled sources
|
|
||||||
urls = [
|
|
||||||
"https://www.reuters.com/",
|
|
||||||
"https://www.cnbc.com/",
|
|
||||||
"https://www.bloomberg.com/",
|
|
||||||
"https://finance.yahoo.com/",
|
|
||||||
]
|
|
||||||
|
|
||||||
results = scraper.scrape_multiple_articles(urls)
|
|
||||||
|
|
||||||
assert len(results) == len(urls)
|
|
||||||
|
|
||||||
for url, result in results.items():
|
|
||||||
assert isinstance(result, ScrapeResult)
|
|
||||||
assert result.final_url == url
|
|
||||||
assert result.status in [
|
|
||||||
"SUCCESS",
|
|
||||||
"SCRAPE_FAILED",
|
|
||||||
"PAYWALL_DETECTED",
|
|
||||||
"NOT_FOUND",
|
|
||||||
]
|
|
||||||
|
|
||||||
@vcr
|
|
||||||
def test_financial_news_sources(self, scraper):
|
|
||||||
"""Test various financial news sources (recorded)."""
|
|
||||||
urls = [
|
|
||||||
"https://www.marketwatch.com/",
|
|
||||||
"https://www.barchart.com/",
|
|
||||||
"https://seekingalpha.com/",
|
|
||||||
"https://www.tipranks.com/",
|
|
||||||
]
|
|
||||||
|
|
||||||
results = scraper.scrape_multiple_articles(urls)
|
|
||||||
|
|
||||||
assert len(results) == len(urls)
|
|
||||||
|
|
||||||
for url, result in results.items():
|
|
||||||
assert isinstance(result, ScrapeResult)
|
|
||||||
|
|
||||||
# Different sources have different paywall policies
|
|
||||||
if "seekingalpha.com" in url and result.status == "PAYWALL_DETECTED":
|
|
||||||
assert result.is_paywall is True
|
|
||||||
elif result.status == "SUCCESS":
|
|
||||||
assert isinstance(result.content, str)
|
|
||||||
|
|
||||||
@vcr
|
|
||||||
def test_business_news_sources(self, scraper):
|
|
||||||
"""Test business news sources (recorded)."""
|
|
||||||
urls = [
|
|
||||||
"https://www.forbes.com/",
|
|
||||||
"https://www.businessinsider.com/",
|
|
||||||
"https://www.wsj.com/",
|
|
||||||
]
|
|
||||||
|
|
||||||
results = scraper.scrape_multiple_articles(urls)
|
|
||||||
|
|
||||||
assert len(results) == len(urls)
|
|
||||||
|
|
||||||
for url, result in results.items():
|
|
||||||
assert isinstance(result, ScrapeResult)
|
|
||||||
assert result.final_url == url
|
|
||||||
|
|
||||||
# WSJ is known for paywalls
|
|
||||||
if "wsj.com" in url and result.status == "PAYWALL_DETECTED":
|
|
||||||
assert result.is_paywall is True
|
|
||||||
|
|
|
||||||
|
|
@ -14,12 +14,33 @@ from tradingagents.domains.news.google_news_client import (
|
||||||
GoogleNewsClient,
|
GoogleNewsClient,
|
||||||
)
|
)
|
||||||
|
|
||||||
# VCR configuration
|
|
||||||
|
# VCR configuration optimized for minimal cassette size
|
||||||
|
def rss_content_filter(response):
|
||||||
|
"""Filter RSS content to reduce cassette size while preserving test data."""
|
||||||
|
content_type = response.get("headers", {}).get("content-type", [""])[0]
|
||||||
|
if "xml" in content_type and "string" in response["body"]:
|
||||||
|
content = response["body"]["string"]
|
||||||
|
# For RSS feeds, keep only first 5 items to reduce size
|
||||||
|
if len(content) > 5000: # Only truncate large RSS feeds
|
||||||
|
# Find closing tag of 5th item
|
||||||
|
item_count = content.count("<item>")
|
||||||
|
if item_count > 5:
|
||||||
|
# Keep RSS structure but limit to 5 items
|
||||||
|
parts = content.split("</item>")
|
||||||
|
if len(parts) > 6: # 5 items + everything after
|
||||||
|
response["body"]["string"] = (
|
||||||
|
"</item>".join(parts[:6]) + "</channel></rss>"
|
||||||
|
)
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
vcr = pytest.mark.vcr(
|
vcr = pytest.mark.vcr(
|
||||||
cassette_library_dir="tests/fixtures/vcr_cassettes/news",
|
cassette_library_dir="tests/fixtures/vcr_cassettes/news",
|
||||||
record_mode="once", # Record once, then replay
|
record_mode="once", # Record once, then replay
|
||||||
match_on=["uri", "method"],
|
match_on=["uri", "method"],
|
||||||
filter_headers=["authorization", "cookie"],
|
filter_headers=["authorization", "cookie", "user-agent", "set-cookie"],
|
||||||
|
before_record_response=rss_content_filter,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -302,69 +323,3 @@ class TestGoogleNewsClient:
|
||||||
assert articles == []
|
assert articles == []
|
||||||
# Should log warning about failed parsing
|
# Should log warning about failed parsing
|
||||||
mock_logger.warning.assert_called()
|
mock_logger.warning.assert_called()
|
||||||
|
|
||||||
|
|
||||||
class TestIntegrationScenarios:
|
|
||||||
"""Integration tests with multiple components."""
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def client(self):
|
|
||||||
"""Create GoogleNewsClient instance."""
|
|
||||||
return GoogleNewsClient()
|
|
||||||
|
|
||||||
def test_empty_feed_response(self, client):
|
|
||||||
"""Test handling of empty RSS feed."""
|
|
||||||
with patch("requests.get") as mock_get, patch("feedparser.parse") as mock_parse:
|
|
||||||
mock_response = Mock()
|
|
||||||
mock_response.content = b"""<?xml version="1.0"?>
|
|
||||||
<rss version="2.0">
|
|
||||||
<channel>
|
|
||||||
<title>Empty Feed</title>
|
|
||||||
<description>No items</description>
|
|
||||||
</channel>
|
|
||||||
</rss>"""
|
|
||||||
mock_response.raise_for_status.return_value = None
|
|
||||||
mock_get.return_value = mock_response
|
|
||||||
|
|
||||||
mock_feed = Mock()
|
|
||||||
mock_feed.bozo = False
|
|
||||||
mock_feed.entries = []
|
|
||||||
mock_parse.return_value = mock_feed
|
|
||||||
|
|
||||||
articles = client._get_rss_feed("EMPTY")
|
|
||||||
|
|
||||||
assert articles == []
|
|
||||||
assert mock_get.called
|
|
||||||
assert mock_parse.called
|
|
||||||
|
|
||||||
@vcr
|
|
||||||
def test_special_characters_in_query(self, client):
|
|
||||||
"""Test query with special characters that need URL encoding."""
|
|
||||||
# Query with spaces and special chars
|
|
||||||
articles = client.get_company_news("S&P 500")
|
|
||||||
|
|
||||||
assert isinstance(articles, list)
|
|
||||||
# Should handle URL encoding properly
|
|
||||||
|
|
||||||
def test_concurrent_category_failures(self, client):
|
|
||||||
"""Test that failures in one category don't affect others."""
|
|
||||||
successful_article = GoogleNewsArticle(
|
|
||||||
title="Success",
|
|
||||||
link="https://success.com",
|
|
||||||
published=datetime.now(timezone.utc).replace(tzinfo=None),
|
|
||||||
summary="Successful fetch",
|
|
||||||
source="GoodSource",
|
|
||||||
guid="success-1",
|
|
||||||
)
|
|
||||||
|
|
||||||
with patch.object(client, "_get_rss_feed") as mock_get_rss:
|
|
||||||
mock_get_rss.side_effect = [
|
|
||||||
Exception("Network timeout"),
|
|
||||||
[successful_article],
|
|
||||||
Exception("Parse error"),
|
|
||||||
]
|
|
||||||
|
|
||||||
articles = client.get_global_news(["fail1", "success", "fail2"])
|
|
||||||
|
|
||||||
assert len(articles) == 1
|
|
||||||
assert articles[0].title == "Success"
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue