""" Test ArticleScraperClient with pytest-vcr for HTTP recording/replay. Following pragmatic TDD principles: - Mock HTTP boundaries with VCR cassettes - Test real business logic and data transformations - Fast, deterministic tests """ from pathlib import Path from unittest.mock import Mock, patch import pytest from tradingagents.domains.news.article_scraper_client import ( ArticleScraperClient, ScrapeResult, ) @pytest.fixture def cassette_dir(): """Directory for VCR cassettes.""" return ( Path(__file__).parent.parent.parent / "fixtures" / "vcr_cassettes" / "article_scraper" ) @pytest.fixture def scraper(): """ArticleScraperClient instance for testing.""" return ArticleScraperClient( user_agent="Test-Agent/1.0", delay=0.1, # Faster tests ) @pytest.fixture def valid_urls(): """Valid test URLs.""" return [ "https://www.reuters.com/business/finance/", "https://www.bloomberg.com/markets/stocks", "https://techcrunch.com/2024/01/15/tech-news/", ] @pytest.fixture def invalid_urls(): """Invalid test URLs.""" return [ "", "not-a-url", "http://", "https://", "ftp://example.com/file.txt", "https://non-existent-domain-123456.com/article", ] class TestArticleScraperClient: """Test ArticleScraperClient functionality.""" def test_initialization(self): """Test scraper initializes with correct configuration.""" # Test with custom user agent scraper = ArticleScraperClient("Custom-Agent/1.0", delay=2.0) assert scraper.user_agent == "Custom-Agent/1.0" assert scraper.delay == 2.0 # Test with default user agent (None/empty) scraper_default = ArticleScraperClient(None) assert "Chrome" in scraper_default.user_agent assert scraper_default.delay == 1.0 def test_is_valid_url(self, scraper): """Test URL validation logic.""" # Valid URLs assert scraper._is_valid_url("https://example.com/article") is True assert scraper._is_valid_url("http://example.com/article") is True assert scraper._is_valid_url("https://sub.domain.com/path?query=value") is True # Invalid URLs assert scraper._is_valid_url("") is False assert scraper._is_valid_url("not-a-url") is False assert scraper._is_valid_url("ftp://example.com") is False assert scraper._is_valid_url("http://") is False assert scraper._is_valid_url("https://") is False def test_scrape_article_invalid_url(self, scraper, invalid_urls): """Test scraping with invalid URLs returns NOT_FOUND.""" for url in invalid_urls: result = scraper.scrape_article(url) assert result.status == "NOT_FOUND" assert result.content == "" assert result.final_url == url class TestArticleScrapingSuccess: """Test successful article scraping scenarios.""" @patch("tradingagents.domains.news.article_scraper_client.time.sleep") @patch("tradingagents.domains.news.article_scraper_client.Article") def test_scrape_article_success(self, mock_article_class, mock_sleep, scraper): """Test successful article scraping with mocked newspaper4k.""" # Setup mock article mock_article = Mock() mock_article.text = "This is a long article content that is definitely over 100 characters in length and should pass the validation check." mock_article.title = "Test Article Title" mock_article.authors = ["John Doe", "Jane Smith"] mock_article.publish_date = "2024-01-15" mock_article.download.return_value = None mock_article.parse.return_value = None mock_article_class.return_value = mock_article # Test scraping result = scraper.scrape_article("https://example.com/article") # Verify results assert result.status == "SUCCESS" assert result.content == mock_article.text assert result.title == "Test Article Title" assert result.author == "John Doe, Jane Smith" assert result.publish_date == "2024-01-15" assert result.final_url == "https://example.com/article" # Verify newspaper4k was configured correctly mock_article_class.assert_called_once() args, kwargs = mock_article_class.call_args assert args[0] == "https://example.com/article" config = ( kwargs["config"] if "config" in kwargs else args[1] if len(args) > 1 else None ) assert config is not None assert config.browser_user_agent == "Test-Agent/1.0" assert config.request_timeout == 10 # Verify delay was applied mock_sleep.assert_called_once_with(0.1) @patch("tradingagents.domains.news.article_scraper_client.time.sleep") @patch("tradingagents.domains.news.article_scraper_client.Article") def test_scrape_article_with_datetime_publish_date( self, mock_article_class, mock_sleep, scraper ): """Test successful scraping with datetime publish_date.""" from datetime import datetime mock_article = Mock() mock_article.text = "Long article content over 100 characters for testing publish date handling in the newspaper4k client." mock_article.title = "DateTime Test Article" mock_article.authors = [] mock_article.publish_date = datetime(2024, 1, 15, 14, 30, 0) mock_article_class.return_value = mock_article result = scraper.scrape_article("https://example.com/datetime-article") assert result.status == "SUCCESS" assert result.publish_date == "2024-01-15" assert result.author == "" # Empty authors list @patch("tradingagents.domains.news.article_scraper_client.time.sleep") @patch("tradingagents.domains.news.article_scraper_client.Article") def test_scrape_article_short_content_fails( self, mock_article_class, mock_sleep, scraper ): """Test that articles with content under 100 chars are rejected.""" mock_article = Mock() mock_article.text = "Short content" # Under 100 characters mock_article.title = "Short Article" mock_article.authors = [] mock_article.publish_date = None mock_article_class.return_value = mock_article result = scraper.scrape_article("https://example.com/short-article") assert result.status == "SCRAPE_FAILED" assert result.content == "" @patch("tradingagents.domains.news.article_scraper_client.time.sleep") @patch("tradingagents.domains.news.article_scraper_client.Article") def test_scrape_article_empty_content_fails( self, mock_article_class, mock_sleep, scraper ): """Test that articles with empty content are rejected.""" mock_article = Mock() mock_article.text = "" # Empty content mock_article.title = "" mock_article.authors = [] mock_article.publish_date = None mock_article_class.return_value = mock_article result = scraper.scrape_article("https://example.com/empty-article") assert result.status == "SCRAPE_FAILED" assert result.content == "" class TestArticleScrapingFailure: """Test article scraping failure scenarios.""" @patch("tradingagents.domains.news.article_scraper_client.time.sleep") @patch("tradingagents.domains.news.article_scraper_client.Article") def test_scrape_article_download_exception( self, mock_article_class, mock_sleep, scraper ): """Test scraping when newspaper4k download fails.""" mock_article = Mock() mock_article.download.side_effect = Exception("Download failed") mock_article_class.return_value = mock_article result = scraper.scrape_article("https://example.com/failing-article") assert result.status == "SCRAPE_FAILED" assert result.content == "" assert result.final_url == "https://example.com/failing-article" @patch("tradingagents.domains.news.article_scraper_client.time.sleep") @patch("tradingagents.domains.news.article_scraper_client.Article") def test_scrape_article_parse_exception( self, mock_article_class, mock_sleep, scraper ): """Test scraping when newspaper4k parse fails.""" mock_article = Mock() mock_article.download.return_value = None mock_article.parse.side_effect = Exception("Parse failed") mock_article_class.return_value = mock_article result = scraper.scrape_article("https://example.com/parse-fail-article") assert result.status == "SCRAPE_FAILED" assert result.content == "" class TestWaybackMachineFallback: """Test Internet Archive Wayback Machine fallback functionality.""" @patch("tradingagents.domains.news.article_scraper_client.requests.get") def test_scrape_from_wayback_no_requests(self, mock_get, scraper): """Test Wayback fallback when requests is not available.""" with patch( "builtins.__import__", side_effect=ImportError("No module named 'requests'") ): result = scraper._scrape_from_wayback("https://example.com/article") assert result.status == "NOT_FOUND" assert result.final_url == "https://example.com/article" @patch("tradingagents.domains.news.article_scraper_client.requests.get") def test_scrape_from_wayback_no_snapshots(self, mock_get, scraper): """Test Wayback fallback when no archived snapshots exist.""" # Mock CDX API response with only headers (no snapshots) mock_response = Mock() mock_response.json.return_value = [["timestamp", "original"]] # Only headers mock_response.raise_for_status.return_value = None mock_get.return_value = mock_response result = scraper._scrape_from_wayback("https://example.com/no-archive") assert result.status == "NOT_FOUND" assert result.final_url == "https://example.com/no-archive" @patch("tradingagents.domains.news.article_scraper_client.requests.get") @patch("tradingagents.domains.news.article_scraper_client.time.sleep") @patch("tradingagents.domains.news.article_scraper_client.Article") def test_scrape_from_wayback_success( self, mock_article_class, mock_sleep, mock_get, scraper ): """Test successful Wayback Machine scraping.""" # Mock CDX API response mock_response = Mock() mock_response.json.return_value = [ ["timestamp", "original"], # Headers ["20240115120000", "https://example.com/article"], # Snapshot data ] mock_response.raise_for_status.return_value = None mock_get.return_value = mock_response # Mock successful article scraping from archive mock_article = Mock() mock_article.text = "Archived article content that is long enough to pass validation checks and contains meaningful information." mock_article.title = "Archived Article" mock_article.authors = ["Archive Author"] mock_article.publish_date = "2024-01-15" mock_article_class.return_value = mock_article result = scraper._scrape_from_wayback("https://example.com/article") assert result.status == "ARCHIVE_SUCCESS" assert result.content == mock_article.text assert result.title == "Archived Article" assert ( result.final_url == "https://web.archive.org/web/20240115120000/https://example.com/article" ) # Verify CDX API was called correctly mock_get.assert_called_with( "http://web.archive.org/cdx/search/cdx", params={ "url": "https://example.com/article", "output": "json", "fl": "timestamp,original", "filter": "statuscode:200", "limit": "1", }, timeout=10, ) @patch("tradingagents.domains.news.article_scraper_client.requests.get") def test_scrape_from_wayback_requests_exception(self, mock_get, scraper): """Test Wayback fallback when requests fails.""" mock_get.side_effect = Exception("Request timeout") result = scraper._scrape_from_wayback("https://example.com/timeout") assert result.status == "NOT_FOUND" assert result.final_url == "https://example.com/timeout" @patch("tradingagents.domains.news.article_scraper_client.time.sleep") @patch("tradingagents.domains.news.article_scraper_client.Article") def test_scrape_article_fallback_to_wayback( self, mock_article_class, mock_sleep, scraper ): """Test full workflow: source fails, fallback to Wayback succeeds.""" # First call (original source) fails # Second call (Wayback source) succeeds mock_article_fail = Mock() mock_article_fail.download.side_effect = Exception("Download failed") mock_article_success = Mock() mock_article_success.text = "Successfully scraped content from Wayback Machine with enough length to pass validation tests." mock_article_success.title = "Wayback Success" mock_article_success.authors = ["Wayback Author"] mock_article_success.publish_date = "2024-01-15" mock_article_success.download.return_value = None mock_article_success.parse.return_value = None mock_article_class.side_effect = [mock_article_fail, mock_article_success] with patch( "tradingagents.domains.news.article_scraper_client.requests.get" ) as mock_get: # Mock successful CDX API response mock_response = Mock() mock_response.json.return_value = [ ["timestamp", "original"], ["20240115120000", "https://example.com/article"], ] mock_response.raise_for_status.return_value = None mock_get.return_value = mock_response result = scraper.scrape_article("https://example.com/article") assert result.status == "ARCHIVE_SUCCESS" assert ( result.content == "Successfully scraped content from Wayback Machine with enough length to pass validation tests." ) assert "web.archive.org" in result.final_url class TestMultipleArticles: """Test scraping multiple articles functionality.""" @patch("tradingagents.domains.news.article_scraper_client.time.sleep") def test_scrape_multiple_articles_empty_list(self, mock_sleep, scraper): """Test scraping empty list returns empty dict.""" results = scraper.scrape_multiple_articles([]) assert results == {} mock_sleep.assert_not_called() @patch("tradingagents.domains.news.article_scraper_client.time.sleep") def test_scrape_multiple_articles_single_url(self, mock_sleep, scraper): """Test scraping single URL in list.""" urls = ["https://example.com/single"] with patch.object(scraper, "scrape_article") as mock_scrape: mock_scrape.return_value = ScrapeResult( status="SUCCESS", content="Single article content" ) results = scraper.scrape_multiple_articles(urls) assert len(results) == 1 assert results["https://example.com/single"].status == "SUCCESS" mock_scrape.assert_called_once_with("https://example.com/single") # No delay needed for single article mock_sleep.assert_not_called() @patch("tradingagents.domains.news.article_scraper_client.time.sleep") def test_scrape_multiple_articles_with_delays(self, mock_sleep, scraper): """Test scraping multiple URLs with delays between requests.""" urls = [ "https://example.com/article1", "https://example.com/article2", "https://example.com/article3", ] with patch.object(scraper, "scrape_article") as mock_scrape: mock_scrape.side_effect = [ ScrapeResult(status="SUCCESS", content="Article 1"), ScrapeResult(status="SUCCESS", content="Article 2"), ScrapeResult(status="SCRAPE_FAILED", content=""), ] results = scraper.scrape_multiple_articles(urls) assert len(results) == 3 assert results["https://example.com/article1"].status == "SUCCESS" assert results["https://example.com/article2"].status == "SUCCESS" assert results["https://example.com/article3"].status == "SCRAPE_FAILED" # Verify delay called between requests (n-1 times) assert mock_sleep.call_count == 2 mock_sleep.assert_called_with(0.1) class TestDataTransformation: """Test data transformation and edge cases.""" @patch("tradingagents.domains.news.article_scraper_client.time.sleep") @patch("tradingagents.domains.news.article_scraper_client.Article") def test_publish_date_edge_cases(self, mock_article_class, mock_sleep, scraper): """Test various publish_date formats are handled correctly.""" from datetime import datetime test_cases = [ (None, ""), ("", ""), ("2024-01-15", "2024-01-15"), (datetime(2024, 1, 15), "2024-01-15"), (12345, "12345"), # Numeric conversion ({"year": 2024}, "{'year': 2024}"), # Dict conversion ] for pub_date, expected in test_cases: mock_article = Mock() mock_article.text = "Long enough content for validation testing with various publish date formats and edge cases." mock_article.title = "Date Test" mock_article.authors = [] mock_article.publish_date = pub_date mock_article_class.return_value = mock_article result = scraper.scrape_article("https://example.com/date-test") assert result.status == "SUCCESS" assert result.publish_date == expected def test_scrape_result_dataclass_defaults(self): """Test ScrapeResult dataclass has correct defaults.""" result = ScrapeResult(status="TEST") assert result.status == "TEST" assert result.content == "" assert result.author == "" assert result.final_url == "" assert result.title == "" assert result.publish_date == "" def test_scrape_result_all_fields(self): """Test ScrapeResult with all fields populated.""" result = ScrapeResult( status="SUCCESS", content="Full article content", author="Test Author", final_url="https://final.com/url", title="Test Title", publish_date="2024-01-15", ) assert result.status == "SUCCESS" assert result.content == "Full article content" assert result.author == "Test Author" assert result.final_url == "https://final.com/url" assert result.title == "Test Title" assert result.publish_date == "2024-01-15" class TestErrorHandlingAndEdgeCases: """Test error handling and edge cases.""" def test_user_agent_fallback(self): """Test user agent fallback when None or empty is provided.""" scraper_none = ArticleScraperClient(None) scraper_empty = ArticleScraperClient("") # Both should use default Chrome user agent default_ua = ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" ) assert scraper_none.user_agent == default_ua assert scraper_empty.user_agent == default_ua @patch("tradingagents.domains.news.article_scraper_client.time.sleep") @patch("tradingagents.domains.news.article_scraper_client.Article") def test_config_applied_correctly(self, mock_article_class, mock_sleep): """Test that newspaper4k Config is applied with correct settings.""" scraper = ArticleScraperClient("Custom-Agent/2.0", delay=0.5) mock_article = Mock() mock_article.text = "Test content that meets minimum length requirements for successful article scraping validation." mock_article_class.return_value = mock_article scraper.scrape_article("https://example.com/config-test") # Verify Article was created with correct config mock_article_class.assert_called_once() args, kwargs = mock_article_class.call_args assert args[0] == "https://example.com/config-test" config = kwargs.get("config") or (args[1] if len(args) > 1 else None) assert config is not None assert config.browser_user_agent == "Custom-Agent/2.0" assert config.request_timeout == 10 assert config.keep_article_html is True assert config.fetch_images is False