diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 00000000..044f68fe --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,23 @@ +{ + "hooks": { + "PostToolUse": [ + { + "matcher": "Edit|MultiEdit|Write", + "hooks": [ + { + "type": "command", + "command": "mise run format" + }, + { + "type": "command", + "command": "mise run lint --fix" + }, + { + "type": "command", + "command": "mise run typecheck" + } + ] + } + ] + } +} \ No newline at end of file diff --git a/FundamentalDataService_PRD.md b/FundamentalDataService_PRD.md deleted file mode 100644 index 11006502..00000000 --- a/FundamentalDataService_PRD.md +++ /dev/null @@ -1,289 +0,0 @@ -# Product Requirements Document: FundamentalDataService Completion - -## Overview - -Complete the `FundamentalDataService` to provide strongly-typed fundamental financial data to trading agents using a local-first data strategy with gap detection and intelligent caching. - -## Current State Analysis - -### Issues to Fix -- **CRITICAL**: Service calls `FinnhubClient` methods with string dates but client expects `date` objects -- **CRITICAL**: References non-existent `self.simfin_client` instead of `self.finnhub_client` -- Missing strongly-typed interfaces between components -- Incomplete local-first strategy implementation -- No concrete gap detection logic -- Missing error recovery for partial data - -### What Works -- ✅ `FinnhubClient` fully implemented with strict `date` object interface -- ✅ `FundamentalDataRepository` with dataclass-based storage -- ✅ `FundamentalContext` Pydantic model for agent consumption -- ✅ Basic service structure and error handling - -## Technical Requirements - -### 1. Strongly-Typed Interfaces - -#### Client → Service Interface -```python -# FinnhubClient methods (already implemented) -def get_balance_sheet(symbol: str, frequency: str, report_date: date) -> dict[str, Any] -def get_income_statement(symbol: str, frequency: str, report_date: date) -> dict[str, Any] -def get_cash_flow(symbol: str, frequency: str, report_date: date) -> dict[str, Any] -``` - -#### Service → Repository Interface -```python -# Repository methods (already implemented) -def has_data_for_period(symbol: str, start_date: str, end_date: str, frequency: str) -> bool -def get_data(symbol: str, start_date: str, end_date: str, frequency: str) -> dict[str, Any] -def store_data(symbol: str, cache_data: dict, frequency: str, overwrite: bool) -> bool -def clear_data(symbol: str, start_date: str, end_date: str, frequency: str) -> bool -``` - -#### Service → Agent Interface -```python -# Service output (already defined) -def get_context(symbol: str, start_date: str, end_date: str, frequency: str, force_refresh: bool) -> FundamentalContext -``` - -### 2. Local-First Data Strategy - -#### Flow -1. **Repository Lookup**: Check `FundamentalDataRepository.has_data_for_period()` -2. **Gap Detection**: Identify missing data periods using `detect_fundamental_gaps()` -3. **Selective Fetching**: Fetch only missing data from `FinnhubClient` -4. **Cache Updates**: Store new data via `repository.store_data()` -5. **Context Assembly**: Return validated `FundamentalContext` - -#### Gap Detection Implementation -```python -def detect_fundamental_gaps(self, symbol: str, start_date: str, end_date: str, frequency: str) -> list[str]: - """ - Returns list of report dates that need fetching. - - Example: If requesting quarterly from 2024-01-01 to 2024-12-31 - and cache has Q1 and Q3, returns ["2024-06-30", "2024-09-30", "2024-12-31"] - - For quarterly: Check for Q1 (Mar 31), Q2 (Jun 30), Q3 (Sep 30), Q4 (Dec 31) - For annual: Check for fiscal year ends - """ - # Implementation should: - # 1. Get existing report dates from repository - # 2. Calculate expected report dates in requested period - # 3. Return difference between expected and existing -``` - -#### Force Refresh Support -- `force_refresh=True` bypasses local data completely -- Clears existing cache before fetching fresh data -- Stores refreshed data with metadata indicating refresh - -#### Cache Invalidation Strategy -- **Fundamental data is immutable**: Once a report is filed, it doesn't change -- **No staleness checks needed**: Reports are valid indefinitely -- **Only fetch if missing**: Never re-fetch existing reports - -### 3. Date Object Conversion - -#### Service Boundary Conversion -```python -# Service receives string dates from agents -def get_context(self, symbol: str, start_date: str, end_date: str, ...) -> FundamentalContext: - # Validate date strings - try: - start_dt = date.fromisoformat(start_date) - end_dt = date.fromisoformat(end_date) - except ValueError as e: - raise ValueError(f"Invalid date format: {e}") - - # Check date order - if end_dt < start_dt: - raise ValueError(f"End date {end_date} is before start date {start_date}") - - # Use date objects when calling FinnhubClient - data = self.finnhub_client.get_balance_sheet(symbol, frequency, end_dt) -``` - -### 4. Error Recovery and Partial Data - -```python -def handle_partial_statements( - self, - balance_sheet: dict | None, - income_statement: dict | None, - cash_flow: dict | None -) -> FundamentalContext: - """ - Create context even if some statements are missing. - - - If all statements fail: Raise exception - - If some statements succeed: Return partial context - - Mark missing statements in metadata - """ - metadata = { - "has_balance_sheet": balance_sheet is not None, - "has_income_statement": income_statement is not None, - "has_cash_flow": cash_flow is not None, - "partial_data": any(s is None for s in [balance_sheet, income_statement, cash_flow]) - } - - # Convert available statements to FinancialStatement objects - # Return FundamentalContext with available data -``` - -### 5. Pydantic Validation - -#### Context Structure -```python -@dataclass -class FundamentalContext(BaseModel): - symbol: str - period: dict[str, str] # {"start": "2024-01-01", "end": "2024-01-31"} - balance_sheet: FinancialStatement | None - income_statement: FinancialStatement | None - cash_flow: FinancialStatement | None - key_ratios: dict[str, float] - metadata: dict[str, Any] - - @validator('period') - def validate_period(cls, v): - # Ensure start and end dates are present and valid - return v -``` - -## Implementation Tasks - -### Phase 1: Fix Critical Issues - -1. **Date Conversion Fix** - - Add `date.fromisoformat()` conversion in service methods - - Add date validation (format, order) - - Update all `FinnhubClient` method calls to use `date` objects - - File: `tradingagents/services/fundamental_data_service.py:153, 164, 175` - -2. **Client Reference Fix** - - Replace `self.simfin_client` with `self.finnhub_client` - - File: `tradingagents/services/fundamental_data_service.py:375` - -### Phase 2: Enhanced Local-First Strategy - -3. **Gap Detection Logic** - - Implement `detect_fundamental_gaps()` method - - Calculate expected report dates based on frequency - - Compare with cached data to find gaps - - Handle fiscal year variations - -4. **Partial Data Handling** - - Implement `handle_partial_statements()` method - - Continue processing if some statements succeed - - Mark missing data in metadata - - Only fail if all statements fail - -### Phase 3: Type Safety & Validation - -5. **Comprehensive Type Checking** - - Run `mise run typecheck` - must pass with 0 errors - - Validate all `date` object conversions - - Ensure Pydantic model compliance - -6. **Enhanced Testing** - - Update existing tests for new date handling - - Add gap detection test scenarios - - Test partial data scenarios - - Test force refresh behavior - - Test date validation edge cases - -## Testing Scenarios - -### Integration Tests -1. **Gap Detection** - - Test with empty cache (should fetch all) - - Test with partial cache (should fetch only missing) - - Test with complete cache (should fetch none) - -2. **Partial Data Recovery** - - Test when balance sheet API fails but others succeed - - Test when only one statement type is available - - Test when all APIs fail (should raise exception) - -3. **Date Handling** - - Test invalid date formats - - Test end_date < start_date - - Test boundary conditions (year start/end) - -4. **Force Refresh** - - Test that force_refresh=True clears cache - - Test that new data is fetched and stored - -## Success Criteria - -### Functional Requirements -- ✅ Service successfully calls `FinnhubClient` with `date` objects -- ✅ Gap detection correctly identifies missing reports -- ✅ Partial data scenarios handled gracefully -- ✅ Local-first strategy works: checks cache → identifies gaps → fetches missing → stores updates -- ✅ Returns properly validated `FundamentalContext` to agents -- ✅ Force refresh bypasses cache and refreshes data - -### Technical Requirements -- ✅ Zero type checking errors: `mise run typecheck` -- ✅ Zero linting errors: `mise run lint` -- ✅ All existing tests pass -- ✅ No runtime errors with date conversions -- ✅ Proper error messages for validation failures - -### Quality Requirements -- ✅ Strongly-typed interfaces between all components -- ✅ Comprehensive error handling and logging -- ✅ Efficient caching with minimal API calls -- ✅ Clear separation of concerns between service, client, and repository - -## Dependencies - -### Completed -- ✅ `FinnhubClient` with `date` object interface -- ✅ `FundamentalDataRepository` with dataclass storage -- ✅ `FundamentalContext` Pydantic model - -### Required -- Working `FinnhubClient` instance with valid API key -- Writable data directory for repository storage - -## Timeline - -### Immediate (Today) -- Fix critical date conversion and reference issues -- Implement basic gap detection -- Add date validation - -### Next Steps -- Implement partial data handling -- Comprehensive testing -- Integration with agent workflows - -## Acceptance Criteria - -### Must Have -1. **Type Safety**: Service passes `mise run typecheck` with zero errors -2. **Client Integration**: All `FinnhubClient` calls use `date` objects correctly -3. **Gap Detection**: Correctly identifies missing report periods -4. **Partial Data**: Service returns partial context when some statements fail -5. **Local-First**: Service checks repository before API calls -6. **Context Validation**: Returns valid `FundamentalContext` with Pydantic validation -7. **Error Handling**: Graceful handling of API failures and missing data - -### Should Have -1. **Cache Efficiency**: Minimal redundant API calls -2. **Force Refresh**: Complete cache bypass when requested -3. **Data Quality**: Metadata indicating data completeness -4. **Clear Error Messages**: Informative errors for date validation failures - -### Nice to Have -1. **Performance Metrics**: Timing and cache hit rate logging -2. **Fiscal Year Handling**: Support for non-calendar fiscal years -3. **Bulk Operations**: Fetch multiple symbols efficiently - ---- - -This PRD focuses on completing the `FundamentalDataService` as a strongly-typed, local-first data service that seamlessly integrates with the existing `FinnhubClient` and `FundamentalDataRepository` components while providing robust gap detection and partial data handling. diff --git a/MarketDataService_PRD.md b/MarketDataService_PRD.md deleted file mode 100644 index 6917d096..00000000 --- a/MarketDataService_PRD.md +++ /dev/null @@ -1,502 +0,0 @@ -# Product Requirements Document: MarketDataService Completion - -## Overview - -Complete the `MarketDataService` to provide strongly-typed market data and technical indicators to trading agents using a local-first data strategy with gap detection and intelligent caching. - -## Current State Analysis - -### Issues to Fix -- **CRITICAL**: Service uses `BaseClient` inheritance but `YFinanceClient` exists and needs refactoring to FinnhubClient standard -- **CRITICAL**: Service calls client methods with string dates instead of date objects -- **CRITICAL**: Need to integrate `stockstats` library for technical analysis calculations instead of legacy utils -- **CRITICAL**: `MarketDataRepository` exists but missing service interface methods -- Missing strongly-typed interface between YFinanceClient and service -- YFinanceClient uses BaseClient inheritance and string dates (needs refactoring) -- No concrete gap detection logic -- Missing technical indicator data sufficiency validation - -### What Works -- ✅ Local-first data strategy implementation (`_get_price_data_local_first`) -- ✅ Force refresh logic (`_fetch_and_cache_fresh_data`) -- ✅ `MarketDataContext` Pydantic model for agent consumption -- ✅ Error handling and metadata creation patterns -- ✅ `YFinanceClient` exists with yfinance SDK integration and comprehensive methods -- ✅ `MarketDataRepository` exists with CSV storage and pandas DataFrame operations -- ✅ Service structure ready for `stockstats` integration for technical analysis - -## Technical Requirements - -### 1. Strongly-Typed Interfaces - -#### Client → Service Interface -```python -# YFinanceClient methods (to be refactored) -def get_historical_data(symbol: str, start_date: date, end_date: date) -> dict[str, Any] -def get_price_data(symbol: str, start_date: date, end_date: date) -> dict[str, Any] - -# Technical analysis handled in service layer using stockstats -# No get_technical_indicator method needed in client - calculated from OHLCV data -``` - -#### Service → Repository Interface -```python -# MarketDataRepository methods (to be implemented) -def has_data_for_period(symbol: str, start_date: str, end_date: str) -> bool -def get_data(symbol: str, start_date: str, end_date: str) -> dict[str, Any] -def store_data(symbol: str, cache_data: dict, overwrite: bool) -> bool -def clear_data(symbol: str, start_date: str, end_date: str) -> bool -``` - -#### Service → Agent Interface -```python -# Service output (already defined) -def get_context(symbol: str, start_date: str, end_date: str, indicators: list[str], force_refresh: bool) -> MarketDataContext -``` - -### 2. Local-First Data Strategy - -#### Flow -1. **Repository Lookup**: Check `MarketDataRepository.has_data_for_period()` -2. **Gap Detection**: Identify missing price data periods using `detect_market_gaps()` -3. **Data Sufficiency Check**: Ensure enough historical data for requested indicators -4. **Selective Fetching**: Fetch only missing data from `YFinanceClient` -5. **Cache Updates**: Store new data via `repository.store_data()` -6. **Context Assembly**: Return validated `MarketDataContext` - -#### Gap Detection Implementation -```python -def detect_market_gaps(self, cached_dates: list[str], requested_start: str, requested_end: str) -> list[tuple[str, str]]: - """ - Returns list of (start, end) tuples for missing periods. - - Example: If requesting 2024-01-01 to 2024-01-31 and cache has: - - 2024-01-01 to 2024-01-10 - - 2024-01-20 to 2024-01-25 - Returns: [("2024-01-11", "2024-01-19"), ("2024-01-26", "2024-01-31")] - - Accounts for: - - Weekends (Saturday/Sunday) - - Market holidays - - Continuous date ranges to minimize API calls - """ - # Implementation should use pandas business day logic -``` - -#### Force Refresh Support -- `force_refresh=True` bypasses local data completely -- Clears existing cache before fetching fresh data -- Stores refreshed data with metadata indicating refresh - -#### Cache Invalidation Strategy -- **Historical data is immutable**: Data older than yesterday never changes -- **Today's data needs updates**: During market hours, refresh every 15 minutes -- **After market close**: Today's data becomes immutable -```python -def is_data_stale(self, data_date: date, last_updated: datetime) -> bool: - today = date.today() - if data_date < today: - return False # Historical data never stale - - # For today's data, check if market is open and last update > 15 min - if is_market_open() and (datetime.now() - last_updated).minutes > 15: - return True - return False -``` - -### 3. Date Object Conversion - -#### Service Boundary Conversion -```python -# Service receives string dates from agents -def get_context(self, symbol: str, start_date: str, end_date: str, ...) -> MarketDataContext: - # Validate date strings - try: - start_dt = date.fromisoformat(start_date) - end_dt = date.fromisoformat(end_date) - except ValueError as e: - raise ValueError(f"Invalid date format: {e}") - - # Check date order - if end_dt < start_dt: - raise ValueError(f"End date {end_date} is before start date {start_date}") - - # Expand date range for technical indicators - expanded_start = self._calculate_lookback_start(start_dt, indicators) - - # Use date objects when calling YFinanceClient - price_data = self.yfinance_client.get_historical_data(symbol, expanded_start, end_dt) - - # Calculate technical indicators using stockstats library - technical_indicators = self._calculate_technical_indicators(price_data, indicators) -``` - -### 4. Technical Analysis with Stockstats - -#### Data Sufficiency Validation -```python -# Minimum data points required for each indicator -INDICATOR_REQUIREMENTS = { - "sma_20": 20, - "sma_200": 200, - "ema_12": 24, # 2x for exponential smoothing - "ema_200": 400, - "rsi_14": 28, # 2x period for warm-up - "macd": 34, # 26 + 8 for signal line - "bb_upper": 20, # Based on 20-period SMA - "atr_14": 28, # 2x period for accuracy - "stochrsi_14": 42, # 3x period for double smoothing -} - -def _calculate_lookback_start(self, start_date: date, indicators: list[str]) -> date: - """Calculate how far back we need data to compute indicators accurately.""" - max_lookback = 0 - for indicator in indicators: - lookback = INDICATOR_REQUIREMENTS.get(indicator, 0) - max_lookback = max(max_lookback, lookback) - - # Add buffer for weekends/holidays - business_days_back = max_lookback * 1.5 - return start_date - timedelta(days=int(business_days_back)) - -def _validate_data_sufficiency(self, data_points: int, indicators: list[str]) -> dict[str, bool]: - """Check if we have enough data for each indicator.""" - return { - indicator: data_points >= INDICATOR_REQUIREMENTS.get(indicator, 0) - for indicator in indicators - } -``` - -#### Stockstats Integration -```python -def _calculate_technical_indicators(self, price_data: list[dict], indicators: list[str]) -> dict[str, list[dict]]: - """ - Calculate technical indicators using stockstats library. - - Args: - price_data: OHLCV data from YFinanceClient - indicators: List of requested indicators (e.g., ['rsi_14', 'macd', 'bb_upper', 'sma_20']) - - Returns: - Dict mapping indicator names to time series data - """ - import pandas as pd - from stockstats import StockDataFrame - - # Convert price data to pandas DataFrame - df = pd.DataFrame(price_data) - df['date'] = pd.to_datetime(df['date']) - df.set_index('date', inplace=True) - - # Check data sufficiency - sufficiency = self._validate_data_sufficiency(len(df), indicators) - - # Create StockDataFrame for technical analysis - sdf = StockDataFrame.retype(df) - - # Calculate requested indicators - indicator_data = {} - for indicator in indicators: - if not sufficiency[indicator]: - logger.warning(f"Insufficient data for {indicator}, need {INDICATOR_REQUIREMENTS[indicator]} points") - indicator_data[indicator] = [] - continue - - try: - if indicator in sdf.columns: - values = sdf[indicator].dropna() - indicator_data[indicator] = [ - {"date": idx.strftime("%Y-%m-%d"), "value": float(val)} - for idx, val in values.items() - ] - except Exception as e: - logger.warning(f"Failed to calculate {indicator}: {e}") - indicator_data[indicator] = [] - - return indicator_data -``` - -### 5. Error Recovery and Partial Data - -```python -def handle_partial_price_data( - self, - requested_start: str, - requested_end: str, - available_data: list[dict] -) -> MarketDataContext: - """ - Handle cases where only partial date range is available. - - - If no data available: Raise exception - - If partial data: Return what's available with metadata - - Mark gaps in metadata - """ - if not available_data: - raise ValueError(f"No market data available for {symbol}") - - actual_start = min(d['date'] for d in available_data) - actual_end = max(d['date'] for d in available_data) - - metadata = { - "requested_period": {"start": requested_start, "end": requested_end}, - "actual_period": {"start": actual_start, "end": actual_end}, - "partial_data": actual_start > requested_start or actual_end < requested_end, - "data_points": len(available_data) - } - - # Return context with available data and metadata -``` - -### 6. Pydantic Validation - -#### Context Structure -```python -@dataclass -class MarketDataContext(BaseModel): - symbol: str - period: dict[str, str] # {"start": "2024-01-01", "end": "2024-01-31"} - price_data: list[dict[str, Any]] # OHLCV records - technical_indicators: dict[str, list[TechnicalIndicatorData]] - metadata: dict[str, Any] - - @validator('price_data') - def validate_price_data(cls, v): - # Ensure OHLCV fields present and valid - required_fields = {'date', 'open', 'high', 'low', 'close', 'volume'} - for record in v: - if not all(field in record for field in required_fields): - raise ValueError(f"Missing required OHLCV fields") - return v -``` - -## Implementation Tasks - -### Phase 1: Refactor YFinanceClient - -1. **YFinanceClient Refactoring** - - **Refactor existing** `tradingagents/clients/yfinance_client.py` - - Remove BaseClient inheritance - - Update all method signatures to accept `date` objects instead of strings - - Keep all existing functionality intact - - Example changes: - ```python - # Current (wrong) - def get_historical_data(self, symbol: str, start_date: str, end_date: str) -> dict[str, Any]: - - # Updated (correct) - def get_historical_data(self, symbol: str, start_date: date, end_date: date) -> dict[str, Any]: - ``` - -2. **Comprehensive Testing** - - Update `tradingagents/clients/test_yfinance_client.py` - - Test with date objects - - Use pytest-vcr for HTTP interaction recording - - Test error handling and edge cases - -### Phase 2: Update MarketDataRepository - -3. **Repository Interface Enhancement** - - Update existing `tradingagents/repositories/market_data_repository.py` - - Add missing service interface methods: `has_data_for_period()`, `get_data()`, `store_data()`, `clear_data()` - - Maintain existing CSV/pandas functionality while adding service compatibility - - Support gap detection and partial data scenarios - -### Phase 3: Update MarketDataService - -4. **Client Integration Fix** - - Replace `BaseClient` dependency with `YFinanceClient` - - File: `tradingagents/services/market_data_service.py:8, 26` - - Update constructor to accept `yfinance_client: YFinanceClient` - -5. **Date Conversion and Validation** - - Add `date.fromisoformat()` conversion in service methods - - Add date validation (format, order) - - Update client calls to use date objects instead of strings - - File: `tradingagents/services/market_data_service.py:151, 227` - -6. **Technical Indicator Integration with Stockstats** - - Implement `_calculate_technical_indicators()` method using `stockstats` library - - Add `_calculate_lookback_start()` for data sufficiency - - Add `_validate_data_sufficiency()` to check if enough data - - Replace legacy `StockstatsUtils` integration with direct stockstats usage - - File: `tradingagents/services/market_data_service.py:9, 43, 280-346` - -### Phase 4: Type Safety & Validation - -7. **Comprehensive Type Checking** - - Run `mise run typecheck` - must pass with 0 errors - - Validate all date object conversions - - Ensure MarketDataContext compliance - -8. **Enhanced Testing** - - Update existing service tests for new YFinanceClient interface - - Add gap detection test scenarios - - Test technical indicator data sufficiency - - Test partial data handling - -## Testing Scenarios - -### Integration Tests - -1. **Gap Detection** - - Test with empty cache (should fetch all) - - Test with partial cache (should fetch only missing periods) - - Test weekend/holiday handling - -2. **Technical Indicator Sufficiency** - - Test SMA_200 with only 100 days of data (should skip indicator) - - Test RSI_14 with exactly 28 days (should calculate) - - Test mixed indicators with varying data requirements - -3. **Partial Data Recovery** - - Test when API returns less data than requested - - Test when some dates are missing (holidays) - - Test metadata accuracy for partial data - -4. **Date Handling** - - Test invalid date formats - - Test end_date < start_date - - Test future dates - - Test weekend date handling - -5. **Cache Staleness** - - Test historical data (should never refresh) - - Test today's data during market hours (should refresh if > 15 min) - - Test today's data after market close (should not refresh) - -## Success Criteria - -### Functional Requirements -- ✅ Service successfully calls refactored `YFinanceClient` with `date` objects -- ✅ Gap detection correctly identifies missing trading days -- ✅ Technical indicators validate data sufficiency before calculation -- ✅ Partial data scenarios handled gracefully -- ✅ Local-first strategy works: checks cache → identifies gaps → fetches missing → stores updates -- ✅ Returns properly validated `MarketDataContext` to agents -- ✅ Technical indicators calculated from OHLCV data using stockstats library -- ✅ Force refresh bypasses cache and refreshes data - -### Technical Requirements -- ✅ Zero type checking errors: `mise run typecheck` -- ✅ Zero linting errors: `mise run lint` -- ✅ All existing tests pass with updated architecture -- ✅ No runtime errors with date conversions -- ✅ Proper error messages for validation failures - -### Quality Requirements -- ✅ Strongly-typed interfaces between all components -- ✅ Official yfinance SDK and stockstats library usage -- ✅ Comprehensive error handling and logging -- ✅ Efficient caching with minimal API calls -- ✅ Clear separation of concerns between service, client, and repository - -## Data Architecture - -### YFinanceClient Response Format -```python -{ - "symbol": "AAPL", - "period": {"start": "2024-01-01", "end": "2024-01-31"}, - "data": [ - { - "date": "2024-01-02", # Note: Jan 1 was a holiday - "open": 150.0, - "high": 155.0, - "low": 149.0, - "close": 154.0, - "volume": 1000000, - "adj_close": 154.0 - }, - ... - ], - "metadata": { - "source": "yfinance", - "retrieved_at": "2024-01-31T10:00:00Z", - "data_quality": "HIGH", - "missing_dates": ["2024-01-01", "2024-01-15"] # Holidays - } -} -``` - -### Technical Indicator Data Format -```python -# MarketDataContext.technical_indicators structure -{ - "rsi_14": [ - {"date": "2024-01-29", "value": 65.5}, # First valid after 28 days - {"date": "2024-01-30", "value": 67.2}, - ... - ], - "sma_200": [], # Empty if insufficient data - "macd": [ - {"date": "2024-01-31", "value": {"macd": 2.1, "signal": 1.8, "histogram": 0.3}} - ], - "_metadata": { - "indicators_calculated": ["rsi_14", "macd"], - "indicators_skipped": { - "sma_200": "Insufficient data: need 200 points, have 31" - } - } -} -``` - -## Dependencies - -### Existing Components (Need Updates) -- ✅ `YFinanceClient` exists but needs refactoring (remove BaseClient, use date objects) -- ✅ `MarketDataRepository` exists with CSV storage but needs service interface methods -- ✅ Tests exist but need updates for new interfaces - -### Required -- Official `yfinance` library for market data fetching -- `stockstats` library for technical analysis calculations -- `pandas` for date/time handling and business day calculations -- Working internet connection for live data fetching -- Writable data directory for repository storage - -## Timeline - -### Immediate (Phase 1) -- Refactor existing YFinanceClient to use date objects -- Remove BaseClient inheritance -- Update tests for new interface - -### Phase 2-3 -- Add service interface methods to MarketDataRepository -- Update MarketDataService to use refactored YFinanceClient -- Implement data sufficiency validation -- Integrate stockstats library for technical indicators - -### Phase 4 -- Comprehensive type checking and validation -- Integration testing with gap detection -- Performance optimization and caching efficiency - -## Acceptance Criteria - -### Must Have -1. **Type Safety**: Service passes `mise run typecheck` with zero errors -2. **Client Refactoring**: YFinanceClient uses date objects, no BaseClient -3. **Gap Detection**: Correctly identifies missing trading days -4. **Data Sufficiency**: Validates enough data for technical indicators -5. **Partial Data**: Service handles incomplete data gracefully -6. **Local-First**: Service checks repository before API calls -7. **Context Validation**: Returns valid `MarketDataContext` with Pydantic validation -8. **Technical Indicators**: Calculated using stockstats with proper validation - -### Should Have -1. **Cache Efficiency**: Minimal redundant API calls to Yahoo Finance -2. **Force Refresh**: Complete cache bypass when requested -3. **Stale Data Handling**: Refresh today's data during market hours -4. **Clear Error Messages**: Informative errors for validation failures - -### Nice to Have -1. **Performance Metrics**: Timing and cache hit rate logging -2. **Extended Indicators**: Support for 50+ technical indicators -3. **Real-time Data**: WebSocket integration for live prices -4. **Bulk Symbol Support**: Fetch multiple symbols efficiently - ---- - -This PRD focuses on completing the `MarketDataService` as a strongly-typed, local-first data service that integrates OHLCV price data from a refactored `YFinanceClient` and calculates comprehensive technical indicators using the `stockstats` library, with robust gap detection and data sufficiency validation. diff --git a/NewsService_PRD.md b/NewsService_PRD.md deleted file mode 100644 index c9eaa6f5..00000000 --- a/NewsService_PRD.md +++ /dev/null @@ -1,779 +0,0 @@ -# Product Requirements Document: NewsService Completion - -## Overview - -Complete the `NewsService` to provide strongly-typed news data and sentiment analysis to trading agents using a local-first data strategy with RSS feed integration, article content extraction, and LLM-powered sentiment analysis. - -## Current State Analysis - -### Issues to Fix -- **CRITICAL**: Service is currently empty placeholder with only method stubs -- **CRITICAL**: Need to implement GoogleNewsClient to read RSS feeds -- **CRITICAL**: Need RSS article fetching with fallback to Internet Archive -- **CRITICAL**: Need LLM-powered sentiment analysis integration -- **CRITICAL**: Service uses `BaseClient` inheritance instead of typed clients -- **CRITICAL**: `NewsRepository` has different interface than service expectations -- Missing strongly-typed interfaces between components -- No concrete approach for article content extraction - -### What Works -- ✅ `NewsContext` and `ArticleData` Pydantic models for agent consumption -- ✅ `SentimentScore` model for structured sentiment data -- ✅ `FinnhubClient` with `get_company_news()` method using date objects -- ✅ `NewsRepository` with dataclass-based storage and deduplication -- ✅ Service structure placeholder ready for implementation - -## Technical Requirements - -### 1. Strongly-Typed Interfaces - -#### Client → Service Interface -```python -# FinnhubClient methods (already implemented) -def get_company_news(symbol: str, start_date: date, end_date: date) -> dict[str, Any] - -# GoogleNewsClient methods (to be implemented) -def fetch_rss_feed(query: str, start_date: date, end_date: date) -> dict[str, Any] -def fetch_article_content(url: str, use_archive_fallback: bool = True) -> dict[str, Any] -def get_company_news(symbol: str, start_date: date, end_date: date) -> dict[str, Any] -def get_global_news(start_date: date, end_date: date, categories: list[str]) -> dict[str, Any] -``` - -#### Service → Repository Interface -```python -# NewsRepository methods (to be implemented/bridged) -def has_data_for_period(query: str, start_date: str, end_date: str, symbol: str | None) -> bool -def get_data(query: str, start_date: str, end_date: str, symbol: str | None) -> dict[str, Any] -def store_data(query: str, cache_data: dict, symbol: str | None, overwrite: bool) -> bool -def clear_data(query: str, start_date: str, end_date: str, symbol: str | None) -> bool -``` - -#### Service → Agent Interface -```python -# Service output (already defined) -def get_context(query: str, start_date: str, end_date: str, symbol: str | None, sources: list[str], force_refresh: bool) -> NewsContext -``` - -### 2. Local-First Data Strategy - -#### Flow -1. **Repository Lookup**: Check `NewsRepository.has_data_for_period()` -2. **Freshness Check**: Determine if cache needs updating (news is append-only) -3. **RSS Feed Fetching**: Fetch RSS feeds from Google News -4. **Content Extraction**: Extract full article content with Internet Archive fallback -5. **LLM Analysis**: Perform sentiment analysis using LLM -6. **Cache Updates**: Store enriched articles via `repository.store_data()` -7. **Context Assembly**: Return validated `NewsContext` - -#### News-Specific Gap Detection -```python -def should_fetch_new_articles(self, last_fetch_time: datetime, current_time: datetime) -> bool: - """ - News doesn't have "gaps" - it's append-only. Check if enough time passed for new articles. - - Returns True if: - - Last fetch was more than 6 hours ago - - User requested force_refresh - - No data exists for the query/period - """ - if not last_fetch_time: - return True - - hours_since_fetch = (current_time - last_fetch_time).total_seconds() / 3600 - return hours_since_fetch >= 6 # Fetch new articles every 6 hours -``` - -#### Force Refresh Support -- `force_refresh=True` fetches all articles fresh from sources -- Does NOT clear existing cache (news is immutable) -- Deduplicates against existing articles before storing - -#### Cache Invalidation Strategy -- **Articles are immutable**: Once published, articles don't change -- **Cache grows append-only**: New articles are added, old ones retained -- **Freshness check**: Re-fetch every 6 hours for new articles -- **No deletion**: Articles are never removed from cache - -### 3. RSS Feed Processing & Article Fetching - -#### GoogleNewsClient RSS Implementation -```python -import feedparser -from newspaper import Article -import requests -from datetime import date, datetime -from typing import Any, Optional - -class GoogleNewsClient: - """Google News RSS client following FinnhubClient standard.""" - - def __init__(self): - self.base_rss_url = "https://news.google.com/rss" - self.archive_base_url = "https://archive.org/wayback/available" - - def fetch_rss_feed(self, query: str, start_date: date, end_date: date) -> dict[str, Any]: - """ - Fetch RSS feed data for news articles. - - Args: - query: Search query or company symbol - start_date: Start date for filtering articles - end_date: End date for filtering articles - - Returns: - Dict containing RSS feed articles with metadata - """ - # Construct RSS feed URL - rss_url = f"{self.base_rss_url}/search?q={query}&hl=en-US&gl=US&ceid=US:en" - - # Parse RSS feed - feed = feedparser.parse(rss_url) - - # Filter and structure articles - articles = [] - for entry in feed.entries: - # Parse publication date - pub_date = datetime(*entry.published_parsed[:6]).date() - - # Filter by date range - if start_date <= pub_date <= end_date: - articles.append({ - "headline": entry.title, - "url": entry.link, - "source": entry.source.get('title', 'Google News'), - "date": pub_date.isoformat(), - "summary": entry.get('summary', ''), - }) - - return { - "query": query, - "period": {"start": start_date.isoformat(), "end": end_date.isoformat()}, - "articles": articles, - "metadata": { - "source": "google_news_rss", - "rss_feed_url": rss_url, - "article_count": len(articles) - } - } - - def fetch_article_content(self, url: str, use_archive_fallback: bool = True) -> dict[str, Any]: - """ - Fetch full article content from URL with Internet Archive fallback. - - Args: - url: Article URL to fetch - use_archive_fallback: Whether to try Internet Archive if direct fetch fails - - Returns: - Dict containing article content, title, publication date - """ - try: - # Try direct fetch - article = Article(url) - article.download() - article.parse() - - return { - "content": article.text, - "title": article.title, - "authors": article.authors, - "publish_date": article.publish_date.isoformat() if article.publish_date else None, - "extracted_via": "direct_fetch", - "extraction_success": True - } - - except Exception as e: - if use_archive_fallback: - # Try Internet Archive - archive_url = self._get_archive_url(url) - if archive_url: - try: - article = Article(archive_url) - article.download() - article.parse() - - return { - "content": article.text, - "title": article.title, - "authors": article.authors, - "publish_date": article.publish_date.isoformat() if article.publish_date else None, - "extracted_via": "internet_archive", - "extraction_success": True - } - except Exception: - pass - - # Return failure - return { - "content": "", - "title": "", - "extracted_via": "failed", - "extraction_success": False, - "error": str(e) - } - - def _get_archive_url(self, url: str) -> Optional[str]: - """Get Internet Archive URL for a given URL.""" - try: - response = requests.get(f"{self.archive_base_url}?url={url}") - data = response.json() - if data.get("archived_snapshots", {}).get("closest", {}).get("available"): - return data["archived_snapshots"]["closest"]["url"] - except Exception: - pass - return None -``` - -### 4. LLM-Powered Sentiment Analysis - -#### Sentiment Analysis Integration -```python -class LLMSentimentAnalyzer: - """LLM-based sentiment analyzer for financial news.""" - - def __init__(self, llm_client): - self.llm_client = llm_client - self.sentiment_prompt = """ - Analyze the sentiment of this financial news article for trading purposes. - - Article: - Title: {headline} - Content: {content} - - Provide your analysis in the following JSON format: - {{ - "score": , - "confidence": , - "label": <"positive", "negative", or "neutral">, - "reasoning": , - "key_themes": , - "financial_entities": - }} - - Focus on the financial and market implications of the news. - """ - - def analyze_sentiment(self, article: ArticleData) -> SentimentScore: - """ - Analyze article sentiment using LLM. - - Args: - article: Article data with headline and content - - Returns: - SentimentScore with score, confidence, and label - """ - # Prepare prompt - prompt = self.sentiment_prompt.format( - headline=article.headline, - content=article.content[:2000] # Limit content length - ) - - # Get LLM response - response = self.llm_client.complete(prompt) - - # Parse response - try: - result = json.loads(response) - - # Convert to SentimentScore - score = result.get("score", 0.0) - return SentimentScore( - positive=max(0, score), - negative=abs(min(0, score)), - neutral=1.0 - abs(score), - metadata={ - "confidence": result.get("confidence", 0.5), - "label": result.get("label", "neutral"), - "reasoning": result.get("reasoning", ""), - "key_themes": result.get("key_themes", []), - "financial_entities": result.get("financial_entities", []) - } - ) - except Exception as e: - # Return neutral sentiment on error - return SentimentScore( - positive=0.0, - negative=0.0, - neutral=1.0, - metadata={"error": str(e)} - ) - - def batch_analyze(self, articles: list[ArticleData], batch_size: int = 5) -> list[SentimentScore]: - """ - Batch process sentiment analysis for multiple articles. - - Args: - articles: List of articles to analyze - batch_size: Number of articles to process in parallel - - Returns: - List of sentiment scores corresponding to input articles - """ - results = [] - - for i in range(0, len(articles), batch_size): - batch = articles[i:i + batch_size] - - # Process batch (could be parallelized) - for article in batch: - sentiment = self.analyze_sentiment(article) - results.append(sentiment) - - # Add small delay to respect rate limits - time.sleep(0.1) - - return results -``` - -### 5. Date Object Conversion - -#### Service Boundary Conversion -```python -# Service receives string dates from agents -def get_context(self, query: str, start_date: str, end_date: str, ...) -> NewsContext: - # Validate date strings - try: - start_dt = date.fromisoformat(start_date) - end_dt = date.fromisoformat(end_date) - except ValueError as e: - raise ValueError(f"Invalid date format: {e}") - - # Check date order - if end_dt < start_dt: - raise ValueError(f"End date {end_date} is before start date {start_date}") - - # Fetch from multiple sources - finnhub_data = self.finnhub_client.get_company_news(symbol, start_dt, end_dt) if symbol else None - google_rss = self.google_client.fetch_rss_feed(query, start_dt, end_dt) - - # Fetch full article content for RSS articles - for article in google_rss.get('articles', []): - content_data = self.google_client.fetch_article_content(article['url']) - article.update(content_data) - - # Combine all articles - all_articles = self._combine_and_deduplicate(finnhub_data, google_rss) - - # Perform LLM sentiment analysis - enriched_articles = [] - for article in all_articles: - article_data = ArticleData(**article) - article_data.sentiment = self.sentiment_analyzer.analyze_sentiment(article_data) - enriched_articles.append(article_data) - - # Create and return context - return self._create_news_context(enriched_articles, start_date, end_date) -``` - -### 6. Error Recovery and Partial Data - -```python -def handle_source_failure( - self, - finnhub_data: dict | None, - google_data: dict | None, - errors: dict[str, Exception] -) -> NewsContext: - """ - Handle cases where one or more news sources fail. - - - If all sources fail: Raise exception - - If some sources succeed: Return partial data with metadata - - Track content extraction failures separately - """ - if not finnhub_data and not google_data: - raise ValueError("All news sources failed to return data") - - # Track extraction statistics - extraction_stats = { - "total_articles": 0, - "successful_extractions": 0, - "archive_fallbacks": 0, - "failed_extractions": 0 - } - - # Process available articles - all_articles = [] - successful_sources = [] - - if finnhub_data: - all_articles.extend(finnhub_data.get('articles', [])) - successful_sources.append('finnhub') - - if google_data: - articles = google_data.get('articles', []) - for article in articles: - extraction_stats["total_articles"] += 1 - if article.get("extraction_success"): - extraction_stats["successful_extractions"] += 1 - if article.get("extracted_via") == "internet_archive": - extraction_stats["archive_fallbacks"] += 1 - else: - extraction_stats["failed_extractions"] += 1 - - all_articles.extend(articles) - successful_sources.append('google_news') - - metadata = { - "sources_requested": ["finnhub", "google_news"], - "sources_successful": successful_sources, - "sources_failed": {source: str(error) for source, error in errors.items()}, - "extraction_stats": extraction_stats, - "partial_data": len(successful_sources) < 2 - } - - # Deduplicate and return context - return self._create_context(all_articles, metadata) -``` - -### 7. Repository Method Bridging - -```python -# Add these bridge methods to NewsRepository -def has_data_for_period(self, query: str, start_date: str, end_date: str, symbol: str | None = None) -> bool: - """Bridge to existing get_news_data method.""" - existing_data = self.get_news_data( - symbol=symbol or query, - start_date=start_date, - end_date=end_date - ) - return len(existing_data.get('articles', [])) > 0 - -def get_data(self, query: str, start_date: str, end_date: str, symbol: str | None = None) -> dict[str, Any]: - """Bridge to existing get_news_data method.""" - return self.get_news_data( - symbol=symbol or query, - start_date=start_date, - end_date=end_date - ) - -def store_data(self, query: str, cache_data: dict, symbol: str | None = None, overwrite: bool = False) -> bool: - """Bridge to existing store_news_articles method.""" - articles = cache_data.get('articles', []) - if not articles: - return False - - # Convert to expected format - news_articles = [ - NewsArticle( - symbol=symbol or query, - headline=a['headline'], - summary=a.get('summary', ''), - content=a.get('content', ''), - url=a['url'], - source=a['source'], - date=a['date'], - entities=a.get('entities', []), - sentiment_score=a.get('sentiment', {}).get('score', 0.0), - sentiment_metadata=a.get('sentiment', {}) - ) - for a in articles - ] - - return self.store_news_articles(news_articles) - -def clear_data(self, query: str, start_date: str, end_date: str, symbol: str | None = None) -> bool: - """News is append-only, so this just marks data as stale for re-fetch.""" - # Implementation depends on repository design - # Could update metadata to trigger re-fetch - return True -``` - -### 8. Pydantic Validation - -#### Context Structure -```python -@dataclass -class NewsContext(BaseModel): - symbol: str | None - period: dict[str, str] # {"start": "2024-01-01", "end": "2024-01-31"} - articles: list[ArticleData] - sentiment_summary: SentimentScore - article_count: int - sources: list[str] - metadata: dict[str, Any] - - @validator('period') - def validate_period(cls, v): - # Ensure start and end dates are present and valid - if 'start' not in v or 'end' not in v: - raise ValueError("Period must have 'start' and 'end' dates") - return v - - @validator('articles') - def validate_articles(cls, v): - # Ensure no duplicate URLs - urls = [a.url for a in v] - if len(urls) != len(set(urls)): - raise ValueError("Duplicate articles detected") - return v -``` - -## Implementation Tasks - -### Phase 1: Create GoogleNewsClient - -1. **GoogleNewsClient Implementation** - - Create `tradingagents/clients/google_news_client.py` following FinnhubClient standard - - Implement RSS feed parsing using `feedparser` library - - Add `fetch_rss_feed()` method with Google News RSS integration - - Add `fetch_article_content()` method with `newspaper3k` and Internet Archive fallback - - Use `date` objects for all date parameters - - No BaseClient inheritance - -2. **Article Content Extraction** - - Implement robust article content extraction using `newspaper3k` - - Add fallback to Internet Archive Wayback Machine for failed fetches - - Handle paywall detection and alternative content sources - - Extract clean text, title, publication date, and metadata - -3. **Comprehensive Testing** - - Create test suite for GoogleNewsClient - - Test RSS parsing with various queries - - Test content extraction with real and archived URLs - - Use pytest-vcr for HTTP interaction recording - -### Phase 2: Bridge NewsRepository Interface - -4. **Repository Interface Standardization** - - Add standard service interface methods to `NewsRepository` - - Bridge existing methods without changing underlying storage - - File: `tradingagents/repositories/news_repository.py` - - Maintain backward compatibility - -### Phase 3: Implement NewsService - -5. **Service Core Implementation** - - Replace method stubs with full implementation - - Implement `get_context()`, `get_company_news_context()`, `get_global_news_context()` - - Add local-first data strategy with freshness checking - - Replace `BaseClient` dependencies with typed clients - - File: `tradingagents/services/news_service.py` - -6. **LLM Sentiment Analysis Integration** - - Implement `LLMSentimentAnalyzer` class - - Create financial news sentiment prompts - - Add batch processing for efficiency - - Handle LLM rate limiting and errors - -7. **Date Conversion and Article Processing** - - Add date validation and conversion - - Implement RSS article fetching pipeline - - Add content extraction with fallback - - Combine articles from multiple sources - - Implement deduplication by URL - -### Phase 4: Type Safety & Validation - -8. **Comprehensive Type Checking** - - Run `mise run typecheck` - must pass with 0 errors - - Validate all date object conversions - - Ensure NewsContext compliance - -9. **Enhanced Testing** - - Test RSS feed parsing edge cases - - Test content extraction failures and fallbacks - - Test LLM sentiment analysis with various article types - - Test multi-source aggregation and deduplication - -## Testing Scenarios - -### Integration Tests - -1. **RSS Feed Processing** - - Test with various search queries - - Test date filtering in RSS results - - Test handling of malformed RSS feeds - -2. **Content Extraction** - - Test direct fetch success - - Test Internet Archive fallback - - Test paywall detection - - Test extraction failure handling - -3. **LLM Sentiment Analysis** - - Test positive news sentiment - - Test negative earnings reports - - Test neutral market updates - - Test batch processing - - Test LLM error handling - -4. **Multi-Source Aggregation** - - Test both sources succeed - - Test Finnhub fails, Google succeeds - - Test Google fails, Finnhub succeeds - - Test both sources fail - -5. **Date Handling** - - Test invalid date formats - - Test end_date < start_date - - Test date filtering in RSS feeds - -## Success Criteria - -### Functional Requirements -- ✅ Service successfully implements all placeholder methods -- ✅ GoogleNewsClient reads and parses RSS feeds correctly -- ✅ Article content extraction works with Internet Archive fallback -- ✅ LLM sentiment analysis provides structured financial sentiment -- ✅ Local-first strategy with proper freshness checking -- ✅ Multi-source aggregation with deduplication -- ✅ Returns properly validated `NewsContext` to agents -- ✅ Force refresh fetches fresh articles without clearing cache - -### Technical Requirements -- ✅ Zero type checking errors: `mise run typecheck` -- ✅ Zero linting errors: `mise run lint` -- ✅ All tests pass with new implementation -- ✅ No runtime errors with date conversions -- ✅ Proper error messages for validation failures - -### Quality Requirements -- ✅ Strongly-typed interfaces between all components -- ✅ RSS feed parsing with robust error handling -- ✅ Article content extraction with fallback strategy -- ✅ LLM integration with proper prompt engineering -- ✅ Efficient caching with minimal external calls -- ✅ Clear separation of concerns - -## Data Architecture - -### GoogleNewsClient RSS Response Format -```python -{ - "query": "Apple stock", - "period": {"start": "2024-01-01", "end": "2024-01-31"}, - "articles": [ - { - "headline": "Apple Stock Soars on New Product Launch", - "summary": "Brief summary from RSS feed...", - "content": "Full article text extracted from source...", - "url": "https://www.cnbc.com/2024/01/20/apple-stock.html", - "source": "CNBC", - "date": "2024-01-20", - "authors": ["Tech Reporter"], - "publish_date": "2024-01-20T14:30:00Z", - "extracted_via": "direct_fetch", # or "internet_archive" - "extraction_success": true - } - ], - "metadata": { - "source": "google_news_rss", - "article_count": 25, - "rss_feed_url": "https://news.google.com/rss/search?q=Apple+stock", - "extraction_stats": { - "successful": 22, - "archive_fallback": 2, - "failed": 3 - } - } -} -``` - -### LLM Sentiment Analysis Response Format -```python -{ - "article_url": "https://www.cnbc.com/2024/01/20/apple-stock.html", - "sentiment": { - "positive": 0.7, - "negative": 0.1, - "neutral": 0.2, - "metadata": { - "score": 0.7, - "confidence": 0.85, - "label": "positive", - "reasoning": "Article discusses positive earnings and growth outlook", - "key_themes": ["earnings_beat", "product_launch", "revenue_growth"], - "financial_entities": ["AAPL", "Apple Inc.", "iPhone 15"] - } - } -} -``` - -### Aggregate Sentiment Summary -```python -{ - "sentiment_summary": { - "positive": 0.65, # Average across all articles - "negative": 0.20, - "neutral": 0.15, - "metadata": { - "dominant_sentiment": "positive", - "confidence": 0.82, - "article_count": 25, - "themes": { - "earnings": 8, - "product_launch": 5, - "market_analysis": 12 - } - } - } -} -``` - -## Dependencies - -### Components to Create -- ⏳ `GoogleNewsClient` - Full implementation with RSS and content extraction -- ⏳ `LLMSentimentAnalyzer` - LLM integration for sentiment analysis -- ⏳ `NewsService` - Replace stubs with full implementation - -### Existing Components -- ✅ `FinnhubClient` with company news using date objects -- ✅ `NewsRepository` with dataclass storage -- ✅ `NewsContext` and related Pydantic models - -### Required Libraries -- `feedparser` - RSS feed parsing -- `newspaper3k` - Article content extraction -- `requests` - HTTP requests and Internet Archive API -- `beautifulsoup4` - HTML parsing fallback -- LLM client library (OpenAI, Anthropic, etc.) - -## Timeline - -### Immediate (Phase 1) -- Create GoogleNewsClient with RSS and content extraction -- Implement feedparser integration -- Add Internet Archive fallback -- Create comprehensive test suite - -### Phase 2-3 -- Add repository bridge methods -- Implement full NewsService -- Integrate LLM sentiment analysis -- Handle multi-source aggregation - -### Phase 4 -- Type checking and validation -- Integration testing -- Performance optimization -- Documentation - -## Acceptance Criteria - -### Must Have -1. **Type Safety**: Service passes `mise run typecheck` with zero errors -2. **RSS Integration**: Successfully parse Google News RSS feeds -3. **Content Extraction**: Extract full articles with fallback -4. **LLM Sentiment**: Financial sentiment analysis for all articles -5. **Service Implementation**: All stubs replaced with working code -6. **Local-First**: Check cache before fetching new data -7. **Multi-Source**: Aggregate Finnhub and Google News - -### Should Have -1. **Extraction Stats**: Track success/failure rates -2. **Batch Processing**: Efficient LLM sentiment analysis -3. **Force Refresh**: Fetch new articles on demand -4. **Error Recovery**: Handle partial failures gracefully - -### Nice to Have -1. **Additional Sources**: Support more news providers -2. **Real-time Monitoring**: WebSocket for breaking news -3. **Advanced Extraction**: Handle PDFs, videos -4. **Sentiment Trends**: Track sentiment over time - ---- - -This PRD focuses on completing the currently empty `NewsService` with a full implementation including RSS feed integration, article content extraction with Internet Archive fallback, and LLM-powered sentiment analysis for financial news. diff --git a/README.md b/README.md index 1925e9c3..97bd02c8 100644 --- a/README.md +++ b/README.md @@ -293,6 +293,33 @@ This project uses [mise](https://mise.jdx.dev/) for tool and task management. Al - **Install tools**: `mise install` - Install Python, uv, ruff, pyright - **Install dependencies**: `mise run install` - Install project dependencies with uv +### Testing Principles + +**Pragmatic outside-in TDD** - Mock I/O boundaries, test real logic, fast feedback. + +#### Test Structure (Mirror Source) +``` +tests/ +├── conftest.py # Shared fixtures +├── domains/ +│ ├── __init__.py +│ └── news/ +│ ├── __init__.py +│ ├── test_news_service.py # Mock repo + clients +│ ├── test_news_repository.py # Docker test DB +│ └── test_google_news_client.py # pytest-vcr +``` + +#### Mocking Strategy by Layer +- **Services**: Mock Repository + Clients, test real transformations +- **Repositories**: Real persistence (temp files/Docker), no mocks +- **Clients**: Real HTTP with pytest-vcr cassettes + +#### Quality Standards +- **85% coverage** minimum +- **< 100ms** per unit test +- **Mock boundaries, test behavior** + ### Configuration The TradingAgents framework uses a centralized `TradingAgentsConfig` class for all configuration management. @@ -428,4 +455,5 @@ ALWAYS prefer editing an existing file to creating a new one. NEVER proactively create documentation files (*.md) or README files. Only create documentation files if explicitly requested by the User. - IMPORTANT: this context may or may not be relevant to your tasks. You should not respond to this context unless it is highly relevant to your task. \ No newline at end of file + IMPORTANT: this context may or may not be relevant to your tasks. You should not respond to this context unless it is highly relevant to your task. +- remember what we learnt about testing? \ No newline at end of file diff --git a/SocialMediaService_PRD.md b/SocialMediaService_PRD.md deleted file mode 100644 index 2b722e70..00000000 --- a/SocialMediaService_PRD.md +++ /dev/null @@ -1,424 +0,0 @@ -# Product Requirements Document: SocialMediaService Completion - -## Overview - -Complete the `SocialMediaService` to provide strongly-typed social media data and sentiment analysis to trading agents using a local-first data strategy with gap detection and intelligent caching. - -## Current State Analysis - -### Issues to Fix -- **CRITICAL**: Missing `RedditClient` implementation - service calls non-existent client methods -- **CRITICAL**: Service uses `BaseClient` inheritance but needs typed `RedditClient` -- **CRITICAL**: `SocialRepository` has different interface than standard service pattern -- **CRITICAL**: Repository uses `date` objects internally but service expects string date interface -- Missing strongly-typed interfaces between components -- Service calls `reddit_client.search_posts()`, `get_top_posts()`, `filter_posts_by_date()` methods that don't exist - -### What Works -- ✅ Local-first data strategy implementation (`_get_social_data_local_first`) -- ✅ Force refresh logic (`_fetch_and_cache_fresh_social_data`) -- ✅ `SocialContext` Pydantic model for agent consumption -- ✅ Comprehensive sentiment analysis with keyword-based scoring -- ✅ Engagement metrics calculation and post ranking -- ✅ Error handling and metadata creation patterns -- ✅ `SocialRepository` with JSON storage and post deduplication -- ✅ `PostData` and `SentimentScore` models for structured data -- ✅ Real-time sentiment analysis with weighted scoring - -## Technical Requirements - -### 1. Strongly-Typed Interfaces - -#### Client → Service Interface -```python -# RedditClient methods (to be implemented) -def search_posts(query: str, subreddit_names: list[str], start_date: date, end_date: date, limit: int, time_filter: str) -> dict[str, Any] -def get_top_posts(subreddit_names: list[str], start_date: date, end_date: date, limit: int, time_filter: str) -> dict[str, Any] -def get_company_posts(symbol: str, subreddit_names: list[str], start_date: date, end_date: date, limit: int) -> dict[str, Any] -``` - -#### Service → Repository Interface -```python -# SocialRepository methods (to be implemented/bridged) -def has_data_for_period(query: str, start_date: str, end_date: str, symbol: str | None) -> bool -def get_data(query: str, start_date: str, end_date: str, symbol: str | None) -> dict[str, Any] -def store_data(query: str, cache_data: dict, symbol: str | None, overwrite: bool) -> bool -def clear_data(query: str, start_date: str, end_date: str, symbol: str | None) -> bool -``` - -#### Service → Agent Interface -```python -# Service output (already defined) -def get_context(query: str, start_date: str, end_date: str, symbol: str | None, subreddits: list[str], force_refresh: bool) -> SocialContext -def get_company_social_context(symbol: str, start_date: str, end_date: str, subreddits: list[str]) -> SocialContext -def get_global_trends(start_date: str, end_date: str, subreddits: list[str]) -> SocialContext -``` - -### 2. Local-First Data Strategy - -#### Flow -1. **Repository Lookup**: Check `SocialRepository.has_data_for_period()` -2. **Gap Detection**: Identify missing social media data periods -3. **Selective Fetching**: Fetch only missing data from `RedditClient` -4. **Cache Updates**: Store new data via `repository.store_data()` -5. **Context Assembly**: Return validated `SocialContext` - -#### Force Refresh Support -- `force_refresh=True` bypasses local data completely -- Clears existing cache before fetching fresh data -- Stores refreshed data with metadata indicating refresh - -### 3. Date Object Conversion - -#### Service Boundary Conversion -```python -# Service receives string dates from agents -def get_context(self, query: str, start_date: str, end_date: str, ...) -> SocialContext: - # Convert to date objects for client calls - start_dt = date.fromisoformat(start_date) - end_dt = date.fromisoformat(end_date) - - # Use date objects when calling RedditClient - posts_data = self.reddit_client.search_posts(query, subreddits, start_dt, end_dt, limit, time_filter) - - # Repository bridge handles string to date conversion internally - cached_data = self.repository.get_data(query, start_date, end_date, symbol) -``` - -### 4. Reddit API Integration - -#### RedditClient Implementation Strategy -```python -# RedditClient following FinnhubClient standard -class RedditClient: - """Client for Reddit API access with PRAW library integration.""" - - def __init__(self, client_id: str, client_secret: str, user_agent: str): - """Initialize Reddit client with PRAW.""" - import praw - self.reddit = praw.Reddit( - client_id=client_id, - client_secret=client_secret, - user_agent=user_agent - ) - - def search_posts(self, query: str, subreddit_names: list[str], - start_date: date, end_date: date, limit: int = 50, - time_filter: str = "week") -> dict[str, Any]: - """Search for posts across subreddits within date range.""" - - def get_top_posts(self, subreddit_names: list[str], - start_date: date, end_date: date, limit: int = 50, - time_filter: str = "week") -> dict[str, Any]: - """Get top posts from subreddits within date range.""" - - def get_company_posts(self, symbol: str, subreddit_names: list[str], - start_date: date, end_date: date, limit: int = 50) -> dict[str, Any]: - """Get company-specific posts from subreddits.""" -``` - -#### Reddit Response Format -```python -{ - "query": "AAPL", - "period": {"start": "2024-01-01", "end": "2024-01-31"}, - "posts": [ - { - "title": "Apple earnings discussion", - "content": "What do you think about...", - "author": "redditor123", - "subreddit": "investing", - "created_utc": 1704067200, - "score": 125, - "num_comments": 45, - "upvote_ratio": 0.87, - "url": "https://reddit.com/r/investing/comments/abc123", - "id": "abc123" - } - ], - "metadata": { - "source": "reddit", - "retrieved_at": "2024-01-31T10:00:00Z", - "data_quality": "HIGH", - "subreddits": ["investing", "stocks"], - "total_posts": 25 - } -} -``` - -### 5. Sentiment Analysis Enhancement - -#### Advanced Sentiment Features -- **Weighted Scoring**: High-engagement posts have more influence on overall sentiment -- **Keyword Analysis**: Comprehensive positive/negative keyword detection -- **Score Adjustment**: Reddit score (upvotes) influences sentiment confidence -- **Confidence Metrics**: Based on post count and engagement levels -- **Multi-level Analysis**: Individual post sentiment + overall summary sentiment - -#### Sentiment Calculation Strategy -```python -def _calculate_advanced_sentiment(self, posts: list[PostData]) -> SentimentScore: - """Enhanced sentiment analysis with multiple factors.""" - # Weight by engagement score (upvotes + comments) - # Adjust for subreddit context (WSB vs investing) - # Consider temporal patterns (recent posts weighted higher) - # Apply confidence scoring based on data volume -``` - -### 6. Pydantic Validation - -#### Context Structure -```python -@dataclass -class SocialContext(BaseModel): - symbol: str | None - period: dict[str, str] # {"start": "2024-01-01", "end": "2024-01-31"} - posts: list[PostData] - engagement_metrics: dict[str, float] - sentiment_summary: SentimentScore - post_count: int - platforms: list[str] # ["reddit"] - metadata: dict[str, Any] -``` - -#### PostData Format -```python -@dataclass -class PostData(BaseModel): - title: str - content: str - author: str - source: str # subreddit name - date: str - url: str - score: int - comments: int - engagement_score: int - subreddit: str | None - sentiment: SentimentScore | None - metadata: dict[str, Any] -``` - -## Implementation Tasks - -### Phase 1: Create RedditClient - -1. **RedditClient Implementation** - - Create `tradingagents/clients/reddit_client.py` - - Follow FinnhubClient standard: no BaseClient inheritance, date objects, proper error handling - - Use PRAW (Python Reddit API Wrapper) library for Reddit API access - - Methods: `search_posts()`, `get_top_posts()`, `get_company_posts()` - - Implement date filtering for posts within specified ranges - - Handle Reddit API rate limits and authentication - -2. **Comprehensive Testing** - - Create `tradingagents/clients/test_reddit_client.py` - - Use pytest-vcr for Reddit API interaction recording - - Test all client methods with multiple queries and subreddits - - Test error handling and API rate limit scenarios - - Mock Reddit API responses for consistent testing - -### Phase 2: Bridge SocialRepository Interface - -3. **Repository Interface Standardization** - - Add standard service interface methods to `SocialRepository` - - Bridge existing `get_social_data()` with `get_data()` - - Bridge existing `store_social_posts()` with `store_data()` - - Add missing `has_data_for_period()` and `clear_data()` methods - - File: `tradingagents/repositories/social_repository.py` - - Maintain existing dataclass functionality while adding service compatibility - -4. **Repository Method Implementation** - ```python - # Add these methods to SocialRepository - def has_data_for_period(self, query: str, start_date: str, end_date: str, symbol: str | None = None) -> bool - def get_data(self, query: str, start_date: str, end_date: str, symbol: str | None = None) -> dict[str, Any] - def store_data(self, query: str, cache_data: dict, symbol: str | None = None, overwrite: bool = False) -> bool - def clear_data(self, query: str, start_date: str, end_date: str, symbol: str | None = None) -> bool - ``` - -### Phase 3: Update SocialMediaService - -5. **Client Integration Fix** - - Replace `BaseClient` dependency with `RedditClient` - - File: `tradingagents/services/social_media_service.py:27` - - Update constructor: `reddit_client: RedditClient` - -6. **Date Conversion Fix** - - Add `date.fromisoformat()` conversion in service methods - - Update all client calls to use date objects instead of strings - - File: `tradingagents/services/social_media_service.py:182-190, 418-429` - -7. **Repository Interface Integration** - - Update repository method calls to use new standard interface - - Ensure proper error handling for repository operations - - File: `tradingagents/services/social_media_service.py:302-311, 325-337` - -### Phase 4: Type Safety & Validation - -8. **Comprehensive Type Checking** - - Run `mise run typecheck` - must pass with 0 errors - - Validate all date object conversions - - Ensure SocialContext compliance - -9. **Enhanced Testing** - - Update existing service tests for new RedditClient interface - - Add gap detection test scenarios - - Test sentiment analysis accuracy with known datasets - - Test multi-subreddit aggregation and deduplication - -## Success Criteria - -### Functional Requirements -- ✅ Service successfully calls `RedditClient` with `date` objects -- ✅ Local-first strategy works: checks cache → identifies gaps → fetches missing → stores updates -- ✅ Returns properly validated `SocialContext` to agents -- ✅ Sentiment analysis provides accurate scores with confidence metrics -- ✅ Multi-subreddit support with post deduplication -- ✅ Force refresh bypasses cache and refreshes data - -### Technical Requirements -- ✅ Zero type checking errors: `mise run typecheck` -- ✅ Zero linting errors: `mise run lint` -- ✅ All existing tests pass with updated architecture -- ✅ No runtime errors with date conversions - -### Quality Requirements -- ✅ Strongly-typed interfaces between all components -- ✅ PRAW library integration for reliable Reddit API access -- ✅ Comprehensive error handling and logging -- ✅ Efficient caching with minimal API calls -- ✅ Clear separation of concerns between service, client, and repository -- ✅ Accurate sentiment analysis with engagement weighting - -## Data Architecture - -### RedditClient Response Format -```python -{ - "query": "Tesla", - "period": {"start": "2024-01-01", "end": "2024-01-31"}, - "posts": [ - { - "title": "Tesla Q4 earnings beat expectations", - "content": "Tesla reported strong Q4 results...", - "author": "teslaInvestor", - "subreddit": "TeslaInvestors", - "created_utc": 1704067200, - "score": 245, - "num_comments": 67, - "upvote_ratio": 0.92, - "url": "https://reddit.com/r/TeslaInvestors/comments/xyz789", - "id": "xyz789" - } - ], - "metadata": { - "source": "reddit", - "retrieved_at": "2024-01-31T10:00:00Z", - "data_quality": "HIGH", - "subreddits": ["TeslaInvestors", "stocks"], - "post_count": 25, - "api_calls": 3 - } -} -``` - -### SocialRepository Data Bridge Format -```python -# Repository stores data in existing SocialPost format but provides service interface -{ - "query": "Tesla", - "symbol": "TSLA", - "posts": [ - { - "title": "Tesla Q4 earnings beat expectations", - "content": "Tesla reported strong Q4 results...", - "author": "teslaInvestor", - "source": "TeslaInvestors", - "date": "2024-01-15", - "url": "https://reddit.com/r/TeslaInvestors/comments/xyz789", - "score": 245, - "comments": 67, - "engagement_score": 312, - "subreddit": "TeslaInvestors", - "sentiment": { - "score": 0.7, - "confidence": 0.8, - "label": "positive" - }, - "metadata": { - "platform_id": "xyz789", - "upvote_ratio": 0.92 - } - } - ], - "metadata": { - "cached_at": "2024-01-31T10:00:00Z", - "post_count": 25, - "sources": ["reddit"] - } -} -``` - -## Dependencies - -### Missing Components (Need Creation) -- ⏳ `RedditClient` needs full implementation from scratch -- ⏳ Service interface bridge methods for `SocialRepository` -- ⏳ Comprehensive pytest-vcr test suites for Reddit API - -### Existing Components (Ready) -- ✅ `SocialRepository` with JSON storage and deduplication -- ✅ `SocialContext` and `PostData` Pydantic models -- ✅ Sentiment analysis and engagement metrics logic - -### Required -- PRAW (Python Reddit API Wrapper) library for Reddit integration -- Valid Reddit API credentials (client_id, client_secret, user_agent) -- Working internet connection for live data fetching -- Writable data directory for repository storage - -## Timeline - -### Immediate (Phase 1) -- Create RedditClient following FinnhubClient standard with PRAW integration -- Implement comprehensive testing with pytest-vcr for Reddit API -- Validate client functionality with multiple subreddits and queries - -### Phase 2-3 -- Add standard service interface methods to SocialRepository -- Update SocialMediaService to use RedditClient with date objects -- Bridge repository interfaces while maintaining existing functionality - -### Phase 4 -- Comprehensive type checking and validation -- Integration testing with sentiment analysis workflows -- Performance optimization and caching efficiency - -## Acceptance Criteria - -### Must Have -1. **Type Safety**: Service passes `mise run typecheck` with zero errors -2. **Client Integration**: All `RedditClient` calls use `date` objects correctly -3. **Local-First**: Service checks repository before Reddit API calls -4. **Context Validation**: Returns valid `SocialContext` with Pydantic validation -5. **Sentiment Analysis**: Provides accurate sentiment scores with confidence metrics -6. **Multi-Platform**: Seamlessly aggregates social data from Reddit with extensibility - -### Should Have -1. **Gap Detection**: Intelligent identification of missing data periods -2. **Cache Efficiency**: Minimal redundant API calls to Reddit -3. **Force Refresh**: Complete cache bypass when requested -4. **Data Quality**: Metadata indicating data source and quality metrics -5. **Deduplication**: Automatic removal of duplicate posts by platform_id - -### Nice to Have -1. **Performance Metrics**: Timing and cache hit rate logging -2. **Data Staleness**: Automatic refresh of old cached social data -3. **Enhanced Sentiment**: Integration with advanced NLP libraries (TextBlob, VADER) -4. **Real-time Social**: Support for live social media feeds and alerts -5. **Platform Expansion**: Easy addition of Twitter, Discord, other social platforms - ---- - -This PRD focuses on completing the `SocialMediaService` as a strongly-typed, local-first data service that integrates Reddit social media data through a new `RedditClient` following the established FinnhubClient standard patterns, while providing comprehensive sentiment analysis and engagement metrics to trading agents. \ No newline at end of file diff --git a/prd/news_service.md b/prd/news_service.md new file mode 100644 index 00000000..ccd579e6 --- /dev/null +++ b/prd/news_service.md @@ -0,0 +1,1013 @@ +# News Service PRD + +## Executive Summary +The News Service feature will provide up-to-date news sentiment analysis for stock market tickers to the TradingAgents framework. This service will enable agents to make more informed trading decisions based on current market news and sentiment. + +## Requirements + +### Target Users +- Trading Agents (News Analyst, Researchers, Trader Agent, Risk Management team) +- Cron Job system for daily updates + +### Problem Statement +Agents need up-to-date news sentiment when analyzing the stock market to make better trading decisions. Currently, they may be missing important news events or experiencing delays in sentiment analysis that could impact trading performance. + +### Success Metrics +- Impact on trading decision quality + +### User Stories +1. As Cron Job I want to be able to update and store the news with sentiment analysis for a ticker each day +2. As a Trading Agent I want to be able to retrieve the news with sentiment analysis for a ticker and a day from a database + +### Out of Scope (v1) +- Real-time news streaming (vs daily updates) +- Multi-language news support +- Historical news sentiment analysis beyond a certain date range +- News source ranking or weighting +- Advanced filtering options + +### Timeline +MVP in 1 week + +## Status +✅ Requirements Complete | ✅ Technical Design Complete | 🔄 Implementation In Progress + +## Technical Design + +### Architecture +- The `NewsService` will be the central component, orchestrating the fetching, scraping, analysis, and storage of news articles. +- It will utilize the existing `GoogleNewsClient` to fetch RSS feeds from Google News. +- The `ArticleScraperClient` will be enhanced to scrape full article content with robust fallback strategies: + - **Direct Fetch**: Primary method using `newspaper3k` library for content extraction + - **Archive Fallback**: Internet Archive Wayback Machine fallback for failed fetches + - **Content Extraction**: Clean text, title, publication date, and metadata extraction + - **Paywall Detection**: Handle paywall-protected content gracefully +- A new `SentimentAnalysisService` will be created to handle the interaction with the configured LLM for structured sentiment analysis. +- The `NewsRepository` will store the news articles along with their sentiment scores in the existing file-based database. + +### Implementation Components +- **Backend:** + - `tradingagents/domains/news/news_service.py`: + - A new private method `_get_sentiment_for_article` will be added to call the `SentimentAnalysisService`. + - The `update_company_news` method will be modified to call this new method for each scraped article. + - The `_calculate_sentiment_summary` will be updated to aggregate the new structured sentiment scores. + - Update to work with SQLAlchemy-based NewsRepository instead of file-based storage. + - `tradingagents/domains/news/repository.py` (Enhanced with Compatibility Layer): + - Replace file-based storage with SQLAlchemy ORM operations + - **Backward Compatibility**: Maintain existing interface with adapter pattern + - Implement new methods: `save_articles()`, `get_articles_by_symbol()`, `get_articles_by_date_range()` + - Add transaction management and connection pooling + - Include duplicate detection using URL uniqueness constraints + - Add batch operations for efficient bulk inserts + +**Data Model Compatibility Strategy:** +```python +# Enhanced ArticleData to bridge existing and new models +@dataclass +class ArticleData: + # Existing fields (maintain compatibility) + title: str + content: str + author: str + source: str # Keep as string for existing code + date: str # YYYY-MM-DD format + url: str + sentiment: SentimentScore | None = None + + # New fields for enhanced functionality + source_id: int | None = None # Foreign key when available + category_id: int | None = None # Foreign key when available + + # Vector fields (optional for backward compatibility) + title_embedding: List[float] | None = None + content_embedding: List[float] | None = None + sentiment_embedding: List[float] | None = None + + @classmethod + def from_db_model(cls, article: NewsArticle) -> 'ArticleData': + """Convert database model to existing ArticleData format.""" + return cls( + title=article.title, + content=article.content or "", + author=article.author or "", + source=article.source.name if article.source else "Unknown", # Flatten relationship + date=article.published_date.isoformat(), + url=article.url, + sentiment=SentimentScore( + score=float(article.sentiment_score) if article.sentiment_score else 0.0, + confidence=float(article.sentiment_confidence) if article.sentiment_confidence else 0.0, + label=article.sentiment_label or "neutral" + ) if article.sentiment_score is not None else None, + source_id=article.source_id, + category_id=article.category_id, + title_embedding=article.title_embedding, + content_embedding=article.content_embedding, + sentiment_embedding=article.sentiment_embedding + ) + + def to_db_model(self, session: Session) -> NewsArticle: + """Convert to database model, handling source lookup.""" + # Get or create source + source = session.query(NewsSource).filter_by(name=self.source).first() + if not source: + source = NewsSource(name=self.source) + session.add(source) + session.flush() # Get ID + + return NewsArticle( + title=self.title, + content=self.content, + author=self.author, + source_id=source.id, + url=self.url, + published_date=date.fromisoformat(self.date), + sentiment_score=Decimal(str(self.sentiment.score)) if self.sentiment else None, + sentiment_confidence=Decimal(str(self.sentiment.confidence)) if self.sentiment else None, + sentiment_label=self.sentiment.label if self.sentiment else None, + title_embedding=self.title_embedding, + content_embedding=self.content_embedding, + sentiment_embedding=self.sentiment_embedding + ) +``` + - `tradingagents/domains/news/sentiment_service.py` (New File): + - This new service will encapsulate the logic for calling the LLM and generating embeddings. + - Primary method: `get_sentiment_with_embeddings(article_content: str) -> SentimentScoreWithEmbeddings`. + - It will use the `quick_think_llm` from the `TradingAgentsConfig` for performance. + - It will use a structured prompt to ask the LLM to return a JSON object with `score`, `confidence`, and `label`. + - **Embedding Generation**: Generate multiple embeddings using OpenAI's embedding API: + - `title_embedding`: Vector representation of article title (1536 dims) + - `content_embedding`: Vector representation of full article content (1536 dims) + - `sentiment_embedding`: Smaller specialized sentiment vector using sentence-transformers (384 dims) + - **Vector Similarity**: Enable semantic search for similar articles and sentiment clustering +- **Database:** + - **PostgreSQL + SQLAlchemy + pgvector Integration:** + - Replace file-based storage with PostgreSQL database using SQLAlchemy ORM + - Create new SQLAlchemy models for news articles with proper relationships + - Implement database migrations using Alembic + - Add connection pooling and transaction management + - Integrate pgvector extension for high-dimensional sentiment embeddings storage + - Enable semantic similarity search and vector-based sentiment clustering + - **Database Schema Design:** + - `news_articles` table with columns for article data, sentiment scores, embeddings, and metadata + - `news_sources` table for source information and credibility tracking + - `news_categories` table for article categorization + - `sentiment_embeddings` table for high-dimensional vector storage using pgvector + - Proper indexing for symbol, date, source queries, and vector similarity searches + - Foreign key relationships between articles, sources, categories, and embeddings + +### API Specification +- No external API changes. All modifications will be internal to the `NewsService` and the cron job that calls it. + +### Security & Performance +- **Security:** LLM API keys will continue to be managed through the `TradingAgentsConfig` and environment variables. No new security risks are introduced. +- **Performance:** The scraping and sentiment analysis process is I/O and network-bound. This will run as part of the daily cron job, so it will not impact the performance of the trading agents' decision-making process, which will read from the cached data. + +### Database Schema Design + +#### Core Tables +```sql +-- Enable pgvector extension +CREATE EXTENSION IF NOT EXISTS vector; + +-- News sources for credibility tracking +CREATE TABLE news_sources ( + id SERIAL PRIMARY KEY, + name VARCHAR(255) NOT NULL UNIQUE, + domain VARCHAR(255), + credibility_score DECIMAL(3,2) DEFAULT 0.5, -- 0.0 to 1.0 + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +-- News categories for article classification +CREATE TABLE news_categories ( + id SERIAL PRIMARY KEY, + name VARCHAR(100) NOT NULL UNIQUE, + description TEXT, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +-- Main articles table +CREATE TABLE news_articles ( + id SERIAL PRIMARY KEY, + title TEXT NOT NULL, + content TEXT, + author VARCHAR(255), + symbol VARCHAR(10), -- Stock ticker, nullable for global news + source_id INTEGER REFERENCES news_sources(id), + category_id INTEGER REFERENCES news_categories(id), + url TEXT UNIQUE NOT NULL, + published_date DATE NOT NULL, + scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + + -- Sentiment analysis + sentiment_score DECIMAL(3,2), -- -1.0 to 1.0 + sentiment_confidence DECIMAL(3,2), -- 0.0 to 1.0 + sentiment_label VARCHAR(20), -- positive/negative/neutral + sentiment_analyzed_at TIMESTAMP, + + -- Vector embeddings for semantic analysis + title_embedding vector(1536), -- OpenAI ada-002 embedding dimension + content_embedding vector(1536), -- Full article content embedding + sentiment_embedding vector(384), -- Sentence-transformer for sentiment + embedding_model VARCHAR(50) DEFAULT 'text-embedding-ada-002', + embedded_at TIMESTAMP, + + -- Metadata + content_length INTEGER, + scrape_status VARCHAR(20) DEFAULT 'SUCCESS', -- SUCCESS, FAILED, ARCHIVE_SUCCESS + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +-- Remove redundant sentiment_embeddings table +-- All embeddings stored directly in news_articles table for simplicity and performance + +-- Performance indexes +CREATE INDEX idx_news_articles_symbol_date ON news_articles(symbol, published_date); +CREATE INDEX idx_news_articles_published_date ON news_articles(published_date); +CREATE INDEX idx_news_articles_source ON news_articles(source_id); +CREATE INDEX idx_news_articles_sentiment ON news_articles(sentiment_score, sentiment_confidence); +CREATE INDEX idx_news_articles_url_hash ON news_articles USING HASH(url); + +-- Vector similarity indexes using HNSW (Hierarchical Navigable Small World) +-- Note: HNSW indexes consume significant memory (2-4x vector storage) +CREATE INDEX idx_articles_title_embedding ON news_articles USING hnsw (title_embedding vector_cosine_ops) + WITH (m = 16, ef_construction = 64); -- Tuned for performance vs memory +CREATE INDEX idx_articles_content_embedding ON news_articles USING hnsw (content_embedding vector_cosine_ops) + WITH (m = 16, ef_construction = 64); +CREATE INDEX idx_articles_sentiment_embedding ON news_articles USING hnsw (sentiment_embedding vector_cosine_ops) + WITH (m = 8, ef_construction = 32); -- Smaller index for sentiment vectors +``` + +#### SQLAlchemy Models +```python +# tradingagents/domains/news/models.py +from datetime import datetime, date +from decimal import Decimal +from typing import List, Optional +from sqlalchemy import Column, Integer, String, Text, Date, DateTime, Decimal as SQLDecimal, ForeignKey +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import relationship +from pgvector.sqlalchemy import Vector + +Base = declarative_base() + +class NewsSource(Base): + __tablename__ = 'news_sources' + + id = Column(Integer, primary_key=True) + name = Column(String(255), nullable=False, unique=True) + domain = Column(String(255)) + credibility_score = Column(SQLDecimal(3,2), default=0.5) + created_at = Column(DateTime, default=datetime.utcnow) + updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) + + # Relationships + articles = relationship("NewsArticle", back_populates="source") + +class NewsCategory(Base): + __tablename__ = 'news_categories' + + id = Column(Integer, primary_key=True) + name = Column(String(100), nullable=False, unique=True) + description = Column(Text) + created_at = Column(DateTime, default=datetime.utcnow) + + # Relationships + articles = relationship("NewsArticle", back_populates="category") + +class NewsArticle(Base): + __tablename__ = 'news_articles' + + id = Column(Integer, primary_key=True) + title = Column(Text, nullable=False) + content = Column(Text) + author = Column(String(255)) + symbol = Column(String(10)) # Nullable for global news + source_id = Column(Integer, ForeignKey('news_sources.id')) + category_id = Column(Integer, ForeignKey('news_categories.id')) + url = Column(Text, unique=True, nullable=False) + published_date = Column(Date, nullable=False) + scraped_at = Column(DateTime, default=datetime.utcnow) + + # Sentiment fields + sentiment_score = Column(SQLDecimal(3,2)) # -1.0 to 1.0 + sentiment_confidence = Column(SQLDecimal(3,2)) # 0.0 to 1.0 + sentiment_label = Column(String(20)) # positive/negative/neutral + sentiment_analyzed_at = Column(DateTime) + + # Vector embeddings using pgvector + title_embedding = Column(Vector(1536)) # OpenAI ada-002 dimensions + content_embedding = Column(Vector(1536)) # Full content embedding + sentiment_embedding = Column(Vector(384)) # Sentence transformer for sentiment + embedding_model = Column(String(50), default='text-embedding-ada-002') + embedded_at = Column(DateTime) + + # Metadata + content_length = Column(Integer) + scrape_status = Column(String(20), default='SUCCESS') + created_at = Column(DateTime, default=datetime.utcnow) + updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) + + # Relationships + source = relationship("NewsSource", back_populates="articles") + category = relationship("NewsCategory", back_populates="articles") + +# Removed redundant SentimentEmbedding table for simplified architecture +``` + +#### Database Migration Strategy + +**Alembic Configuration:** +```python +# alembic/env.py +from tradingagents.domains.news.models import Base +from tradingagents.config import TradingAgentsConfig + +config = TradingAgentsConfig.from_env() +target_metadata = Base.metadata + +# Database URL from config +config.set_main_option("sqlalchemy.url", config.database_url) +``` + +**Initial Migration:** +```bash +# Initialize Alembic in the project +alembic init alembic + +# Generate initial migration +alembic revision --autogenerate -m "Create news tables" + +# Apply migration +alembic upgrade head +``` + +**Migration Files:** +- `001_enable_pgvector.py` - Enable pgvector extension +- `002_create_news_tables.py` - Initial schema creation with vector fields +- `003_add_vector_indexes.py` - HNSW indexes for vector similarity +- `004_seed_categories_sources.py` - Seed default categories and trusted sources + +**TradingAgentsConfig Extension:** +```python +@dataclass +class TradingAgentsConfig: + # ... existing fields ... + + # Database configuration + database_url: str = field(default_factory=lambda: os.getenv("DATABASE_URL", "")) + database_pool_size: int = field(default_factory=lambda: int(os.getenv("DATABASE_POOL_SIZE", "10"))) + database_max_overflow: int = field(default_factory=lambda: int(os.getenv("DATABASE_MAX_OVERFLOW", "20"))) + database_echo: bool = field(default_factory=lambda: os.getenv("DATABASE_ECHO", "false").lower() == "true") + + # Vector configuration + enable_vector_search: bool = field(default_factory=lambda: os.getenv("ENABLE_VECTOR_SEARCH", "true").lower() == "true") + embedding_model: str = field(default_factory=lambda: os.getenv("EMBEDDING_MODEL", "text-embedding-ada-002")) + embedding_batch_size: int = field(default_factory=lambda: int(os.getenv("EMBEDDING_BATCH_SIZE", "100"))) + enable_sentence_transformers: bool = field(default_factory=lambda: os.getenv("ENABLE_SENTENCE_TRANSFORMERS", "true").lower() == "true") + + @property + def has_database_config(self) -> bool: + """Check if database is properly configured.""" + return bool(self.database_url and self.database_url.startswith("postgresql://")) + + @property + def embedding_provider(self) -> str: + """Get embedding provider from LLM provider setting.""" + # Map LLM providers to their embedding providers + llm_provider = getattr(self, 'llm_provider', 'openai') + embedding_map = { + 'openai': 'openai', + 'google': 'google', # Use Gemini for embeddings when Google is selected + 'anthropic': 'openai', # Anthropic doesn't have embeddings, use OpenAI + 'ollama': 'openai' # Local models, use OpenAI for embeddings + } + return embedding_map.get(llm_provider, 'openai') + +def validate_database_config(config: TradingAgentsConfig) -> None: + """Validate database configuration before startup.""" + if not config.has_database_config: + raise ValueError("DATABASE_URL must be set for PostgreSQL integration") + + if config.enable_vector_search and not config.has_database_config: + raise ValueError("Vector search requires PostgreSQL database configuration") +``` + +**Environment Variables:** +```bash +# Database configuration (required) +DATABASE_URL=postgresql://username:password@localhost:5432/tradingagents +DATABASE_POOL_SIZE=10 # optional, defaults to 10 +DATABASE_MAX_OVERFLOW=20 # optional, defaults to 20 +DATABASE_ECHO=false # optional, set to true for SQL debugging + +# Vector configuration (optional) +ENABLE_VECTOR_SEARCH=true # optional, defaults to true +EMBEDDING_MODEL=google/gemini-2.5-flash # Use Gemini via OpenRouter for embeddings +EMBEDDING_BATCH_SIZE=100 # optional +ENABLE_SENTENCE_TRANSFORMERS=true # optional + +# Example configurations by provider: +# For OpenAI: EMBEDDING_MODEL=text-embedding-ada-002 +# For Gemini: EMBEDDING_MODEL=google/gemini-2.5-flash (via OpenRouter) +``` + +#### Embedding Generation Service Design + +**SentimentScore Enhancement:** +```python +@dataclass +class SentimentScoreWithEmbeddings: + """Enhanced sentiment analysis with vector embeddings.""" + + score: float # -1.0 to 1.0 + confidence: float # 0.0 to 1.0 + label: str # positive/negative/neutral + + # Vector embeddings + title_embedding: List[float] # 1536 dimensions + content_embedding: List[float] # 1536 dimensions + sentiment_embedding: List[float] # 384 dimensions + embedding_model: str = "text-embedding-ada-002" +``` + +**Service Implementation:** +```python +class EmbeddingProvider: + """Abstract base for embedding providers.""" + async def get_embeddings(self, texts: List[str]) -> List[List[float]]: + raise NotImplementedError + +class OpenAIEmbeddingProvider(EmbeddingProvider): + def __init__(self, api_key: str, model: str = "text-embedding-ada-002"): + self.client = AsyncOpenAI(api_key=api_key) + self.model = model + + async def get_embeddings(self, texts: List[str]) -> List[List[float]]: + response = await self.client.embeddings.create( + input=texts, + model=self.model + ) + return [item.embedding for item in response.data] + +class GeminiEmbeddingProvider(EmbeddingProvider): + def __init__(self, api_key: str, base_url: str = "https://openrouter.ai/api/v1"): + self.client = AsyncOpenAI(api_key=api_key, base_url=base_url) + self.model = "google/gemini-2.5-flash" + + async def get_embeddings(self, texts: List[str]) -> List[List[float]]: + # Gemini via OpenRouter - batch embeddings + response = await self.client.embeddings.create( + input=texts, + model=self.model + ) + return [item.embedding for item in response.data] + +class SentimentAnalysisService: + def __init__(self, config: TradingAgentsConfig): + self.llm_client = self._get_llm_client(config) + self.embedding_provider = self._get_embedding_provider(config) + self.sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2') if config.enable_sentence_transformers else None + + def _get_embedding_provider(self, config: TradingAgentsConfig) -> EmbeddingProvider: + """Get appropriate embedding provider based on configuration.""" + provider = config.embedding_provider + + if provider == 'openai': + return OpenAIEmbeddingProvider( + api_key=os.getenv('OPENAI_API_KEY'), + model=config.embedding_model + ) + elif provider == 'google': + return GeminiEmbeddingProvider( + api_key=os.getenv('OPENAI_API_KEY'), # OpenRouter key + base_url="https://openrouter.ai/api/v1" + ) + else: + # Default to OpenAI + return OpenAIEmbeddingProvider( + api_key=os.getenv('OPENAI_API_KEY'), + model=config.embedding_model + ) + + async def get_sentiment_with_embeddings( + self, + title: str, + content: str + ) -> SentimentScoreWithEmbeddings: + """Generate sentiment analysis with vector embeddings - optimized for performance.""" + + # 1. Parallel processing: sentiment score + embeddings + tasks = [ + self._get_sentiment_score(content), # LLM sentiment analysis + self.embedding_provider.get_embeddings([title, content]) # Batch embedding API call + ] + + sentiment, embeddings = await asyncio.gather(*tasks) + title_embedding, content_embedding = embeddings + + # 2. Generate local sentiment embedding if enabled + sentiment_embedding = None + if self.sentence_transformer: + sentiment_embedding = self.sentence_transformer.encode(content).tolist() + + return SentimentScoreWithEmbeddings( + score=sentiment.score, + confidence=sentiment.confidence, + label=sentiment.label, + title_embedding=title_embedding, + content_embedding=content_embedding, + sentiment_embedding=sentiment_embedding, + embedding_model=self.embedding_provider.model + ) + + async def _get_sentiment_score(self, content: str) -> SentimentScore: + """Generate sentiment score using LLM with financial news prompt.""" + + prompt = """ + Analyze the sentiment of this financial news article for trading purposes. + + Article Content: {content} + + Provide your analysis in the following JSON format: + {{ + "score": , + "confidence": , + "label": <"positive", "negative", or "neutral">, + "reasoning": , + "key_themes": , + "financial_entities": + }} + + Focus on the financial and market implications of the news. + Consider impact on stock prices, market sentiment, and trading decisions. + """.format(content=content[:2000]) # Limit content length + + response = await self.llm_client.complete(prompt) + + try: + result = json.loads(response) + return SentimentScore( + score=result.get("score", 0.0), + confidence=result.get("confidence", 0.5), + label=result.get("label", "neutral"), + metadata={ + "reasoning": result.get("reasoning", ""), + "key_themes": result.get("key_themes", []), + "financial_entities": result.get("financial_entities", []) + } + ) + except Exception as e: + # Return neutral sentiment on error + return SentimentScore( + score=0.0, + confidence=0.0, + label="neutral", + metadata={"error": str(e)} + ) + + def find_similar_articles( + self, + embedding: List[float], + limit: int = 10, + similarity_threshold: float = 0.8 + ) -> List[NewsArticle]: + """Find semantically similar articles using vector similarity.""" + # Use pgvector cosine similarity search + pass + + async def batch_analyze_sentiment( + self, + articles: List[ArticleData], + batch_size: int = 5 + ) -> List[SentimentScoreWithEmbeddings]: + """ + Batch process sentiment analysis and embedding generation. + + Args: + articles: List of articles to analyze + batch_size: Number of articles to process concurrently + + Returns: + List of sentiment scores with embeddings + """ + results = [] + + for i in range(0, len(articles), batch_size): + batch = articles[i:i + batch_size] + + # Process batch concurrently + batch_tasks = [ + self.get_sentiment_with_embeddings(article.title, article.content) + for article in batch + ] + + batch_results = await asyncio.gather(*batch_tasks, return_exceptions=True) + + for result in batch_results: + if isinstance(result, Exception): + # Handle individual failures gracefully + logger.error(f"Sentiment analysis failed: {result}") + results.append(self._get_neutral_sentiment_with_embeddings()) + else: + results.append(result) + + # Rate limiting: Add delay between batches + if i + batch_size < len(articles): + await asyncio.sleep(1.0) # 1 second delay between batches + + return results +``` + +**Optimized Vector Similarity Queries:** +```sql +-- Find articles similar to a given title embedding (HNSW optimized) +-- Note: Don't use WHERE clause on similarity - it defeats HNSW indexing +SELECT id, title, symbol, + (title_embedding <=> %s) as distance, + (1 - (title_embedding <=> %s)) as similarity +FROM news_articles +WHERE title_embedding IS NOT NULL -- Only filter on non-null vectors +ORDER BY title_embedding <=> %s +LIMIT 20 -- Get more candidates, filter in application if needed +HAVING distance < 0.2; -- Filter after ordering for best performance + +-- Find articles with similar sentiment patterns (pre-filter by label for efficiency) +SELECT id, title, sentiment_label, + (sentiment_embedding <=> %s) as distance +FROM news_articles +WHERE sentiment_label = %s -- Filter first by indexed column + AND sentiment_embedding IS NOT NULL +ORDER BY sentiment_embedding <=> %s +LIMIT 15; + +-- Cluster articles by content similarity for a ticker (optimized approach) +WITH similar_articles AS ( + SELECT id, symbol, sentiment_score, + (content_embedding <=> %s) as distance + FROM news_articles + WHERE symbol = %s -- Use indexed column first + AND content_embedding IS NOT NULL + ORDER BY content_embedding <=> %s + LIMIT 50 -- Limit search space +) +SELECT symbol, + AVG(sentiment_score) as avg_sentiment, + COUNT(*) as article_count, + AVG(distance) as avg_content_distance +FROM similar_articles +WHERE distance < 0.3 -- Apply similarity threshold after vector search +GROUP BY symbol; + +-- Performance monitoring query +SELECT + schemaname, + tablename, + attname as column_name, + n_distinct, + correlation +FROM pg_stats +WHERE tablename = 'news_articles' + AND attname LIKE '%embedding%'; +``` + +**Memory Usage Estimation:** +```sql +-- Estimate memory requirements for HNSW indexes +SELECT + pg_size_pretty(pg_total_relation_size('idx_articles_title_embedding')) as title_index_size, + pg_size_pretty(pg_total_relation_size('idx_articles_content_embedding')) as content_index_size, + pg_size_pretty(pg_total_relation_size('idx_articles_sentiment_embedding')) as sentiment_index_size, + pg_size_pretty(pg_total_relation_size('news_articles')) as table_size; + +-- Expected memory usage: 500MB-1GB for 10K articles with 3 embedding types +``` + +### Current Implementation Status + +**✅ COMPLETED COMPONENTS:** + +1. **NewsService Core Structure (90% Complete)** + - ✅ Core service class with dependency injection + - ✅ Read path implemented: `get_company_news_context()`, `get_global_news_context()` + - ✅ Write path implemented: `update_company_news()`, `update_global_news()` + - ✅ Repository integration with file-based storage + - ✅ ArticleData model conversion from repository NewsArticle + - ✅ Simple keyword-based sentiment analysis as fallback + - ✅ Error handling and empty context returns + - ✅ Trending topics extraction + - ✅ Date validation and ISO format handling + +2. **NewsRepository (100% Complete)** + - ✅ File-based storage with JSON serialization + - ✅ Source separation (finnhub, google_news) + - ✅ Date-based file organization (YYYY-MM-DD.json) + - ✅ Article deduplication by URL + - ✅ Batch storage operations + - ✅ Complete CRUD operations + - ✅ Proper error handling and logging + +3. **Data Models (100% Complete)** + - ✅ ArticleData dataclass with sentiment field + - ✅ NewsContext and GlobalNewsContext for agent consumption + - ✅ SentimentScore model + - ✅ NewsUpdateResult for operation tracking + - ✅ DataQuality enum for metadata + +**✅ COMPLETED COMPONENTS (UPDATED):** + +4. **GoogleNewsClient (100% Complete)** + - ✅ RSS feed parsing with feedparser + - ✅ Company news method implemented (`get_company_news()`) + - ✅ Global news method implemented (`get_global_news()`) + - ✅ Proper error handling and logging + - ✅ Google News RSS URL construction + - ✅ Article parsing with source extraction + - ✅ Date parsing with fallback handling + +5. **ArticleScraperClient (100% Complete)** + - ✅ Full newspaper3k content extraction + - ✅ Internet Archive Wayback Machine fallback + - ✅ Robust error handling for failed scrapes + - ✅ Content validation (minimum length checks) + - ✅ Multiple article batch processing + - ✅ Rate limiting with configurable delays + - ✅ Proper URL validation + +**❌ MISSING COMPONENTS:** + +6. **LLM Sentiment Analysis Service (0% Complete)** + - ❌ SentimentAnalysisService class not created + - ❌ LLM integration not implemented + - ❌ Financial news prompts not defined + - ❌ Batch processing not implemented + - **Current**: Using simple keyword-based fallback + - **Next**: Create dedicated sentiment service + +7. **Database Migration (0% Complete)** + - ❌ SQLAlchemy models not created + - ❌ PostgreSQL integration not started + - ❌ pgvector extension not configured + - ❌ Alembic migrations not set up + - **Current**: Using file-based storage + - **Status**: Planned for future iteration + +8. **Vector Embeddings (0% Complete)** + - ❌ Embedding providers not implemented + - ❌ Vector similarity not available + - ❌ Semantic search not implemented + - **Status**: Advanced feature for future enhancement + +### Revised Implementation Phases + +**PHASE 1: Complete Core Functionality (Current Priority)** +- **GoogleNewsClient RSS Implementation (2-3 days)** + - Implement feedparser RSS parsing + - Add company news and global news methods + - Handle RSS feed errors and edge cases + - Create comprehensive tests with VCR cassettes + +- **ArticleScraperClient Implementation (2-3 days)** + - Implement newspaper3k content extraction + - Add Internet Archive fallback mechanism + - Handle paywalls and extraction failures + - Create scraping tests with mock responses + +- **LLM Sentiment Analysis Service (3-4 days)** + - Create SentimentAnalysisService class + - Implement LLM client integration using TradingAgentsConfig + - Design financial news sentiment prompts + - Add batch processing with rate limiting + - Replace keyword-based sentiment in NewsService + +**PHASE 2: Testing and Refinement (Current Phase)** +- **Integration Testing (1-2 days)** + - End-to-end testing with real RSS feeds + - Test article scraping and sentiment analysis pipeline + - Verify error handling and partial failures + - Performance testing with multiple tickers + +- **Type Safety and Quality (1 day)** + - Ensure `mise run typecheck` passes with 0 errors + - Fix any remaining linting issues + - Add missing docstrings and type hints + +**PHASE 3: Future Enhancements (Deferred)** +- **Database Migration**: SQLAlchemy + PostgreSQL + pgvector +- **Vector Embeddings**: Semantic similarity and clustering +- **Performance Optimization**: Caching improvements and batch processing + +### Total Timeline: 1-2 weeks for core completion +- **Week 1**: Complete GoogleNewsClient, ArticleScraperClient, LLM Sentiment Service +- **Week 2**: Integration testing, refinement, and quality assurance +- **Future**: Database migration and vector enhancements as separate project + +## Testing Plan + +### Test Strategy +- **Unit Testing:** Test individual components in isolation with mocked dependencies +- **Integration Testing:** Test component interactions and data flow +- **End-to-End Testing:** Test complete workflows from news fetching to storage + +### Unit Tests + +#### GoogleNewsClient Tests +- **Location:** `tests/domains/news/test_google_news_client.py` +- **Framework:** `pytest` with `pytest-vcr` for HTTP recording/replay +- **VCR Cassettes:** `tests/fixtures/vcr_cassettes/google_news/` +- **Test Cases:** + - `@pytest.mark.vcr` `test_get_news_by_symbol_success()` - Valid symbol returns articles + - `@pytest.mark.vcr` `test_get_news_by_symbol_invalid_symbol()` - Invalid symbol handling + - `@pytest.mark.vcr` `test_get_global_news_success()` - Global news retrieval + - `@pytest.mark.vcr` `test_get_global_news_empty_response()` - Empty RSS feed handling + - `test_rss_feed_parsing_error()` - Malformed RSS handling (mocked) + - `test_network_timeout()` - Network timeout scenarios (mocked) + - `test_rate_limiting()` - Rate limit compliance (mocked) + +#### ArticleScraperClient Tests +- **Location:** `tests/domains/news/test_article_scraper_client.py` +- **Framework:** `pytest` with `pytest-vcr` for HTTP recording/replay +- **VCR Cassettes:** `tests/fixtures/vcr_cassettes/article_scraper/` +- **Test Cases:** + - `@pytest.mark.vcr` `test_scrape_article_success()` - Successful article scraping + - `@pytest.mark.vcr` `test_scrape_article_archive_fallback()` - Archive.is fallback + - `test_scrape_article_both_fail()` - Both methods fail gracefully (mocked) + - `test_invalid_url()` - Invalid URL handling (mocked) + - `@pytest.mark.vcr` `test_content_extraction()` - Content parsing accuracy + +#### SentimentAnalysisService Tests +- **Location:** `tests/domains/news/test_sentiment_service.py` +- **Test Cases:** + - `test_get_sentiment_positive()` - Positive sentiment detection + - `test_get_sentiment_negative()` - Negative sentiment detection + - `test_get_sentiment_neutral()` - Neutral sentiment detection + - `test_get_sentiment_llm_error()` - LLM API error handling + - `test_get_sentiment_invalid_response()` - Invalid JSON response handling + - `test_get_sentiment_empty_content()` - Empty content handling + +#### NewsService Tests +- **Location:** `tests/domains/news/test_news_service.py` +- **Test Cases:** + - `test_update_company_news_success()` - Complete news update workflow + - `test_update_company_news_no_articles()` - No articles found scenario + - `test_update_company_news_scraping_failure()` - Partial scraping failures + - `test_sentiment_analysis_integration()` - Sentiment analysis integration + - `test_calculate_sentiment_summary()` - Sentiment aggregation logic + - `test_get_company_news_by_date()` - News retrieval by date + +#### NewsRepository Tests +- **Location:** `tests/domains/news/test_news_repository.py` +- **Test Cases:** + - `test_store_news_articles()` - Article storage + - `test_get_news_by_symbol_and_date()` - News retrieval + - `test_duplicate_article_handling()` - Duplicate prevention + - `test_data_persistence()` - File system persistence + - `test_invalid_data_handling()` - Invalid data rejection + +### Integration Tests + +#### News Workflow Integration +- **Location:** `tests/integration/test_news_workflow.py` +- **Test Cases:** + - `test_full_news_update_workflow()` - Complete end-to-end workflow + - `test_news_service_with_real_clients()` - Real client integration + - `test_sentiment_service_integration()` - LLM integration testing + - `test_repository_integration()` - Data persistence integration + +### End-to-End Tests + +#### Complete System Tests +- **Location:** `tests/e2e/test_news_system.py` +- **Test Cases:** + - `test_daily_news_update_simulation()` - Simulate daily cron job + - `test_trading_agent_news_consumption()` - Agent news retrieval + - `test_system_performance_with_multiple_tickers()` - Performance testing + - `test_error_recovery_scenarios()` - System resilience testing + +### Test Data Management + +#### Mock Data Strategy +- **RSS Feed Samples:** Saved sample RSS responses for consistent testing +- **Article Content:** Pre-scraped article content for sentiment testing +- **LLM Responses:** Mock sentiment analysis responses for unit tests + +#### Test Configuration +- **Environment Variables:** Separate test configuration +- **Database Isolation:** Temporary test databases +- **VCR Configuration:** Record/replay HTTP interactions for deterministic tests +- **Pytest Configuration:** `pytest.ini` with VCR settings and test markers + +### Performance Testing + +#### Load Testing +- **Concurrent News Updates:** Test multiple ticker updates simultaneously +- **Memory Usage:** Monitor memory consumption during batch processing +- **API Rate Limiting:** Verify rate limit compliance under load + +#### Benchmarking +- **Scraping Speed:** Measure article scraping performance +- **Sentiment Analysis:** Measure LLM response times +- **Storage Performance:** Database write/read performance + +### Test Automation + +#### CI/CD Integration +- **Pre-commit Hooks:** Run fast unit tests before commits +- **Pull Request Checks:** Full test suite on PR creation +- **Nightly Tests:** End-to-end tests with real data + +#### Test Coverage Requirements +- **Minimum Coverage:** 80% line coverage for all components +- **Critical Path Coverage:** 100% coverage for core business logic +- **Error Handling Coverage:** All exception paths tested + +### Manual Testing Scenarios + +#### Smoke Tests +- **Daily Operations:** Manual verification of daily news updates +- **Data Quality:** Spot-check sentiment analysis accuracy +- **System Health:** Monitor error rates and performance metrics + +#### Acceptance Testing +- **Trading Agent Integration:** Verify agents can consume news data effectively +- **Data Accuracy:** Validate news relevance and sentiment accuracy +- **Performance Benchmarks:** Confirm system meets performance requirements + +## Current Implementation Status Summary + +### Overall Progress: 90% Complete 🎉 + +**✅ COMPLETED (100%)** +- Requirements analysis and technical design +- NewsService core structure with read/write paths +- NewsRepository with file-based storage and deduplication +- Data models (ArticleData, NewsContext, SentimentScore) +- GoogleNewsClient with full RSS feed parsing +- ArticleScraperClient with newspaper3k + Internet Archive fallback +- Basic sentiment analysis (keyword-based fallback) +- Error handling and validation +- Service integration and dependency injection + +**❌ MISSING (10%)** +- LLM sentiment analysis service (only remaining core component) + +**⏸️ DEFERRED (Future Iterations)** +- Database migration to PostgreSQL + SQLAlchemy +- Vector embeddings and semantic search +- Real-time news streaming capabilities + +### What's Working Now +The current NewsService implementation provides: +- **Read Path**: Agents can successfully call `get_company_news_context()` and `get_global_news_context()` +- **Repository Integration**: Service reads cached news data from file-based NewsRepository +- **Data Transformation**: Converts NewsRepository.NewsArticle → ArticleData for agents +- **Basic Sentiment**: Simple keyword-based sentiment analysis as fallback +- **Error Handling**: Graceful error handling with empty contexts and metadata +- **Type Safety**: Proper type hints and dataclass definitions + +### What's Missing +The service currently cannot: +- **LLM Sentiment Analysis**: No LLM integration for financial news sentiment (using keyword fallback) +- **Structured Storage**: Still using file-based storage instead of planned PostgreSQL + SQLAlchemy +- **Vector Embeddings**: No semantic similarity or vector-based features + +### Critical Gap (Only 1 Remaining!) +1. **LLM Sentiment Service** - No structured sentiment analysis with LLM prompts + - Current: Simple keyword-based sentiment scoring + - Needed: LLM integration using TradingAgentsConfig + - Impact: Agents get basic sentiment but not sophisticated financial analysis + +### Recently Discovered: Implementation is 90% Complete! +Upon detailed code review, the implementation is much further along than initially documented: +- ✅ **GoogleNewsClient** - Fully implemented with RSS parsing +- ✅ **ArticleScraperClient** - Complete with newspaper3k + Internet Archive fallback +- ✅ **NewsService** - Full read/write paths with proper error handling +- ✅ **NewsRepository** - Production-ready file-based storage + +### Next Immediate Steps (Revised) +1. **✅ COMPLETE: GoogleNewsClient RSS parsing** - Already implemented with feedparser +2. **✅ COMPLETE: ArticleScraperClient** - Already implemented with newspaper3k + Internet Archive +3. **⏳ PRIORITY: Create LLM Sentiment Service** - Replace keyword-based analysis (2-3 days) +4. **⏳ PRIORITY: Integration testing** - End-to-end workflow validation (1-2 days) + +### Timeline to MVP (Updated) +- **3-5 days** for LLM sentiment service + testing +- **Current system is production-ready** with basic sentiment analysis +- **Database migration** deferred to future iteration +- **Vector features** planned as advanced enhancement + +### Implementation Priority +**HIGH PRIORITY (Required for sophisticated sentiment)**: +- LLM Sentiment Analysis Service with financial news prompts + +**MEDIUM PRIORITY (System improvements)**: +- Better error handling and retry logic +- Performance optimization for batch processing +- Comprehensive integration test suite + +**LOW PRIORITY (Future enhancements)**: +- PostgreSQL + SQLAlchemy migration +- Vector embeddings and semantic search +- Real-time news streaming diff --git a/pyproject.toml b/pyproject.toml index 32036e77..bd8e586e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ dependencies = [ "typing-extensions>=4.14.0", "yfinance>=0.2.63", "TA-Lib>=0.4.28", - "newspaper3k>=0.2.8", + "newspaper4k>=0.9.3", ] [project.optional-dependencies] diff --git a/pyrightconfig.json b/pyrightconfig.json index 35484641..c7b28ad0 100644 --- a/pyrightconfig.json +++ b/pyrightconfig.json @@ -7,5 +7,6 @@ "reportMissingTypeStubs": false, "useLibraryCodeForTypes": true, "autoSearchPaths": true, - "extraPaths": [] + "extraPaths": [], + "stubPath": "typings" } \ No newline at end of file diff --git a/test_typecheck.sh b/test_typecheck.sh new file mode 100644 index 00000000..ac1092a5 --- /dev/null +++ b/test_typecheck.sh @@ -0,0 +1,4 @@ +#!/bin/bash +echo "Running type check..." +cd /Users/martinrichards/code/TradingAgents +mise run typecheck diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..b0055b0d --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""Test package for TradingAgents following pragmatic outside-in TDD.""" diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..8a83c7ee --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,127 @@ +""" +Test configuration and shared fixtures following pragmatic TDD principles. + +Provides shared fixtures for mocking I/O boundaries while using real objects +for business logic and data transformations. +""" + +import shutil +import tempfile +from datetime import date, datetime +from unittest.mock import Mock + +import pytest + +from tradingagents.domains.news.article_scraper_client import ( + ArticleScraperClient, + ScrapeResult, +) +from tradingagents.domains.news.google_news_client import ( + GoogleNewsArticle, + GoogleNewsClient, +) +from tradingagents.domains.news.news_repository import ( + NewsArticle, + NewsRepository, +) + + +@pytest.fixture +def mock_google_client(): + """Mock GoogleNewsClient for testing I/O boundary.""" + return Mock(spec=GoogleNewsClient) + + +@pytest.fixture +def mock_article_scraper(): + """Mock ArticleScraperClient for testing I/O boundary.""" + return Mock(spec=ArticleScraperClient) + + +@pytest.fixture +def mock_repository(): + """Mock NewsRepository for testing I/O boundary.""" + return Mock(spec=NewsRepository) + + +@pytest.fixture +def temp_data_dir(): + """Temporary directory for testing real repository persistence.""" + temp_dir = tempfile.mkdtemp() + yield temp_dir + shutil.rmtree(temp_dir) + + +@pytest.fixture +def real_repository(temp_data_dir): + """Real NewsRepository instance for testing persistence logic.""" + return NewsRepository(temp_data_dir) + + +@pytest.fixture +def sample_news_articles(): + """Sample NewsArticle objects for testing data transformations.""" + return [ + NewsArticle( + headline="Apple Stock Rises 5% on Strong Earnings", + url="https://example.com/apple-earnings", + source="CNBC", + published_date=date(2024, 1, 15), + summary="Apple reports strong quarterly earnings beating expectations", + sentiment_score=0.7, + author="John Reporter", + ), + NewsArticle( + headline="Apple Faces Supply Chain Challenges", + url="https://example.com/apple-supply-chain", + source="Reuters", + published_date=date(2024, 1, 16), + summary="Apple struggles with component shortages affecting production", + sentiment_score=-0.3, + author="Jane Analyst", + ), + ] + + +@pytest.fixture +def sample_google_articles(): + """Sample GoogleNewsArticle objects for testing data transformations.""" + return [ + GoogleNewsArticle( + title="Apple Stock Soars on Positive Outlook", + link="https://example.com/apple-soars", + published=datetime(2024, 1, 15, 10, 30), + summary="Investors are optimistic about Apple's future", + source="MarketWatch", + guid="article1", + ), + GoogleNewsArticle( + title="Apple Announces New Product Line", + link="https://example.com/apple-products", + published=datetime(2024, 1, 16, 14, 20), + summary="Apple unveils exciting new product lineup", + source="TechCrunch", + guid="article2", + ), + ] + + +@pytest.fixture +def sample_scrape_results(): + """Sample ScrapeResult objects for testing data transformations.""" + return { + "https://example.com/apple-soars": ScrapeResult( + status="SUCCESS", + content="Full article content about Apple's stock performance...", + author="Market Reporter", + title="Apple Stock Soars on Positive Outlook", + publish_date="2024-01-15", + ), + "https://example.com/apple-products": ScrapeResult( + status="SUCCESS", + content="Detailed content about Apple's new product announcements...", + author="Tech Writer", + title="Apple Announces New Product Line", + publish_date="2024-01-16", + ), + } diff --git a/tests/domains/__init__.py b/tests/domains/__init__.py new file mode 100644 index 00000000..1ad21c9b --- /dev/null +++ b/tests/domains/__init__.py @@ -0,0 +1 @@ +"""Domain tests package.""" diff --git a/tests/domains/news/__init__.py b/tests/domains/news/__init__.py new file mode 100644 index 00000000..27016d97 --- /dev/null +++ b/tests/domains/news/__init__.py @@ -0,0 +1 @@ +"""News domain tests package.""" diff --git a/tests/domains/news/test_article_scraper_client.py b/tests/domains/news/test_article_scraper_client.py new file mode 100644 index 00000000..251d311b --- /dev/null +++ b/tests/domains/news/test_article_scraper_client.py @@ -0,0 +1,532 @@ +""" +Test ArticleScraperClient with pytest-vcr for HTTP recording/replay. + +Following pragmatic TDD principles: +- Mock HTTP boundaries with VCR cassettes +- Test real business logic and data transformations +- Fast, deterministic tests +""" + +from pathlib import Path +from unittest.mock import Mock, patch + +import pytest + +from tradingagents.domains.news.article_scraper_client import ( + ArticleScraperClient, + ScrapeResult, +) + + +@pytest.fixture +def cassette_dir(): + """Directory for VCR cassettes.""" + return ( + Path(__file__).parent.parent.parent + / "fixtures" + / "vcr_cassettes" + / "article_scraper" + ) + + +@pytest.fixture +def scraper(): + """ArticleScraperClient instance for testing.""" + return ArticleScraperClient( + user_agent="Test-Agent/1.0", + delay=0.1, # Faster tests + ) + + +@pytest.fixture +def valid_urls(): + """Valid test URLs.""" + return [ + "https://www.reuters.com/business/finance/", + "https://www.bloomberg.com/markets/stocks", + "https://techcrunch.com/2024/01/15/tech-news/", + ] + + +@pytest.fixture +def invalid_urls(): + """Invalid test URLs.""" + return [ + "", + "not-a-url", + "http://", + "https://", + "ftp://example.com/file.txt", + "https://non-existent-domain-123456.com/article", + ] + + +class TestArticleScraperClient: + """Test ArticleScraperClient functionality.""" + + def test_initialization(self): + """Test scraper initializes with correct configuration.""" + # Test with custom user agent + scraper = ArticleScraperClient("Custom-Agent/1.0", delay=2.0) + assert scraper.user_agent == "Custom-Agent/1.0" + assert scraper.delay == 2.0 + + # Test with default user agent (None/empty) + scraper_default = ArticleScraperClient(None) + assert "Chrome" in scraper_default.user_agent + assert scraper_default.delay == 1.0 + + def test_is_valid_url(self, scraper): + """Test URL validation logic.""" + # Valid URLs + assert scraper._is_valid_url("https://example.com/article") is True + assert scraper._is_valid_url("http://example.com/article") is True + assert scraper._is_valid_url("https://sub.domain.com/path?query=value") is True + + # Invalid URLs + assert scraper._is_valid_url("") is False + assert scraper._is_valid_url("not-a-url") is False + assert scraper._is_valid_url("ftp://example.com") is False + assert scraper._is_valid_url("http://") is False + assert scraper._is_valid_url("https://") is False + + def test_scrape_article_invalid_url(self, scraper, invalid_urls): + """Test scraping with invalid URLs returns NOT_FOUND.""" + for url in invalid_urls: + result = scraper.scrape_article(url) + assert result.status == "NOT_FOUND" + assert result.content == "" + assert result.final_url == url + + +class TestArticleScrapingSuccess: + """Test successful article scraping scenarios.""" + + @patch("tradingagents.domains.news.article_scraper_client.time.sleep") + @patch("tradingagents.domains.news.article_scraper_client.Article") + def test_scrape_article_success(self, mock_article_class, mock_sleep, scraper): + """Test successful article scraping with mocked newspaper4k.""" + # Setup mock article + mock_article = Mock() + mock_article.text = "This is a long article content that is definitely over 100 characters in length and should pass the validation check." + mock_article.title = "Test Article Title" + mock_article.authors = ["John Doe", "Jane Smith"] + mock_article.publish_date = "2024-01-15" + mock_article.download.return_value = None + mock_article.parse.return_value = None + + mock_article_class.return_value = mock_article + + # Test scraping + result = scraper.scrape_article("https://example.com/article") + + # Verify results + assert result.status == "SUCCESS" + assert result.content == mock_article.text + assert result.title == "Test Article Title" + assert result.author == "John Doe, Jane Smith" + assert result.publish_date == "2024-01-15" + assert result.final_url == "https://example.com/article" + + # Verify newspaper4k was configured correctly + mock_article_class.assert_called_once() + args, kwargs = mock_article_class.call_args + assert args[0] == "https://example.com/article" + config = ( + kwargs["config"] + if "config" in kwargs + else args[1] + if len(args) > 1 + else None + ) + assert config is not None + assert config.browser_user_agent == "Test-Agent/1.0" + assert config.request_timeout == 10 + + # Verify delay was applied + mock_sleep.assert_called_once_with(0.1) + + @patch("tradingagents.domains.news.article_scraper_client.time.sleep") + @patch("tradingagents.domains.news.article_scraper_client.Article") + def test_scrape_article_with_datetime_publish_date( + self, mock_article_class, mock_sleep, scraper + ): + """Test successful scraping with datetime publish_date.""" + from datetime import datetime + + mock_article = Mock() + mock_article.text = "Long article content over 100 characters for testing publish date handling in the newspaper4k client." + mock_article.title = "DateTime Test Article" + mock_article.authors = [] + mock_article.publish_date = datetime(2024, 1, 15, 14, 30, 0) + + mock_article_class.return_value = mock_article + + result = scraper.scrape_article("https://example.com/datetime-article") + + assert result.status == "SUCCESS" + assert result.publish_date == "2024-01-15" + assert result.author == "" # Empty authors list + + @patch("tradingagents.domains.news.article_scraper_client.time.sleep") + @patch("tradingagents.domains.news.article_scraper_client.Article") + def test_scrape_article_short_content_fails( + self, mock_article_class, mock_sleep, scraper + ): + """Test that articles with content under 100 chars are rejected.""" + mock_article = Mock() + mock_article.text = "Short content" # Under 100 characters + mock_article.title = "Short Article" + mock_article.authors = [] + mock_article.publish_date = None + + mock_article_class.return_value = mock_article + + result = scraper.scrape_article("https://example.com/short-article") + + assert result.status == "SCRAPE_FAILED" + assert result.content == "" + + @patch("tradingagents.domains.news.article_scraper_client.time.sleep") + @patch("tradingagents.domains.news.article_scraper_client.Article") + def test_scrape_article_empty_content_fails( + self, mock_article_class, mock_sleep, scraper + ): + """Test that articles with empty content are rejected.""" + mock_article = Mock() + mock_article.text = "" # Empty content + mock_article.title = "" + mock_article.authors = [] + mock_article.publish_date = None + + mock_article_class.return_value = mock_article + + result = scraper.scrape_article("https://example.com/empty-article") + + assert result.status == "SCRAPE_FAILED" + assert result.content == "" + + +class TestArticleScrapingFailure: + """Test article scraping failure scenarios.""" + + @patch("tradingagents.domains.news.article_scraper_client.time.sleep") + @patch("tradingagents.domains.news.article_scraper_client.Article") + def test_scrape_article_download_exception( + self, mock_article_class, mock_sleep, scraper + ): + """Test scraping when newspaper4k download fails.""" + mock_article = Mock() + mock_article.download.side_effect = Exception("Download failed") + + mock_article_class.return_value = mock_article + + result = scraper.scrape_article("https://example.com/failing-article") + + assert result.status == "SCRAPE_FAILED" + assert result.content == "" + assert result.final_url == "https://example.com/failing-article" + + @patch("tradingagents.domains.news.article_scraper_client.time.sleep") + @patch("tradingagents.domains.news.article_scraper_client.Article") + def test_scrape_article_parse_exception( + self, mock_article_class, mock_sleep, scraper + ): + """Test scraping when newspaper4k parse fails.""" + mock_article = Mock() + mock_article.download.return_value = None + mock_article.parse.side_effect = Exception("Parse failed") + + mock_article_class.return_value = mock_article + + result = scraper.scrape_article("https://example.com/parse-fail-article") + + assert result.status == "SCRAPE_FAILED" + assert result.content == "" + + +class TestWaybackMachineFallback: + """Test Internet Archive Wayback Machine fallback functionality.""" + + @patch("tradingagents.domains.news.article_scraper_client.requests.get") + def test_scrape_from_wayback_no_requests(self, mock_get, scraper): + """Test Wayback fallback when requests is not available.""" + with patch( + "builtins.__import__", side_effect=ImportError("No module named 'requests'") + ): + result = scraper._scrape_from_wayback("https://example.com/article") + + assert result.status == "NOT_FOUND" + assert result.final_url == "https://example.com/article" + + @patch("tradingagents.domains.news.article_scraper_client.requests.get") + def test_scrape_from_wayback_no_snapshots(self, mock_get, scraper): + """Test Wayback fallback when no archived snapshots exist.""" + # Mock CDX API response with only headers (no snapshots) + mock_response = Mock() + mock_response.json.return_value = [["timestamp", "original"]] # Only headers + mock_response.raise_for_status.return_value = None + mock_get.return_value = mock_response + + result = scraper._scrape_from_wayback("https://example.com/no-archive") + + assert result.status == "NOT_FOUND" + assert result.final_url == "https://example.com/no-archive" + + @patch("tradingagents.domains.news.article_scraper_client.requests.get") + @patch("tradingagents.domains.news.article_scraper_client.time.sleep") + @patch("tradingagents.domains.news.article_scraper_client.Article") + def test_scrape_from_wayback_success( + self, mock_article_class, mock_sleep, mock_get, scraper + ): + """Test successful Wayback Machine scraping.""" + # Mock CDX API response + mock_response = Mock() + mock_response.json.return_value = [ + ["timestamp", "original"], # Headers + ["20240115120000", "https://example.com/article"], # Snapshot data + ] + mock_response.raise_for_status.return_value = None + mock_get.return_value = mock_response + + # Mock successful article scraping from archive + mock_article = Mock() + mock_article.text = "Archived article content that is long enough to pass validation checks and contains meaningful information." + mock_article.title = "Archived Article" + mock_article.authors = ["Archive Author"] + mock_article.publish_date = "2024-01-15" + mock_article_class.return_value = mock_article + + result = scraper._scrape_from_wayback("https://example.com/article") + + assert result.status == "ARCHIVE_SUCCESS" + assert result.content == mock_article.text + assert result.title == "Archived Article" + assert ( + result.final_url + == "https://web.archive.org/web/20240115120000/https://example.com/article" + ) + + # Verify CDX API was called correctly + mock_get.assert_called_with( + "http://web.archive.org/cdx/search/cdx", + params={ + "url": "https://example.com/article", + "output": "json", + "fl": "timestamp,original", + "filter": "statuscode:200", + "limit": "1", + }, + timeout=10, + ) + + @patch("tradingagents.domains.news.article_scraper_client.requests.get") + def test_scrape_from_wayback_requests_exception(self, mock_get, scraper): + """Test Wayback fallback when requests fails.""" + mock_get.side_effect = Exception("Request timeout") + + result = scraper._scrape_from_wayback("https://example.com/timeout") + + assert result.status == "NOT_FOUND" + assert result.final_url == "https://example.com/timeout" + + @patch("tradingagents.domains.news.article_scraper_client.time.sleep") + @patch("tradingagents.domains.news.article_scraper_client.Article") + def test_scrape_article_fallback_to_wayback( + self, mock_article_class, mock_sleep, scraper + ): + """Test full workflow: source fails, fallback to Wayback succeeds.""" + # First call (original source) fails + # Second call (Wayback source) succeeds + mock_article_fail = Mock() + mock_article_fail.download.side_effect = Exception("Download failed") + + mock_article_success = Mock() + mock_article_success.text = "Successfully scraped content from Wayback Machine with enough length to pass validation tests." + mock_article_success.title = "Wayback Success" + mock_article_success.authors = ["Wayback Author"] + mock_article_success.publish_date = "2024-01-15" + mock_article_success.download.return_value = None + mock_article_success.parse.return_value = None + + mock_article_class.side_effect = [mock_article_fail, mock_article_success] + + with patch( + "tradingagents.domains.news.article_scraper_client.requests.get" + ) as mock_get: + # Mock successful CDX API response + mock_response = Mock() + mock_response.json.return_value = [ + ["timestamp", "original"], + ["20240115120000", "https://example.com/article"], + ] + mock_response.raise_for_status.return_value = None + mock_get.return_value = mock_response + + result = scraper.scrape_article("https://example.com/article") + + assert result.status == "ARCHIVE_SUCCESS" + assert ( + result.content + == "Successfully scraped content from Wayback Machine with enough length to pass validation tests." + ) + assert "web.archive.org" in result.final_url + + +class TestMultipleArticles: + """Test scraping multiple articles functionality.""" + + @patch("tradingagents.domains.news.article_scraper_client.time.sleep") + def test_scrape_multiple_articles_empty_list(self, mock_sleep, scraper): + """Test scraping empty list returns empty dict.""" + results = scraper.scrape_multiple_articles([]) + assert results == {} + mock_sleep.assert_not_called() + + @patch("tradingagents.domains.news.article_scraper_client.time.sleep") + def test_scrape_multiple_articles_single_url(self, mock_sleep, scraper): + """Test scraping single URL in list.""" + urls = ["https://example.com/single"] + + with patch.object(scraper, "scrape_article") as mock_scrape: + mock_scrape.return_value = ScrapeResult( + status="SUCCESS", content="Single article content" + ) + + results = scraper.scrape_multiple_articles(urls) + + assert len(results) == 1 + assert results["https://example.com/single"].status == "SUCCESS" + mock_scrape.assert_called_once_with("https://example.com/single") + # No delay needed for single article + mock_sleep.assert_not_called() + + @patch("tradingagents.domains.news.article_scraper_client.time.sleep") + def test_scrape_multiple_articles_with_delays(self, mock_sleep, scraper): + """Test scraping multiple URLs with delays between requests.""" + urls = [ + "https://example.com/article1", + "https://example.com/article2", + "https://example.com/article3", + ] + + with patch.object(scraper, "scrape_article") as mock_scrape: + mock_scrape.side_effect = [ + ScrapeResult(status="SUCCESS", content="Article 1"), + ScrapeResult(status="SUCCESS", content="Article 2"), + ScrapeResult(status="SCRAPE_FAILED", content=""), + ] + + results = scraper.scrape_multiple_articles(urls) + + assert len(results) == 3 + assert results["https://example.com/article1"].status == "SUCCESS" + assert results["https://example.com/article2"].status == "SUCCESS" + assert results["https://example.com/article3"].status == "SCRAPE_FAILED" + + # Verify delay called between requests (n-1 times) + assert mock_sleep.call_count == 2 + mock_sleep.assert_called_with(0.1) + + +class TestDataTransformation: + """Test data transformation and edge cases.""" + + @patch("tradingagents.domains.news.article_scraper_client.time.sleep") + @patch("tradingagents.domains.news.article_scraper_client.Article") + def test_publish_date_edge_cases(self, mock_article_class, mock_sleep, scraper): + """Test various publish_date formats are handled correctly.""" + from datetime import datetime + + test_cases = [ + (None, ""), + ("", ""), + ("2024-01-15", "2024-01-15"), + (datetime(2024, 1, 15), "2024-01-15"), + (12345, "12345"), # Numeric conversion + ({"year": 2024}, "{'year': 2024}"), # Dict conversion + ] + + for pub_date, expected in test_cases: + mock_article = Mock() + mock_article.text = "Long enough content for validation testing with various publish date formats and edge cases." + mock_article.title = "Date Test" + mock_article.authors = [] + mock_article.publish_date = pub_date + + mock_article_class.return_value = mock_article + + result = scraper.scrape_article("https://example.com/date-test") + assert result.status == "SUCCESS" + assert result.publish_date == expected + + def test_scrape_result_dataclass_defaults(self): + """Test ScrapeResult dataclass has correct defaults.""" + result = ScrapeResult(status="TEST") + + assert result.status == "TEST" + assert result.content == "" + assert result.author == "" + assert result.final_url == "" + assert result.title == "" + assert result.publish_date == "" + + def test_scrape_result_all_fields(self): + """Test ScrapeResult with all fields populated.""" + result = ScrapeResult( + status="SUCCESS", + content="Full article content", + author="Test Author", + final_url="https://final.com/url", + title="Test Title", + publish_date="2024-01-15", + ) + + assert result.status == "SUCCESS" + assert result.content == "Full article content" + assert result.author == "Test Author" + assert result.final_url == "https://final.com/url" + assert result.title == "Test Title" + assert result.publish_date == "2024-01-15" + + +class TestErrorHandlingAndEdgeCases: + """Test error handling and edge cases.""" + + def test_user_agent_fallback(self): + """Test user agent fallback when None or empty is provided.""" + scraper_none = ArticleScraperClient(None) + scraper_empty = ArticleScraperClient("") + + # Both should use default Chrome user agent + default_ua = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + ) + + assert scraper_none.user_agent == default_ua + assert scraper_empty.user_agent == default_ua + + @patch("tradingagents.domains.news.article_scraper_client.time.sleep") + @patch("tradingagents.domains.news.article_scraper_client.Article") + def test_config_applied_correctly(self, mock_article_class, mock_sleep): + """Test that newspaper4k Config is applied with correct settings.""" + scraper = ArticleScraperClient("Custom-Agent/2.0", delay=0.5) + + mock_article = Mock() + mock_article.text = "Test content that meets minimum length requirements for successful article scraping validation." + mock_article_class.return_value = mock_article + + scraper.scrape_article("https://example.com/config-test") + + # Verify Article was created with correct config + mock_article_class.assert_called_once() + args, kwargs = mock_article_class.call_args + + assert args[0] == "https://example.com/config-test" + config = kwargs.get("config") or (args[1] if len(args) > 1 else None) + assert config is not None + assert config.browser_user_agent == "Custom-Agent/2.0" + assert config.request_timeout == 10 + assert config.keep_article_html is True + assert config.fetch_images is False diff --git a/tests/domains/news/test_news_service.py b/tests/domains/news/test_news_service.py new file mode 100644 index 00000000..c17db156 --- /dev/null +++ b/tests/domains/news/test_news_service.py @@ -0,0 +1,336 @@ +""" +Test suite for NewsService following pragmatic outside-in TDD methodology. + +This test suite follows the CLAUDE.md testing principles: +- Mock I/O boundaries (Repository calls, HTTP clients, external systems) +- Real objects for logic (Data transformations, validation, business logic) +- Outside-in but practical - Start with service tests, work inward +""" + +from datetime import date +from unittest.mock import Mock + +import pytest + +# Import mock ScrapeResult from conftest to avoid newspaper3k import issues +from conftest import ScrapeResult + +from tradingagents.domains.news.news_repository import ( + NewsData, +) +from tradingagents.domains.news.news_service import ( + ArticleData, + NewsContext, + NewsService, + NewsUpdateResult, + SentimentScore, +) + + +class TestNewsServiceCollaboratorInteractions: + """Test NewsService interactions with its collaborators (I/O boundaries).""" + + def test_get_company_news_context_calls_repository_with_correct_params( + self, mock_repository, mock_google_client, mock_article_scraper + ): + """Test that get_company_news_context calls repository with correct parameters.""" + # Arrange - Mock the I/O boundary + mock_repository.get_news_data.return_value = {} + + service = NewsService(mock_google_client, mock_repository, mock_article_scraper) + + # Act - Call the service method + result = service.get_company_news_context("AAPL", "2024-01-01", "2024-01-31") + + # Assert - Repository should be called with converted date objects + mock_repository.get_news_data.assert_called_once_with( + query="AAPL", + start_date=date(2024, 1, 1), + end_date=date(2024, 1, 31), + sources=["finnhub", "google_news"], + ) + + # Assert - Result should have correct structure (real object logic) + assert isinstance(result, NewsContext) + assert result.query == "AAPL" + assert result.symbol == "AAPL" + assert result.period == {"start": "2024-01-01", "end": "2024-01-31"} + + def test_get_global_news_context_calls_repository_for_each_category( + self, mock_repository, mock_google_client, mock_article_scraper + ): + """Test that get_global_news_context calls repository for each category.""" + # Arrange - Mock the I/O boundary + mock_repository.get_news_data.return_value = {} + + service = NewsService(mock_google_client, mock_repository, mock_article_scraper) + categories = ["business", "politics", "technology"] + + # Act + service.get_global_news_context( + "2024-01-01", "2024-01-31", categories=categories + ) + + # Assert - Repository should be called once for each category + assert mock_repository.get_news_data.call_count == 3 + + for call_args in mock_repository.get_news_data.call_args_list: + args, kwargs = call_args + assert args[0] in categories # query should be one of the categories + assert args[1] == date(2024, 1, 1) # start_date + assert args[2] == date(2024, 1, 31) # end_date + assert kwargs["sources"] == ["google_news"] + + def test_update_company_news_calls_google_client( + self, mock_repository, mock_google_client, mock_article_scraper + ): + """Test that update_company_news calls GoogleNewsClient correctly.""" + # Arrange - Mock the I/O boundary + mock_google_client.get_company_news.return_value = [] + + service = NewsService(mock_google_client, mock_repository, mock_article_scraper) + + # Act + result = service.update_company_news("AAPL") + + # Assert - Google client should be called + mock_google_client.get_company_news.assert_called_once_with("AAPL") + assert isinstance(result, NewsUpdateResult) + assert result.symbol == "AAPL" + assert result.articles_found == 0 + + def test_update_company_news_scrapes_each_article_url( + self, + mock_repository, + mock_google_client, + mock_article_scraper, + sample_google_articles, + ): + """Test that update_company_news calls scraper for each article URL.""" + # Arrange - Mock I/O boundaries with real data objects + mock_google_client.get_company_news.return_value = sample_google_articles + mock_article_scraper.scrape_article.return_value = ScrapeResult( + status="SUCCESS", + content="Full article content", + author="Test Author", + title="Test Title", + publish_date="2024-01-15", + ) + + service = NewsService(mock_google_client, mock_repository, mock_article_scraper) + + # Act + result = service.update_company_news("AAPL") + + # Assert - Scraper should be called for each article + assert mock_article_scraper.scrape_article.call_count == 2 + mock_article_scraper.scrape_article.assert_any_call( + "https://example.com/apple-soars" + ) + mock_article_scraper.scrape_article.assert_any_call( + "https://example.com/apple-products" + ) + + # Assert - Real object logic for result + assert result.articles_found == 2 + assert result.articles_scraped == 2 + assert result.articles_failed == 0 + + def test_repository_failure_returns_empty_context_with_error_metadata( + self, mock_repository, mock_google_client, mock_article_scraper + ): + """Test that repository failure is handled gracefully.""" + # Arrange - Mock repository failure (I/O boundary) + mock_repository.get_news_data.side_effect = Exception( + "Database connection failed" + ) + + service = NewsService(mock_google_client, mock_repository, mock_article_scraper) + + # Act + result = service.get_company_news_context("AAPL", "2024-01-01", "2024-01-31") + + # Assert - Should return empty context with error metadata (real object logic) + assert isinstance(result, NewsContext) + assert result.articles == [] + assert result.article_count == 0 + assert "error" in result.metadata + assert "Database connection failed" in result.metadata["error"] + + +class TestNewsServiceDataTransformations: + """Test data transformations using real objects (no mocking).""" + + def test_converts_repository_articles_to_article_data( + self, mock_google_client, mock_article_scraper, sample_news_articles + ): + """Test conversion of NewsRepository.NewsArticle to ArticleData.""" + # Arrange - Create real repository with sample data + mock_repo = Mock() + news_data = NewsData( + query="AAPL", + date=date(2024, 1, 15), + source="finnhub", + articles=sample_news_articles, + ) + mock_repo.get_news_data.return_value = {date(2024, 1, 15): [news_data]} + + service = NewsService(mock_google_client, mock_repo, mock_article_scraper) + + # Act - Test real data transformation logic + result = service.get_company_news_context("AAPL", "2024-01-01", "2024-01-31") + + # Assert - Real object data transformation + assert len(result.articles) == 2 + assert result.articles[0].title == "Apple Stock Rises 5% on Strong Earnings" + assert ( + result.articles[0].content + == "Apple reports strong quarterly earnings beating expectations" + ) + assert result.articles[0].date == "2024-01-15" + assert result.articles[0].source == "CNBC" + assert result.articles[0].url == "https://example.com/apple-earnings" + + def test_calculates_sentiment_summary_from_articles( + self, mock_repository, mock_google_client, mock_article_scraper + ): + """Test sentiment summary calculation from article list.""" + # Arrange - Create articles with sentiment-bearing content (real objects) + articles = [ + ArticleData( + title="Great News for Apple", + content="Apple stock is performing excellent with strong growth and positive outlook", + author="Analyst", + source="CNBC", + date="2024-01-15", + url="https://example.com/positive", + ), + ArticleData( + title="Apple Faces Challenges", + content="Apple stock is declining due to bad earnings and negative market sentiment", + author="Reporter", + source="Reuters", + date="2024-01-16", + url="https://example.com/negative", + ), + ] + + service = NewsService(mock_google_client, mock_repository, mock_article_scraper) + + # Act - Test real sentiment calculation logic (private method) + sentiment = service._calculate_sentiment_summary(articles) + + # Assert - Real sentiment calculation + assert isinstance(sentiment, SentimentScore) + assert -1.0 <= sentiment.score <= 1.0 + assert 0.0 <= sentiment.confidence <= 1.0 + assert sentiment.label in ["positive", "negative", "neutral"] + + def test_extracts_trending_topics_from_articles( + self, mock_repository, mock_google_client, mock_article_scraper + ): + """Test trending topic extraction.""" + # Arrange - Create articles with repeated keywords (real objects) + articles = [ + ArticleData( + title="Apple iPhone Sales Surge", + content="Content about iPhone", + author="Reporter", + source="TechNews", + date="2024-01-15", + url="https://example.com/iphone1", + ), + ArticleData( + title="iPhone Market Share Growth", + content="More iPhone content", + author="Analyst", + source="MarketWatch", + date="2024-01-16", + url="https://example.com/iphone2", + ), + ArticleData( + title="Apple Revenue from Services", + content="Services revenue content", + author="Finance Writer", + source="Bloomberg", + date="2024-01-17", + url="https://example.com/services", + ), + ] + + service = NewsService(mock_google_client, mock_repository, mock_article_scraper) + + # Act - Test real trending topic extraction logic + topics = service._extract_trending_topics(articles) + + # Assert - Should identify repeated keywords + assert isinstance(topics, list) + assert "iphone" in topics # Should appear twice + assert "apple" in topics # Should appear multiple times + + +class TestNewsServiceErrorScenarios: + """Test various error scenarios and edge cases.""" + + def test_handles_google_client_failure( + self, mock_repository, mock_google_client, mock_article_scraper + ): + """Test handling of GoogleNewsClient failure.""" + # Arrange - Mock client failure (I/O boundary) + mock_google_client.get_company_news.side_effect = Exception( + "API rate limit exceeded" + ) + + service = NewsService(mock_google_client, mock_repository, mock_article_scraper) + + # Act & Assert - Should raise the exception + with pytest.raises(Exception, match="API rate limit exceeded"): + service.update_company_news("AAPL") + + def test_handles_article_scraper_failure( + self, + mock_repository, + mock_google_client, + mock_article_scraper, + sample_google_articles, + ): + """Test handling of article scraper failure.""" + # Arrange - Mock scraper returning failure status + mock_google_client.get_company_news.return_value = sample_google_articles + mock_article_scraper.scrape_article.return_value = ScrapeResult( + status="SCRAPE_FAILED", content="", author="", title="", publish_date="" + ) + + service = NewsService(mock_google_client, mock_repository, mock_article_scraper) + + # Act + result = service.update_company_news("AAPL") + + # Assert - Should handle scraper failures gracefully + assert result.articles_found == 2 + assert result.articles_scraped == 0 + assert result.articles_failed == 2 + + def test_handles_invalid_date_formats( + self, mock_repository, mock_google_client, mock_article_scraper + ): + """Test validation of date formats.""" + service = NewsService(mock_google_client, mock_repository, mock_article_scraper) + + # Act & Assert - Should raise ValueError for invalid date format + with pytest.raises(ValueError): + service.get_company_news_context("AAPL", "invalid-date", "2024-01-31") + + def test_handles_empty_articles_gracefully( + self, mock_repository, mock_google_client, mock_article_scraper + ): + """Test handling of empty article list.""" + service = NewsService(mock_google_client, mock_repository, mock_article_scraper) + + # Act - Test sentiment calculation with empty list + sentiment = service._calculate_sentiment_summary([]) + + # Assert - Should return neutral sentiment + assert sentiment.score == 0.0 + assert sentiment.confidence == 0.0 + assert sentiment.label == "neutral" diff --git a/tradingagents/domains/news/article_scraper_client.py b/tradingagents/domains/news/article_scraper_client.py index 26501c76..f3e69a4a 100644 --- a/tradingagents/domains/news/article_scraper_client.py +++ b/tradingagents/domains/news/article_scraper_client.py @@ -8,7 +8,7 @@ from dataclasses import dataclass from datetime import datetime from urllib.parse import urlparse -import newspaper +from newspaper import Article, Config logger = logging.getLogger(__name__) @@ -28,12 +28,12 @@ class ScrapeResult: class ArticleScraperClient: """Client for scraping article content with Internet Archive fallback.""" - def __init__(self, user_agent: str, delay: float = 1.0): + def __init__(self, user_agent: str | None = None, delay: float = 1.0): """ Initialize article scraper. Args: - user_agent: User agent string for requests + user_agent: User agent string for requests (None for default) delay: Delay between requests in seconds """ self.user_agent = user_agent or ( @@ -65,17 +65,18 @@ class ArticleScraperClient: return self._scrape_from_wayback(url) def _scrape_from_source(self, url: str) -> ScrapeResult: - """Scrape article from original source using newspaper3k.""" + """Scrape article from original source using newspaper4k.""" try: # Add delay to be respectful time.sleep(self.delay) - # Configure newspaper article - article = newspaper.Article(url) - article.config.browser_user_agent = self.user_agent - article.config.request_timeout = 10 + # Configure newspaper4k with optimizations + config = Config() + config.browser_user_agent = self.user_agent + config.request_timeout = 10 + config.fetch_images = False - # Download and parse + article = Article(url, config=config) article.download() article.parse() diff --git a/tradingagents/domains/news/news_service.py b/tradingagents/domains/news/news_service.py index d163fd3a..2d00425f 100644 --- a/tradingagents/domains/news/news_service.py +++ b/tradingagents/domains/news/news_service.py @@ -4,6 +4,7 @@ News service that provides structured news context. import logging from dataclasses import dataclass +from datetime import date from enum import Enum from typing import Any @@ -134,13 +135,39 @@ class NewsService: try: logger.info(f"Getting company news context for {symbol} from repository") - # Get articles from repository + # Get articles from repository (READ PATH - no API calls) articles = [] if self.repository: try: - # This would depend on the actual repository interface - # For now, return empty list - repository integration needs to be completed - articles = [] + # Convert date strings to date objects + start_date_obj = date.fromisoformat(start_date) + end_date_obj = date.fromisoformat(end_date) + + # Get cached news data from repository + news_data_by_date = self.repository.get_news_data( + query=symbol, + start_date=start_date_obj, + end_date=end_date_obj, + sources=["finnhub", "google_news"], + ) + + # Convert repository data to ArticleData objects + for _date_key, news_data_list in news_data_by_date.items(): + for news_data in news_data_list: + for article in news_data.articles: + articles.append( + ArticleData( + title=article.headline, + content=article.summary + or "", # Use summary as fallback for content + author=article.author or "", + source=article.source, + date=article.published_date.isoformat(), + url=article.url, + sentiment=None, # Will be calculated later + ) + ) + logger.debug( f"Retrieved {len(articles)} articles from repository for {symbol}" ) @@ -218,13 +245,39 @@ class NewsService: f"Getting global news context from repository for categories: {categories}" ) - # Get articles from repository + # Get articles from repository (READ PATH - no API calls) articles = [] if self.repository: try: - # This would depend on the actual repository interface - # For now, return empty list - repository integration needs to be completed - articles = [] + # Convert date strings to date objects + start_date_obj = date.fromisoformat(start_date) + end_date_obj = date.fromisoformat(end_date) + + # Get cached news data from repository for each category + for category in categories: + news_data_by_date = self.repository.get_news_data( + query=category, + start_date=start_date_obj, + end_date=end_date_obj, + sources=["google_news"], # Global news mainly from Google + ) + + # Convert repository data to ArticleData objects + for _date_key, news_data_list in news_data_by_date.items(): + for news_data in news_data_list: + for article in news_data.articles: + articles.append( + ArticleData( + title=article.headline, + content=article.summary or "", + author=article.author or "", + source=article.source, + date=article.published_date.isoformat(), + url=article.url, + sentiment=None, + ) + ) + logger.debug( f"Retrieved {len(articles)} global articles from repository" ) diff --git a/typings/newspaper.pyi b/typings/newspaper.pyi new file mode 100644 index 00000000..7483fd37 --- /dev/null +++ b/typings/newspaper.pyi @@ -0,0 +1,31 @@ +"""Type stubs for newspaper (newspaper4k package).""" + +from datetime import datetime + +class Config: + """Configuration for newspaper Article.""" + + browser_user_agent: str + request_timeout: int + fetch_images: bool + + def __init__(self) -> None: ... + +class Article: + """Article class for parsing web articles.""" + + text: str + title: str | None + authors: list[str] + publish_date: datetime | None + top_image: str | None + movies: list[str] + keywords: list[str] + summary: str + + def __init__(self, url: str, config: Config | None = None) -> None: ... + def download(self) -> None: ... + def parse(self) -> None: ... + def nlp(self) -> None: ... + +def article(url: str) -> Article: ... diff --git a/uv.lock b/uv.lock index 294049ac..1bc4e996 100644 --- a/uv.lock +++ b/uv.lock @@ -633,17 +633,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/32/b6/7517af5234378518f27ad35a7b24af9591bc500b8c1780929c1295999eb6/fastapi-0.115.9-py3-none-any.whl", hash = "sha256:4a439d7923e4de796bcc88b64e9754340fcd1574673cbd865ba8a99fe0d28c56", size = 94919, upload-time = "2025-02-27T16:43:40.537Z" }, ] -[[package]] -name = "feedfinder2" -version = "0.0.4" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "beautifulsoup4" }, - { name = "requests" }, - { name = "six" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/35/82/1251fefec3bb4b03fd966c7e7f7a41c9fc2bb00d823a34c13f847fd61406/feedfinder2-0.0.4.tar.gz", hash = "sha256:3701ee01a6c85f8b865a049c30ba0b4608858c803fe8e30d1d289fdbe89d0efe", size = 3297, upload-time = "2016-01-25T15:09:17.492Z" } - [[package]] name = "feedparser" version = "6.0.11" @@ -1049,12 +1038,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050, upload-time = "2025-03-19T20:10:01.071Z" }, ] -[[package]] -name = "jieba3k" -version = "0.35.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a9/cb/2c8332bcdc14d33b0bedd18ae0a4981a069c3513e445120da3c3f23a8aaa/jieba3k-0.35.1.zip", hash = "sha256:980a4f2636b778d312518066be90c7697d410dd5a472385f5afced71a2db1c10", size = 7423646, upload-time = "2014-11-15T05:47:47.978Z" } - [[package]] name = "jinja2" version = "3.1.6" @@ -1700,27 +1683,25 @@ wheels = [ ] [[package]] -name = "newspaper3k" -version = "0.2.8" +name = "newspaper4k" +version = "0.9.3.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "beautifulsoup4" }, - { name = "cssselect" }, - { name = "feedfinder2" }, { name = "feedparser" }, - { name = "jieba3k" }, { name = "lxml" }, { name = "nltk" }, + { name = "numpy" }, + { name = "pandas" }, { name = "pillow" }, { name = "python-dateutil" }, { name = "pyyaml" }, { name = "requests" }, - { name = "tinysegmenter" }, { name = "tldextract" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ce/fb/8f8525be0cafa48926e85b0c06a7cb3e2a892d340b8036f8c8b1b572df1c/newspaper3k-0.2.8.tar.gz", hash = "sha256:9f1bd3e1fb48f400c715abf875cc7b0a67b7ddcd87f50c9aeeb8fcbbbd9004fb", size = 205685, upload-time = "2018-09-28T04:58:23.53Z" } +sdist = { url = "https://files.pythonhosted.org/packages/af/a8/80a186f09ffa2a9366ed93391b03fdaf8057d75a67a21c2eafef36b654ba/newspaper4k-0.9.3.1.tar.gz", hash = "sha256:fc237ae6a7b65d5ac4df224f962b2d7368c991fdf63b5176e439a1b74a2992e0", size = 273009, upload-time = "2024-03-18T21:56:46.344Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/d7/b9/51afecb35bb61b188a4b44868001de348a0e8134b4dfa00ffc191567c4b9/newspaper3k-0.2.8-py3-none-any.whl", hash = "sha256:44a864222633d3081113d1030615991c3dbba87239f6bbf59d91240f71a22e3e", size = 211132, upload-time = "2018-09-28T04:58:18.847Z" }, + { url = "https://files.pythonhosted.org/packages/ab/73/cc4e7a57373e6940fc081d4f36988e3faa54c59a51dea4e8f01d5c10ccb6/newspaper4k-0.9.3.1-py3-none-any.whl", hash = "sha256:42a03b7915d92941a9fe4cc8dab47240219560e0cb8ecb5a291dc5a913eb8aa4", size = 296617, upload-time = "2024-03-18T21:56:43.932Z" }, ] [[package]] @@ -3443,12 +3424,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/de/a8/8f499c179ec900783ffe133e9aab10044481679bb9aad78436d239eee716/tiktoken-0.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:5ea0edb6f83dc56d794723286215918c1cde03712cbbafa0348b33448faf5b95", size = 894669, upload-time = "2025-02-14T06:02:47.341Z" }, ] -[[package]] -name = "tinysegmenter" -version = "0.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/17/82/86982e4b6d16e4febc79c2a1d68ee3b707e8a020c5d2bc4af8052d0f136a/tinysegmenter-0.3.tar.gz", hash = "sha256:ed1f6d2e806a4758a73be589754384cbadadc7e1a414c81a166fc9adf2d40c6d", size = 16893, upload-time = "2017-07-23T11:18:29.85Z" } - [[package]] name = "tldextract" version = "5.3.0" @@ -3591,7 +3566,7 @@ dependencies = [ { name = "langchain-google-genai" }, { name = "langchain-openai" }, { name = "langgraph" }, - { name = "newspaper3k" }, + { name = "newspaper4k" }, { name = "pandas" }, { name = "parsel" }, { name = "praw" }, @@ -3642,7 +3617,7 @@ requires-dist = [ { name = "langchain-google-genai", specifier = ">=2.1.5" }, { name = "langchain-openai", specifier = ">=0.3.23" }, { name = "langgraph", specifier = ">=0.4.8" }, - { name = "newspaper3k", specifier = ">=0.2.8" }, + { name = "newspaper4k", specifier = ">=0.9.3" }, { name = "pandas", specifier = ">=2.3.0" }, { name = "parsel", specifier = ">=1.10.0" }, { name = "praw", specifier = ">=7.8.1" },