TradingAgents/tradingagents/services/fundamental_data_service.py

693 lines
26 KiB
Python

"""
Fundamental Data Service for aggregating and analyzing financial statement data.
"""
import logging
from datetime import date, datetime
from typing import Any
from tradingagents.clients import FinnhubClient
from tradingagents.models.context import (
DataQuality,
FinancialStatement,
FundamentalContext,
)
from tradingagents.repositories.fundamental_repository import FundamentalDataRepository
from tradingagents.services.base import BaseService
logger = logging.getLogger(__name__)
class FundamentalDataService(BaseService):
"""Service for fundamental financial data aggregation and analysis."""
def __init__(
self,
finnhub_client: FinnhubClient,
repository: FundamentalDataRepository,
data_dir: str = "data",
**kwargs,
):
"""Initialize Fundamental Data Service.
Args:
finnhub_client: Client for Finnhub/financial API access
repository: Repository for cached fundamental data
data_dir: Directory for data storage
"""
super().__init__(online_mode=True, data_dir=data_dir, **kwargs)
self.finnhub_client = finnhub_client
self.repository = repository
def get_fundamental_context(
self,
symbol: str,
start_date: str,
end_date: str,
frequency: str = "quarterly",
force_refresh: bool = False,
**kwargs,
) -> FundamentalContext:
"""Get fundamental analysis context for a company.
Args:
symbol: Stock ticker symbol
start_date: Start date in YYYY-MM-DD format
end_date: End date in YYYY-MM-DD format
frequency: Reporting frequency ('quarterly' or 'annual')
force_refresh: If True, skip local data and fetch fresh from APIs
Returns:
FundamentalContext with financial statements and key ratios
"""
# Validate date strings first
try:
start_dt = date.fromisoformat(start_date)
end_dt = date.fromisoformat(end_date)
except ValueError as e:
raise ValueError(f"Invalid date format: {e}")
# Check date order
if end_dt < start_dt:
raise ValueError(f"End date {end_date} is before start date {start_date}")
balance_sheet = None
income_statement = None
cash_flow = None
error_info = {}
errors = []
data_source = "unknown"
try:
# Local-first data strategy with force refresh option
if force_refresh:
# Skip local data, fetch fresh from APIs
balance_sheet, income_statement, cash_flow, data_source = (
self._fetch_and_cache_fresh_fundamental_data(
symbol, start_date, end_date, frequency
)
)
else:
# Check local data first, fetch missing if needed
balance_sheet, income_statement, cash_flow, data_source = (
self._get_fundamental_data_local_first(
symbol, start_date, end_date, frequency
)
)
except Exception as e:
logger.error(f"Error fetching fundamental data: {e}")
errors.append(str(e))
# Add error info if there were any errors
if errors:
error_info = {"error": "; ".join(errors)}
# Calculate key financial ratios
key_ratios = self._calculate_key_ratios(
balance_sheet, income_statement, cash_flow
)
# Determine data quality based on data source
data_quality = self._determine_data_quality(
data_source=data_source,
statement_count=sum(
[
balance_sheet is not None,
income_statement is not None,
cash_flow is not None,
]
),
has_errors=bool(errors),
)
# Handle partial data scenarios gracefully
context = self._handle_partial_statements(
symbol=symbol,
start_date=start_date,
end_date=end_date,
frequency=frequency,
balance_sheet=balance_sheet,
income_statement=income_statement,
cash_flow=cash_flow,
key_ratios=key_ratios,
data_quality=data_quality,
data_source=data_source,
force_refresh=force_refresh,
error_info=error_info,
)
return context
def get_context(
self,
symbol: str,
start_date: str,
end_date: str,
frequency: str = "quarterly",
**kwargs,
) -> FundamentalContext:
"""Alias for get_fundamental_context for consistency with other services."""
return self.get_fundamental_context(
symbol=symbol,
start_date=start_date,
end_date=end_date,
frequency=frequency,
**kwargs,
)
def _get_balance_sheet(
self, symbol: str, frequency: str, report_date: date
) -> FinancialStatement | None:
"""Get balance sheet data from client."""
try:
data = self.finnhub_client.get_balance_sheet(symbol, frequency, report_date)
return self._convert_to_financial_statement(data)
except Exception as e:
logger.warning(f"Failed to get balance sheet for {symbol}: {e}")
return None
def _get_income_statement(
self, symbol: str, frequency: str, report_date: date
) -> FinancialStatement | None:
"""Get income statement data from client."""
try:
data = self.finnhub_client.get_income_statement(
symbol, frequency, report_date
)
return self._convert_to_financial_statement(data)
except Exception as e:
logger.warning(f"Failed to get income statement for {symbol}: {e}")
return None
def _get_cash_flow(
self, symbol: str, frequency: str, report_date: date
) -> FinancialStatement | None:
"""Get cash flow statement data from client."""
try:
data = self.finnhub_client.get_cash_flow(symbol, frequency, report_date)
return self._convert_to_financial_statement(data)
except Exception as e:
logger.warning(f"Failed to get cash flow for {symbol}: {e}")
return None
def _convert_to_financial_statement(
self, data: dict[str, Any]
) -> FinancialStatement | None:
"""Convert raw financial data to FinancialStatement object."""
if not data or "data" not in data or not data["data"]:
return None
try:
return FinancialStatement(
period=data.get("period", "Unknown"),
report_date=data.get("report_date", ""),
publish_date=data.get("publish_date", ""),
currency=data.get("currency", "USD"),
data=data["data"],
)
except Exception as e:
logger.warning(f"Failed to convert financial statement: {e}")
return None
def _parse_cached_statements(self, cached_data: dict[str, Any]) -> tuple:
"""Parse cached repository data into financial statements."""
balance_sheet = None
income_statement = None
cash_flow = None
if cached_data and "financial_statements" in cached_data:
statements = cached_data["financial_statements"]
if "balance_sheet" in statements:
balance_sheet = FinancialStatement(**statements["balance_sheet"])
if "income_statement" in statements:
income_statement = FinancialStatement(**statements["income_statement"])
if "cash_flow" in statements:
cash_flow = FinancialStatement(**statements["cash_flow"])
return balance_sheet, income_statement, cash_flow
def _get_fundamental_data_local_first(
self, symbol: str, start_date: str, end_date: str, frequency: str
) -> tuple[
FinancialStatement | None,
FinancialStatement | None,
FinancialStatement | None,
str,
]:
"""Get fundamental data using local-first strategy: check local data first, fetch missing if needed."""
try:
# Check if we have sufficient local data
if self.repository.has_data_for_period(
symbol, start_date, end_date, frequency=frequency
):
logger.info(
f"Using local fundamental data for {symbol} ({start_date} to {end_date})"
)
cached_data = self.repository.get_data(
symbol=symbol,
start_date=start_date,
end_date=end_date,
frequency=frequency,
)
balance_sheet, income_statement, cash_flow = (
self._parse_cached_statements(cached_data)
)
return balance_sheet, income_statement, cash_flow, "local_cache"
# We don't have sufficient local data - need to fetch from APIs
logger.info(
f"Local data insufficient, fetching from APIs for {symbol} ({start_date} to {end_date})"
)
balance_sheet, income_statement, cash_flow, _ = (
self._fetch_fresh_fundamental_data(
symbol, start_date, end_date, frequency
)
)
# Cache the fresh data
if any([balance_sheet, income_statement, cash_flow]):
try:
cache_data = {
"symbol": symbol,
"frequency": frequency,
"financial_statements": {},
"metadata": {"cached_at": datetime.utcnow().isoformat()},
}
if balance_sheet:
cache_data["financial_statements"]["balance_sheet"] = (
balance_sheet.model_dump()
)
if income_statement:
cache_data["financial_statements"]["income_statement"] = (
income_statement.model_dump()
)
if cash_flow:
cache_data["financial_statements"]["cash_flow"] = (
cash_flow.model_dump()
)
self.repository.store_data(symbol, cache_data, frequency=frequency)
logger.debug(f"Cached fresh fundamental data for {symbol}")
except Exception as e:
logger.warning(
f"Failed to cache fundamental data for {symbol}: {e}"
)
return balance_sheet, income_statement, cash_flow, "live_api"
except Exception as e:
logger.error(f"Error fetching fundamental data for {symbol}: {e}")
return None, None, None, "error"
def _fetch_and_cache_fresh_fundamental_data(
self, symbol: str, start_date: str, end_date: str, frequency: str
) -> tuple[
FinancialStatement | None,
FinancialStatement | None,
FinancialStatement | None,
str,
]:
"""Force fetch fresh fundamental data from APIs and cache it, bypassing local data."""
try:
logger.info(
f"Force refreshing fundamental data from APIs for {symbol} ({start_date} to {end_date})"
)
# Clear existing data
try:
self.repository.clear_data(
symbol, start_date, end_date, frequency=frequency
)
logger.debug(f"Cleared existing fundamental data for {symbol}")
except Exception as e:
logger.warning(
f"Failed to clear existing fundamental data for {symbol}: {e}"
)
# Fetch fresh data
balance_sheet, income_statement, cash_flow, _ = (
self._fetch_fresh_fundamental_data(
symbol, start_date, end_date, frequency
)
)
# Cache the fresh data
if any([balance_sheet, income_statement, cash_flow]):
try:
cache_data = {
"symbol": symbol,
"frequency": frequency,
"financial_statements": {},
"metadata": {"refreshed_at": datetime.utcnow().isoformat()},
}
if balance_sheet:
cache_data["financial_statements"]["balance_sheet"] = (
balance_sheet.model_dump()
)
if income_statement:
cache_data["financial_statements"]["income_statement"] = (
income_statement.model_dump()
)
if cash_flow:
cache_data["financial_statements"]["cash_flow"] = (
cash_flow.model_dump()
)
self.repository.store_data(
symbol, cache_data, frequency=frequency, overwrite=True
)
logger.debug(f"Cached refreshed fundamental data for {symbol}")
except Exception as e:
logger.warning(
f"Failed to cache refreshed fundamental data for {symbol}: {e}"
)
return balance_sheet, income_statement, cash_flow, "live_api_refresh"
except Exception as e:
logger.error(f"Error force refreshing fundamental data for {symbol}: {e}")
return None, None, None, "refresh_error"
def _fetch_fresh_fundamental_data(
self, symbol: str, start_date: str, end_date: str, frequency: str
) -> tuple[
FinancialStatement | None,
FinancialStatement | None,
FinancialStatement | None,
str,
]:
"""Fetch fresh fundamental data from APIs."""
balance_sheet = None
income_statement = None
cash_flow = None
if self.is_online() and self.finnhub_client:
# Parse end_date string to date object for client calls
try:
end_date_obj = date.fromisoformat(end_date)
except ValueError as e:
logger.error(f"Invalid end_date format '{end_date}': {e}")
return balance_sheet, income_statement, cash_flow, "date_error"
# Get financial statements from Finnhub client
balance_sheet = self._get_balance_sheet(symbol, frequency, end_date_obj)
income_statement = self._get_income_statement(
symbol, frequency, end_date_obj
)
cash_flow = self._get_cash_flow(symbol, frequency, end_date_obj)
return balance_sheet, income_statement, cash_flow, "live_api"
def _calculate_key_ratios(
self,
balance_sheet: FinancialStatement | None,
income_statement: FinancialStatement | None,
cash_flow: FinancialStatement | None,
) -> dict[str, float]:
"""Calculate key financial ratios from financial statements."""
ratios = {}
try:
# Extract data from statements
bs_data = balance_sheet.data if balance_sheet else {}
is_data = income_statement.data if income_statement else {}
# Liquidity Ratios
if (
"Total Current Assets" in bs_data
and "Total Current Liabilities" in bs_data
):
current_liabilities = bs_data["Total Current Liabilities"]
if current_liabilities > 0:
ratios["current_ratio"] = (
bs_data["Total Current Assets"] / current_liabilities
)
# Quick ratio (more conservative)
if all(
k in bs_data
for k in [
"Cash and Cash Equivalents",
"Short-term Investments",
"Accounts Receivable",
"Total Current Liabilities",
]
):
quick_assets = (
bs_data["Cash and Cash Equivalents"]
+ bs_data.get("Short-term Investments", 0)
+ bs_data["Accounts Receivable"]
)
current_liabilities = bs_data["Total Current Liabilities"]
if current_liabilities > 0:
ratios["quick_ratio"] = quick_assets / current_liabilities
# Cash ratio
if (
"Cash and Cash Equivalents" in bs_data
and "Total Current Liabilities" in bs_data
):
current_liabilities = bs_data["Total Current Liabilities"]
if current_liabilities > 0:
cash_and_equivalents = bs_data[
"Cash and Cash Equivalents"
] + bs_data.get("Short-term Investments", 0)
ratios["cash_ratio"] = cash_and_equivalents / current_liabilities
# Leverage Ratios
if "Long-term Debt" in bs_data and "Total Shareholders Equity" in bs_data:
equity = bs_data["Total Shareholders Equity"]
if equity > 0:
ratios["debt_to_equity"] = bs_data["Long-term Debt"] / equity
if "Long-term Debt" in bs_data and "Total Assets" in bs_data:
assets = bs_data["Total Assets"]
if assets > 0:
ratios["debt_to_assets"] = bs_data["Long-term Debt"] / assets
if "Total Assets" in bs_data and "Total Shareholders Equity" in bs_data:
equity = bs_data["Total Shareholders Equity"]
if equity > 0:
ratios["equity_multiplier"] = bs_data["Total Assets"] / equity
# Profitability Ratios
if "Total Revenue" in is_data and "Cost of Revenue" in is_data:
revenue = is_data["Total Revenue"]
if revenue > 0:
ratios["gross_margin"] = (
revenue - is_data["Cost of Revenue"]
) / revenue
if "Operating Income" in is_data and "Total Revenue" in is_data:
revenue = is_data["Total Revenue"]
if revenue > 0:
ratios["operating_margin"] = is_data["Operating Income"] / revenue
if "Net Income" in is_data and "Total Revenue" in is_data:
revenue = is_data["Total Revenue"]
if revenue > 0:
ratios["net_margin"] = is_data["Net Income"] / revenue
# Return on Equity (ROE)
if "Net Income" in is_data and "Total Shareholders Equity" in bs_data:
equity = bs_data["Total Shareholders Equity"]
if equity > 0:
ratios["roe"] = is_data["Net Income"] / equity
# Return on Assets (ROA)
if "Net Income" in is_data and "Total Assets" in bs_data:
assets = bs_data["Total Assets"]
if assets > 0:
ratios["roa"] = is_data["Net Income"] / assets
# Efficiency Ratios
if "Total Revenue" in is_data and "Total Assets" in bs_data:
assets = bs_data["Total Assets"]
if assets > 0:
ratios["asset_turnover"] = is_data["Total Revenue"] / assets
# Inventory turnover
if "Cost of Revenue" in is_data and "Inventory" in bs_data:
inventory = bs_data["Inventory"]
if inventory > 0:
ratios["inventory_turnover"] = (
is_data["Cost of Revenue"] / inventory
)
# Receivables turnover
if "Total Revenue" in is_data and "Accounts Receivable" in bs_data:
receivables = bs_data["Accounts Receivable"]
if receivables > 0:
ratios["receivables_turnover"] = (
is_data["Total Revenue"] / receivables
)
except Exception as e:
logger.warning(f"Error calculating financial ratios: {e}")
return ratios
def _handle_partial_statements(
self,
symbol: str,
start_date: str,
end_date: str,
frequency: str,
balance_sheet: FinancialStatement | None,
income_statement: FinancialStatement | None,
cash_flow: FinancialStatement | None,
key_ratios: dict[str, float],
data_quality: DataQuality,
data_source: str,
force_refresh: bool,
error_info: dict[str, Any],
) -> FundamentalContext:
"""Create context even if some statements are missing.
- If all statements fail: Raise exception
- If some statements succeed: Return partial context
- Mark missing statements in metadata
"""
statement_count = sum(
[
balance_sheet is not None,
income_statement is not None,
cash_flow is not None,
]
)
# If all statements failed, raise exception
if statement_count == 0 and data_source not in ["local_cache"]:
error_msg = f"Failed to fetch any financial statements for {symbol}"
if error_info:
error_msg += f": {error_info.get('error', 'Unknown error')}"
raise ValueError(error_msg)
# Create metadata with partial data information
metadata = {
"data_quality": data_quality,
"service": "fundamental_data",
"online_mode": self.is_online(),
"frequency": frequency,
"data_source": data_source,
"force_refresh": force_refresh,
"has_balance_sheet": balance_sheet is not None,
"has_income_statement": income_statement is not None,
"has_cash_flow": cash_flow is not None,
"partial_data": statement_count < 3,
"statement_count": statement_count,
**error_info,
}
return FundamentalContext(
symbol=symbol,
period={"start": start_date, "end": end_date},
balance_sheet=balance_sheet,
income_statement=income_statement,
cash_flow=cash_flow,
key_ratios=key_ratios,
metadata=metadata,
)
def detect_fundamental_gaps(
self, symbol: str, start_date: str, end_date: str, frequency: str
) -> list[str]:
"""
Returns list of report dates that need fetching.
Example: If requesting quarterly from 2024-01-01 to 2024-12-31
and cache has Q1 and Q3, returns ["2024-06-30", "2024-09-30", "2024-12-31"]
For quarterly: Check for Q1 (Mar 31), Q2 (Jun 30), Q3 (Sep 30), Q4 (Dec 31)
For annual: Check for fiscal year ends
"""
try:
start_dt = date.fromisoformat(start_date)
end_dt = date.fromisoformat(end_date)
except ValueError:
logger.error(
f"Invalid date format in gap detection: {start_date}, {end_date}"
)
return []
# Get existing data from repository
try:
cached_data = self.repository.get_data(
symbol, start_date, end_date, frequency
)
existing_dates = set()
if cached_data and "financial_statements" in cached_data:
for statement_type in [
"balance_sheet",
"income_statement",
"cash_flow",
]:
if statement_type in cached_data["financial_statements"]:
stmt = cached_data["financial_statements"][statement_type]
if "report_date" in stmt:
existing_dates.add(stmt["report_date"])
except Exception as e:
logger.warning(f"Error checking cached data for gap detection: {e}")
existing_dates = set()
# Calculate expected report dates based on frequency
expected_dates = []
current_year = start_dt.year
end_year = end_dt.year
if frequency == "quarterly":
# Standard quarterly dates: Mar 31, Jun 30, Sep 30, Dec 31
quarter_dates = [
(3, 31), # Q1
(6, 30), # Q2
(9, 30), # Q3
(12, 31), # Q4
]
for year in range(current_year, end_year + 1):
for month, day in quarter_dates:
report_date = date(year, month, day)
if start_dt <= report_date <= end_dt:
expected_dates.append(report_date.isoformat())
elif frequency == "annual":
# Standard fiscal year end: Dec 31
for year in range(current_year, end_year + 1):
report_date = date(year, 12, 31)
if start_dt <= report_date <= end_dt:
expected_dates.append(report_date.isoformat())
# Return dates that are expected but not in cache
missing_dates = [d for d in expected_dates if d not in existing_dates]
if missing_dates:
logger.info(
f"Gap detection for {symbol}: missing {len(missing_dates)} report periods"
)
return missing_dates
def _determine_data_quality(
self, data_source: str, statement_count: int, has_errors: bool = False
) -> DataQuality:
"""Determine data quality based on source, statement count, and errors."""
if has_errors or statement_count == 0:
return DataQuality.LOW
if data_source in ["local_cache", "error", "refresh_error"]:
return DataQuality.LOW
elif data_source in ["live_api", "live_api_refresh"]:
if statement_count == 3:
return DataQuality.HIGH # All three statements available
elif statement_count == 2:
return DataQuality.MEDIUM # Two statements available
else:
return DataQuality.LOW # One or no statements
else:
return DataQuality.MEDIUM