496 lines
17 KiB
Python
496 lines
17 KiB
Python
"""
|
|
UAT (User Acceptance Testing) for Agent Output Quality.
|
|
|
|
This module provides end-to-end tests for complete agent workflows:
|
|
1. Complete analysis workflow (BUY/SELL/HOLD scenarios)
|
|
2. Edge case handling (missing data, conflicting reports)
|
|
3. Content quality validation (length, structure, clarity)
|
|
4. State integrity checks (field presence, debate coherence)
|
|
|
|
All tests use mocked data to avoid real API calls.
|
|
"""
|
|
|
|
import pytest
|
|
from typing import Dict, Any
|
|
|
|
from tradingagents.utils.output_validator import (
|
|
validate_agent_state,
|
|
validate_decision_quality,
|
|
validate_debate_state,
|
|
validate_report_completeness,
|
|
)
|
|
|
|
pytestmark = pytest.mark.e2e
|
|
|
|
|
|
# ============================================================================
|
|
# Test Complete Analysis Workflow
|
|
# ============================================================================
|
|
|
|
class TestCompleteAnalysisWorkflow:
|
|
"""Test complete agent analysis workflow for different trading scenarios."""
|
|
|
|
def test_buy_scenario_complete_workflow(self, sample_agent_state_buy):
|
|
"""
|
|
Test complete BUY scenario workflow.
|
|
|
|
Validates:
|
|
- All reports generated
|
|
- Investment debate concludes with BUY
|
|
- Risk debate validates decision
|
|
- Final decision is BUY with reasoning
|
|
"""
|
|
state = sample_agent_state_buy
|
|
|
|
# Validate complete state
|
|
result = validate_agent_state(state)
|
|
|
|
assert result.is_valid is True
|
|
assert result.metrics["company_of_interest"] == "AAPL"
|
|
assert result.metrics["reports_present"] == 4
|
|
assert result.metrics["final_signal"] == "BUY"
|
|
assert result.metrics["investment_debate_valid"] is True
|
|
assert result.metrics["risk_debate_valid"] is True
|
|
|
|
def test_sell_scenario_complete_workflow(self, sample_agent_state_sell):
|
|
"""
|
|
Test complete SELL scenario workflow.
|
|
|
|
Validates:
|
|
- All reports generated
|
|
- Investment debate concludes with SELL
|
|
- Risk debate validates decision
|
|
- Final decision is SELL with reasoning
|
|
"""
|
|
state = sample_agent_state_sell
|
|
|
|
result = validate_agent_state(state)
|
|
|
|
assert result.is_valid is True
|
|
assert result.metrics["final_signal"] == "SELL"
|
|
assert result.metrics["reports_present"] == 4
|
|
|
|
def test_hold_scenario_complete_workflow(self, sample_agent_state_hold):
|
|
"""
|
|
Test complete HOLD scenario workflow.
|
|
|
|
Validates:
|
|
- All reports generated
|
|
- Investment debate is inconclusive or balanced
|
|
- Risk debate recommends caution
|
|
- Final decision is HOLD with reasoning
|
|
"""
|
|
state = sample_agent_state_hold
|
|
|
|
result = validate_agent_state(state)
|
|
|
|
assert result.is_valid is True
|
|
assert result.metrics["final_signal"] == "HOLD"
|
|
|
|
def test_workflow_preserves_debate_history(self, sample_agent_state_buy):
|
|
"""Test that debate history is preserved throughout workflow."""
|
|
state = sample_agent_state_buy
|
|
|
|
invest_debate = state["investment_debate_state"]
|
|
risk_debate = state["risk_debate_state"]
|
|
|
|
# Validate both debates have history
|
|
invest_result = validate_debate_state(invest_debate, debate_type="invest")
|
|
risk_result = validate_debate_state(risk_debate, debate_type="risk")
|
|
|
|
assert invest_result.metrics["history_length"] > 0
|
|
assert risk_result.metrics["history_length"] > 0
|
|
assert invest_result.metrics["count"] > 0
|
|
assert risk_result.metrics["count"] > 0
|
|
|
|
def test_workflow_all_reports_meet_quality_standards(self, sample_agent_state_buy):
|
|
"""Test that all generated reports meet quality standards."""
|
|
state = sample_agent_state_buy
|
|
|
|
reports = [
|
|
state["market_report"],
|
|
state["sentiment_report"],
|
|
state["news_report"],
|
|
state["fundamentals_report"],
|
|
]
|
|
|
|
for report in reports:
|
|
result = validate_report_completeness(
|
|
report,
|
|
min_length=500,
|
|
require_markdown_tables=False,
|
|
require_sections=False,
|
|
)
|
|
assert result.is_valid is True
|
|
assert result.metrics["length"] >= 500
|
|
|
|
|
|
# ============================================================================
|
|
# Test Edge Case Scenarios
|
|
# ============================================================================
|
|
|
|
class TestEdgeCaseScenarios:
|
|
"""Test handling of edge cases and unusual scenarios."""
|
|
|
|
def test_missing_single_report_graceful_degradation(self):
|
|
"""Test that workflow continues with one missing report."""
|
|
state = {
|
|
"company_of_interest": "TSLA",
|
|
"trade_date": "2024-01-20",
|
|
"market_report": "Market analysis. " * 100,
|
|
"sentiment_report": "Sentiment analysis. " * 100,
|
|
"news_report": "News analysis. " * 100,
|
|
# Missing fundamentals_report
|
|
"investment_debate_state": {
|
|
"history": "Debate based on available data",
|
|
"count": 3,
|
|
"judge_decision": "HOLD: Incomplete data, proceeding cautiously",
|
|
},
|
|
"risk_debate_state": {
|
|
"history": "Risk assessment",
|
|
"count": 2,
|
|
"judge_decision": "HOLD: Missing fundamentals increases uncertainty",
|
|
},
|
|
"final_trade_decision": "HOLD: Awaiting fundamental data",
|
|
}
|
|
|
|
result = validate_agent_state(state)
|
|
|
|
# Should still be valid but with warnings
|
|
assert result.is_valid is True
|
|
assert result.metrics["reports_present"] == 3
|
|
assert len(result.warnings) > 0
|
|
|
|
def test_conflicting_debate_conclusions_warning(self):
|
|
"""Test warning when investment and risk debates conflict."""
|
|
state = {
|
|
"company_of_interest": "GOOGL",
|
|
"trade_date": "2024-01-22",
|
|
"market_report": "Report. " * 100,
|
|
"sentiment_report": "Report. " * 100,
|
|
"news_report": "Report. " * 100,
|
|
"fundamentals_report": "Report. " * 100,
|
|
"investment_debate_state": {
|
|
"history": "Bullish debate",
|
|
"count": 2,
|
|
"judge_decision": "BUY: Strong upside potential",
|
|
},
|
|
"risk_debate_state": {
|
|
"history": "Risk concerns",
|
|
"count": 2,
|
|
"judge_decision": "SELL: Risk too high", # Conflicts with invest
|
|
},
|
|
"final_trade_decision": "HOLD: Conflicting signals from teams",
|
|
}
|
|
|
|
result = validate_agent_state(state)
|
|
|
|
assert result.is_valid is True
|
|
# Different signals detected
|
|
assert result.metrics.get("final_signal") == "HOLD"
|
|
|
|
def test_empty_debate_history_but_valid_decision(self):
|
|
"""Test handling of empty debate history with valid decision."""
|
|
state = {
|
|
"company_of_interest": "MSFT",
|
|
"trade_date": "2024-01-25",
|
|
"market_report": "Report. " * 100,
|
|
"investment_debate_state": {
|
|
"history": "", # Empty history
|
|
"count": 0,
|
|
"judge_decision": "HOLD: Insufficient deliberation",
|
|
},
|
|
"final_trade_decision": "HOLD: More analysis needed",
|
|
}
|
|
|
|
result = validate_agent_state(state)
|
|
|
|
assert result.is_valid is True
|
|
assert len(result.warnings) > 0 # Should warn about empty history
|
|
|
|
def test_very_long_debate_convergence_issue(self):
|
|
"""Test detection of debates that went too long."""
|
|
state = {
|
|
"company_of_interest": "NVDA",
|
|
"trade_date": "2024-01-28",
|
|
"market_report": "Report. " * 100,
|
|
"investment_debate_state": {
|
|
"history": "Round 1...\nRound 2...\n" * 15,
|
|
"count": 15, # Very high count
|
|
"judge_decision": "BUY: Finally reached consensus",
|
|
},
|
|
"final_trade_decision": "BUY: After extensive deliberation",
|
|
}
|
|
|
|
result = validate_agent_state(state)
|
|
|
|
assert result.is_valid is True
|
|
# Should have warnings about high debate count
|
|
invest_debate_result = validate_debate_state(
|
|
state["investment_debate_state"],
|
|
debate_type="invest"
|
|
)
|
|
assert len(invest_debate_result.warnings) > 0
|
|
|
|
def test_malformed_but_extractable_decision(self):
|
|
"""Test extraction of signal from poorly formatted decision."""
|
|
decisions = [
|
|
"i think we should BUY this stock",
|
|
"recommendation: buy",
|
|
"buy!!!",
|
|
"Final call is to buy the position",
|
|
]
|
|
|
|
for decision in decisions:
|
|
result = validate_decision_quality(decision)
|
|
assert result.metrics["signal"] == "BUY"
|
|
|
|
def test_missing_all_debate_states(self):
|
|
"""Test handling when no debates occurred."""
|
|
state = {
|
|
"company_of_interest": "META",
|
|
"trade_date": "2024-02-01",
|
|
"market_report": "Report. " * 100,
|
|
# No debate states
|
|
"final_trade_decision": "HOLD: No consensus reached",
|
|
}
|
|
|
|
result = validate_agent_state(state)
|
|
|
|
assert result.is_valid is True
|
|
assert len(result.warnings) > 0
|
|
assert any("incomplete" in w.lower() for w in result.warnings)
|
|
|
|
|
|
# ============================================================================
|
|
# Test Content Quality
|
|
# ============================================================================
|
|
|
|
class TestContentQuality:
|
|
"""Test content quality validation across all outputs."""
|
|
|
|
def test_report_minimum_length_enforcement(self):
|
|
"""Test that all reports meet minimum length requirements."""
|
|
short_reports = [
|
|
"Too short",
|
|
"Also short",
|
|
"Brief",
|
|
]
|
|
|
|
for report in short_reports:
|
|
result = validate_report_completeness(report, min_length=500)
|
|
assert result.is_valid is False
|
|
|
|
def test_report_markdown_structure_quality(self):
|
|
"""Test that well-structured reports are recognized."""
|
|
well_structured_report = """
|
|
# Market Analysis for AAPL
|
|
|
|
## Executive Summary
|
|
Strong buy signal based on comprehensive analysis.
|
|
|
|
## Technical Indicators
|
|
| Indicator | Value | Signal |
|
|
|-----------|-------|--------|
|
|
| RSI | 45 | Neutral|
|
|
| MACD | +2.3 | Buy |
|
|
|
|
## Fundamental Analysis
|
|
- Revenue growth: 15% YoY
|
|
- P/E ratio: 25 (reasonable for tech)
|
|
- Strong balance sheet
|
|
|
|
## Conclusion
|
|
""" + "Detailed conclusion. " * 50
|
|
|
|
result = validate_report_completeness(
|
|
well_structured_report,
|
|
min_length=500,
|
|
require_markdown_tables=True,
|
|
require_sections=True,
|
|
)
|
|
|
|
assert result.is_valid is True
|
|
assert result.metrics["markdown_tables"] > 0
|
|
assert result.metrics["section_headers"] >= 3
|
|
assert result.metrics["has_bullet_points"] is True
|
|
|
|
def test_decision_clarity_with_reasoning(self):
|
|
"""Test that clear decisions with reasoning are validated."""
|
|
clear_decisions = [
|
|
"BUY: Strong fundamentals (P/E 20), positive momentum (RSI 55), bullish sentiment",
|
|
"SELL: Overvalued at current P/E of 45, declining revenue, negative news",
|
|
"HOLD: Mixed signals - good fundamentals but uncertain market conditions",
|
|
]
|
|
|
|
for decision in clear_decisions:
|
|
result = validate_decision_quality(decision)
|
|
assert result.is_valid is True
|
|
assert result.metrics["has_reasoning"] is True
|
|
assert len(result.warnings) == 0 # Clear decisions shouldn't warn
|
|
|
|
def test_decision_ambiguity_detection(self):
|
|
"""Test detection of ambiguous decisions."""
|
|
ambiguous_decisions = [
|
|
"BUY or SELL, not sure",
|
|
"Maybe HOLD, could be BUY",
|
|
"SELL but also considering BUY",
|
|
]
|
|
|
|
for decision in ambiguous_decisions:
|
|
result = validate_decision_quality(decision)
|
|
# Should still extract first signal
|
|
assert result.metrics["signal"] is not None
|
|
# But should warn about ambiguity
|
|
assert len(result.warnings) > 0
|
|
|
|
def test_report_content_variety_indicators(self):
|
|
"""Test that reports with varied content structure are recognized."""
|
|
varied_report = """
|
|
# Comprehensive Analysis
|
|
|
|
## Overview
|
|
Multiple content types present.
|
|
|
|
## Data Table
|
|
| Metric | Q1 | Q2 | Q3 | Q4 |
|
|
|--------|----|----|----|----|
|
|
| Revenue| 10M| 12M| 15M| 18M|
|
|
|
|
## Key Points
|
|
- Point 1
|
|
- Point 2
|
|
* Point 3
|
|
|
|
## Details
|
|
""" + "Additional detailed analysis. " * 50
|
|
|
|
result = validate_report_completeness(varied_report, min_length=500)
|
|
|
|
assert result.is_valid is True
|
|
assert result.metrics["markdown_tables"] > 0
|
|
assert result.metrics["section_headers"] > 0
|
|
assert result.metrics["has_bullet_points"] is True
|
|
# No warnings about lacking structure
|
|
assert not any("structured" in w.lower() for w in result.warnings)
|
|
|
|
|
|
# ============================================================================
|
|
# Test State Integrity
|
|
# ============================================================================
|
|
|
|
class TestStateIntegrity:
|
|
"""Test integrity and consistency of agent state."""
|
|
|
|
def test_all_required_fields_present(self, sample_agent_state_buy):
|
|
"""Test that all required fields are present in state."""
|
|
state = sample_agent_state_buy
|
|
|
|
required_fields = [
|
|
"company_of_interest",
|
|
"trade_date",
|
|
"market_report",
|
|
"sentiment_report",
|
|
"news_report",
|
|
"fundamentals_report",
|
|
"investment_debate_state",
|
|
"risk_debate_state",
|
|
"final_trade_decision",
|
|
]
|
|
|
|
for field in required_fields:
|
|
assert field in state, f"Missing required field: {field}"
|
|
|
|
def test_debate_state_internal_consistency(self, sample_invest_debate):
|
|
"""Test internal consistency of debate state."""
|
|
debate = sample_invest_debate
|
|
|
|
result = validate_debate_state(debate, debate_type="invest")
|
|
|
|
assert result.is_valid is True
|
|
# Count should match history length (approximately)
|
|
assert result.metrics["count"] > 0
|
|
assert result.metrics["history_length"] > 0
|
|
|
|
def test_final_decision_aligns_with_debates(self, sample_agent_state_buy):
|
|
"""Test that final decision aligns with debate conclusions."""
|
|
state = sample_agent_state_buy
|
|
|
|
invest_debate = state["investment_debate_state"]
|
|
risk_debate = state["risk_debate_state"]
|
|
final_decision = state["final_trade_decision"]
|
|
|
|
# Extract all signals
|
|
invest_result = validate_debate_state(invest_debate, debate_type="invest")
|
|
risk_result = validate_debate_state(risk_debate, debate_type="risk")
|
|
final_result = validate_decision_quality(final_decision)
|
|
|
|
# All should be BUY for this scenario
|
|
assert invest_result.metrics.get("judge_signal") == "BUY"
|
|
assert risk_result.metrics.get("judge_signal") in ["BUY", "HOLD"]
|
|
assert final_result.metrics["signal"] == "BUY"
|
|
|
|
def test_state_preserves_company_context(self, sample_agent_state_buy):
|
|
"""Test that company context is preserved throughout state."""
|
|
state = sample_agent_state_buy
|
|
|
|
company = state["company_of_interest"]
|
|
trade_date = state["trade_date"]
|
|
|
|
# Verify basic context
|
|
assert isinstance(company, str)
|
|
assert len(company) > 0
|
|
assert isinstance(trade_date, str)
|
|
assert len(trade_date) > 0
|
|
|
|
def test_debate_history_chronological_consistency(self, sample_invest_debate):
|
|
"""Test that debate history appears chronologically consistent."""
|
|
debate = sample_invest_debate
|
|
|
|
history = debate["history"]
|
|
count = debate["count"]
|
|
|
|
# History should exist if count > 0
|
|
if count > 0:
|
|
assert len(history) > 0
|
|
|
|
# If multiple rounds, history should reflect that
|
|
if count >= 2:
|
|
# Should have multiple segments or rounds
|
|
assert len(history) > 50 # Reasonable minimum for 2+ rounds
|
|
|
|
def test_type_consistency_across_state(self, sample_agent_state_buy):
|
|
"""Test that all fields have correct types."""
|
|
state = sample_agent_state_buy
|
|
|
|
# String fields
|
|
string_fields = [
|
|
"company_of_interest",
|
|
"trade_date",
|
|
"market_report",
|
|
"sentiment_report",
|
|
"news_report",
|
|
"fundamentals_report",
|
|
"final_trade_decision",
|
|
]
|
|
|
|
for field in string_fields:
|
|
if field in state:
|
|
assert isinstance(state[field], str), f"{field} should be string"
|
|
|
|
# Dict fields
|
|
dict_fields = ["investment_debate_state", "risk_debate_state"]
|
|
|
|
for field in dict_fields:
|
|
if field in state:
|
|
assert isinstance(state[field], dict), f"{field} should be dict"
|
|
|
|
def test_empty_state_detection(self):
|
|
"""Test detection of completely empty state."""
|
|
empty_state = {}
|
|
|
|
result = validate_agent_state(empty_state)
|
|
|
|
assert result.is_valid is False
|
|
assert len(result.errors) >= 2 # At least missing company and date
|