feat: improve error handling

2025-09-29 21:29:03 +02:00 · 2025-09-29 21:29:03 +02:00 · 350c18dc0b
parent 51fc6c23de
commit 350c18dc0b
8 changed files with 139 additions and 14 deletions
--- a/debug_llm_call.py
+++ b/debug_llm_call.py
@ -0,0 +1,42 @@
 """Standalone diagnostic script to test a single LLM call with resilience.
 Run: python debug_llm_call.py --provider openai --model gpt-4o-mini --message "Test message".
 It will respect environment variables for keys and SSL the same way the graph does.
 """
 import argparse
 import os
 from tradingagents.default_config import DEFAULT_CONFIG
 from tradingagents.graph.trading_graph import TradingAgentsGraph
 from langchain_core.messages import HumanMessage
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--provider', default=DEFAULT_CONFIG['llm_provider'])
    parser.add_argument('--model', default=DEFAULT_CONFIG['quick_think_llm'])
    parser.add_argument('--message', default='Say hello and include a short market summary placeholder.')
    args = parser.parse_args()
    cfg = DEFAULT_CONFIG.copy()
    cfg['llm_provider'] = args.provider
    cfg['quick_think_llm'] = args.model
    cfg['deep_think_llm'] = args.model
    graph = TradingAgentsGraph(config=cfg)
    # Build a minimal state for market analyst
    state = {
        'trade_date': '2025-09-29',
        'company_of_interest': 'AAPL',
        'messages': [HumanMessage(content=args.message)],
    }
    market_node = graph.graph_setup.analyst_nodes.get('market')
    if not market_node:
        print('Market node not found in graph setup.')
        return
    # Directly invoke underlying function if possible
    result_state = market_node(state)
    print('Result keys:', list(result_state.keys()))
    print('Market report snippet:', str(result_state.get('market_report',''))[:500])
 if __name__ == '__main__':
    main()
--- a/memory_store/chroma.sqlite3
+++ b/memory_store/chroma.sqlite3
--- a/tradingagents/agents/analysts/fundamentals_analyst.py
+++ b/tradingagents/agents/analysts/fundamentals_analyst.py
@ -1,6 +1,7 @@
 from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 import time
 import json
 from tradingagents.agents.utils.llm_resilience import invoke_with_retries
 def create_fundamentals_analyst(llm, toolkit):
@ -55,13 +56,19 @@ def create_fundamentals_analyst(llm, toolkit):
        prompt = prompt.partial(ticker=ticker)
        chain = prompt | llm.bind_tools(tools)
-
+        try:
-        result = chain.invoke(state["messages"])
+            result = invoke_with_retries(chain, state["messages"], toolkit.config)
        except Exception as e:  # noqa: BLE001
            class DummyResult:
                def __init__(self, content):
                    self.content = content
                    self.tool_calls = []
            result = DummyResult(f"Fundamentals analyst failed after retries. Error: {e}")
        report = ""
-        if len(result.tool_calls) == 0:
+        if getattr(result, 'tool_calls', []) == []:
-            report = result.content
+            report = getattr(result, 'content', '')
        return {
            "messages": [result],
--- a/tradingagents/agents/analysts/market_analyst.py
+++ b/tradingagents/agents/analysts/market_analyst.py
@ -1,6 +1,7 @@
 from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 import time
 import json
 from tradingagents.agents.utils.llm_resilience import invoke_with_retries
 def create_market_analyst(llm, toolkit):
@ -85,12 +86,22 @@ Bullish and Bearish Candlestick Patterns:
        chain = prompt | llm.bind_tools(tools)
-        result = chain.invoke(state["messages"])
+        # Resilient invocation with retries
        try:
            result = invoke_with_retries(chain, state["messages"], toolkit.config)
        except Exception as e:  # noqa: BLE001
            # Provide a graceful degraded response so graph can continue / be logged
            fallback_content = f"Market analyst failed to retrieve a model response after retries. Error: {e}"
            class DummyResult:
                def __init__(self, content):
                    self.content = content
                    self.tool_calls = []
            result = DummyResult(fallback_content)
        report = ""
-        if len(result.tool_calls) == 0:
+        if getattr(result, 'tool_calls', []) == []:
-            report = result.content
+            report = getattr(result, 'content', '')
        return {
            "messages": [result],
--- a/tradingagents/agents/analysts/news_analyst.py
+++ b/tradingagents/agents/analysts/news_analyst.py
@ -1,6 +1,7 @@
 from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 import time
 import json
 from tradingagents.agents.utils.llm_resilience import invoke_with_retries
 def create_news_analyst(llm, toolkit):
@ -45,12 +46,19 @@ def create_news_analyst(llm, toolkit):
        prompt = prompt.partial(ticker=ticker)
        chain = prompt | llm.bind_tools(tools)
-        result = chain.invoke(state["messages"])
+        try:
            result = invoke_with_retries(chain, state["messages"], toolkit.config)
        except Exception as e:  # noqa: BLE001
            class DummyResult:
                def __init__(self, content):
                    self.content = content
                    self.tool_calls = []
            result = DummyResult(f"News analyst failed after retries. Error: {e}")
        report = ""
-        if len(result.tool_calls) == 0:
+        if getattr(result, 'tool_calls', []) == []:
-            report = result.content
+            report = getattr(result, 'content', '')
        return {
            "messages": [result],
--- a/tradingagents/agents/analysts/social_media_analyst.py
+++ b/tradingagents/agents/analysts/social_media_analyst.py
@ -1,6 +1,7 @@
 from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 import time
 import json
 from tradingagents.agents.utils.llm_resilience import invoke_with_retries
 def create_social_media_analyst(llm, toolkit):
@ -44,13 +45,19 @@ def create_social_media_analyst(llm, toolkit):
        prompt = prompt.partial(ticker=ticker)
        chain = prompt | llm.bind_tools(tools)
-
+        try:
-        result = chain.invoke(state["messages"])
+            result = invoke_with_retries(chain, state["messages"], toolkit.config)
        except Exception as e:  # noqa: BLE001
            class DummyResult:
                def __init__(self, content):
                    self.content = content
                    self.tool_calls = []
            result = DummyResult(f"Social media analyst failed after retries. Error: {e}")
        report = ""
-        if len(result.tool_calls) == 0:
+        if getattr(result, 'tool_calls', []) == []:
-            report = result.content
+            report = getattr(result, 'content', '')
        return {
            "messages": [result],
--- a/tradingagents/agents/utils/llm_resilience.py
+++ b/tradingagents/agents/utils/llm_resilience.py
@ -0,0 +1,46 @@
 import time
 import json
 import logging
 from typing import Any, Callable, Dict
 from json import JSONDecodeError
 logger = logging.getLogger(__name__)
 def invoke_with_retries(chain: Any, messages: Any, config: Dict[str, Any]):
    """Invoke a langchain chain with retries and detailed logging.
    Handles transient HTTP issues and JSON decode errors coming from provider SDKs.
    """
    max_retries = config.get("llm_max_retries", 3)
    backoff = config.get("llm_retry_backoff", 2.0)
    last_err = None
    for attempt in range(1, max_retries + 1):
        try:
            result = chain.invoke(messages)
            return result
        except JSONDecodeError as e:
            last_err = e
            logger.warning(
                "JSONDecodeError on attempt %s/%s: %s", attempt, max_retries, e
            )
        except Exception as e:  # noqa: BLE001
            # Capture common transient network / HTTP errors keywords
            transient = any(
                kw in str(e).lower() for kw in [
                    "timeout", "temporarily", "rate limit", "connection reset", "503", "502", "jsondecodeerror"
                ]
            )
            last_err = e
            logger.warning(
                "LLM invocation error (transient=%s) attempt %s/%s: %s", transient, attempt, max_retries, e
            )
            if not transient and not isinstance(e, JSONDecodeError):
                # Non transient -> abort early
                break
        # Exponential backoff
        sleep_for = backoff ** (attempt - 1)
        time.sleep(sleep_for)
    # All attempts failed
    raise last_err  # propagate last error
--- a/tradingagents/default_config.py
+++ b/tradingagents/default_config.py
@ -33,4 +33,8 @@ DEFAULT_CONFIG = {
    # Proxy settings (if needed)
    "http_proxy": os.getenv("HTTP_PROXY"),
    "https_proxy": os.getenv("HTTPS_PROXY"),
    # LLM resilience settings
    "llm_max_retries": int(os.getenv("LLM_MAX_RETRIES", "3")),
    "llm_retry_backoff": float(os.getenv("LLM_RETRY_BACKOFF", "2")),  # seconds exponential base
    "debug_http": os.getenv("DEBUG_HTTP", "false").lower() in ("1", "true", "yes"),
 }