feat: improve error handling

This commit is contained in:
Kevin Bruton 2025-09-29 21:29:03 +02:00
parent 51fc6c23de
commit 350c18dc0b
8 changed files with 139 additions and 14 deletions

42
debug_llm_call.py Normal file
View File

@ -0,0 +1,42 @@
"""Standalone diagnostic script to test a single LLM call with resilience.
Run: python debug_llm_call.py --provider openai --model gpt-4o-mini --message "Test message".
It will respect environment variables for keys and SSL the same way the graph does.
"""
import argparse
import os
from tradingagents.default_config import DEFAULT_CONFIG
from tradingagents.graph.trading_graph import TradingAgentsGraph
from langchain_core.messages import HumanMessage
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--provider', default=DEFAULT_CONFIG['llm_provider'])
parser.add_argument('--model', default=DEFAULT_CONFIG['quick_think_llm'])
parser.add_argument('--message', default='Say hello and include a short market summary placeholder.')
args = parser.parse_args()
cfg = DEFAULT_CONFIG.copy()
cfg['llm_provider'] = args.provider
cfg['quick_think_llm'] = args.model
cfg['deep_think_llm'] = args.model
graph = TradingAgentsGraph(config=cfg)
# Build a minimal state for market analyst
state = {
'trade_date': '2025-09-29',
'company_of_interest': 'AAPL',
'messages': [HumanMessage(content=args.message)],
}
market_node = graph.graph_setup.analyst_nodes.get('market')
if not market_node:
print('Market node not found in graph setup.')
return
# Directly invoke underlying function if possible
result_state = market_node(state)
print('Result keys:', list(result_state.keys()))
print('Market report snippet:', str(result_state.get('market_report',''))[:500])
if __name__ == '__main__':
main()

Binary file not shown.

View File

@ -1,6 +1,7 @@
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
import time
import json
from tradingagents.agents.utils.llm_resilience import invoke_with_retries
def create_fundamentals_analyst(llm, toolkit):
@ -55,13 +56,19 @@ def create_fundamentals_analyst(llm, toolkit):
prompt = prompt.partial(ticker=ticker)
chain = prompt | llm.bind_tools(tools)
result = chain.invoke(state["messages"])
try:
result = invoke_with_retries(chain, state["messages"], toolkit.config)
except Exception as e: # noqa: BLE001
class DummyResult:
def __init__(self, content):
self.content = content
self.tool_calls = []
result = DummyResult(f"Fundamentals analyst failed after retries. Error: {e}")
report = ""
if len(result.tool_calls) == 0:
report = result.content
if getattr(result, 'tool_calls', []) == []:
report = getattr(result, 'content', '')
return {
"messages": [result],

View File

@ -1,6 +1,7 @@
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
import time
import json
from tradingagents.agents.utils.llm_resilience import invoke_with_retries
def create_market_analyst(llm, toolkit):
@ -85,12 +86,22 @@ Bullish and Bearish Candlestick Patterns:
chain = prompt | llm.bind_tools(tools)
result = chain.invoke(state["messages"])
# Resilient invocation with retries
try:
result = invoke_with_retries(chain, state["messages"], toolkit.config)
except Exception as e: # noqa: BLE001
# Provide a graceful degraded response so graph can continue / be logged
fallback_content = f"Market analyst failed to retrieve a model response after retries. Error: {e}"
class DummyResult:
def __init__(self, content):
self.content = content
self.tool_calls = []
result = DummyResult(fallback_content)
report = ""
if len(result.tool_calls) == 0:
report = result.content
if getattr(result, 'tool_calls', []) == []:
report = getattr(result, 'content', '')
return {
"messages": [result],

View File

@ -1,6 +1,7 @@
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
import time
import json
from tradingagents.agents.utils.llm_resilience import invoke_with_retries
def create_news_analyst(llm, toolkit):
@ -45,12 +46,19 @@ def create_news_analyst(llm, toolkit):
prompt = prompt.partial(ticker=ticker)
chain = prompt | llm.bind_tools(tools)
result = chain.invoke(state["messages"])
try:
result = invoke_with_retries(chain, state["messages"], toolkit.config)
except Exception as e: # noqa: BLE001
class DummyResult:
def __init__(self, content):
self.content = content
self.tool_calls = []
result = DummyResult(f"News analyst failed after retries. Error: {e}")
report = ""
if len(result.tool_calls) == 0:
report = result.content
if getattr(result, 'tool_calls', []) == []:
report = getattr(result, 'content', '')
return {
"messages": [result],

View File

@ -1,6 +1,7 @@
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
import time
import json
from tradingagents.agents.utils.llm_resilience import invoke_with_retries
def create_social_media_analyst(llm, toolkit):
@ -44,13 +45,19 @@ def create_social_media_analyst(llm, toolkit):
prompt = prompt.partial(ticker=ticker)
chain = prompt | llm.bind_tools(tools)
result = chain.invoke(state["messages"])
try:
result = invoke_with_retries(chain, state["messages"], toolkit.config)
except Exception as e: # noqa: BLE001
class DummyResult:
def __init__(self, content):
self.content = content
self.tool_calls = []
result = DummyResult(f"Social media analyst failed after retries. Error: {e}")
report = ""
if len(result.tool_calls) == 0:
report = result.content
if getattr(result, 'tool_calls', []) == []:
report = getattr(result, 'content', '')
return {
"messages": [result],

View File

@ -0,0 +1,46 @@
import time
import json
import logging
from typing import Any, Callable, Dict
from json import JSONDecodeError
logger = logging.getLogger(__name__)
def invoke_with_retries(chain: Any, messages: Any, config: Dict[str, Any]):
"""Invoke a langchain chain with retries and detailed logging.
Handles transient HTTP issues and JSON decode errors coming from provider SDKs.
"""
max_retries = config.get("llm_max_retries", 3)
backoff = config.get("llm_retry_backoff", 2.0)
last_err = None
for attempt in range(1, max_retries + 1):
try:
result = chain.invoke(messages)
return result
except JSONDecodeError as e:
last_err = e
logger.warning(
"JSONDecodeError on attempt %s/%s: %s", attempt, max_retries, e
)
except Exception as e: # noqa: BLE001
# Capture common transient network / HTTP errors keywords
transient = any(
kw in str(e).lower() for kw in [
"timeout", "temporarily", "rate limit", "connection reset", "503", "502", "jsondecodeerror"
]
)
last_err = e
logger.warning(
"LLM invocation error (transient=%s) attempt %s/%s: %s", transient, attempt, max_retries, e
)
if not transient and not isinstance(e, JSONDecodeError):
# Non transient -> abort early
break
# Exponential backoff
sleep_for = backoff ** (attempt - 1)
time.sleep(sleep_for)
# All attempts failed
raise last_err # propagate last error

View File

@ -33,4 +33,8 @@ DEFAULT_CONFIG = {
# Proxy settings (if needed)
"http_proxy": os.getenv("HTTP_PROXY"),
"https_proxy": os.getenv("HTTPS_PROXY"),
# LLM resilience settings
"llm_max_retries": int(os.getenv("LLM_MAX_RETRIES", "3")),
"llm_retry_backoff": float(os.getenv("LLM_RETRY_BACKOFF", "2")), # seconds exponential base
"debug_http": os.getenv("DEBUG_HTTP", "false").lower() in ("1", "true", "yes"),
}