"""Robust JSON extraction from LLM responses that may wrap JSON in markdown or prose."""
from __future__ import annotations
import json
import re
from typing import Any
# Pre-compiled regex patterns for better performance
THINK_PATTERN = re.compile(r".*?", re.DOTALL)
FENCE_PATTERN = re.compile(r"```(?:json)?\s*\n?(.*?)\n?\s*```", re.DOTALL)
def extract_json(text: str) -> dict[str, Any]:
"""Extract a JSON object from LLM output that may contain markdown fences,
preamble/postamble text, or blocks.
Strategy (in order):
1. Try direct json.loads() — works if the LLM returned pure JSON
2. Strip ... blocks (DeepSeek R1 reasoning)
3. Extract from markdown code fences (```json ... ``` or ``` ... ```)
4. Find the first '{' and last '}' and try to parse that substring
5. Raise ValueError if nothing works
Args:
text: Raw LLM response string.
Returns:
Parsed JSON dict.
Raises:
ValueError: If no valid JSON object could be extracted.
"""
if not text or not text.strip():
raise ValueError("Empty input — no JSON to extract")
def _ensure_dict(obj: object) -> dict[str, Any]:
if not isinstance(obj, dict):
raise ValueError(
f"Expected a JSON object (dict), got {type(obj).__name__}"
)
return obj
# 1. Direct parse
try:
return _ensure_dict(json.loads(text))
except json.JSONDecodeError:
pass
# 2. Strip ... blocks (DeepSeek R1)
cleaned = THINK_PATTERN.sub("", text).strip()
# Try again after stripping think blocks
try:
return _ensure_dict(json.loads(cleaned))
except json.JSONDecodeError:
pass
# 3. Extract from markdown code fences
fences = FENCE_PATTERN.findall(cleaned)
for block in fences:
try:
return _ensure_dict(json.loads(block.strip()))
except (json.JSONDecodeError, ValueError):
# JSONDecodeError = bad JSON; ValueError = parsed but not a dict
continue
# 4. Find first '{' to last '}'
first_brace = cleaned.find("{")
last_brace = cleaned.rfind("}")
if first_brace != -1 and last_brace > first_brace:
try:
return _ensure_dict(json.loads(cleaned[first_brace : last_brace + 1]))
except (json.JSONDecodeError, ValueError):
# JSONDecodeError = bad JSON; ValueError = parsed but not a dict
pass
raise ValueError(
f"Could not extract valid JSON from LLM response (length={len(text)}, "
f"preview={text[:200]!r})"
)