TradingAgents/tradingagents/agents/managers/fact_checker.py

from langchain_core.messages import AIMessage
import json
import re
import requests
import warnings
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from concurrent.futures import ThreadPoolExecutor

# Suppress only the single warning from urllib3 needed.
warnings.simplefilter('ignore', InsecureRequestWarning)

def verify_url(url):
    try:
        # Use requests library instead of curl for better portability across OS
        # mimicking the headers we found effective
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'Accept-Language': 'en-US,en;q=0.9',
            'Referer': 'https://www.google.com/',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'cross-site',
            'Sec-Fetch-User': '?1',
            'Cache-Control': 'max-age=0'
        }

        # Use GET with stream=True to check headers/status without downloading full body
        # verify=False is equivalent to curl -k (insecure)
        response = requests.get(url, headers=headers, timeout=15, stream=True, verify=False)

        status_code = response.status_code

        # Treat 403 as potentially accessible but blocked by WAF
        if 200 <= status_code < 400:
            return url, "VALID"
        elif status_code == 404:
            return url, "NOT FOUND (404)"
        elif status_code == 403:
             return url, "VALID (Protected/403)"
        else:
            return url, f"ACCESSIBLE (Status: {status_code})"

    except Exception as e:
        # Fallback error handling
        return url, f"ERROR (Could not access: {str(e)})"

def get_unique_urls(text):
    if not text:
        return []
    # Simple regex to find URLs
    urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    # Remove duplicates and clean trailing punctuation
    clean_urls = []
    for url in urls:
        # Strip common trailing punctuation that might be captured
        url = url.rstrip(').,;]')
        clean_urls.append(url)

    return list(set(clean_urls))

def check_urls_and_get_data(text, source_label):
    unique_urls = get_unique_urls(text)

    if not unique_urls:
        return []

    results = []
    # Limit max workers to avoid being flagged as DoS
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(verify_url, url) for url in unique_urls]
        for future in futures:
            url, status = future.result()
            results.append({
                "url": url,
                "status": status,
                "source": source_label
            })
    return results

def create_fact_checker(llm):
    def fact_checker_node(state) -> dict:
        investment_debate_state = state["investment_debate_state"]
        current_response = investment_debate_state.get("current_response", "")

        # Reports to verify against
        market_research_report = state["market_report"]
        sentiment_report = state["sentiment_report"]
        news_report = state["news_report"]
        fundamentals_report = state["fundamentals_report"]

        # If there's no response to check, pass
        if not current_response:
            return {}

        # Verify URLs - collect structured data
        verified_data = []

        # 1. Check URLs in News Report
        news_verified = check_urls_and_get_data(news_report, "News Analyst")
        verified_data.extend(news_verified)

        # 2. Check URLs in Current Response (Debate)
        response_verified = check_urls_and_get_data(current_response, "Debate Speaker")
        verified_data.extend(response_verified)

        # Deduplicate by URL (keep first occurrence or merge sources)
        unique_verified_map = {}
        for item in verified_data:
            if item["url"] not in unique_verified_map:
                unique_verified_map[item["url"]] = item
            else:
                # If already exists, append source if different
                if item["source"] not in unique_verified_map[item["url"]]["source"]:
                    unique_verified_map[item["url"]]["source"] += f", {item['source']}"

        final_verified_list = list(unique_verified_map.values())

        # Generate text report for LLM prompt
        url_check_report = ""
        if final_verified_list:
            url_check_report = "\n[URL Verification Report]\n"
            for item in final_verified_list:
                url_check_report += f"- {item['url']}: {item['status']} (Source: {item['source']})\n"

            print(f"DEBUG: URL Verification completed.\n{url_check_report}")

        prompt = f"""You are a strict Fact Checker for a financial analysis team.
Your job is to verify the claims made in the following statement against the provided source reports AND verify the validity of the sources.

Statement to Verify:
"{current_response}"

Source Reports:
1. Market Research: {market_research_report}
2. Sentiment Report: {sentiment_report}
3. News Report: {news_report}
4. Fundamentals Report: {fundamentals_report}

URL Verification Status (Physical Check of Links):
{url_check_report}

Instructions:
1. Extract every factual claim (numbers, dates, specific news events) from the statement.
2. Check if each claim exists in the Source Reports.
3. **CRITICAL**: Check the "URL Verification Status" section. If the statement relies on a news article whose URL is marked as "NOT FOUND", "ERROR", or "INVALID", you MUST flag this as a potential invalid source or deleted article.
4. If a claim is NOT found or contradicts the reports, flag it as a HALLUCINATION.
5. If the statement is mostly opinion/analysis, it is acceptable.
6. **ALWAYS** list the Verified Sources (URLs) at the end of your response, even if verified.

Output Format:
If all facts are supported and sources are valid:
"VERIFIED: The statement is consistent with the provided data and sources appear valid.

[Verified Sources]
- [URL1] (Status: VALID)
...
"

If errors are found:
"CORRECTION NEEDED:
- Claim: [Claim] -> Error: [Not found / Contradiction / Source URL Invalid or Deleted]
...

[Verified Sources]
- [URL1] (Status: ...)
...
"

Be extremely strict about numbers. If revenue is 10B in reports but statement says 12B, flag it.
Be strict about source validity. If a key argument is based on a broken link, flag it.
"""

        response = llm.invoke(prompt)
        check_result = response.content

        # Log the fact check result
        print(f"\n\n[Fact Checker]:\n{check_result}\n")

        # Prepare state update
        new_state = investment_debate_state.copy()

        # Merge existing verified urls with new ones
        existing_verified = new_state.get("verified_urls", [])
        # We want to accumulate unique URLs seen so far
        existing_url_map = {item['url']: item for item in existing_verified}
        for item in final_verified_list:
             existing_url_map[item['url']] = item
        new_state["verified_urls"] = list(existing_url_map.values())

        if "CORRECTION NEEDED" in check_result:
            updated_response = f"{current_response}\n\n[SYSTEM NOTE: Fact Check Warning]\n{check_result}"
            new_state["current_response"] = updated_response
            new_state["history"] += f"\n\n[Fact Check Warning]: {check_result}"
            return {"investment_debate_state": new_state}

        elif "VERIFIED" in check_result and url_check_report:
            new_state["history"] += f"\n\n[Fact Checker]: {check_result}"
            return {"investment_debate_state": new_state}

        # Even if no text update, we return state to save verified_urls
        return {"investment_debate_state": new_state}

    return fact_checker_node