feat(hypotheses): add LLM analysis to hypothesis conclusion

When ANTHROPIC_API_KEY is set, conclude_hypothesis now: - Loads the scanner domain file for context - Calls claude-haiku-4-5-20251001 for a 3–5 sentence interpretation - Embeds the analysis in the concluded .md doc and PR comment The LLM enriches the conclusion with sample-size caveats, market context, and a follow-up hypothesis suggestion — without overriding the programmatic accept/reject decision. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-10 10:57:52 -07:00 · 2026-04-10 10:57:52 -07:00 · 26df957e37
parent 49175e3b0a
commit 26df957e37
1 changed files with 83 additions and 2 deletions
--- a/scripts/run_hypothesis_runner.py
+++ b/scripts/run_hypothesis_runner.py
@ -24,6 +24,7 @@ import subprocess
 import sys
 from datetime import datetime
 from pathlib import Path
 from typing import Optional
 ROOT = Path(__file__).resolve().parent.parent
 sys.path.insert(0, str(ROOT))
@ -165,6 +166,75 @@ def run_hypothesis(hyp: dict) -> bool:
    return False
 def llm_analysis(hyp: dict, conclusion: dict, scanner_domain: str) -> Optional[str]:
    """
    Ask Claude to interpret the experiment results and provide richer context.
    Returns a markdown string to embed in the PR comment, or None if the API
    call fails or ANTHROPIC_API_KEY is not set.
    The LLM does NOT override the programmatic decision — it adds nuance:
    sample-size caveats, market-condition context, follow-up hypotheses.
    """
    api_key = os.environ.get("ANTHROPIC_API_KEY")
    if not api_key:
        return None
    try:
        import anthropic
    except ImportError:
        print("    anthropic SDK not installed, skipping LLM analysis", flush=True)
        return None
    hyp_metrics = conclusion["hypothesis"]
    base_metrics = conclusion["baseline"]
    decision = conclusion["decision"]
    prompt = f"""You are analyzing the results of a scanner hypothesis experiment for an automated trading discovery system.
 ## Hypothesis
 **ID:** {hyp["id"]}
 **Title:** {hyp.get("title", "")}
 **Description:** {hyp.get("description", hyp.get("title", ""))}
 **Scanner:** {hyp["scanner"]}
 **Period:** {hyp.get("created_at")} → {TODAY} ({hyp.get("days_elapsed")} days)
 ## Statistical Results
 **Decision (programmatic):** {decision}
 **Reason:** {conclusion["reason"]}
 | Metric | Baseline | Experiment | Delta |
 |---|---|---|---|
 | 7d win rate | {base_metrics.get("win_rate") or "—"}% | {hyp_metrics.get("win_rate") or "—"}% | {_delta_str(hyp_metrics.get("win_rate"), base_metrics.get("win_rate"), "pp")} |
 | Avg 7d return | {base_metrics.get("avg_return") or "—"}% | {hyp_metrics.get("avg_return") or "—"}% | {_delta_str(hyp_metrics.get("avg_return"), base_metrics.get("avg_return"), "%")} |
 | Picks evaluated | {base_metrics.get("evaluated", base_metrics.get("count", "—"))} | {hyp_metrics.get("evaluated", hyp_metrics.get("count", "—"))} | — |
 ## Scanner Domain Knowledge
 {scanner_domain}
 ---
 Provide a concise analysis (3–5 sentences) covering:
 1. Whether the sample size is sufficient to trust the result, or if more data is needed
 2. Any caveats about the measurement period (e.g., unusual market conditions)
 3. What the numbers suggest about the underlying hypothesis — even if the decision is "rejected", is the direction meaningful?
 4. One concrete follow-up hypothesis worth testing next
 Be direct. Do not restate the numbers — interpret them. Do not recommend merging or closing the PR."""
    try:
        client = anthropic.Anthropic(api_key=api_key)
        message = client.messages.create(
            model="claude-haiku-4-5-20251001",
            max_tokens=512,
            messages=[{"role": "user", "content": prompt}],
        )
        return message.content[0].text.strip()
    except Exception as e:
        print(f"    LLM analysis failed: {e}", flush=True)
        return None
 def conclude_hypothesis(hyp: dict) -> bool:
    """Run comparison, write conclusion doc, close/merge PR. Returns True."""
    hid = hyp["id"]
@ -208,6 +278,14 @@ def conclude_hypothesis(hyp: dict) -> bool:
    hyp_metrics = conclusion["hypothesis"]
    base_metrics = conclusion["baseline"]
    # Load scanner domain knowledge (may not exist yet — that's fine)
    scanner_domain_path = ROOT / "docs" / "iterations" / "scanners" / f"{scanner}.md"
    scanner_domain = scanner_domain_path.read_text() if scanner_domain_path.exists() else ""
    # Optional LLM analysis — enriches the conclusion without overriding the decision
    analysis = llm_analysis(hyp, conclusion, scanner_domain)
    analysis_section = f"\n\n## Analysis\n{analysis}" if analysis else ""
    period_start = hyp.get("created_at", TODAY)
    concluded_doc = CONCLUDED_DIR / f"{TODAY}-{hid}.md"
    concluded_doc.write_text(
@ -227,7 +305,8 @@ def conclude_hypothesis(hyp: dict) -> bool:
        f"{hyp_metrics.get('avg_return') or '—'}% | "
        f"{_delta_str(hyp_metrics.get('avg_return'), base_metrics.get('avg_return'), '%')} |\n"
        f"| Picks | {base_metrics.get('count', '—')} | {hyp_metrics.get('count', '—')} | — |\n\n"
-        f"## Decision\n{conclusion['reason']}\n\n"
+        f"## Decision\n{conclusion['reason']}\n"
        f"{analysis_section}\n\n"
        f"## Action\n"
        f"{'Ready to merge — awaiting manual review.' if decision == 'accepted' else 'Experiment concluded — awaiting manual review before closing.'}\n"
    )
@ -239,13 +318,15 @@ def conclude_hypothesis(hyp: dict) -> bool:
        # Mark PR ready for review (removes draft status) and post conclusion as a comment.
        # The PR is NOT merged or closed automatically — the user reviews and decides.
        outcome_emoji = "✅ accepted" if decision == "accepted" else "❌ rejected"
        analysis_block = f"\n\n**Analysis**\n{analysis}" if analysis else ""
        comment = (
            f"**Hypothesis concluded: {outcome_emoji}**\n\n"
            f"{conclusion['reason']}\n\n"
            f"| Metric | Baseline | Experiment |\n"
            f"|---|---|---|\n"
            f"| 7d win rate | {base_metrics.get('win_rate') or '—'}% | {hyp_metrics.get('win_rate') or '—'}% |\n"
-            f"| Avg return | {base_metrics.get('avg_return') or '—'}% | {hyp_metrics.get('avg_return') or '—'}% |\n\n"
+            f"| Avg return | {base_metrics.get('avg_return') or '—'}% | {hyp_metrics.get('avg_return') or '—'}% |\n"
            f"{analysis_block}\n\n"
            f"{'Merge this PR to apply the change.' if decision == 'accepted' else 'Close this PR to discard the experiment.'}"
        )
        subprocess.run(