From 26df957e3751053f5aceca553702c6632fd97700 Mon Sep 17 00:00:00 2001
From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com>
Date: Fri, 10 Apr 2026 10:57:52 -0700
Subject: [PATCH] feat(hypotheses): add LLM analysis to hypothesis conclusion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When ANTHROPIC_API_KEY is set, conclude_hypothesis now:
- Loads the scanner domain file for context
- Calls claude-haiku-4-5-20251001 for a 3–5 sentence interpretation
- Embeds the analysis in the concluded .md doc and PR comment

The LLM enriches the conclusion with sample-size caveats, market
context, and a follow-up hypothesis suggestion — without overriding
the programmatic accept/reject decision.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 scripts/run_hypothesis_runner.py | 85 +++++++++++++++++++++++++++++++-
 1 file changed, 83 insertions(+), 2 deletions(-)

diff --git a/scripts/run_hypothesis_runner.py b/scripts/run_hypothesis_runner.py
index 38340617..179b8794 100644
--- a/scripts/run_hypothesis_runner.py
+++ b/scripts/run_hypothesis_runner.py
@@ -24,6 +24,7 @@ import subprocess
 import sys
 from datetime import datetime
 from pathlib import Path
+from typing import Optional
 
 ROOT = Path(__file__).resolve().parent.parent
 sys.path.insert(0, str(ROOT))
@@ -165,6 +166,75 @@ def run_hypothesis(hyp: dict) -> bool:
     return False
 
 
+def llm_analysis(hyp: dict, conclusion: dict, scanner_domain: str) -> Optional[str]:
+    """
+    Ask Claude to interpret the experiment results and provide richer context.
+
+    Returns a markdown string to embed in the PR comment, or None if the API
+    call fails or ANTHROPIC_API_KEY is not set.
+
+    The LLM does NOT override the programmatic decision — it adds nuance:
+    sample-size caveats, market-condition context, follow-up hypotheses.
+    """
+    api_key = os.environ.get("ANTHROPIC_API_KEY")
+    if not api_key:
+        return None
+
+    try:
+        import anthropic
+    except ImportError:
+        print("    anthropic SDK not installed, skipping LLM analysis", flush=True)
+        return None
+
+    hyp_metrics = conclusion["hypothesis"]
+    base_metrics = conclusion["baseline"]
+    decision = conclusion["decision"]
+
+    prompt = f"""You are analyzing the results of a scanner hypothesis experiment for an automated trading discovery system.
+
+## Hypothesis
+**ID:** {hyp["id"]}
+**Title:** {hyp.get("title", "")}
+**Description:** {hyp.get("description", hyp.get("title", ""))}
+**Scanner:** {hyp["scanner"]}
+**Period:** {hyp.get("created_at")} → {TODAY} ({hyp.get("days_elapsed")} days)
+
+## Statistical Results
+**Decision (programmatic):** {decision}
+**Reason:** {conclusion["reason"]}
+
+| Metric | Baseline | Experiment | Delta |
+|---|---|---|---|
+| 7d win rate | {base_metrics.get("win_rate") or "—"}% | {hyp_metrics.get("win_rate") or "—"}% | {_delta_str(hyp_metrics.get("win_rate"), base_metrics.get("win_rate"), "pp")} |
+| Avg 7d return | {base_metrics.get("avg_return") or "—"}% | {hyp_metrics.get("avg_return") or "—"}% | {_delta_str(hyp_metrics.get("avg_return"), base_metrics.get("avg_return"), "%")} |
+| Picks evaluated | {base_metrics.get("evaluated", base_metrics.get("count", "—"))} | {hyp_metrics.get("evaluated", hyp_metrics.get("count", "—"))} | — |
+
+## Scanner Domain Knowledge
+{scanner_domain}
+
+---
+
+Provide a concise analysis (3–5 sentences) covering:
+1. Whether the sample size is sufficient to trust the result, or if more data is needed
+2. Any caveats about the measurement period (e.g., unusual market conditions)
+3. What the numbers suggest about the underlying hypothesis — even if the decision is "rejected", is the direction meaningful?
+4. One concrete follow-up hypothesis worth testing next
+
+Be direct. Do not restate the numbers — interpret them. Do not recommend merging or closing the PR."""
+
+    try:
+        client = anthropic.Anthropic(api_key=api_key)
+        message = client.messages.create(
+            model="claude-haiku-4-5-20251001",
+            max_tokens=512,
+            messages=[{"role": "user", "content": prompt}],
+        )
+        return message.content[0].text.strip()
+    except Exception as e:
+        print(f"    LLM analysis failed: {e}", flush=True)
+        return None
+
+
 def conclude_hypothesis(hyp: dict) -> bool:
     """Run comparison, write conclusion doc, close/merge PR. Returns True."""
     hid = hyp["id"]
@@ -208,6 +278,14 @@ def conclude_hypothesis(hyp: dict) -> bool:
     hyp_metrics = conclusion["hypothesis"]
     base_metrics = conclusion["baseline"]
 
+    # Load scanner domain knowledge (may not exist yet — that's fine)
+    scanner_domain_path = ROOT / "docs" / "iterations" / "scanners" / f"{scanner}.md"
+    scanner_domain = scanner_domain_path.read_text() if scanner_domain_path.exists() else ""
+
+    # Optional LLM analysis — enriches the conclusion without overriding the decision
+    analysis = llm_analysis(hyp, conclusion, scanner_domain)
+    analysis_section = f"\n\n## Analysis\n{analysis}" if analysis else ""
+
     period_start = hyp.get("created_at", TODAY)
     concluded_doc = CONCLUDED_DIR / f"{TODAY}-{hid}.md"
     concluded_doc.write_text(
@@ -227,7 +305,8 @@ def conclude_hypothesis(hyp: dict) -> bool:
         f"{hyp_metrics.get('avg_return') or '—'}% | "
         f"{_delta_str(hyp_metrics.get('avg_return'), base_metrics.get('avg_return'), '%')} |\n"
         f"| Picks | {base_metrics.get('count', '—')} | {hyp_metrics.get('count', '—')} | — |\n\n"
-        f"## Decision\n{conclusion['reason']}\n\n"
+        f"## Decision\n{conclusion['reason']}\n"
+        f"{analysis_section}\n\n"
         f"## Action\n"
         f"{'Ready to merge — awaiting manual review.' if decision == 'accepted' else 'Experiment concluded — awaiting manual review before closing.'}\n"
     )
@@ -239,13 +318,15 @@ def conclude_hypothesis(hyp: dict) -> bool:
         # Mark PR ready for review (removes draft status) and post conclusion as a comment.
         # The PR is NOT merged or closed automatically — the user reviews and decides.
         outcome_emoji = "✅ accepted" if decision == "accepted" else "❌ rejected"
+        analysis_block = f"\n\n**Analysis**\n{analysis}" if analysis else ""
         comment = (
             f"**Hypothesis concluded: {outcome_emoji}**\n\n"
             f"{conclusion['reason']}\n\n"
             f"| Metric | Baseline | Experiment |\n"
             f"|---|---|---|\n"
             f"| 7d win rate | {base_metrics.get('win_rate') or '—'}% | {hyp_metrics.get('win_rate') or '—'}% |\n"
-            f"| Avg return | {base_metrics.get('avg_return') or '—'}% | {hyp_metrics.get('avg_return') or '—'}% |\n\n"
+            f"| Avg return | {base_metrics.get('avg_return') or '—'}% | {hyp_metrics.get('avg_return') or '—'}% |\n"
+            f"{analysis_block}\n\n"
             f"{'Merge this PR to apply the change.' if decision == 'accepted' else 'Close this PR to discard the experiment.'}"
         )
         subprocess.run(