feat(hypotheses): add LLM analysis to hypothesis conclusion

When ANTHROPIC_API_KEY is set, conclude_hypothesis now: - Loads the scanner domain file for context - Calls claude-haiku-4-5-20251001 for a 3–5 sentence interpretation - Embeds the analysis in the concluded .md doc and PR comment The LLM enriches the conclusion with sample-size caveats, market context, and a follow-up hypothesis suggestion — without overriding the programmatic accept/reject decision. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-10 10:57:52 -07:00 · 2026-04-10 10:57:52 -07:00 · 26df957e37
parent 49175e3b0a
commit 26df957e37
1 changed files with 83 additions and 2 deletions
--- a/scripts/run_hypothesis_runner.py
+++ b/scripts/run_hypothesis_runner.py
@ -24,6 +24,7 @@ import subprocess
 import sys
 from datetime import datetime
 from pathlib import Path
+from typing import Optional

 ROOT = Path(__file__).resolve().parent.parent
 sys.path.insert(0, str(ROOT))
@ -165,6 +166,75 @@ def run_hypothesis(hyp: dict) -> bool:
    return False


+def llm_analysis(hyp: dict, conclusion: dict, scanner_domain: str) -> Optional[str]:
+    """
+    Ask Claude to interpret the experiment results and provide richer context.
+
+    Returns a markdown string to embed in the PR comment, or None if the API
+    call fails or ANTHROPIC_API_KEY is not set.
+
+    The LLM does NOT override the programmatic decision — it adds nuance:
+    sample-size caveats, market-condition context, follow-up hypotheses.
+    """
+    api_key = os.environ.get("ANTHROPIC_API_KEY")
+    if not api_key:
+        return None
+
+    try:
+        import anthropic
+    except ImportError:
+        print("    anthropic SDK not installed, skipping LLM analysis", flush=True)
+        return None
+
+    hyp_metrics = conclusion["hypothesis"]
+    base_metrics = conclusion["baseline"]
+    decision = conclusion["decision"]
+
+    prompt = f"""You are analyzing the results of a scanner hypothesis experiment for an automated trading discovery system.
+
+## Hypothesis
+**ID:** {hyp["id"]}
+**Title:** {hyp.get("title", "")}
+**Description:** {hyp.get("description", hyp.get("title", ""))}
+**Scanner:** {hyp["scanner"]}
+**Period:** {hyp.get("created_at")} → {TODAY} ({hyp.get("days_elapsed")} days)
+
+## Statistical Results
+**Decision (programmatic):** {decision}
+**Reason:** {conclusion["reason"]}
+
+| Metric | Baseline | Experiment | Delta |
+|---|---|---|---|
+| 7d win rate | {base_metrics.get("win_rate") or "—"}% | {hyp_metrics.get("win_rate") or "—"}% | {_delta_str(hyp_metrics.get("win_rate"), base_metrics.get("win_rate"), "pp")} |
+| Avg 7d return | {base_metrics.get("avg_return") or "—"}% | {hyp_metrics.get("avg_return") or "—"}% | {_delta_str(hyp_metrics.get("avg_return"), base_metrics.get("avg_return"), "%")} |
+| Picks evaluated | {base_metrics.get("evaluated", base_metrics.get("count", "—"))} | {hyp_metrics.get("evaluated", hyp_metrics.get("count", "—"))} | — |
+
+## Scanner Domain Knowledge
+{scanner_domain}
+
+---
+
+Provide a concise analysis (3–5 sentences) covering:
+1. Whether the sample size is sufficient to trust the result, or if more data is needed
+2. Any caveats about the measurement period (e.g., unusual market conditions)
+3. What the numbers suggest about the underlying hypothesis — even if the decision is "rejected", is the direction meaningful?
+4. One concrete follow-up hypothesis worth testing next
+
+Be direct. Do not restate the numbers — interpret them. Do not recommend merging or closing the PR."""
+
+    try:
+        client = anthropic.Anthropic(api_key=api_key)
+        message = client.messages.create(
+            model="claude-haiku-4-5-20251001",
+            max_tokens=512,
+            messages=[{"role": "user", "content": prompt}],
+        )
+        return message.content[0].text.strip()
+    except Exception as e:
+        print(f"    LLM analysis failed: {e}", flush=True)
+        return None
+
+
 def conclude_hypothesis(hyp: dict) -> bool:
    """Run comparison, write conclusion doc, close/merge PR. Returns True."""
    hid = hyp["id"]
@ -208,6 +278,14 @@ def conclude_hypothesis(hyp: dict) -> bool:
    hyp_metrics = conclusion["hypothesis"]
    base_metrics = conclusion["baseline"]

+    # Load scanner domain knowledge (may not exist yet — that's fine)
+    scanner_domain_path = ROOT / "docs" / "iterations" / "scanners" / f"{scanner}.md"
+    scanner_domain = scanner_domain_path.read_text() if scanner_domain_path.exists() else ""
+
+    # Optional LLM analysis — enriches the conclusion without overriding the decision
+    analysis = llm_analysis(hyp, conclusion, scanner_domain)
+    analysis_section = f"\n\n## Analysis\n{analysis}" if analysis else ""
+
    period_start = hyp.get("created_at", TODAY)
    concluded_doc = CONCLUDED_DIR / f"{TODAY}-{hid}.md"
    concluded_doc.write_text(
@ -227,7 +305,8 @@ def conclude_hypothesis(hyp: dict) -> bool:
        f"{hyp_metrics.get('avg_return') or '—'}% | "
        f"{_delta_str(hyp_metrics.get('avg_return'), base_metrics.get('avg_return'), '%')} |\n"
        f"| Picks | {base_metrics.get('count', '—')} | {hyp_metrics.get('count', '—')} | — |\n\n"
-        f"## Decision\n{conclusion['reason']}\n\n"
+        f"## Decision\n{conclusion['reason']}\n"
+        f"{analysis_section}\n\n"
        f"## Action\n"
        f"{'Ready to merge — awaiting manual review.' if decision == 'accepted' else 'Experiment concluded — awaiting manual review before closing.'}\n"
    )
@ -239,13 +318,15 @@ def conclude_hypothesis(hyp: dict) -> bool:
        # Mark PR ready for review (removes draft status) and post conclusion as a comment.
        # The PR is NOT merged or closed automatically — the user reviews and decides.
        outcome_emoji = "✅ accepted" if decision == "accepted" else "❌ rejected"
+        analysis_block = f"\n\n**Analysis**\n{analysis}" if analysis else ""
        comment = (
            f"**Hypothesis concluded: {outcome_emoji}**\n\n"
            f"{conclusion['reason']}\n\n"
            f"| Metric | Baseline | Experiment |\n"
            f"|---|---|---|\n"
            f"| 7d win rate | {base_metrics.get('win_rate') or '—'}% | {hyp_metrics.get('win_rate') or '—'}% |\n"
-            f"| Avg return | {base_metrics.get('avg_return') or '—'}% | {hyp_metrics.get('avg_return') or '—'}% |\n\n"
+            f"| Avg return | {base_metrics.get('avg_return') or '—'}% | {hyp_metrics.get('avg_return') or '—'}% |\n"
+            f"{analysis_block}\n\n"
            f"{'Merge this PR to apply the change.' if decision == 'accepted' else 'Close this PR to discard the experiment.'}"
        )
        subprocess.run(