feat(hypotheses): add LLM analysis to hypothesis conclusion

When ANTHROPIC_API_KEY is set, conclude_hypothesis now:
- Loads the scanner domain file for context
- Calls claude-haiku-4-5-20251001 for a 3–5 sentence interpretation
- Embeds the analysis in the concluded .md doc and PR comment

The LLM enriches the conclusion with sample-size caveats, market
context, and a follow-up hypothesis suggestion — without overriding
the programmatic accept/reject decision.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Youssef Aitousarrah 2026-04-10 10:57:52 -07:00
parent 49175e3b0a
commit 26df957e37
1 changed files with 83 additions and 2 deletions

View File

@ -24,6 +24,7 @@ import subprocess
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
@ -165,6 +166,75 @@ def run_hypothesis(hyp: dict) -> bool:
return False
def llm_analysis(hyp: dict, conclusion: dict, scanner_domain: str) -> Optional[str]:
"""
Ask Claude to interpret the experiment results and provide richer context.
Returns a markdown string to embed in the PR comment, or None if the API
call fails or ANTHROPIC_API_KEY is not set.
The LLM does NOT override the programmatic decision it adds nuance:
sample-size caveats, market-condition context, follow-up hypotheses.
"""
api_key = os.environ.get("ANTHROPIC_API_KEY")
if not api_key:
return None
try:
import anthropic
except ImportError:
print(" anthropic SDK not installed, skipping LLM analysis", flush=True)
return None
hyp_metrics = conclusion["hypothesis"]
base_metrics = conclusion["baseline"]
decision = conclusion["decision"]
prompt = f"""You are analyzing the results of a scanner hypothesis experiment for an automated trading discovery system.
## Hypothesis
**ID:** {hyp["id"]}
**Title:** {hyp.get("title", "")}
**Description:** {hyp.get("description", hyp.get("title", ""))}
**Scanner:** {hyp["scanner"]}
**Period:** {hyp.get("created_at")} {TODAY} ({hyp.get("days_elapsed")} days)
## Statistical Results
**Decision (programmatic):** {decision}
**Reason:** {conclusion["reason"]}
| Metric | Baseline | Experiment | Delta |
|---|---|---|---|
| 7d win rate | {base_metrics.get("win_rate") or ""}% | {hyp_metrics.get("win_rate") or ""}% | {_delta_str(hyp_metrics.get("win_rate"), base_metrics.get("win_rate"), "pp")} |
| Avg 7d return | {base_metrics.get("avg_return") or ""}% | {hyp_metrics.get("avg_return") or ""}% | {_delta_str(hyp_metrics.get("avg_return"), base_metrics.get("avg_return"), "%")} |
| Picks evaluated | {base_metrics.get("evaluated", base_metrics.get("count", ""))} | {hyp_metrics.get("evaluated", hyp_metrics.get("count", ""))} | |
## Scanner Domain Knowledge
{scanner_domain}
---
Provide a concise analysis (35 sentences) covering:
1. Whether the sample size is sufficient to trust the result, or if more data is needed
2. Any caveats about the measurement period (e.g., unusual market conditions)
3. What the numbers suggest about the underlying hypothesis even if the decision is "rejected", is the direction meaningful?
4. One concrete follow-up hypothesis worth testing next
Be direct. Do not restate the numbers interpret them. Do not recommend merging or closing the PR."""
try:
client = anthropic.Anthropic(api_key=api_key)
message = client.messages.create(
model="claude-haiku-4-5-20251001",
max_tokens=512,
messages=[{"role": "user", "content": prompt}],
)
return message.content[0].text.strip()
except Exception as e:
print(f" LLM analysis failed: {e}", flush=True)
return None
def conclude_hypothesis(hyp: dict) -> bool:
"""Run comparison, write conclusion doc, close/merge PR. Returns True."""
hid = hyp["id"]
@ -208,6 +278,14 @@ def conclude_hypothesis(hyp: dict) -> bool:
hyp_metrics = conclusion["hypothesis"]
base_metrics = conclusion["baseline"]
# Load scanner domain knowledge (may not exist yet — that's fine)
scanner_domain_path = ROOT / "docs" / "iterations" / "scanners" / f"{scanner}.md"
scanner_domain = scanner_domain_path.read_text() if scanner_domain_path.exists() else ""
# Optional LLM analysis — enriches the conclusion without overriding the decision
analysis = llm_analysis(hyp, conclusion, scanner_domain)
analysis_section = f"\n\n## Analysis\n{analysis}" if analysis else ""
period_start = hyp.get("created_at", TODAY)
concluded_doc = CONCLUDED_DIR / f"{TODAY}-{hid}.md"
concluded_doc.write_text(
@ -227,7 +305,8 @@ def conclude_hypothesis(hyp: dict) -> bool:
f"{hyp_metrics.get('avg_return') or ''}% | "
f"{_delta_str(hyp_metrics.get('avg_return'), base_metrics.get('avg_return'), '%')} |\n"
f"| Picks | {base_metrics.get('count', '')} | {hyp_metrics.get('count', '')} | — |\n\n"
f"## Decision\n{conclusion['reason']}\n\n"
f"## Decision\n{conclusion['reason']}\n"
f"{analysis_section}\n\n"
f"## Action\n"
f"{'Ready to merge — awaiting manual review.' if decision == 'accepted' else 'Experiment concluded — awaiting manual review before closing.'}\n"
)
@ -239,13 +318,15 @@ def conclude_hypothesis(hyp: dict) -> bool:
# Mark PR ready for review (removes draft status) and post conclusion as a comment.
# The PR is NOT merged or closed automatically — the user reviews and decides.
outcome_emoji = "✅ accepted" if decision == "accepted" else "❌ rejected"
analysis_block = f"\n\n**Analysis**\n{analysis}" if analysis else ""
comment = (
f"**Hypothesis concluded: {outcome_emoji}**\n\n"
f"{conclusion['reason']}\n\n"
f"| Metric | Baseline | Experiment |\n"
f"|---|---|---|\n"
f"| 7d win rate | {base_metrics.get('win_rate') or ''}% | {hyp_metrics.get('win_rate') or ''}% |\n"
f"| Avg return | {base_metrics.get('avg_return') or ''}% | {hyp_metrics.get('avg_return') or ''}% |\n\n"
f"| Avg return | {base_metrics.get('avg_return') or ''}% | {hyp_metrics.get('avg_return') or ''}% |\n"
f"{analysis_block}\n\n"
f"{'Merge this PR to apply the change.' if decision == 'accepted' else 'Close this PR to discard the experiment.'}"
)
subprocess.run(