feat(hypotheses): add LLM analysis to hypothesis conclusion
When ANTHROPIC_API_KEY is set, conclude_hypothesis now: - Loads the scanner domain file for context - Calls claude-haiku-4-5-20251001 for a 3–5 sentence interpretation - Embeds the analysis in the concluded .md doc and PR comment The LLM enriches the conclusion with sample-size caveats, market context, and a follow-up hypothesis suggestion — without overriding the programmatic accept/reject decision. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
49175e3b0a
commit
26df957e37
|
|
@ -24,6 +24,7 @@ import subprocess
|
||||||
import sys
|
import sys
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
ROOT = Path(__file__).resolve().parent.parent
|
ROOT = Path(__file__).resolve().parent.parent
|
||||||
sys.path.insert(0, str(ROOT))
|
sys.path.insert(0, str(ROOT))
|
||||||
|
|
@ -165,6 +166,75 @@ def run_hypothesis(hyp: dict) -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def llm_analysis(hyp: dict, conclusion: dict, scanner_domain: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Ask Claude to interpret the experiment results and provide richer context.
|
||||||
|
|
||||||
|
Returns a markdown string to embed in the PR comment, or None if the API
|
||||||
|
call fails or ANTHROPIC_API_KEY is not set.
|
||||||
|
|
||||||
|
The LLM does NOT override the programmatic decision — it adds nuance:
|
||||||
|
sample-size caveats, market-condition context, follow-up hypotheses.
|
||||||
|
"""
|
||||||
|
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||||
|
if not api_key:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
import anthropic
|
||||||
|
except ImportError:
|
||||||
|
print(" anthropic SDK not installed, skipping LLM analysis", flush=True)
|
||||||
|
return None
|
||||||
|
|
||||||
|
hyp_metrics = conclusion["hypothesis"]
|
||||||
|
base_metrics = conclusion["baseline"]
|
||||||
|
decision = conclusion["decision"]
|
||||||
|
|
||||||
|
prompt = f"""You are analyzing the results of a scanner hypothesis experiment for an automated trading discovery system.
|
||||||
|
|
||||||
|
## Hypothesis
|
||||||
|
**ID:** {hyp["id"]}
|
||||||
|
**Title:** {hyp.get("title", "")}
|
||||||
|
**Description:** {hyp.get("description", hyp.get("title", ""))}
|
||||||
|
**Scanner:** {hyp["scanner"]}
|
||||||
|
**Period:** {hyp.get("created_at")} → {TODAY} ({hyp.get("days_elapsed")} days)
|
||||||
|
|
||||||
|
## Statistical Results
|
||||||
|
**Decision (programmatic):** {decision}
|
||||||
|
**Reason:** {conclusion["reason"]}
|
||||||
|
|
||||||
|
| Metric | Baseline | Experiment | Delta |
|
||||||
|
|---|---|---|---|
|
||||||
|
| 7d win rate | {base_metrics.get("win_rate") or "—"}% | {hyp_metrics.get("win_rate") or "—"}% | {_delta_str(hyp_metrics.get("win_rate"), base_metrics.get("win_rate"), "pp")} |
|
||||||
|
| Avg 7d return | {base_metrics.get("avg_return") or "—"}% | {hyp_metrics.get("avg_return") or "—"}% | {_delta_str(hyp_metrics.get("avg_return"), base_metrics.get("avg_return"), "%")} |
|
||||||
|
| Picks evaluated | {base_metrics.get("evaluated", base_metrics.get("count", "—"))} | {hyp_metrics.get("evaluated", hyp_metrics.get("count", "—"))} | — |
|
||||||
|
|
||||||
|
## Scanner Domain Knowledge
|
||||||
|
{scanner_domain}
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Provide a concise analysis (3–5 sentences) covering:
|
||||||
|
1. Whether the sample size is sufficient to trust the result, or if more data is needed
|
||||||
|
2. Any caveats about the measurement period (e.g., unusual market conditions)
|
||||||
|
3. What the numbers suggest about the underlying hypothesis — even if the decision is "rejected", is the direction meaningful?
|
||||||
|
4. One concrete follow-up hypothesis worth testing next
|
||||||
|
|
||||||
|
Be direct. Do not restate the numbers — interpret them. Do not recommend merging or closing the PR."""
|
||||||
|
|
||||||
|
try:
|
||||||
|
client = anthropic.Anthropic(api_key=api_key)
|
||||||
|
message = client.messages.create(
|
||||||
|
model="claude-haiku-4-5-20251001",
|
||||||
|
max_tokens=512,
|
||||||
|
messages=[{"role": "user", "content": prompt}],
|
||||||
|
)
|
||||||
|
return message.content[0].text.strip()
|
||||||
|
except Exception as e:
|
||||||
|
print(f" LLM analysis failed: {e}", flush=True)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def conclude_hypothesis(hyp: dict) -> bool:
|
def conclude_hypothesis(hyp: dict) -> bool:
|
||||||
"""Run comparison, write conclusion doc, close/merge PR. Returns True."""
|
"""Run comparison, write conclusion doc, close/merge PR. Returns True."""
|
||||||
hid = hyp["id"]
|
hid = hyp["id"]
|
||||||
|
|
@ -208,6 +278,14 @@ def conclude_hypothesis(hyp: dict) -> bool:
|
||||||
hyp_metrics = conclusion["hypothesis"]
|
hyp_metrics = conclusion["hypothesis"]
|
||||||
base_metrics = conclusion["baseline"]
|
base_metrics = conclusion["baseline"]
|
||||||
|
|
||||||
|
# Load scanner domain knowledge (may not exist yet — that's fine)
|
||||||
|
scanner_domain_path = ROOT / "docs" / "iterations" / "scanners" / f"{scanner}.md"
|
||||||
|
scanner_domain = scanner_domain_path.read_text() if scanner_domain_path.exists() else ""
|
||||||
|
|
||||||
|
# Optional LLM analysis — enriches the conclusion without overriding the decision
|
||||||
|
analysis = llm_analysis(hyp, conclusion, scanner_domain)
|
||||||
|
analysis_section = f"\n\n## Analysis\n{analysis}" if analysis else ""
|
||||||
|
|
||||||
period_start = hyp.get("created_at", TODAY)
|
period_start = hyp.get("created_at", TODAY)
|
||||||
concluded_doc = CONCLUDED_DIR / f"{TODAY}-{hid}.md"
|
concluded_doc = CONCLUDED_DIR / f"{TODAY}-{hid}.md"
|
||||||
concluded_doc.write_text(
|
concluded_doc.write_text(
|
||||||
|
|
@ -227,7 +305,8 @@ def conclude_hypothesis(hyp: dict) -> bool:
|
||||||
f"{hyp_metrics.get('avg_return') or '—'}% | "
|
f"{hyp_metrics.get('avg_return') or '—'}% | "
|
||||||
f"{_delta_str(hyp_metrics.get('avg_return'), base_metrics.get('avg_return'), '%')} |\n"
|
f"{_delta_str(hyp_metrics.get('avg_return'), base_metrics.get('avg_return'), '%')} |\n"
|
||||||
f"| Picks | {base_metrics.get('count', '—')} | {hyp_metrics.get('count', '—')} | — |\n\n"
|
f"| Picks | {base_metrics.get('count', '—')} | {hyp_metrics.get('count', '—')} | — |\n\n"
|
||||||
f"## Decision\n{conclusion['reason']}\n\n"
|
f"## Decision\n{conclusion['reason']}\n"
|
||||||
|
f"{analysis_section}\n\n"
|
||||||
f"## Action\n"
|
f"## Action\n"
|
||||||
f"{'Ready to merge — awaiting manual review.' if decision == 'accepted' else 'Experiment concluded — awaiting manual review before closing.'}\n"
|
f"{'Ready to merge — awaiting manual review.' if decision == 'accepted' else 'Experiment concluded — awaiting manual review before closing.'}\n"
|
||||||
)
|
)
|
||||||
|
|
@ -239,13 +318,15 @@ def conclude_hypothesis(hyp: dict) -> bool:
|
||||||
# Mark PR ready for review (removes draft status) and post conclusion as a comment.
|
# Mark PR ready for review (removes draft status) and post conclusion as a comment.
|
||||||
# The PR is NOT merged or closed automatically — the user reviews and decides.
|
# The PR is NOT merged or closed automatically — the user reviews and decides.
|
||||||
outcome_emoji = "✅ accepted" if decision == "accepted" else "❌ rejected"
|
outcome_emoji = "✅ accepted" if decision == "accepted" else "❌ rejected"
|
||||||
|
analysis_block = f"\n\n**Analysis**\n{analysis}" if analysis else ""
|
||||||
comment = (
|
comment = (
|
||||||
f"**Hypothesis concluded: {outcome_emoji}**\n\n"
|
f"**Hypothesis concluded: {outcome_emoji}**\n\n"
|
||||||
f"{conclusion['reason']}\n\n"
|
f"{conclusion['reason']}\n\n"
|
||||||
f"| Metric | Baseline | Experiment |\n"
|
f"| Metric | Baseline | Experiment |\n"
|
||||||
f"|---|---|---|\n"
|
f"|---|---|---|\n"
|
||||||
f"| 7d win rate | {base_metrics.get('win_rate') or '—'}% | {hyp_metrics.get('win_rate') or '—'}% |\n"
|
f"| 7d win rate | {base_metrics.get('win_rate') or '—'}% | {hyp_metrics.get('win_rate') or '—'}% |\n"
|
||||||
f"| Avg return | {base_metrics.get('avg_return') or '—'}% | {hyp_metrics.get('avg_return') or '—'}% |\n\n"
|
f"| Avg return | {base_metrics.get('avg_return') or '—'}% | {hyp_metrics.get('avg_return') or '—'}% |\n"
|
||||||
|
f"{analysis_block}\n\n"
|
||||||
f"{'Merge this PR to apply the change.' if decision == 'accepted' else 'Close this PR to discard the experiment.'}"
|
f"{'Merge this PR to apply the change.' if decision == 'accepted' else 'Close this PR to discard the experiment.'}"
|
||||||
)
|
)
|
||||||
subprocess.run(
|
subprocess.run(
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue