diff --git a/.claude/commands/backtest-hypothesis.md b/.claude/commands/backtest-hypothesis.md index 3941bb70..b3c0d45f 100644 --- a/.claude/commands/backtest-hypothesis.md +++ b/.claude/commands/backtest-hypothesis.md @@ -35,13 +35,41 @@ without any code change. Examples: If statistical: run the analysis now against `data/recommendations/performance_database.json`. Write the finding to the relevant scanner domain file under **Evidence Log**. Print a summary. -Done — no branch needed. + +Then register the hypothesis in `docs/iterations/hypotheses/active.json` as `status: "pending"` +so the runner picks it up on the next cycle and attaches LLM analysis to the report: + +```json +{ + "id": "-", + "scanner": "", + "title": "", + "description": "<description>", + "branch": null, + "pr_number": null, + "status": "pending", + "priority": 0, + "expected_impact": "low", + "hypothesis_type": "statistical", + "created_at": "<YYYY-MM-DD>", + "min_days": 0, + "days_elapsed": 0, + "picks_log": [], + "baseline_scanner": "<scanner>", + "conclusion": null +} +``` + +Commit and push the updated `active.json` to `main`. Done — no branch or worktree needed. ## Step 3b: Implementation Path ### 3b-i: Capacity check -Count running hypotheses from `active.json`. If fewer than `max_active` running, proceed. +Count running hypotheses where `hypothesis_type == "implementation"` from `active.json`. +Statistical hypotheses do not consume runner slots and are excluded from this count. + +If fewer than `max_active` implementation hypotheses are running, proceed. If at capacity: add the new hypothesis as `status: "pending"` — running experiments are NEVER paused mid-streak. Inform the user which slot it is queued behind and when it will likely start. diff --git a/scripts/run_hypothesis_runner.py b/scripts/run_hypothesis_runner.py index c360407c..8c229c31 100644 --- a/scripts/run_hypothesis_runner.py +++ b/scripts/run_hypothesis_runner.py @@ -409,9 +409,89 @@ def _delta_str(hyp_val, base_val, unit: str) -> str: return f"{sign}{delta:.1f}{unit}" +def conclude_statistical_hypothesis(hyp: dict) -> None: + """ + Conclude a statistical hypothesis immediately using existing performance data. + + Statistical hypotheses don't require worktrees or code changes — they answer + a question against already-collected pick data. This runs synchronously and + writes a markdown report to docs/iterations/hypotheses/concluded/. + """ + hid = hyp["id"] + scanner = hyp["scanner"] + print(f"\n── Statistical hypothesis: {hid} ──", flush=True) + + # Load performance database + picks = [] + if DB_PATH.exists(): + try: + with open(DB_PATH) as f: + db = json.load(f) + picks = [p for p in db if p.get("scanner") == scanner or p.get("strategy_match") == scanner] + except Exception as e: + print(f" Could not read performance database: {e}", flush=True) + + n = len(picks) + print(f" Found {n} picks for scanner '{scanner}'", flush=True) + + # Compute basic stats + scores = [p["final_score"] for p in picks if p.get("final_score") is not None] + avg_score = round(sum(scores) / len(scores), 1) if scores else None + + returns_7d = [p["return_7d"] for p in picks if p.get("return_7d") is not None] + win_rate = round(100 * sum(1 for r in returns_7d if r > 0) / len(returns_7d), 1) if returns_7d else None + avg_return = round(sum(returns_7d) / len(returns_7d), 2) if returns_7d else None + + stats_block = ( + f"- Total picks: {n}\n" + f"- Avg score: {avg_score if avg_score is not None else '—'}\n" + f"- 7d win rate: {win_rate if win_rate is not None else '—'}%\n" + f"- Avg 7d return: {avg_return if avg_return is not None else '—'}%\n" + ) + + # Read scanner domain for LLM context + scanner_domain = "" + domain_file = ROOT / "docs" / "iterations" / "scanners" / f"{scanner}.md" + if domain_file.exists(): + scanner_domain = domain_file.read_text()[:3000] + + # LLM analysis — reuse llm_analysis() with a synthetic conclusion dict + conclusion = { + "decision": "statistical", + "reason": hyp.get("description", "Statistical analysis of existing pick data"), + "hypothesis": {"count": n, "win_rate": win_rate, "avg_return": avg_return}, + "baseline": {}, + } + llm_insight = llm_analysis(hyp, conclusion, scanner_domain) + + # Write concluded report + CONCLUDED_DIR.mkdir(parents=True, exist_ok=True) + report_path = CONCLUDED_DIR / f"{hid}.md" + insight_block = f"\n## LLM Analysis\n\n{llm_insight}\n" if llm_insight else "" + report_path.write_text( + f"# Statistical Hypothesis: {hyp.get('title', hid)}\n\n" + f"**ID:** {hid}\n" + f"**Scanner:** {scanner}\n" + f"**Description:** {hyp.get('description', '')}\n" + f"**Concluded:** {TODAY}\n\n" + f"## Data Summary\n\n{stats_block}" + f"{insight_block}" + ) + print(f" Report written to {report_path}", flush=True) + + hyp["status"] = "concluded" + hyp["conclusion"] = "statistical" + hyp["days_elapsed"] = 0 + + def promote_pending(registry: dict) -> None: - """Promote the highest-priority pending hypothesis to running if a slot is open.""" - running_count = sum(1 for h in registry["hypotheses"] if h["status"] == "running") + """Promote the highest-priority pending implementation hypothesis to running if a slot is open.""" + # Only implementation/forward_test hypotheses count toward max_active. + # Statistical hypotheses are concluded immediately and never occupy runner slots. + running_count = sum( + 1 for h in registry["hypotheses"] + if h["status"] == "running" and h.get("hypothesis_type", "implementation") == "implementation" + ) max_active = registry.get("max_active", 5) if running_count >= max_active: return @@ -435,6 +515,19 @@ def main(): registry = load_registry() filter_id = os.environ.get("FILTER_ID", "").strip() + # Fast-path: conclude all pending statistical hypotheses immediately. + # They answer questions from existing data — no cap, no worktree, no waiting. + statistical_pending = [ + h for h in registry.get("hypotheses", []) + if h["status"] == "pending" and h.get("hypothesis_type") == "statistical" + and (not filter_id or h["id"] == filter_id) + ] + for hyp in statistical_pending: + try: + conclude_statistical_hypothesis(hyp) + except Exception as e: + print(f" Error concluding statistical hypothesis {hyp['id']}: {e}", flush=True) + hypotheses = registry.get("hypotheses", []) running = [ h