feat(hypotheses): uncap statistical hypotheses from max_active limit

Statistical hypotheses now conclude immediately on the next runner cycle without counting toward max_active. Only implementation hypotheses occupy runner slots. Added conclude_statistical_hypothesis() for instant analysis against existing performance data with Gemini LLM enrichment. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-13 12:39:17 -07:00 · 2026-04-13 12:39:17 -07:00 · 662fdb5753
parent 79a58a540c
commit 662fdb5753
2 changed files with 125 additions and 4 deletions
--- a/.claude/commands/backtest-hypothesis.md
+++ b/.claude/commands/backtest-hypothesis.md
@ -35,13 +35,41 @@ without any code change. Examples:

 If statistical: run the analysis now against `data/recommendations/performance_database.json`.
 Write the finding to the relevant scanner domain file under **Evidence Log**. Print a summary.
-Done — no branch needed.
+
+Then register the hypothesis in `docs/iterations/hypotheses/active.json` as `status: "pending"`
+so the runner picks it up on the next cycle and attaches LLM analysis to the report:
+
+```json
+{
+  "id": "<scanner>-<slug>",
+  "scanner": "<scanner>",
+  "title": "<title>",
+  "description": "<description>",
+  "branch": null,
+  "pr_number": null,
+  "status": "pending",
+  "priority": 0,
+  "expected_impact": "low",
+  "hypothesis_type": "statistical",
+  "created_at": "<YYYY-MM-DD>",
+  "min_days": 0,
+  "days_elapsed": 0,
+  "picks_log": [],
+  "baseline_scanner": "<scanner>",
+  "conclusion": null
+}
+```
+
+Commit and push the updated `active.json` to `main`. Done — no branch or worktree needed.

 ## Step 3b: Implementation Path

 ### 3b-i: Capacity check

-Count running hypotheses from `active.json`. If fewer than `max_active` running, proceed.
+Count running hypotheses where `hypothesis_type == "implementation"` from `active.json`.
+Statistical hypotheses do not consume runner slots and are excluded from this count.
+
+If fewer than `max_active` implementation hypotheses are running, proceed.
 If at capacity: add the new hypothesis as `status: "pending"` — running experiments are NEVER
 paused mid-streak. Inform the user which slot it is queued behind and when it will likely start.

--- a/scripts/run_hypothesis_runner.py
+++ b/scripts/run_hypothesis_runner.py
@ -409,9 +409,89 @@ def _delta_str(hyp_val, base_val, unit: str) -> str:
    return f"{sign}{delta:.1f}{unit}"


+def conclude_statistical_hypothesis(hyp: dict) -> None:
+    """
+    Conclude a statistical hypothesis immediately using existing performance data.
+
+    Statistical hypotheses don't require worktrees or code changes — they answer
+    a question against already-collected pick data. This runs synchronously and
+    writes a markdown report to docs/iterations/hypotheses/concluded/.
+    """
+    hid = hyp["id"]
+    scanner = hyp["scanner"]
+    print(f"\n── Statistical hypothesis: {hid} ──", flush=True)
+
+    # Load performance database
+    picks = []
+    if DB_PATH.exists():
+        try:
+            with open(DB_PATH) as f:
+                db = json.load(f)
+            picks = [p for p in db if p.get("scanner") == scanner or p.get("strategy_match") == scanner]
+        except Exception as e:
+            print(f"    Could not read performance database: {e}", flush=True)
+
+    n = len(picks)
+    print(f"    Found {n} picks for scanner '{scanner}'", flush=True)
+
+    # Compute basic stats
+    scores = [p["final_score"] for p in picks if p.get("final_score") is not None]
+    avg_score = round(sum(scores) / len(scores), 1) if scores else None
+
+    returns_7d = [p["return_7d"] for p in picks if p.get("return_7d") is not None]
+    win_rate = round(100 * sum(1 for r in returns_7d if r > 0) / len(returns_7d), 1) if returns_7d else None
+    avg_return = round(sum(returns_7d) / len(returns_7d), 2) if returns_7d else None
+
+    stats_block = (
+        f"- Total picks: {n}\n"
+        f"- Avg score: {avg_score if avg_score is not None else '—'}\n"
+        f"- 7d win rate: {win_rate if win_rate is not None else '—'}%\n"
+        f"- Avg 7d return: {avg_return if avg_return is not None else '—'}%\n"
+    )
+
+    # Read scanner domain for LLM context
+    scanner_domain = ""
+    domain_file = ROOT / "docs" / "iterations" / "scanners" / f"{scanner}.md"
+    if domain_file.exists():
+        scanner_domain = domain_file.read_text()[:3000]
+
+    # LLM analysis — reuse llm_analysis() with a synthetic conclusion dict
+    conclusion = {
+        "decision": "statistical",
+        "reason": hyp.get("description", "Statistical analysis of existing pick data"),
+        "hypothesis": {"count": n, "win_rate": win_rate, "avg_return": avg_return},
+        "baseline": {},
+    }
+    llm_insight = llm_analysis(hyp, conclusion, scanner_domain)
+
+    # Write concluded report
+    CONCLUDED_DIR.mkdir(parents=True, exist_ok=True)
+    report_path = CONCLUDED_DIR / f"{hid}.md"
+    insight_block = f"\n## LLM Analysis\n\n{llm_insight}\n" if llm_insight else ""
+    report_path.write_text(
+        f"# Statistical Hypothesis: {hyp.get('title', hid)}\n\n"
+        f"**ID:** {hid}\n"
+        f"**Scanner:** {scanner}\n"
+        f"**Description:** {hyp.get('description', '')}\n"
+        f"**Concluded:** {TODAY}\n\n"
+        f"## Data Summary\n\n{stats_block}"
+        f"{insight_block}"
+    )
+    print(f"    Report written to {report_path}", flush=True)
+
+    hyp["status"] = "concluded"
+    hyp["conclusion"] = "statistical"
+    hyp["days_elapsed"] = 0
+
+
 def promote_pending(registry: dict) -> None:
-    """Promote the highest-priority pending hypothesis to running if a slot is open."""
-    running_count = sum(1 for h in registry["hypotheses"] if h["status"] == "running")
+    """Promote the highest-priority pending implementation hypothesis to running if a slot is open."""
+    # Only implementation/forward_test hypotheses count toward max_active.
+    # Statistical hypotheses are concluded immediately and never occupy runner slots.
+    running_count = sum(
+        1 for h in registry["hypotheses"]
+        if h["status"] == "running" and h.get("hypothesis_type", "implementation") == "implementation"
+    )
    max_active = registry.get("max_active", 5)
    if running_count >= max_active:
        return
@ -435,6 +515,19 @@ def main():
    registry = load_registry()
    filter_id = os.environ.get("FILTER_ID", "").strip()

+    # Fast-path: conclude all pending statistical hypotheses immediately.
+    # They answer questions from existing data — no cap, no worktree, no waiting.
+    statistical_pending = [
+        h for h in registry.get("hypotheses", [])
+        if h["status"] == "pending" and h.get("hypothesis_type") == "statistical"
+        and (not filter_id or h["id"] == filter_id)
+    ]
+    for hyp in statistical_pending:
+        try:
+            conclude_statistical_hypothesis(hyp)
+        except Exception as e:
+            print(f"  Error concluding statistical hypothesis {hyp['id']}: {e}", flush=True)
+
    hypotheses = registry.get("hypotheses", [])
    running = [
        h