From de4ef56c91653653d3919aecbfb59ada79535a37 Mon Sep 17 00:00:00 2001
From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com>
Date: Thu, 9 Apr 2026 23:34:39 -0700
Subject: [PATCH 01/14] docs(spec): hypothesis backtesting system design

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 ...026-04-09-hypothesis-backtesting-design.md | 196 ++++++++++++++++++
 1 file changed, 196 insertions(+)
 create mode 100644 docs/superpowers/specs/2026-04-09-hypothesis-backtesting-design.md
diff --git a/docs/superpowers/specs/2026-04-09-hypothesis-backtesting-design.md b/docs/superpowers/specs/2026-04-09-hypothesis-backtesting-design.md
new file mode 100644
index 00000000..6fa943f6
--- /dev/null
+++ b/docs/superpowers/specs/2026-04-09-hypothesis-backtesting-design.md
@@ -0,0 +1,196 @@
+# Hypothesis Backtesting System — Design Spec
+
+## Goal
+
+Enable systematic, branch-per-hypothesis experimentation for scanner improvements. Each hypothesis runs its modified code daily in isolation, accumulates picks, and auto-concludes with a statistical comparison once enough data exists. Up to 5 experiments run in parallel, prioritized by expected impact, with full UI visibility.
+
+---
+
+## Architecture
+
+```
+docs/iterations/hypotheses/
+  active.json                              ← source of truth for all experiments
+  concluded/
+    YYYY-MM-DD-<id>.md                     ← one file per concluded hypothesis
+
+.claude/commands/
+  backtest-hypothesis.md                   ← /backtest-hypothesis command
+
+.github/workflows/
+  hypothesis-runner.yml                    ← daily 08:00 UTC, runs all active experiments
+
+tradingagents/ui/pages/
+  hypotheses.py                            ← new Streamlit dashboard tab
+```
+
+The `active.json` file lives on `main`. Each hypothesis branch (`hypothesis/<scanner>-<slug>`) contains the code change being tested. The daily runner checks out each branch, runs discovery, commits picks back to that branch, and — once `min_days` have elapsed — concludes the hypothesis and cleans up.
+
+---
+
+## `active.json` Schema
+
+```json
+{
+  "max_active": 5,
+  "hypotheses": [
+    {
+      "id": "options_flow-scan-3-expirations",
+      "scanner": "options_flow",
+      "title": "Scan 3 expirations instead of 1",
+      "description": "Hypothesis: scanning up to 3 expirations captures institutional positioning in 30+ DTE contracts, improving signal quality over nearest-expiry-only.",
+      "branch": "hypothesis/options_flow-scan-3-expirations",
+      "pr_number": 14,
+      "status": "running",
+      "priority": 8,
+      "expected_impact": "high",
+      "hypothesis_type": "implementation",
+      "created_at": "2026-04-09",
+      "min_days": 14,
+      "days_elapsed": 3,
+      "picks_log": ["2026-04-09", "2026-04-10", "2026-04-11"],
+      "baseline_scanner": "options_flow",
+      "conclusion": null
+    }
+  ]
+}
+```
+
+**Field reference:**
+
+| Field | Description |
+|---|---|
+| `id` | `<scanner>-<slug>` — unique, used for branch and file names |
+| `status` | `running` / `paused` / `concluded` |
+| `priority` | 1–10 (higher = more important); auto-pause lowest when at capacity |
+| `hypothesis_type` | `statistical` (answer from existing data) or `implementation` (requires branch + forward testing) |
+| `min_days` | Minimum picks days before conclusion analysis runs |
+| `picks_log` | Dates when the runner collected picks on this branch |
+| `conclusion` | `null` while running; `"accepted"` or `"rejected"` once concluded |
+
+---
+
+## `/backtest-hypothesis` Command
+
+**Trigger:** `claude /backtest-hypothesis "<description>"`
+
+**Flow:**
+
+1. **Classify** the hypothesis as `statistical` or `implementation`.
+   - Statistical: answerable from existing `performance_database.json` data — no code change needed.
+   - Implementation: requires a code change and forward-testing period.
+
+2. **Statistical path:** Run the analysis immediately against existing performance data. Write conclusion to the relevant scanner domain file (`docs/iterations/scanners/<scanner>.md`). Done — no branch created.
+
+3. **Implementation path:**
+   a. Read `active.json`. If `running` count < 5, proceed. If at 5, auto-pause the entry with the lowest `priority` (set `status: "paused"`, keep branch alive).
+   b. Create branch `hypothesis/<scanner>-<slug>` from `main`.
+   c. Implement the minimal code change on the branch.
+   d. Open a draft PR: title `hypothesis(<scanner>): <title>`, body describes the hypothesis, expected impact, and `min_days`.
+   e. Write new entry to `active.json` on `main` with `status: "running"`.
+   f. Print summary: branch name, PR number, expected conclusion date.
+
+**Priority scoring** (set at creation time):
+
+| Factor | Score contribution |
+|---|---|
+| Scanner has poor 30d win rate (<40%) | +3 |
+| Change is low-complexity (1 file, 1 parameter) | +2 |
+| Hypothesis directly addresses a known weak spot in LEARNINGS.md | +2 |
+| High daily pick volume from scanner (more data faster) | +1 |
+| Evidence from external research (arXiv, Alpha Architect, etc.) | +1 |
+| Conflicting evidence or uncertain direction | -2 |
+
+Max score 9. Claude assigns this score and writes it to `active.json`.
+
+---
+
+## Daily Hypothesis Runner (`hypothesis-runner.yml`)
+
+Runs at **08:00 UTC daily** (after iterate at 06:00 UTC).
+
+**Per-hypothesis loop** (for each entry with `status: "running"`):
+
+```
+1. git checkout hypothesis/<id>
+2. Run daily discovery pipeline (same as daily-discovery.yml)
+3. Append today's date to picks_log
+4. Commit picks update back to hypothesis branch
+5. If days_elapsed >= min_days:
+   a. Run statistical comparison vs baseline scanner (same scanner, main branch picks)
+   b. Compute: win rate delta, avg return delta, pick volume delta, p-value if N >= 20
+   c. Decision rule:
+      - accepted if win rate delta > +5pp OR avg return delta > +1% (with p < 0.1 if N >= 20)
+      - rejected otherwise
+   d. Write concluded doc to docs/iterations/hypotheses/concluded/YYYY-MM-DD-<id>.md
+   e. Update scanner domain file with finding
+   f. Set status = "concluded", conclusion = "accepted"/"rejected" in active.json
+   g. If accepted: merge PR into main
+      If rejected: close PR without merging, delete hypothesis branch
+   h. Push active.json update to main
+```
+
+**Capacity:** 5 experiments × ~2 min each = ~10 min max runtime. Workflow timeout: 60 minutes.
+
+---
+
+## Conclusion Document Format
+
+`docs/iterations/hypotheses/concluded/YYYY-MM-DD-<id>.md`:
+
+```markdown
+# Hypothesis: <title>
+
+**Scanner:** options_flow
+**Branch:** hypothesis/options_flow-scan-3-expirations
+**Period:** 2026-04-09 → 2026-04-23 (14 days)
+**Outcome:** accepted ✅ / rejected ❌
+
+## Hypothesis
+<original description>
+
+## Results
+
+| Metric | Baseline | Experiment | Delta |
+|---|---|---|---|
+| 7d win rate | 42% | 53% | +11pp |
+| 30d avg return | -2.9% | +0.8% | +3.7% |
+| Picks/day | 1.2 | 1.8 | +0.6 |
+
+## Decision
+<1-2 sentences on why accepted/rejected>
+
+## Action
+<what was merged or discarded>
+```
+
+---
+
+## Dashboard Tab (`tradingagents/ui/pages/hypotheses.py`)
+
+New "Hypotheses" tab in the Streamlit dashboard.
+
+**Active experiments table:**
+
+| Hypothesis | Scanner | Status | Days | Picks | Expected Ready | Priority |
+|---|---|---|---|---|---|---|
+| Scan 3 expirations | options_flow | running | 3/14 | 4 | 2026-04-23 | 8 |
+| ITM-only filter | options_flow | paused | 1/14 | 1 | — | 5 |
+
+**Concluded experiments table:**
+
+| Hypothesis | Scanner | Outcome | Concluded | Win Rate Delta |
+|---|---|---|---|---|
+| Premium filter >$25K | options_flow | ✅ merged | 2026-04-01 | +9pp |
+| Reddit DD confidence gate | reddit_dd | ❌ rejected | 2026-03-20 | -3pp |
+
+Both tables read directly from `active.json` and the `concluded/` directory. No separate database.
+
+---
+
+## What Is Not In Scope
+
+- Hypothesis branches do not interact with each other (no cross-branch comparison)
+- No A/B testing within a single discovery run (too complex, not needed)
+- No email/Slack notifications (rolling PRs in GitHub are the notification mechanism)
+- No manual override of priority scoring (set at creation, editable directly in `active.json`)

From 36884966f1973502b90dbdd40051d79e1eff1e93 Mon Sep 17 00:00:00 2001
From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com>
Date: Thu, 9 Apr 2026 23:50:55 -0700
Subject: [PATCH 02/14] =?UTF-8?q?docs(spec):=20fix=20hypothesis=20capacity?=
 =?UTF-8?q?=20=E2=80=94=20running=20experiments=20never=20paused?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pending hypotheses queue by priority and promote when a slot opens,
rather than pausing a running experiment mid-streak.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../2026-04-09-hypothesis-backtesting-design.md    | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/docs/superpowers/specs/2026-04-09-hypothesis-backtesting-design.md b/docs/superpowers/specs/2026-04-09-hypothesis-backtesting-design.md
index 6fa943f6..042bb796 100644
--- a/docs/superpowers/specs/2026-04-09-hypothesis-backtesting-design.md
+++ b/docs/superpowers/specs/2026-04-09-hypothesis-backtesting-design.md
@@ -61,8 +61,8 @@ The `active.json` file lives on `main`. Each hypothesis branch (`hypothesis/<sca
 | Field | Description |
 |---|---|
 | `id` | `<scanner>-<slug>` — unique, used for branch and file names |
-| `status` | `running` / `paused` / `concluded` |
-| `priority` | 1–10 (higher = more important); auto-pause lowest when at capacity |
+| `status` | `running` / `pending` / `concluded` |
+| `priority` | 1–9 (higher = more important); determines queue order for `pending` hypotheses |
 | `hypothesis_type` | `statistical` (answer from existing data) or `implementation` (requires branch + forward testing) |
 | `min_days` | Minimum picks days before conclusion analysis runs |
 | `picks_log` | Dates when the runner collected picks on this branch |
@@ -83,12 +83,14 @@ The `active.json` file lives on `main`. Each hypothesis branch (`hypothesis/<sca
 2. **Statistical path:** Run the analysis immediately against existing performance data. Write conclusion to the relevant scanner domain file (`docs/iterations/scanners/<scanner>.md`). Done — no branch created.
 
 3. **Implementation path:**
-   a. Read `active.json`. If `running` count < 5, proceed. If at 5, auto-pause the entry with the lowest `priority` (set `status: "paused"`, keep branch alive).
+   a. Read `active.json`. If `running` count < 5, start immediately. If all 5 slots are occupied by running experiments, add the new hypothesis as `status: "pending"` — running experiments are never interrupted (pausing mid-experiment breaks the picks streak and invalidates the statistical comparison).
    b. Create branch `hypothesis/<scanner>-<slug>` from `main`.
    c. Implement the minimal code change on the branch.
    d. Open a draft PR: title `hypothesis(<scanner>): <title>`, body describes the hypothesis, expected impact, and `min_days`.
-   e. Write new entry to `active.json` on `main` with `status: "running"`.
-   f. Print summary: branch name, PR number, expected conclusion date.
+   e. Write new entry to `active.json` on `main` with `status: "running"` (or `"pending"` if at capacity).
+   f. Print summary: branch name, PR number, expected start date (if pending), expected conclusion date (if running).
+
+**Pending → running promotion:** At the end of each daily runner cycle, after any experiments conclude, the runner checks for `pending` entries and promotes the highest-priority one to `running` if a slot opened up.
 
 **Priority scoring** (set at creation time):
 
@@ -175,7 +177,7 @@ New "Hypotheses" tab in the Streamlit dashboard.
 | Hypothesis | Scanner | Status | Days | Picks | Expected Ready | Priority |
 |---|---|---|---|---|---|---|
 | Scan 3 expirations | options_flow | running | 3/14 | 4 | 2026-04-23 | 8 |
-| ITM-only filter | options_flow | paused | 1/14 | 1 | — | 5 |
+| ITM-only filter | options_flow | pending | 0/14 | 0 | waiting for slot | 5 |
 
 **Concluded experiments table:**
 

From e0b6e28a3be0ce04674074b9b4dae9c9c1613d7c Mon Sep 17 00:00:00 2001
From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com>
Date: Fri, 10 Apr 2026 09:04:58 -0700
Subject: [PATCH 03/14] docs(plan): hypothesis backtesting implementation plan

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../2026-04-10-hypothesis-backtesting.md      | 1493 +++++++++++++++++
 1 file changed, 1493 insertions(+)
 create mode 100644 docs/superpowers/plans/2026-04-10-hypothesis-backtesting.md

diff --git a/docs/superpowers/plans/2026-04-10-hypothesis-backtesting.md b/docs/superpowers/plans/2026-04-10-hypothesis-backtesting.md
new file mode 100644
index 00000000..711c3cb8
--- /dev/null
+++ b/docs/superpowers/plans/2026-04-10-hypothesis-backtesting.md
@@ -0,0 +1,1493 @@
+# Hypothesis Backtesting System — Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Build a branch-per-hypothesis experimentation system that runs scanner code changes daily in isolation, accumulates picks, auto-concludes with a statistical comparison, and surfaces everything in the dashboard.
+
+**Architecture:** `active.json` is the registry (lives on `main`). Each hypothesis gets a `hypothesis/<scanner>-<slug>` branch with the code change. A daily workflow (08:00 UTC) uses git worktrees to run discovery on each branch, stores picks in `docs/iterations/hypotheses/<id>/picks.json` on the hypothesis branch, and concludes when `min_days` elapsed. The `/backtest-hypothesis` command classifies, creates branches, and manages the registry.
+
+**Tech Stack:** Python 3.10, yfinance (`download_history`), GitHub Actions, Streamlit, `gh` CLI, `git worktree`
+
+---
+
+## File Map
+
+| Path | Action | Purpose |
+|---|---|---|
+| `docs/iterations/hypotheses/active.json` | Create | Registry of all experiments |
+| `docs/iterations/hypotheses/concluded/.gitkeep` | Create | Directory placeholder |
+| `scripts/compare_hypothesis.py` | Create | Fetch returns + statistical comparison |
+| `.claude/commands/backtest-hypothesis.md` | Create | `/backtest-hypothesis` Claude command |
+| `.github/workflows/hypothesis-runner.yml` | Create | Daily 08:00 UTC runner |
+| `tradingagents/ui/pages/hypotheses.py` | Create | Dashboard "Hypotheses" tab |
+| `tradingagents/ui/pages/__init__.py` | Modify | Register new page |
+| `tradingagents/ui/dashboard.py` | Modify | Add "Hypotheses" to nav |
+
+---
+
+## Task 1: Hypothesis Registry Structure
+
+**Files:**
+- Create: `docs/iterations/hypotheses/active.json`
+- Create: `docs/iterations/hypotheses/concluded/.gitkeep`
+
+- [ ] **Step 1: Create the directory and initial `active.json`**
+
+```bash
+mkdir -p docs/iterations/hypotheses/concluded
+```
+
+Write `docs/iterations/hypotheses/active.json`:
+
+```json
+{
+  "max_active": 5,
+  "hypotheses": []
+}
+```
+
+- [ ] **Step 2: Create the concluded directory placeholder**
+
+```bash
+touch docs/iterations/hypotheses/concluded/.gitkeep
+```
+
+- [ ] **Step 3: Verify JSON is valid**
+
+```bash
+python3 -c "import json; json.load(open('docs/iterations/hypotheses/active.json')); print('valid')"
+```
+
+Expected: `valid`
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add docs/iterations/hypotheses/
+git commit -m "feat(hypotheses): initialize hypothesis registry"
+```
+
+---
+
+## Task 2: Comparison Script
+
+**Files:**
+- Create: `scripts/compare_hypothesis.py`
+- Create: `tests/test_compare_hypothesis.py`
+
+`★ Insight ─────────────────────────────────────`
+The comparison reads picks from the hypothesis branch via `git show <branch>:path` — this avoids checking out the branch just to read a file, keeping the working tree on `main` throughout.
+`─────────────────────────────────────────────────`
+
+- [ ] **Step 1: Write the failing tests**
+
+Create `tests/test_compare_hypothesis.py`:
+
+```python
+"""Tests for the hypothesis comparison script."""
+import json
+import subprocess
+import sys
+from datetime import date, timedelta
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from scripts.compare_hypothesis import (
+    compute_metrics,
+    compute_7d_return,
+    load_baseline_metrics,
+    make_decision,
+)
+
+
+# ── compute_metrics ──────────────────────────────────────────────────────────
+
+def test_compute_metrics_empty():
+    result = compute_metrics([])
+    assert result == {"count": 0, "evaluated": 0, "win_rate": None, "avg_return": None}
+
+
+def test_compute_metrics_all_wins():
+    picks = [
+        {"return_7d": 5.0, "win_7d": True},
+        {"return_7d": 3.0, "win_7d": True},
+    ]
+    result = compute_metrics(picks)
+    assert result["win_rate"] == 100.0
+    assert result["avg_return"] == 4.0
+    assert result["evaluated"] == 2
+
+
+def test_compute_metrics_mixed():
+    picks = [
+        {"return_7d": 10.0, "win_7d": True},
+        {"return_7d": -5.0, "win_7d": False},
+        {"return_7d": None, "win_7d": None},   # pending — excluded
+    ]
+    result = compute_metrics(picks)
+    assert result["win_rate"] == 50.0
+    assert result["avg_return"] == 2.5
+    assert result["evaluated"] == 2
+    assert result["count"] == 3
+
+
+# ── compute_7d_return ────────────────────────────────────────────────────────
+
+def test_compute_7d_return_positive():
+    mock_df = MagicMock()
+    mock_df.empty = False
+    # Simulate DataFrame with Close column: entry=100, exit=110
+    mock_df.__len__ = lambda self: 2
+    mock_df["Close"].iloc.__getitem__ = MagicMock(side_effect=lambda i: 100.0 if i == 0 else 110.0)
+
+    with patch("scripts.compare_hypothesis.download_history", return_value=mock_df):
+        ret, win = compute_7d_return("AAPL", "2026-03-01")
+
+    assert ret == pytest.approx(10.0, rel=0.01)
+    assert win is True
+
+
+def test_compute_7d_return_empty_data():
+    mock_df = MagicMock()
+    mock_df.empty = True
+
+    with patch("scripts.compare_hypothesis.download_history", return_value=mock_df):
+        ret, win = compute_7d_return("AAPL", "2026-03-01")
+
+    assert ret is None
+    assert win is None
+
+
+# ── load_baseline_metrics ────────────────────────────────────────────────────
+
+def test_load_baseline_metrics(tmp_path):
+    db = {
+        "recommendations_by_date": {
+            "2026-03-01": [
+                {"strategy_match": "options_flow", "return_7d": 5.0, "win_7d": True},
+                {"strategy_match": "options_flow", "return_7d": -2.0, "win_7d": False},
+                {"strategy_match": "reddit_dd", "return_7d": 3.0, "win_7d": True},
+            ]
+        }
+    }
+    db_file = tmp_path / "performance_database.json"
+    db_file.write_text(json.dumps(db))
+
+    result = load_baseline_metrics("options_flow", str(db_file))
+
+    assert result["win_rate"] == 50.0
+    assert result["avg_return"] == 1.5
+    assert result["count"] == 2
+
+
+def test_load_baseline_metrics_missing_file(tmp_path):
+    result = load_baseline_metrics("options_flow", str(tmp_path / "missing.json"))
+    assert result == {"count": 0, "win_rate": None, "avg_return": None}
+
+
+# ── make_decision ─────────────────────────────────────────────────────────────
+
+def test_make_decision_accepted_by_win_rate():
+    hyp = {"win_rate": 60.0, "avg_return": 0.5, "evaluated": 10}
+    baseline = {"win_rate": 50.0, "avg_return": 0.5}
+    decision, reason = make_decision(hyp, baseline)
+    assert decision == "accepted"
+    assert "win rate" in reason.lower()
+
+
+def test_make_decision_accepted_by_return():
+    hyp = {"win_rate": 52.0, "avg_return": 3.0, "evaluated": 10}
+    baseline = {"win_rate": 50.0, "avg_return": 1.5}
+    decision, reason = make_decision(hyp, baseline)
+    assert decision == "accepted"
+    assert "return" in reason.lower()
+
+
+def test_make_decision_rejected():
+    hyp = {"win_rate": 48.0, "avg_return": 0.2, "evaluated": 10}
+    baseline = {"win_rate": 50.0, "avg_return": 1.0}
+    decision, reason = make_decision(hyp, baseline)
+    assert decision == "rejected"
+
+
+def test_make_decision_insufficient_data():
+    hyp = {"win_rate": 80.0, "avg_return": 5.0, "evaluated": 2}
+    baseline = {"win_rate": 50.0, "avg_return": 1.0}
+    decision, reason = make_decision(hyp, baseline)
+    assert decision == "rejected"
+    assert "insufficient" in reason.lower()
+```
+
+- [ ] **Step 2: Run tests to confirm they fail**
+
+```bash
+python -m pytest tests/test_compare_hypothesis.py -v 2>&1 | head -30
+```
+
+Expected: `ModuleNotFoundError: No module named 'scripts.compare_hypothesis'` or similar import error — confirms tests are wired correctly.
+
+- [ ] **Step 3: Write `scripts/compare_hypothesis.py`**
+
+```python
+#!/usr/bin/env python3
+"""
+Hypothesis comparison — computes 7d returns for hypothesis picks and
+compares them against the baseline scanner in performance_database.json.
+
+Usage (called by hypothesis-runner.yml after min_days elapsed):
+    python scripts/compare_hypothesis.py \\
+        --hypothesis-id options_flow-scan-3-expirations \\
+        --picks-json '{"picks": [...]}' \\
+        --scanner options_flow \\
+        --db-path data/recommendations/performance_database.json
+
+Prints a JSON conclusion to stdout:
+    {
+      "decision": "accepted",
+      "reason": "...",
+      "hypothesis": {"win_rate": 58.0, "avg_return": 1.8, "count": 14, "evaluated": 10},
+      "baseline":   {"win_rate": 42.0, "avg_return": -0.3, "count": 87}
+    }
+"""
+
+import argparse
+import json
+import sys
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Optional, Tuple
+
+ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(ROOT))
+
+from tradingagents.dataflows.y_finance import download_history
+
+
+# Minimum evaluated picks required to make a decision
+_MIN_EVALUATED = 5
+# Thresholds from spec
+_WIN_RATE_DELTA_THRESHOLD = 5.0   # percentage points
+_AVG_RETURN_DELTA_THRESHOLD = 1.0  # percent
+
+
+def compute_7d_return(ticker: str, pick_date: str) -> Tuple[Optional[float], Optional[bool]]:
+    """
+    Fetch 7-day return for a pick using yfinance.
+
+    Args:
+        ticker: Stock symbol, e.g. "AAPL"
+        pick_date: Date the pick was made, "YYYY-MM-DD"
+
+    Returns:
+        (return_pct, is_win) or (None, None) if data unavailable
+    """
+    try:
+        entry_dt = datetime.strptime(pick_date, "%Y-%m-%d")
+        exit_dt = entry_dt + timedelta(days=10)  # +3 buffer for weekends/holidays
+        df = download_history(
+            ticker,
+            start=entry_dt.strftime("%Y-%m-%d"),
+            end=exit_dt.strftime("%Y-%m-%d"),
+        )
+        if df.empty or len(df) < 2:
+            return None, None
+
+        # Use first available close as entry, 7th trading day as exit
+        close = df["Close"]
+        entry_price = float(close.iloc[0])
+        exit_idx = min(5, len(close) - 1)  # ~7 calendar days = ~5 trading days
+        exit_price = float(close.iloc[exit_idx])
+
+        if entry_price <= 0:
+            return None, None
+
+        ret = (exit_price - entry_price) / entry_price * 100
+        return round(ret, 4), ret > 0
+
+    except Exception:
+        return None, None
+
+
+def enrich_picks_with_returns(picks: list) -> list:
+    """
+    Compute 7d return for each pick that is old enough (>= 7 days) and
+    doesn't already have return_7d populated.
+
+    Args:
+        picks: List of pick dicts with at least 'ticker' and 'date' fields
+
+    Returns:
+        Same list with return_7d and win_7d populated where possible
+    """
+    cutoff = (datetime.utcnow() - timedelta(days=7)).strftime("%Y-%m-%d")
+    for pick in picks:
+        if pick.get("return_7d") is not None:
+            continue  # already computed
+        if pick.get("date", "9999-99-99") > cutoff:
+            continue  # too recent
+        ret, win = compute_7d_return(pick["ticker"], pick["date"])
+        pick["return_7d"] = ret
+        pick["win_7d"] = win
+    return picks
+
+
+def compute_metrics(picks: list) -> dict:
+    """
+    Compute win rate and avg return for a list of picks.
+
+    Only picks with non-None return_7d contribute to win_rate and avg_return.
+
+    Returns:
+        {"count": int, "evaluated": int, "win_rate": float|None, "avg_return": float|None}
+    """
+    evaluated = [p for p in picks if p.get("return_7d") is not None]
+    if not evaluated:
+        return {"count": len(picks), "evaluated": 0, "win_rate": None, "avg_return": None}
+
+    wins = sum(1 for p in evaluated if p.get("win_7d"))
+    avg_ret = sum(p["return_7d"] for p in evaluated) / len(evaluated)
+    return {
+        "count": len(picks),
+        "evaluated": len(evaluated),
+        "win_rate": round(wins / len(evaluated) * 100, 1),
+        "avg_return": round(avg_ret, 2),
+    }
+
+
+def load_baseline_metrics(scanner: str, db_path: str) -> dict:
+    """
+    Load baseline metrics for a scanner from performance_database.json.
+
+    Args:
+        scanner: Scanner name, e.g. "options_flow"
+        db_path: Path to performance_database.json
+
+    Returns:
+        {"count": int, "win_rate": float|None, "avg_return": float|None}
+    """
+    path = Path(db_path)
+    if not path.exists():
+        return {"count": 0, "win_rate": None, "avg_return": None}
+
+    try:
+        with open(path) as f:
+            db = json.load(f)
+    except Exception:
+        return {"count": 0, "win_rate": None, "avg_return": None}
+
+    picks = []
+    for recs in db.get("recommendations_by_date", {}).values():
+        for rec in (recs if isinstance(recs, list) else []):
+            if rec.get("strategy_match") == scanner and rec.get("return_7d") is not None:
+                picks.append(rec)
+
+    return compute_metrics(picks)
+
+
+def make_decision(hypothesis: dict, baseline: dict) -> Tuple[str, str]:
+    """
+    Decide accepted or rejected based on metrics delta.
+
+    Rules:
+    - Minimum _MIN_EVALUATED evaluated picks required
+    - accepted if win_rate_delta > _WIN_RATE_DELTA_THRESHOLD (5pp)
+      OR avg_return_delta > _AVG_RETURN_DELTA_THRESHOLD (1%)
+    - rejected otherwise
+
+    Returns:
+        (decision, reason) where decision is "accepted" or "rejected"
+    """
+    evaluated = hypothesis.get("evaluated", 0)
+    if evaluated < _MIN_EVALUATED:
+        return "rejected", f"Insufficient data: only {evaluated} evaluated picks (need {_MIN_EVALUATED})"
+
+    hyp_wr = hypothesis.get("win_rate")
+    hyp_ret = hypothesis.get("avg_return")
+    base_wr = baseline.get("win_rate")
+    base_ret = baseline.get("avg_return")
+
+    reasons = []
+
+    if hyp_wr is not None and base_wr is not None:
+        delta_wr = hyp_wr - base_wr
+        if delta_wr > _WIN_RATE_DELTA_THRESHOLD:
+            reasons.append(f"win rate improved by {delta_wr:+.1f}pp ({base_wr:.1f}% → {hyp_wr:.1f}%)")
+
+    if hyp_ret is not None and base_ret is not None:
+        delta_ret = hyp_ret - base_ret
+        if delta_ret > _AVG_RETURN_DELTA_THRESHOLD:
+            reasons.append(f"avg return improved by {delta_ret:+.2f}% ({base_ret:+.2f}% → {hyp_ret:+.2f}%)")
+
+    if reasons:
+        return "accepted", "; ".join(reasons)
+
+    wr_str = f"{hyp_wr:.1f}% vs baseline {base_wr:.1f}%" if hyp_wr is not None else "no win rate data"
+    ret_str = f"{hyp_ret:+.2f}% vs baseline {base_ret:+.2f}%" if hyp_ret is not None else "no return data"
+    return "rejected", f"No significant improvement — win rate: {wr_str}; avg return: {ret_str}"
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Compare hypothesis picks against baseline")
+    parser.add_argument("--hypothesis-id", required=True)
+    parser.add_argument("--picks-json", required=True, help="JSON string of picks list")
+    parser.add_argument("--scanner", required=True, help="Baseline scanner name")
+    parser.add_argument(
+        "--db-path",
+        default="data/recommendations/performance_database.json",
+        help="Path to performance_database.json",
+    )
+    args = parser.parse_args()
+
+    picks = json.loads(args.picks_json)
+    picks = enrich_picks_with_returns(picks)
+
+    hyp_metrics = compute_metrics(picks)
+    base_metrics = load_baseline_metrics(args.scanner, args.db_path)
+
+    decision, reason = make_decision(hyp_metrics, base_metrics)
+
+    result = {
+        "hypothesis_id": args.hypothesis_id,
+        "decision": decision,
+        "reason": reason,
+        "hypothesis": hyp_metrics,
+        "baseline": base_metrics,
+        "enriched_picks": picks,
+    }
+    print(json.dumps(result, indent=2))
+
+
+if __name__ == "__main__":
+    main()
+```
+
+- [ ] **Step 4: Run tests to confirm they pass**
+
+```bash
+python -m pytest tests/test_compare_hypothesis.py -v
+```
+
+Expected: all 10 tests pass.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add scripts/compare_hypothesis.py tests/test_compare_hypothesis.py
+git commit -m "feat(hypotheses): add comparison + conclusion script"
+```
+
+---
+
+## Task 3: `/backtest-hypothesis` Command
+
+**Files:**
+- Create: `.claude/commands/backtest-hypothesis.md`
+
+- [ ] **Step 1: Write the command file**
+
+Create `.claude/commands/backtest-hypothesis.md`:
+
+````markdown
+# /backtest-hypothesis
+
+Test a hypothesis about a scanner improvement using branch-per-hypothesis isolation.
+
+**Usage:** `/backtest-hypothesis "<description of the hypothesis>"`
+
+**Example:** `/backtest-hypothesis "options_flow: scan 3 expirations instead of 1 to capture institutional 30+ DTE positioning"`
+
+---
+
+## Step 1: Read Current Registry
+
+Read `docs/iterations/hypotheses/active.json`. Note:
+- How many hypotheses currently have `status: "running"`
+- The `max_active` limit (default 5)
+- Any existing `pending` entries
+
+Also read `docs/iterations/LEARNINGS.md` and the relevant scanner domain file in
+`docs/iterations/scanners/` to understand the current baseline.
+
+## Step 2: Classify the Hypothesis
+
+Determine whether this is:
+
+**Statistical** — answerable from existing data in `data/recommendations/performance_database.json`
+without any code change. Examples:
+- "Does high confidence (≥8) predict better 30d returns?"
+- "Are options_flow picks that are ITM outperforming OTM ones?"
+
+**Implementation** — requires a code change and forward-testing period. Examples:
+- "Scan 3 expirations instead of 1"
+- "Apply a premium filter of $50K instead of $25K"
+
+## Step 3a: Statistical Path
+
+If statistical: run the analysis now against `data/recommendations/performance_database.json`.
+Write the finding to the relevant scanner domain file under **Evidence Log**. Print a summary.
+Done — no branch needed.
+
+## Step 3b: Implementation Path
+
+### 3b-i: Capacity check
+
+Count running hypotheses from `active.json`. If fewer than `max_active` running, proceed.
+If at capacity: add the new hypothesis as `status: "pending"` — running experiments are NEVER
+paused mid-streak. Inform the user which slot it queued behind and when it will likely start.
+
+### 3b-ii: Score the hypothesis
+
+Assign a `priority` score (1–9) using these factors:
+
+| Factor | Score |
+|---|---|
+| Scanner 30d win rate < 40% | +3 |
+| Change touches 1 file, 1 parameter | +2 |
+| Directly addresses a weak spot in LEARNINGS.md | +2 |
+| Scanner generates ≥2 picks/day (data accrues fast) | +1 |
+| Supported by external research (arXiv, Alpha Architect, etc.) | +1 |
+| Contradictory evidence or unclear direction | −2 |
+
+### 3b-iii: Determine min_days
+
+Set `min_days` based on the scanner's typical picks-per-day rate:
+- ≥2 picks/day → 14 days
+- 1 pick/day → 21 days
+- <1 pick/day → 30 days
+
+### 3b-iv: Create the branch and implement the code change
+
+```bash
+BRANCH="hypothesis/<scanner>-<slug>"
+git checkout -b "$BRANCH"
+```
+
+Make the minimal code change that implements the hypothesis. Read the scanner file first.
+Only change what the hypothesis requires — do not refactor surrounding code.
+
+```bash
+git add tradingagents/
+git commit -m "hypothesis(<scanner>): <title>"
+```
+
+### 3b-v: Create picks tracking file on the branch
+
+Create `docs/iterations/hypotheses/<id>/picks.json` on the hypothesis branch:
+
+```json
+{
+  "hypothesis_id": "<id>",
+  "scanner": "<scanner>",
+  "picks": []
+}
+```
+
+```bash
+mkdir -p docs/iterations/hypotheses/<id>
+# write the file
+git add docs/iterations/hypotheses/<id>/picks.json
+git commit -m "hypothesis(<scanner>): add picks tracker"
+git push -u origin "$BRANCH"
+```
+
+### 3b-vi: Open a draft PR
+
+```bash
+gh pr create \
+  --title "hypothesis(<scanner>): <title>" \
+  --body "**Hypothesis:** <description>
+
+**Expected impact:** <high/medium/low>
+**Min days:** <N>
+**Priority:** <score>/9
+
+*This is an automated hypothesis experiment. It will be auto-concluded after ${MIN_DAYS} days of data.*" \
+  --draft \
+  --base main
+```
+
+Note the PR number from the output.
+
+### 3b-vii: Update active.json on main
+
+Check out `main`, then update `docs/iterations/hypotheses/active.json` to add the new entry:
+
+```json
+{
+  "id": "<scanner>-<slug>",
+  "scanner": "<scanner>",
+  "title": "<title>",
+  "description": "<description>",
+  "branch": "hypothesis/<scanner>-<slug>",
+  "pr_number": <N>,
+  "status": "running",
+  "priority": <score>,
+  "expected_impact": "<high|medium|low>",
+  "hypothesis_type": "implementation",
+  "created_at": "<YYYY-MM-DD>",
+  "min_days": <N>,
+  "days_elapsed": 0,
+  "picks_log": [],
+  "baseline_scanner": "<scanner>",
+  "conclusion": null
+}
+```
+
+```bash
+git checkout main
+git add docs/iterations/hypotheses/active.json
+git commit -m "feat(hypotheses): register hypothesis <id>"
+git push origin main
+```
+
+## Step 4: Print Summary
+
+Print a confirmation:
+- Hypothesis ID and branch name
+- Status: running or pending
+- Expected conclusion date (created_at + min_days)
+- PR link (if running)
+- Priority score and why
+````
+
+- [ ] **Step 2: Verify the file exists and is non-empty**
+
+```bash
+wc -l .claude/commands/backtest-hypothesis.md
+```
+
+Expected: at least 80 lines.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add .claude/commands/backtest-hypothesis.md
+git commit -m "feat(hypotheses): add /backtest-hypothesis command"
+```
+
+---
+
+## Task 4: Hypothesis Runner Workflow
+
+**Files:**
+- Create: `.github/workflows/hypothesis-runner.yml`
+
+- [ ] **Step 1: Write the workflow**
+
+Create `.github/workflows/hypothesis-runner.yml`:
+
+```yaml
+name: Hypothesis Runner
+
+on:
+  schedule:
+    # 8:00 AM UTC daily — runs after iterate (06:00) and daily-discovery (12:30)
+    - cron: "0 8 * * *"
+  workflow_dispatch:
+    inputs:
+      hypothesis_id:
+        description: "Run a specific hypothesis ID only (blank = all running)"
+        required: false
+        default: ""
+
+env:
+  PYTHON_VERSION: "3.10"
+
+jobs:
+  run-hypotheses:
+    runs-on: ubuntu-latest
+    environment: TradingAgent
+    timeout-minutes: 60
+    permissions:
+      contents: write
+      pull-requests: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.GH_TOKEN }}
+
+      - name: Set up git identity
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+          cache: pip
+
+      - name: Install dependencies
+        run: pip install --upgrade pip && pip install -e .
+
+      - name: Run hypothesis experiments
+        env:
+          GH_TOKEN: ${{ secrets.GH_TOKEN }}
+          GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          FINNHUB_API_KEY: ${{ secrets.FINNHUB_API_KEY }}
+          ALPHA_VANTAGE_API_KEY: ${{ secrets.ALPHA_VANTAGE_API_KEY }}
+          FMP_API_KEY: ${{ secrets.FMP_API_KEY }}
+          REDDIT_CLIENT_ID: ${{ secrets.REDDIT_CLIENT_ID }}
+          REDDIT_CLIENT_SECRET: ${{ secrets.REDDIT_CLIENT_SECRET }}
+          TRADIER_API_KEY: ${{ secrets.TRADIER_API_KEY }}
+          FILTER_ID: ${{ inputs.hypothesis_id }}
+        run: |
+          python scripts/run_hypothesis_runner.py
+
+      - name: Commit active.json updates
+        run: |
+          git add docs/iterations/hypotheses/active.json || true
+          if git diff --cached --quiet; then
+            echo "No registry changes"
+          else
+            git commit -m "chore(hypotheses): update registry $(date -u +%Y-%m-%d)"
+            git pull --rebase origin main
+            git push origin main
+          fi
+```
+
+- [ ] **Step 2: Write `scripts/run_hypothesis_runner.py`**
+
+Create `scripts/run_hypothesis_runner.py`:
+
+```python
+#!/usr/bin/env python3
+"""
+Hypothesis Runner — orchestrates daily experiment cycles.
+
+For each running hypothesis in active.json:
+  1. Creates a git worktree for the hypothesis branch
+  2. Runs the daily discovery pipeline in that worktree
+  3. Extracts picks from the discovery result, appends to picks.json
+  4. Commits and pushes picks to hypothesis branch
+  5. Removes worktree
+  6. Updates active.json (days_elapsed, picks_log)
+  7. If days_elapsed >= min_days: concludes the hypothesis
+
+After all hypotheses: promotes highest-priority pending → running if a slot opened.
+
+Environment variables read:
+  FILTER_ID — if set, only run the hypothesis with this ID
+"""
+
+import json
+import os
+import subprocess
+import sys
+from datetime import datetime, timedelta
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(ROOT))
+
+ACTIVE_JSON = ROOT / "docs/iterations/hypotheses/active.json"
+CONCLUDED_DIR = ROOT / "docs/iterations/hypotheses/concluded"
+DB_PATH = ROOT / "data/recommendations/performance_database.json"
+TODAY = datetime.utcnow().strftime("%Y-%m-%d")
+
+
+def load_registry() -> dict:
+    with open(ACTIVE_JSON) as f:
+        return json.load(f)
+
+
+def save_registry(registry: dict) -> None:
+    with open(ACTIVE_JSON, "w") as f:
+        json.dump(registry, f, indent=2)
+
+
+def run(cmd: list, cwd: str = None, check: bool = True) -> subprocess.CompletedProcess:
+    print(f"  $ {' '.join(cmd)}", flush=True)
+    return subprocess.run(cmd, cwd=cwd or str(ROOT), check=check, capture_output=False)
+
+
+def run_capture(cmd: list, cwd: str = None) -> str:
+    result = subprocess.run(cmd, cwd=cwd or str(ROOT), capture_output=True, text=True)
+    return result.stdout.strip()
+
+
+def extract_picks(worktree: str, scanner: str) -> list:
+    """
+    Extract picks for the given scanner from the most recent discovery result
+    in the worktree's results/discovery/<TODAY>/ directory.
+    """
+    results_dir = Path(worktree) / "results" / "discovery" / TODAY
+    if not results_dir.exists():
+        print(f"    No discovery results for {TODAY} in worktree", flush=True)
+        return []
+
+    picks = []
+    for run_dir in sorted(results_dir.iterdir()):
+        result_file = run_dir / "discovery_result.json"
+        if not result_file.exists():
+            continue
+        try:
+            with open(result_file) as f:
+                data = json.load(f)
+            for item in data.get("final_ranking", []):
+                if item.get("strategy_match") == scanner:
+                    picks.append({
+                        "date": TODAY,
+                        "ticker": item["ticker"],
+                        "score": item.get("final_score"),
+                        "confidence": item.get("confidence"),
+                        "scanner": scanner,
+                        "return_7d": None,
+                        "win_7d": None,
+                    })
+        except Exception as e:
+            print(f"    Warning: could not read {result_file}: {e}", flush=True)
+
+    return picks
+
+
+def load_picks_from_branch(hypothesis_id: str, branch: str) -> list:
+    """Load picks.json from the hypothesis branch using git show."""
+    picks_path = f"docs/iterations/hypotheses/{hypothesis_id}/picks.json"
+    result = subprocess.run(
+        ["git", "show", f"{branch}:{picks_path}"],
+        cwd=str(ROOT),
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        return []
+    try:
+        return json.loads(result.stdout).get("picks", [])
+    except Exception:
+        return []
+
+
+def save_picks_to_worktree(worktree: str, hypothesis_id: str, scanner: str, picks: list) -> None:
+    """Write updated picks.json into the worktree and commit."""
+    picks_dir = Path(worktree) / "docs" / "iterations" / "hypotheses" / hypothesis_id
+    picks_dir.mkdir(parents=True, exist_ok=True)
+    picks_file = picks_dir / "picks.json"
+    payload = {"hypothesis_id": hypothesis_id, "scanner": scanner, "picks": picks}
+    picks_file.write_text(json.dumps(payload, indent=2))
+
+    run(["git", "add", str(picks_file)], cwd=worktree)
+    result = subprocess.run(
+        ["git", "diff", "--cached", "--quiet"], cwd=worktree
+    )
+    if result.returncode != 0:
+        run(
+            ["git", "commit", "-m", f"chore(hypotheses): picks {TODAY} for {hypothesis_id}"],
+            cwd=worktree,
+        )
+
+
+def run_hypothesis(hyp: dict) -> bool:
+    """
+    Run one hypothesis experiment cycle. Returns True if the experiment concluded.
+    """
+    hid = hyp["id"]
+    branch = hyp["branch"]
+    scanner = hyp["scanner"]
+    worktree = f"/tmp/hyp-{hid}"
+
+    print(f"\n── Hypothesis: {hid} ──", flush=True)
+
+    # 1. Create worktree
+    run(["git", "fetch", "origin", branch], check=False)
+    run(["git", "worktree", "add", worktree, branch])
+
+    try:
+        # 2. Run discovery in worktree
+        result = subprocess.run(
+            [sys.executable, "scripts/run_daily_discovery.py", "--date", TODAY, "--no-update-positions"],
+            cwd=worktree,
+            check=False,
+        )
+        if result.returncode != 0:
+            print(f"    Discovery failed for {hid}, skipping picks update", flush=True)
+        else:
+            # 3. Extract picks + merge with existing
+            new_picks = extract_picks(worktree, scanner)
+            existing_picks = load_picks_from_branch(hid, branch)
+            # Deduplicate by (date, ticker)
+            seen = {(p["date"], p["ticker"]) for p in existing_picks}
+            merged = existing_picks + [p for p in new_picks if (p["date"], p["ticker"]) not in seen]
+
+            # 4. Save picks + commit in worktree
+            save_picks_to_worktree(worktree, hid, scanner, merged)
+
+            # 5. Push hypothesis branch
+            run(["git", "push", "origin", f"HEAD:{branch}"], cwd=worktree)
+
+        # 6. Update registry fields
+        if TODAY not in hyp.get("picks_log", []):
+            hyp.setdefault("picks_log", []).append(TODAY)
+        hyp["days_elapsed"] = len(hyp["picks_log"])
+
+        # 7. Check conclusion
+        if hyp["days_elapsed"] >= hyp["min_days"]:
+            return conclude_hypothesis(hyp)
+
+    finally:
+        run(["git", "worktree", "remove", "--force", worktree], check=False)
+
+    return False
+
+
+def conclude_hypothesis(hyp: dict) -> bool:
+    """Run comparison, write conclusion doc, close/merge PR. Returns True."""
+    hid = hyp["id"]
+    scanner = hyp["scanner"]
+    branch = hyp["branch"]
+
+    print(f"\n  Concluding {hid}...", flush=True)
+
+    # Load picks from branch
+    picks = load_picks_from_branch(hid, branch)
+    if not picks:
+        print(f"    No picks found for {hid}, marking rejected", flush=True)
+        conclusion = {
+            "decision": "rejected",
+            "reason": "No picks were collected during the experiment period",
+            "hypothesis": {"count": 0, "evaluated": 0, "win_rate": None, "avg_return": None},
+            "baseline": {"count": 0, "win_rate": None, "avg_return": None},
+        }
+    else:
+        # Run comparison script
+        result = subprocess.run(
+            [
+                sys.executable, "scripts/compare_hypothesis.py",
+                "--hypothesis-id", hid,
+                "--picks-json", json.dumps(picks),
+                "--scanner", scanner,
+                "--db-path", str(DB_PATH),
+            ],
+            cwd=str(ROOT),
+            capture_output=True,
+            text=True,
+        )
+        if result.returncode != 0:
+            print(f"    compare_hypothesis.py failed: {result.stderr}", flush=True)
+            return False
+        conclusion = json.loads(result.stdout)
+
+    decision = conclusion["decision"]
+    hyp_metrics = conclusion["hypothesis"]
+    base_metrics = conclusion["baseline"]
+
+    # Write concluded doc
+    period_start = hyp.get("created_at", TODAY)
+    concluded_doc = CONCLUDED_DIR / f"{TODAY}-{hid}.md"
+    concluded_doc.write_text(
+        f"# Hypothesis: {hyp['title']}\n\n"
+        f"**Scanner:** {scanner}\n"
+        f"**Branch:** {branch}\n"
+        f"**Period:** {period_start} → {TODAY} ({hyp['days_elapsed']} days)\n"
+        f"**Outcome:** {'accepted ✅' if decision == 'accepted' else 'rejected ❌'}\n\n"
+        f"## Hypothesis\n{hyp.get('description', hyp['title'])}\n\n"
+        f"## Results\n\n"
+        f"| Metric | Baseline | Experiment | Delta |\n"
+        f"|---|---|---|---|\n"
+        f"| 7d win rate | {base_metrics.get('win_rate') or '—'}% | "
+        f"{hyp_metrics.get('win_rate') or '—'}% | "
+        f"{_delta_str(hyp_metrics.get('win_rate'), base_metrics.get('win_rate'), 'pp')} |\n"
+        f"| Avg return | {base_metrics.get('avg_return') or '—'}% | "
+        f"{hyp_metrics.get('avg_return') or '—'}% | "
+        f"{_delta_str(hyp_metrics.get('avg_return'), base_metrics.get('avg_return'), '%')} |\n"
+        f"| Picks | {base_metrics.get('count', '—')} | {hyp_metrics.get('count', '—')} | — |\n\n"
+        f"## Decision\n{conclusion['reason']}\n\n"
+        f"## Action\n"
+        f"{'Branch merged into main.' if decision == 'accepted' else 'Branch closed without merging.'}\n"
+    )
+
+    run(["git", "add", str(concluded_doc)], check=False)
+
+    # Close or merge PR
+    pr = hyp.get("pr_number")
+    if pr:
+        if decision == "accepted":
+            subprocess.run(
+                ["gh", "pr", "merge", str(pr), "--squash", "--delete-branch"],
+                cwd=str(ROOT), check=False,
+            )
+        else:
+            subprocess.run(
+                ["gh", "pr", "close", str(pr), "--delete-branch"],
+                cwd=str(ROOT), check=False,
+            )
+
+    # Update registry entry
+    hyp["status"] = "concluded"
+    hyp["conclusion"] = decision
+
+    print(f"  {hid}: {decision} — {conclusion['reason']}", flush=True)
+    return True
+
+
+def _delta_str(hyp_val, base_val, unit: str) -> str:
+    if hyp_val is None or base_val is None:
+        return "—"
+    delta = hyp_val - base_val
+    sign = "+" if delta >= 0 else ""
+    return f"{sign}{delta:.1f}{unit}"
+
+
+def promote_pending(registry: dict) -> None:
+    """Promote the highest-priority pending hypothesis to running if a slot is open."""
+    running_count = sum(1 for h in registry["hypotheses"] if h["status"] == "running")
+    max_active = registry.get("max_active", 5)
+    if running_count >= max_active:
+        return
+
+    pending = [h for h in registry["hypotheses"] if h["status"] == "pending"]
+    if not pending:
+        return
+
+    # Promote highest priority
+    to_promote = max(pending, key=lambda h: h.get("priority", 0))
+    to_promote["status"] = "running"
+    print(f"\n  Promoted pending hypothesis to running: {to_promote['id']}", flush=True)
+
+
+def main():
+    registry = load_registry()
+    filter_id = os.environ.get("FILTER_ID", "").strip()
+
+    hypotheses = registry.get("hypotheses", [])
+    running = [
+        h for h in hypotheses
+        if h["status"] == "running" and (not filter_id or h["id"] == filter_id)
+    ]
+
+    if not running:
+        print("No running hypotheses to process.", flush=True)
+    else:
+        for hyp in running:
+            run_hypothesis(hyp)
+
+    promote_pending(registry)
+    save_registry(registry)
+    print("\nRegistry updated.", flush=True)
+
+
+if __name__ == "__main__":
+    main()
+```
+
+- [ ] **Step 3: Verify the workflow YAML is valid**
+
+```bash
+python3 -c "import yaml; yaml.safe_load(open('.github/workflows/hypothesis-runner.yml'))" 2>/dev/null \
+  || python3 -c "
+import re, sys
+with open('.github/workflows/hypothesis-runner.yml') as f:
+    content = f.read()
+# Just check the file exists and has the cron line
+assert '0 8 * * *' in content, 'missing cron'
+print('workflow file looks good')
+"
+```
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add .github/workflows/hypothesis-runner.yml scripts/run_hypothesis_runner.py
+git commit -m "feat(hypotheses): add daily hypothesis runner workflow"
+```
+
+---
+
+## Task 5: Dashboard Hypotheses Tab
+
+**Files:**
+- Create: `tradingagents/ui/pages/hypotheses.py`
+- Modify: `tradingagents/ui/pages/__init__.py`
+- Modify: `tradingagents/ui/dashboard.py`
+
+- [ ] **Step 1: Write the failing test**
+
+Create `tests/test_hypotheses_page.py`:
+
+```python
+"""Tests for the hypotheses dashboard page data loading."""
+import json
+import sys
+from pathlib import Path
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+
+from tradingagents.ui.pages.hypotheses import (
+    load_active_hypotheses,
+    load_concluded_hypotheses,
+    days_until_ready,
+)
+
+
+# ── load_active_hypotheses ────────────────────────────────────────────────────
+
+def test_load_active_hypotheses(tmp_path):
+    active = {
+        "max_active": 5,
+        "hypotheses": [
+            {
+                "id": "options_flow-test",
+                "title": "Test hypothesis",
+                "scanner": "options_flow",
+                "status": "running",
+                "priority": 7,
+                "days_elapsed": 5,
+                "min_days": 14,
+                "created_at": "2026-04-01",
+                "picks_log": ["2026-04-01"] * 5,
+                "conclusion": None,
+            }
+        ],
+    }
+    f = tmp_path / "active.json"
+    f.write_text(json.dumps(active))
+
+    result = load_active_hypotheses(str(f))
+    assert len(result) == 1
+    assert result[0]["id"] == "options_flow-test"
+
+
+def test_load_active_hypotheses_missing_file(tmp_path):
+    result = load_active_hypotheses(str(tmp_path / "missing.json"))
+    assert result == []
+
+
+# ── load_concluded_hypotheses ─────────────────────────────────────────────────
+
+def test_load_concluded_hypotheses(tmp_path):
+    doc = tmp_path / "2026-04-10-options_flow-test.md"
+    doc.write_text(
+        "# Hypothesis: Test\n\n"
+        "**Scanner:** options_flow\n"
+        "**Period:** 2026-03-27 → 2026-04-10 (14 days)\n"
+        "**Outcome:** accepted ✅\n"
+    )
+
+    results = load_concluded_hypotheses(str(tmp_path))
+    assert len(results) == 1
+    assert results[0]["filename"] == doc.name
+    assert results[0]["outcome"] == "accepted ✅"
+
+
+def test_load_concluded_hypotheses_empty_dir(tmp_path):
+    results = load_concluded_hypotheses(str(tmp_path))
+    assert results == []
+
+
+# ── days_until_ready ──────────────────────────────────────────────────────────
+
+def test_days_until_ready_has_days_left():
+    hyp = {"days_elapsed": 5, "min_days": 14}
+    assert days_until_ready(hyp) == 9
+
+
+def test_days_until_ready_past_due():
+    hyp = {"days_elapsed": 15, "min_days": 14}
+    assert days_until_ready(hyp) == 0
+```
+
+- [ ] **Step 2: Run tests to confirm they fail**
+
+```bash
+python -m pytest tests/test_hypotheses_page.py -v 2>&1 | head -20
+```
+
+Expected: `ModuleNotFoundError` for `tradingagents.ui.pages.hypotheses`.
+
+- [ ] **Step 3: Write `tradingagents/ui/pages/hypotheses.py`**
+
+```python
+"""
+Hypotheses dashboard page — tracks active and concluded experiments.
+
+Reads docs/iterations/hypotheses/active.json and the concluded/ directory.
+No external API calls; all data is file-based.
+"""
+
+import json
+import re
+from pathlib import Path
+from typing import Any, Dict, List
+
+import streamlit as st
+
+from tradingagents.ui.theme import COLORS, page_header
+
+_REPO_ROOT = Path(__file__).parent.parent.parent.parent
+_ACTIVE_JSON = _REPO_ROOT / "docs/iterations/hypotheses/active.json"
+_CONCLUDED_DIR = _REPO_ROOT / "docs/iterations/hypotheses/concluded"
+
+
+# ── Data loaders ─────────────────────────────────────────────────────────────
+
+
+def load_active_hypotheses(active_path: str = str(_ACTIVE_JSON)) -> List[Dict[str, Any]]:
+    """Load all hypotheses from active.json. Returns [] if file missing."""
+    path = Path(active_path)
+    if not path.exists():
+        return []
+    try:
+        with open(path) as f:
+            data = json.load(f)
+        return data.get("hypotheses", [])
+    except Exception:
+        return []
+
+
+def load_concluded_hypotheses(concluded_dir: str = str(_CONCLUDED_DIR)) -> List[Dict[str, Any]]:
+    """
+    Load concluded hypothesis metadata by parsing the markdown files in concluded/.
+
+    Extracts: filename, title, scanner, period, outcome from each .md file.
+    """
+    dir_path = Path(concluded_dir)
+    if not dir_path.exists():
+        return []
+
+    results = []
+    for md_file in sorted(dir_path.glob("*.md"), reverse=True):
+        if md_file.name == ".gitkeep":
+            continue
+        try:
+            text = md_file.read_text()
+            title = _extract_md_field(text, r"^# Hypothesis: (.+)$")
+            scanner = _extract_md_field(text, r"^\*\*Scanner:\*\* (.+)$")
+            period = _extract_md_field(text, r"^\*\*Period:\*\* (.+)$")
+            outcome = _extract_md_field(text, r"^\*\*Outcome:\*\* (.+)$")
+            results.append({
+                "filename": md_file.name,
+                "title": title or md_file.stem,
+                "scanner": scanner or "—",
+                "period": period or "—",
+                "outcome": outcome or "—",
+            })
+        except Exception:
+            continue
+
+    return results
+
+
+def _extract_md_field(text: str, pattern: str) -> str:
+    """Extract a field value from a markdown line using regex."""
+    match = re.search(pattern, text, re.MULTILINE)
+    return match.group(1).strip() if match else ""
+
+
+def days_until_ready(hyp: Dict[str, Any]) -> int:
+    """Return number of days remaining before hypothesis can conclude (min 0)."""
+    return max(0, hyp.get("min_days", 14) - hyp.get("days_elapsed", 0))
+
+
+# ── Rendering ─────────────────────────────────────────────────────────────────
+
+
+def render() -> None:
+    """Render the hypotheses tracking page."""
+    st.markdown(
+        page_header("Hypotheses", "Active experiments & concluded findings"),
+        unsafe_allow_html=True,
+    )
+
+    hypotheses = load_active_hypotheses()
+    concluded = load_concluded_hypotheses()
+
+    if not hypotheses and not concluded:
+        st.info(
+            "No hypotheses yet. Run `/backtest-hypothesis \"<description>\"` to start an experiment."
+        )
+        return
+
+    # ── Active experiments ────────────────────────────────────────────────────
+    running = [h for h in hypotheses if h["status"] == "running"]
+    pending = [h for h in hypotheses if h["status"] == "pending"]
+
+    st.markdown(
+        f'<div class="section-title">Active Experiments '
+        f'<span class="accent">// {len(running)} running, {len(pending)} pending</span></div>',
+        unsafe_allow_html=True,
+    )
+
+    if running or pending:
+        active_rows = []
+        for h in sorted(running + pending, key=lambda x: -x.get("priority", 0)):
+            days_left = days_until_ready(h)
+            ready_str = "concluding soon" if days_left == 0 else f"{days_left}d left"
+            status_color = COLORS["green"] if h["status"] == "running" else COLORS["amber"]
+            active_rows.append({
+                "ID": h["id"],
+                "Title": h.get("title", "—"),
+                "Scanner": h.get("scanner", "—"),
+                "Status": h["status"],
+                "Progress": f"{h.get('days_elapsed', 0)}/{h.get('min_days', 14)}d",
+                "Picks": len(h.get("picks_log", [])),
+                "Ready": ready_str,
+                "Priority": h.get("priority", "—"),
+            })
+
+        import pandas as pd
+        df = pd.DataFrame(active_rows)
+        st.dataframe(
+            df,
+            width="stretch",
+            hide_index=True,
+            column_config={
+                "ID": st.column_config.TextColumn(width="medium"),
+                "Title": st.column_config.TextColumn(width="large"),
+                "Scanner": st.column_config.TextColumn(width="medium"),
+                "Status": st.column_config.TextColumn(width="small"),
+                "Progress": st.column_config.TextColumn(width="small"),
+                "Picks": st.column_config.NumberColumn(format="%d", width="small"),
+                "Ready": st.column_config.TextColumn(width="medium"),
+                "Priority": st.column_config.NumberColumn(format="%d/9", width="small"),
+            },
+        )
+    else:
+        st.info("No active experiments.")
+
+    st.markdown("<div style='height:1.5rem;'></div>", unsafe_allow_html=True)
+
+    # ── Concluded experiments ─────────────────────────────────────────────────
+    st.markdown(
+        f'<div class="section-title">Concluded Experiments '
+        f'<span class="accent">// {len(concluded)} total</span></div>',
+        unsafe_allow_html=True,
+    )
+
+    if concluded:
+        import pandas as pd
+        concluded_rows = []
+        for c in concluded:
+            outcome = c["outcome"]
+            emoji = "✅" if "accepted" in outcome else "❌"
+            concluded_rows.append({
+                "Date": c["filename"][:10],
+                "Title": c["title"],
+                "Scanner": c["scanner"],
+                "Period": c["period"],
+                "Outcome": emoji,
+            })
+        cdf = pd.DataFrame(concluded_rows)
+        st.dataframe(
+            cdf,
+            width="stretch",
+            hide_index=True,
+            column_config={
+                "Date": st.column_config.TextColumn(width="small"),
+                "Title": st.column_config.TextColumn(width="large"),
+                "Scanner": st.column_config.TextColumn(width="medium"),
+                "Period": st.column_config.TextColumn(width="medium"),
+                "Outcome": st.column_config.TextColumn(width="small"),
+            },
+        )
+    else:
+        st.info("No concluded experiments yet.")
+```
+
+- [ ] **Step 4: Run tests to confirm they pass**
+
+```bash
+python -m pytest tests/test_hypotheses_page.py -v
+```
+
+Expected: all 6 tests pass.
+
+- [ ] **Step 5: Register the page in `tradingagents/ui/pages/__init__.py`**
+
+Add after the `settings` import block (around line 38):
+
+```python
+try:
+    from tradingagents.ui.pages import hypotheses
+except Exception as _e:
+    _logger.error("Failed to import hypotheses page: %s", _e, exc_info=True)
+    hypotheses = None
+```
+
+And add `"hypotheses"` to `__all__`:
+
+```python
+__all__ = [
+    "home",
+    "todays_picks",
+    "portfolio",
+    "performance",
+    "settings",
+    "hypotheses",
+]
+```
+
+- [ ] **Step 6: Add "Hypotheses" to dashboard navigation in `tradingagents/ui/dashboard.py`**
+
+In `render_sidebar`, change the `options` list:
+
+```python
+page = st.radio(
+    "Navigation",
+    options=["Overview", "Signals", "Portfolio", "Performance", "Hypotheses", "Config"],
+    label_visibility="collapsed",
+)
+```
+
+In `route_page`, add to `page_map`:
+
+```python
+page_map = {
+    "Overview": pages.home,
+    "Signals": pages.todays_picks,
+    "Portfolio": pages.portfolio,
+    "Performance": pages.performance,
+    "Hypotheses": pages.hypotheses,
+    "Config": pages.settings,
+}
+```
+
+- [ ] **Step 7: Run the full test suite**
+
+```bash
+python -m pytest tests/test_compare_hypothesis.py tests/test_hypotheses_page.py -v
+```
+
+Expected: all 16 tests pass.
+
+- [ ] **Step 8: Commit everything**
+
+```bash
+git add \
+  tradingagents/ui/pages/hypotheses.py \
+  tradingagents/ui/pages/__init__.py \
+  tradingagents/ui/dashboard.py \
+  tests/test_hypotheses_page.py
+git commit -m "feat(hypotheses): add Hypotheses dashboard tab"
+```
+
+---
+
+## Self-Review
+
+**Spec coverage check:**
+- ✅ `active.json` schema with `status: running/pending/concluded` — Task 1
+- ✅ `/backtest-hypothesis` command: classify, priority scoring, pending queue, branch creation — Task 3
+- ✅ Running experiments never paused — enforced in `run_hypothesis_runner.py` (only `running` entries processed; new ones queue as `pending`)
+- ✅ Daily runner: worktree per hypothesis, run discovery, commit picks, conclude — Task 4
+- ✅ Statistical comparison with 5pp / 1% thresholds, minimum 5 evaluated picks — Task 2
+- ✅ Auto-promote pending → running when slot opens — `promote_pending()` in runner
+- ✅ Concluded doc written with metrics table — `conclude_hypothesis()` in runner
+- ✅ PR merged (accepted) or closed (rejected) automatically — `conclude_hypothesis()`
+- ✅ Dashboard tab with active + concluded tables — Task 5
+
+**Type/name consistency:**
+- `hypothesis_id` / `hid` / `id` field: the dict key is always `"id"`, the local var is `hid`, the argument is `--hypothesis-id` — consistent throughout
+- `picks.json` structure: `{"hypothesis_id": ..., "scanner": ..., "picks": [...]}` — used in `save_picks_to_worktree` and `load_picks_from_branch` consistently
+- `strategy_match` field used to filter picks in `extract_picks` — matches `discovery_result.json` structure confirmed by inspection

From d3065f59f1abafd85abfa3c081790a54865a4a85 Mon Sep 17 00:00:00 2001
From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com>
Date: Fri, 10 Apr 2026 09:26:17 -0700
Subject: [PATCH 04/14] feat(hypotheses): initialize hypothesis registry

---
 docs/iterations/hypotheses/active.json        | 4 ++++
 docs/iterations/hypotheses/concluded/.gitkeep | 0
 2 files changed, 4 insertions(+)
 create mode 100644 docs/iterations/hypotheses/active.json
 create mode 100644 docs/iterations/hypotheses/concluded/.gitkeep

diff --git a/docs/iterations/hypotheses/active.json b/docs/iterations/hypotheses/active.json
new file mode 100644
index 00000000..6ed3446d
--- /dev/null
+++ b/docs/iterations/hypotheses/active.json
@@ -0,0 +1,4 @@
+{
+  "max_active": 5,
+  "hypotheses": []
+}
diff --git a/docs/iterations/hypotheses/concluded/.gitkeep b/docs/iterations/hypotheses/concluded/.gitkeep
new file mode 100644
index 00000000..e69de29b

From 6c438f87e6cc71d79566a9b83e46a6bacd5289e7 Mon Sep 17 00:00:00 2001
From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com>
Date: Fri, 10 Apr 2026 09:29:08 -0700
Subject: [PATCH 05/14] feat(hypotheses): add comparison + conclusion script

Implements compute_7d_return, compute_metrics, load_baseline_metrics,
and make_decision functions with full TDD coverage (11 tests passing).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 scripts/__init__.py              |   0
 scripts/compare_hypothesis.py    | 153 +++++++++++++++++++++++++++++++
 tests/test_compare_hypothesis.py | 135 +++++++++++++++++++++++++++
 3 files changed, 288 insertions(+)
 create mode 100644 scripts/__init__.py
 create mode 100644 scripts/compare_hypothesis.py
 create mode 100644 tests/test_compare_hypothesis.py

diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/scripts/compare_hypothesis.py b/scripts/compare_hypothesis.py
new file mode 100644
index 00000000..991f5baf
--- /dev/null
+++ b/scripts/compare_hypothesis.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+"""
+Hypothesis comparison — computes 7d returns for hypothesis picks and
+compares them against the baseline scanner in performance_database.json.
+
+Usage (called by hypothesis-runner.yml after min_days elapsed):
+    python scripts/compare_hypothesis.py \
+        --hypothesis-id options_flow-scan-3-expirations \
+        --picks-json '[{"date": "2026-04-01", "ticker": "AAPL", ...}]' \
+        --scanner options_flow \
+        --db-path data/recommendations/performance_database.json
+
+Prints a JSON conclusion to stdout.
+"""
+
+import argparse
+import json
+import sys
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Optional, Tuple
+
+ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(ROOT))
+
+from tradingagents.dataflows.y_finance import download_history
+
+_MIN_EVALUATED = 5
+_WIN_RATE_DELTA_THRESHOLD = 5.0
+_AVG_RETURN_DELTA_THRESHOLD = 1.0
+
+
+def compute_7d_return(ticker: str, pick_date: str) -> Tuple[Optional[float], Optional[bool]]:
+    """Fetch 7-day return for a pick using yfinance. Returns (pct, is_win) or (None, None)."""
+    try:
+        entry_dt = datetime.strptime(pick_date, "%Y-%m-%d")
+        exit_dt = entry_dt + timedelta(days=10)
+        df = download_history(
+            ticker,
+            start=entry_dt.strftime("%Y-%m-%d"),
+            end=exit_dt.strftime("%Y-%m-%d"),
+        )
+        if df.empty or len(df) < 2:
+            return None, None
+        close = df["Close"]
+        entry_price = float(close.iloc[0])
+        exit_idx = min(5, len(close) - 1)
+        exit_price = float(close.iloc[exit_idx])
+        if entry_price <= 0:
+            return None, None
+        ret = (exit_price - entry_price) / entry_price * 100
+        return round(ret, 4), ret > 0
+    except Exception:
+        return None, None
+
+
+def enrich_picks_with_returns(picks: list) -> list:
+    """Compute 7d return for each pick >= 7 days old that lacks return_7d."""
+    cutoff = (datetime.utcnow() - timedelta(days=7)).strftime("%Y-%m-%d")
+    for pick in picks:
+        if pick.get("return_7d") is not None:
+            continue
+        if pick.get("date", "9999-99-99") > cutoff:
+            continue
+        ret, win = compute_7d_return(pick["ticker"], pick["date"])
+        pick["return_7d"] = ret
+        pick["win_7d"] = win
+    return picks
+
+
+def compute_metrics(picks: list) -> dict:
+    """Compute win rate and avg return. Only picks with non-None return_7d are evaluated."""
+    evaluated = [p for p in picks if p.get("return_7d") is not None]
+    if not evaluated:
+        return {"count": len(picks), "evaluated": 0, "win_rate": None, "avg_return": None}
+    wins = sum(1 for p in evaluated if p.get("win_7d"))
+    avg_ret = sum(p["return_7d"] for p in evaluated) / len(evaluated)
+    return {
+        "count": len(picks),
+        "evaluated": len(evaluated),
+        "win_rate": round(wins / len(evaluated) * 100, 1),
+        "avg_return": round(avg_ret, 2),
+    }
+
+
+def load_baseline_metrics(scanner: str, db_path: str) -> dict:
+    """Load baseline metrics for a scanner from performance_database.json."""
+    path = Path(db_path)
+    if not path.exists():
+        return {"count": 0, "win_rate": None, "avg_return": None}
+    try:
+        with open(path) as f:
+            db = json.load(f)
+    except Exception:
+        return {"count": 0, "win_rate": None, "avg_return": None}
+    picks = []
+    for recs in db.get("recommendations_by_date", {}).values():
+        for rec in (recs if isinstance(recs, list) else []):
+            if rec.get("strategy_match") == scanner and rec.get("return_7d") is not None:
+                picks.append(rec)
+    return compute_metrics(picks)
+
+
+def make_decision(hypothesis: dict, baseline: dict) -> Tuple[str, str]:
+    """Decide accepted/rejected. Requires _MIN_EVALUATED evaluated picks."""
+    evaluated = hypothesis.get("evaluated", 0)
+    if evaluated < _MIN_EVALUATED:
+        return "rejected", f"Insufficient data: only {evaluated} evaluated picks (need {_MIN_EVALUATED})"
+    hyp_wr = hypothesis.get("win_rate")
+    hyp_ret = hypothesis.get("avg_return")
+    base_wr = baseline.get("win_rate")
+    base_ret = baseline.get("avg_return")
+    reasons = []
+    if hyp_wr is not None and base_wr is not None:
+        delta_wr = hyp_wr - base_wr
+        if delta_wr > _WIN_RATE_DELTA_THRESHOLD:
+            reasons.append(f"win rate improved by {delta_wr:+.1f}pp ({base_wr:.1f}% → {hyp_wr:.1f}%)")
+    if hyp_ret is not None and base_ret is not None:
+        delta_ret = hyp_ret - base_ret
+        if delta_ret > _AVG_RETURN_DELTA_THRESHOLD:
+            reasons.append(f"avg return improved by {delta_ret:+.2f}% ({base_ret:+.2f}% → {hyp_ret:+.2f}%)")
+    if reasons:
+        return "accepted", "; ".join(reasons)
+    wr_str = f"{hyp_wr:.1f}% vs baseline {base_wr:.1f}%" if hyp_wr is not None else "no win rate data"
+    ret_str = f"{hyp_ret:+.2f}% vs baseline {base_ret:+.2f}%" if hyp_ret is not None else "no return data"
+    return "rejected", f"No significant improvement — win rate: {wr_str}; avg return: {ret_str}"
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--hypothesis-id", required=True)
+    parser.add_argument("--picks-json", required=True)
+    parser.add_argument("--scanner", required=True)
+    parser.add_argument("--db-path", default="data/recommendations/performance_database.json")
+    args = parser.parse_args()
+    picks = json.loads(args.picks_json)
+    picks = enrich_picks_with_returns(picks)
+    hyp_metrics = compute_metrics(picks)
+    base_metrics = load_baseline_metrics(args.scanner, args.db_path)
+    decision, reason = make_decision(hyp_metrics, base_metrics)
+    result = {
+        "hypothesis_id": args.hypothesis_id,
+        "decision": decision,
+        "reason": reason,
+        "hypothesis": hyp_metrics,
+        "baseline": base_metrics,
+        "enriched_picks": picks,
+    }
+    print(json.dumps(result, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_compare_hypothesis.py b/tests/test_compare_hypothesis.py
new file mode 100644
index 00000000..2cf41609
--- /dev/null
+++ b/tests/test_compare_hypothesis.py
@@ -0,0 +1,135 @@
+"""Tests for the hypothesis comparison script."""
+import json
+import sys
+from datetime import date, timedelta
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from scripts.compare_hypothesis import (
+    compute_metrics,
+    compute_7d_return,
+    load_baseline_metrics,
+    make_decision,
+)
+
+
+# ── compute_metrics ──────────────────────────────────────────────────────────
+
+def test_compute_metrics_empty():
+    result = compute_metrics([])
+    assert result == {"count": 0, "evaluated": 0, "win_rate": None, "avg_return": None}
+
+
+def test_compute_metrics_all_wins():
+    picks = [
+        {"return_7d": 5.0, "win_7d": True},
+        {"return_7d": 3.0, "win_7d": True},
+    ]
+    result = compute_metrics(picks)
+    assert result["win_rate"] == 100.0
+    assert result["avg_return"] == 4.0
+    assert result["evaluated"] == 2
+
+
+def test_compute_metrics_mixed():
+    picks = [
+        {"return_7d": 10.0, "win_7d": True},
+        {"return_7d": -5.0, "win_7d": False},
+        {"return_7d": None, "win_7d": None},   # pending — excluded
+    ]
+    result = compute_metrics(picks)
+    assert result["win_rate"] == 50.0
+    assert result["avg_return"] == 2.5
+    assert result["evaluated"] == 2
+    assert result["count"] == 3
+
+
+# ── compute_7d_return ────────────────────────────────────────────────────────
+
+def test_compute_7d_return_positive():
+    import pandas as pd
+
+    close_data = [100.0, 101.0, 102.0, 103.0, 104.0, 110.0]
+    mock_df = pd.DataFrame({"Close": close_data})
+
+    with patch("scripts.compare_hypothesis.download_history", return_value=mock_df):
+        ret, win = compute_7d_return("AAPL", "2026-03-01")
+
+    assert ret == pytest.approx(10.0, rel=0.01)
+    assert win is True
+
+
+def test_compute_7d_return_empty_data():
+    import pandas as pd
+
+    mock_df = pd.DataFrame()
+
+    with patch("scripts.compare_hypothesis.download_history", return_value=mock_df):
+        ret, win = compute_7d_return("AAPL", "2026-03-01")
+
+    assert ret is None
+    assert win is None
+
+
+# ── load_baseline_metrics ────────────────────────────────────────────────────
+
+def test_load_baseline_metrics(tmp_path):
+    db = {
+        "recommendations_by_date": {
+            "2026-03-01": [
+                {"strategy_match": "options_flow", "return_7d": 5.0, "win_7d": True},
+                {"strategy_match": "options_flow", "return_7d": -2.0, "win_7d": False},
+                {"strategy_match": "reddit_dd", "return_7d": 3.0, "win_7d": True},
+            ]
+        }
+    }
+    db_file = tmp_path / "performance_database.json"
+    db_file.write_text(json.dumps(db))
+
+    result = load_baseline_metrics("options_flow", str(db_file))
+
+    assert result["win_rate"] == 50.0
+    assert result["avg_return"] == 1.5
+    assert result["count"] == 2
+
+
+def test_load_baseline_metrics_missing_file(tmp_path):
+    result = load_baseline_metrics("options_flow", str(tmp_path / "missing.json"))
+    assert result == {"count": 0, "win_rate": None, "avg_return": None}
+
+
+# ── make_decision ─────────────────────────────────────────────────────────────
+
+def test_make_decision_accepted_by_win_rate():
+    hyp = {"win_rate": 60.0, "avg_return": 0.5, "evaluated": 10}
+    baseline = {"win_rate": 50.0, "avg_return": 0.5}
+    decision, reason = make_decision(hyp, baseline)
+    assert decision == "accepted"
+    assert "win rate" in reason.lower()
+
+
+def test_make_decision_accepted_by_return():
+    hyp = {"win_rate": 52.0, "avg_return": 3.0, "evaluated": 10}
+    baseline = {"win_rate": 50.0, "avg_return": 1.5}
+    decision, reason = make_decision(hyp, baseline)
+    assert decision == "accepted"
+    assert "return" in reason.lower()
+
+
+def test_make_decision_rejected():
+    hyp = {"win_rate": 48.0, "avg_return": 0.2, "evaluated": 10}
+    baseline = {"win_rate": 50.0, "avg_return": 1.0}
+    decision, reason = make_decision(hyp, baseline)
+    assert decision == "rejected"
+
+
+def test_make_decision_insufficient_data():
+    hyp = {"win_rate": 80.0, "avg_return": 5.0, "evaluated": 2}
+    baseline = {"win_rate": 50.0, "avg_return": 1.0}
+    decision, reason = make_decision(hyp, baseline)
+    assert decision == "rejected"
+    assert "insufficient" in reason.lower()

From 2747ccddcd5bd024313e5e1ccf773d0d00838b25 Mon Sep 17 00:00:00 2001
From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com>
Date: Fri, 10 Apr 2026 09:29:22 -0700
Subject: [PATCH 06/14] feat(hypotheses): add comparison + conclusion script

Implements compute_7d_return, compute_metrics, load_baseline_metrics,
and make_decision functions with full TDD coverage (11 tests passing).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 scripts/compare_hypothesis.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/scripts/compare_hypothesis.py b/scripts/compare_hypothesis.py
index 991f5baf..a1cd08bc 100644
--- a/scripts/compare_hypothesis.py
+++ b/scripts/compare_hypothesis.py
@@ -105,7 +105,10 @@ def make_decision(hypothesis: dict, baseline: dict) -> Tuple[str, str]:
     """Decide accepted/rejected. Requires _MIN_EVALUATED evaluated picks."""
     evaluated = hypothesis.get("evaluated", 0)
     if evaluated < _MIN_EVALUATED:
-        return "rejected", f"Insufficient data: only {evaluated} evaluated picks (need {_MIN_EVALUATED})"
+        return (
+            "rejected",
+            f"Insufficient data: only {evaluated} evaluated picks (need {_MIN_EVALUATED})",
+        )
     hyp_wr = hypothesis.get("win_rate")
     hyp_ret = hypothesis.get("avg_return")
     base_wr = baseline.get("win_rate")
@@ -114,15 +117,23 @@ def make_decision(hypothesis: dict, baseline: dict) -> Tuple[str, str]:
     if hyp_wr is not None and base_wr is not None:
         delta_wr = hyp_wr - base_wr
         if delta_wr > _WIN_RATE_DELTA_THRESHOLD:
-            reasons.append(f"win rate improved by {delta_wr:+.1f}pp ({base_wr:.1f}% → {hyp_wr:.1f}%)")
+            reasons.append(
+                f"win rate improved by {delta_wr:+.1f}pp ({base_wr:.1f}% → {hyp_wr:.1f}%)"
+            )
     if hyp_ret is not None and base_ret is not None:
         delta_ret = hyp_ret - base_ret
         if delta_ret > _AVG_RETURN_DELTA_THRESHOLD:
-            reasons.append(f"avg return improved by {delta_ret:+.2f}% ({base_ret:+.2f}% → {hyp_ret:+.2f}%)")
+            reasons.append(
+                f"avg return improved by {delta_ret:+.2f}% ({base_ret:+.2f}% → {hyp_ret:+.2f}%)"
+            )
     if reasons:
         return "accepted", "; ".join(reasons)
-    wr_str = f"{hyp_wr:.1f}% vs baseline {base_wr:.1f}%" if hyp_wr is not None else "no win rate data"
-    ret_str = f"{hyp_ret:+.2f}% vs baseline {base_ret:+.2f}%" if hyp_ret is not None else "no return data"
+    wr_str = (
+        f"{hyp_wr:.1f}% vs baseline {base_wr:.1f}%" if hyp_wr is not None else "no win rate data"
+    )
+    ret_str = (
+        f"{hyp_ret:+.2f}% vs baseline {base_ret:+.2f}%" if hyp_ret is not None else "no return data"
+    )
     return "rejected", f"No significant improvement — win rate: {wr_str}; avg return: {ret_str}"
 
 

From f8063f3596a5bc274f002e546d9499ce600a8366 Mon Sep 17 00:00:00 2001
From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com>
Date: Fri, 10 Apr 2026 09:31:07 -0700
Subject: [PATCH 07/14] fix(hypotheses): use correct 7-trading-day exit index
 in comparison

---
 scripts/compare_hypothesis.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/compare_hypothesis.py b/scripts/compare_hypothesis.py
index a1cd08bc..ac6a72aa 100644
--- a/scripts/compare_hypothesis.py
+++ b/scripts/compare_hypothesis.py
@@ -44,7 +44,7 @@ def compute_7d_return(ticker: str, pick_date: str) -> Tuple[Optional[float], Opt
             return None, None
         close = df["Close"]
         entry_price = float(close.iloc[0])
-        exit_idx = min(5, len(close) - 1)
+        exit_idx = min(6, len(close) - 1)
         exit_price = float(close.iloc[exit_idx])
         if entry_price <= 0:
             return None, None

From 38b9cef41c41cabca78b03fd1b1b20d50e173ed6 Mon Sep 17 00:00:00 2001
From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com>
Date: Fri, 10 Apr 2026 09:46:33 -0700
Subject: [PATCH 08/14] feat(hypotheses): add /backtest-hypothesis command

---
 .claude/commands/backtest-hypothesis.md | 159 ++++++++++++++++++++++++
 1 file changed, 159 insertions(+)
 create mode 100644 .claude/commands/backtest-hypothesis.md

diff --git a/.claude/commands/backtest-hypothesis.md b/.claude/commands/backtest-hypothesis.md
new file mode 100644
index 00000000..3941bb70
--- /dev/null
+++ b/.claude/commands/backtest-hypothesis.md
@@ -0,0 +1,159 @@
+# /backtest-hypothesis
+
+Test a hypothesis about a scanner improvement using branch-per-hypothesis isolation.
+
+**Usage:** `/backtest-hypothesis "<description of the hypothesis>"`
+
+**Example:** `/backtest-hypothesis "options_flow: scan 3 expirations instead of 1 to capture institutional 30+ DTE positioning"`
+
+---
+
+## Step 1: Read Current Registry
+
+Read `docs/iterations/hypotheses/active.json`. Note:
+- How many hypotheses currently have `status: "running"`
+- The `max_active` limit (default 5)
+- Any existing `pending` entries
+
+Also read `docs/iterations/LEARNINGS.md` and the relevant scanner domain file in
+`docs/iterations/scanners/` to understand the current baseline.
+
+## Step 2: Classify the Hypothesis
+
+Determine whether this is:
+
+**Statistical** — answerable from existing data in `data/recommendations/performance_database.json`
+without any code change. Examples:
+- "Does high confidence (≥8) predict better 30d returns?"
+- "Are options_flow picks that are ITM outperforming OTM ones?"
+
+**Implementation** — requires a code change and forward-testing period. Examples:
+- "Scan 3 expirations instead of 1"
+- "Apply a premium filter of $50K instead of $25K"
+
+## Step 3a: Statistical Path
+
+If statistical: run the analysis now against `data/recommendations/performance_database.json`.
+Write the finding to the relevant scanner domain file under **Evidence Log**. Print a summary.
+Done — no branch needed.
+
+## Step 3b: Implementation Path
+
+### 3b-i: Capacity check
+
+Count running hypotheses from `active.json`. If fewer than `max_active` running, proceed.
+If at capacity: add the new hypothesis as `status: "pending"` — running experiments are NEVER
+paused mid-streak. Inform the user which slot it is queued behind and when it will likely start.
+
+### 3b-ii: Score the hypothesis
+
+Assign a `priority` score (1–9) using these factors:
+
+| Factor | Score |
+|---|---|
+| Scanner 30d win rate < 40% | +3 |
+| Change touches 1 file, 1 parameter | +2 |
+| Directly addresses a weak spot in LEARNINGS.md | +2 |
+| Scanner generates ≥2 picks/day (data accrues fast) | +1 |
+| Supported by external research (arXiv, Alpha Architect, etc.) | +1 |
+| Contradictory evidence or unclear direction | −2 |
+
+### 3b-iii: Determine min_days
+
+Set `min_days` based on the scanner's typical picks-per-day rate:
+- ≥2 picks/day → 14 days
+- 1 pick/day → 21 days
+- <1 pick/day → 30 days
+
+### 3b-iv: Create the branch and implement the code change
+
+```bash
+BRANCH="hypothesis/<scanner>-<slug>"
+git checkout -b "$BRANCH"
+```
+
+Make the minimal code change that implements the hypothesis. Read the scanner file first.
+Only change what the hypothesis requires — do not refactor surrounding code.
+
+```bash
+git add tradingagents/
+git commit -m "hypothesis(<scanner>): <title>"
+```
+
+### 3b-v: Create picks tracking file on the branch
+
+Create `docs/iterations/hypotheses/<id>/picks.json` on the hypothesis branch:
+
+```json
+{
+  "hypothesis_id": "<id>",
+  "scanner": "<scanner>",
+  "picks": []
+}
+```
+
+```bash
+mkdir -p docs/iterations/hypotheses/<id>
+git add docs/iterations/hypotheses/<id>/picks.json
+git commit -m "hypothesis(<scanner>): add picks tracker"
+git push -u origin "$BRANCH"
+```
+
+### 3b-vi: Open a draft PR
+
+```bash
+gh pr create \
+  --title "hypothesis(<scanner>): <title>" \
+  --body "**Hypothesis:** <description>
+
+**Expected impact:** <high/medium/low>
+**Min days:** <N>
+**Priority:** <score>/9
+
+*This is an automated hypothesis experiment. It will be auto-concluded after ${MIN_DAYS} days of data.*" \
+  --draft \
+  --base main
+```
+
+Note the PR number from the output.
+
+### 3b-vii: Update active.json on main
+
+Check out `main`, then update `docs/iterations/hypotheses/active.json` to add the new entry:
+
+```json
+{
+  "id": "<scanner>-<slug>",
+  "scanner": "<scanner>",
+  "title": "<title>",
+  "description": "<description>",
+  "branch": "hypothesis/<scanner>-<slug>",
+  "pr_number": <N>,
+  "status": "running",
+  "priority": <score>,
+  "expected_impact": "<high|medium|low>",
+  "hypothesis_type": "implementation",
+  "created_at": "<YYYY-MM-DD>",
+  "min_days": <N>,
+  "days_elapsed": 0,
+  "picks_log": [],
+  "baseline_scanner": "<scanner>",
+  "conclusion": null
+}
+```
+
+```bash
+git checkout main
+git add docs/iterations/hypotheses/active.json
+git commit -m "feat(hypotheses): register hypothesis <id>"
+git push origin main
+```
+
+## Step 4: Print Summary
+
+Print a confirmation:
+- Hypothesis ID and branch name
+- Status: running or pending
+- Expected conclusion date (created_at + min_days)
+- PR link (if running)
+- Priority score and why

From 1b782b1cd62f6e454078436cb2cc306fbc197818 Mon Sep 17 00:00:00 2001
From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com>
Date: Fri, 10 Apr 2026 09:49:10 -0700
Subject: [PATCH 09/14] feat(hypotheses): add daily hypothesis runner workflow

---
 .github/workflows/hypothesis-runner.yml |  74 +++++++
 scripts/run_hypothesis_runner.py        | 283 ++++++++++++++++++++++++
 2 files changed, 357 insertions(+)
 create mode 100644 .github/workflows/hypothesis-runner.yml
 create mode 100644 scripts/run_hypothesis_runner.py

diff --git a/.github/workflows/hypothesis-runner.yml b/.github/workflows/hypothesis-runner.yml
new file mode 100644
index 00000000..0d6cd4bc
--- /dev/null
+++ b/.github/workflows/hypothesis-runner.yml
@@ -0,0 +1,74 @@
+name: Hypothesis Runner
+
+on:
+  schedule:
+    # 8:00 AM UTC daily — runs after iterate (06:00 UTC)
+    - cron: "0 8 * * *"
+  workflow_dispatch:
+    inputs:
+      hypothesis_id:
+        description: "Run a specific hypothesis ID only (blank = all running)"
+        required: false
+        default: ""
+
+env:
+  PYTHON_VERSION: "3.10"
+
+jobs:
+  run-hypotheses:
+    runs-on: ubuntu-latest
+    environment: TradingAgent
+    timeout-minutes: 60
+    permissions:
+      contents: write
+      pull-requests: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.GH_TOKEN }}
+
+      - name: Set up git identity
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+          cache: pip
+
+      - name: Install dependencies
+        run: pip install --upgrade pip && pip install -e .
+
+      - name: Run hypothesis experiments
+        env:
+          GH_TOKEN: ${{ secrets.GH_TOKEN }}
+          GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          FINNHUB_API_KEY: ${{ secrets.FINNHUB_API_KEY }}
+          ALPHA_VANTAGE_API_KEY: ${{ secrets.ALPHA_VANTAGE_API_KEY }}
+          FMP_API_KEY: ${{ secrets.FMP_API_KEY }}
+          REDDIT_CLIENT_ID: ${{ secrets.REDDIT_CLIENT_ID }}
+          REDDIT_CLIENT_SECRET: ${{ secrets.REDDIT_CLIENT_SECRET }}
+          TRADIER_API_KEY: ${{ secrets.TRADIER_API_KEY }}
+          FILTER_ID: ${{ inputs.hypothesis_id }}
+        run: |
+          python scripts/run_hypothesis_runner.py
+
+      - name: Commit active.json updates
+        env:
+          GH_TOKEN: ${{ secrets.GH_TOKEN }}
+        run: |
+          git add docs/iterations/hypotheses/active.json docs/iterations/hypotheses/concluded/ || true
+          if git diff --cached --quiet; then
+            echo "No registry changes"
+          else
+            git commit -m "chore(hypotheses): update registry $(date -u +%Y-%m-%d)"
+            git pull --rebase origin main
+            git push origin main
+          fi
diff --git a/scripts/run_hypothesis_runner.py b/scripts/run_hypothesis_runner.py
new file mode 100644
index 00000000..79d73e04
--- /dev/null
+++ b/scripts/run_hypothesis_runner.py
@@ -0,0 +1,283 @@
+#!/usr/bin/env python3
+"""
+Hypothesis Runner — orchestrates daily experiment cycles.
+
+For each running hypothesis in active.json:
+  1. Creates a git worktree for the hypothesis branch
+  2. Runs the daily discovery pipeline in that worktree
+  3. Extracts picks from the discovery result, appends to picks.json
+  4. Commits and pushes picks to hypothesis branch
+  5. Removes worktree
+  6. Updates active.json (days_elapsed, picks_log)
+  7. If days_elapsed >= min_days: concludes the hypothesis
+
+After all hypotheses: promotes highest-priority pending → running if a slot opened.
+
+Environment variables:
+  FILTER_ID — if set, only run the hypothesis with this ID
+"""
+
+import json
+import os
+import subprocess
+import sys
+from datetime import datetime
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(ROOT))
+
+ACTIVE_JSON = ROOT / "docs/iterations/hypotheses/active.json"
+CONCLUDED_DIR = ROOT / "docs/iterations/hypotheses/concluded"
+DB_PATH = ROOT / "data/recommendations/performance_database.json"
+TODAY = datetime.utcnow().strftime("%Y-%m-%d")
+
+
+def load_registry() -> dict:
+    with open(ACTIVE_JSON) as f:
+        return json.load(f)
+
+
+def save_registry(registry: dict) -> None:
+    with open(ACTIVE_JSON, "w") as f:
+        json.dump(registry, f, indent=2)
+
+
+def run(cmd: list, cwd: str = None, check: bool = True) -> subprocess.CompletedProcess:
+    print(f"  $ {' '.join(cmd)}", flush=True)
+    return subprocess.run(cmd, cwd=cwd or str(ROOT), check=check, capture_output=False)
+
+
+def extract_picks(worktree: str, scanner: str) -> list:
+    """Extract picks for the given scanner from the most recent discovery result in the worktree."""
+    results_dir = Path(worktree) / "results" / "discovery" / TODAY
+    if not results_dir.exists():
+        print(f"    No discovery results for {TODAY} in worktree", flush=True)
+        return []
+    picks = []
+    for run_dir in sorted(results_dir.iterdir()):
+        result_file = run_dir / "discovery_result.json"
+        if not result_file.exists():
+            continue
+        try:
+            with open(result_file) as f:
+                data = json.load(f)
+            for item in data.get("final_ranking", []):
+                if item.get("strategy_match") == scanner:
+                    picks.append({
+                        "date": TODAY,
+                        "ticker": item["ticker"],
+                        "score": item.get("final_score"),
+                        "confidence": item.get("confidence"),
+                        "scanner": scanner,
+                        "return_7d": None,
+                        "win_7d": None,
+                    })
+        except Exception as e:
+            print(f"    Warning: could not read {result_file}: {e}", flush=True)
+    return picks
+
+
+def load_picks_from_branch(hypothesis_id: str, branch: str) -> list:
+    """Load picks.json from the hypothesis branch using git show."""
+    picks_path = f"docs/iterations/hypotheses/{hypothesis_id}/picks.json"
+    result = subprocess.run(
+        ["git", "show", f"{branch}:{picks_path}"],
+        cwd=str(ROOT),
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        return []
+    try:
+        return json.loads(result.stdout).get("picks", [])
+    except Exception:
+        return []
+
+
+def save_picks_to_worktree(worktree: str, hypothesis_id: str, scanner: str, picks: list) -> None:
+    """Write updated picks.json into the worktree and commit."""
+    picks_dir = Path(worktree) / "docs" / "iterations" / "hypotheses" / hypothesis_id
+    picks_dir.mkdir(parents=True, exist_ok=True)
+    picks_file = picks_dir / "picks.json"
+    payload = {"hypothesis_id": hypothesis_id, "scanner": scanner, "picks": picks}
+    picks_file.write_text(json.dumps(payload, indent=2))
+    run(["git", "add", str(picks_file)], cwd=worktree)
+    result = subprocess.run(["git", "diff", "--cached", "--quiet"], cwd=worktree)
+    if result.returncode != 0:
+        run(
+            ["git", "commit", "-m", f"chore(hypotheses): picks {TODAY} for {hypothesis_id}"],
+            cwd=worktree,
+        )
+
+
+def run_hypothesis(hyp: dict) -> bool:
+    """Run one hypothesis experiment cycle. Returns True if the experiment concluded."""
+    hid = hyp["id"]
+    branch = hyp["branch"]
+    scanner = hyp["scanner"]
+    worktree = f"/tmp/hyp-{hid}"
+
+    print(f"\n── Hypothesis: {hid} ──", flush=True)
+
+    run(["git", "fetch", "origin", branch], check=False)
+    run(["git", "worktree", "add", worktree, branch])
+
+    try:
+        result = subprocess.run(
+            [sys.executable, "scripts/run_daily_discovery.py", "--date", TODAY, "--no-update-positions"],
+            cwd=worktree,
+            check=False,
+        )
+        if result.returncode != 0:
+            print(f"    Discovery failed for {hid}, skipping picks update", flush=True)
+        else:
+            new_picks = extract_picks(worktree, scanner)
+            existing_picks = load_picks_from_branch(hid, branch)
+            seen = {(p["date"], p["ticker"]) for p in existing_picks}
+            merged = existing_picks + [p for p in new_picks if (p["date"], p["ticker"]) not in seen]
+            save_picks_to_worktree(worktree, hid, scanner, merged)
+            run(["git", "push", "origin", f"HEAD:{branch}"], cwd=worktree)
+
+        if TODAY not in hyp.get("picks_log", []):
+            hyp.setdefault("picks_log", []).append(TODAY)
+        hyp["days_elapsed"] = len(hyp["picks_log"])
+
+        if hyp["days_elapsed"] >= hyp["min_days"]:
+            return conclude_hypothesis(hyp)
+
+    finally:
+        run(["git", "worktree", "remove", "--force", worktree], check=False)
+
+    return False
+
+
+def conclude_hypothesis(hyp: dict) -> bool:
+    """Run comparison, write conclusion doc, close/merge PR. Returns True."""
+    hid = hyp["id"]
+    scanner = hyp["scanner"]
+    branch = hyp["branch"]
+
+    print(f"\n  Concluding {hid}...", flush=True)
+
+    picks = load_picks_from_branch(hid, branch)
+    if not picks:
+        conclusion = {
+            "decision": "rejected",
+            "reason": "No picks were collected during the experiment period",
+            "hypothesis": {"count": 0, "evaluated": 0, "win_rate": None, "avg_return": None},
+            "baseline": {"count": 0, "win_rate": None, "avg_return": None},
+        }
+    else:
+        result = subprocess.run(
+            [
+                sys.executable, "scripts/compare_hypothesis.py",
+                "--hypothesis-id", hid,
+                "--picks-json", json.dumps(picks),
+                "--scanner", scanner,
+                "--db-path", str(DB_PATH),
+            ],
+            cwd=str(ROOT),
+            capture_output=True,
+            text=True,
+        )
+        if result.returncode != 0:
+            print(f"    compare_hypothesis.py failed: {result.stderr}", flush=True)
+            return False
+        conclusion = json.loads(result.stdout)
+
+    decision = conclusion["decision"]
+    hyp_metrics = conclusion["hypothesis"]
+    base_metrics = conclusion["baseline"]
+
+    period_start = hyp.get("created_at", TODAY)
+    concluded_doc = CONCLUDED_DIR / f"{TODAY}-{hid}.md"
+    concluded_doc.write_text(
+        f"# Hypothesis: {hyp['title']}\n\n"
+        f"**Scanner:** {scanner}\n"
+        f"**Branch:** {branch}\n"
+        f"**Period:** {period_start} → {TODAY} ({hyp['days_elapsed']} days)\n"
+        f"**Outcome:** {'accepted ✅' if decision == 'accepted' else 'rejected ❌'}\n\n"
+        f"## Hypothesis\n{hyp.get('description', hyp['title'])}\n\n"
+        f"## Results\n\n"
+        f"| Metric | Baseline | Experiment | Delta |\n"
+        f"|---|---|---|---|\n"
+        f"| 7d win rate | {base_metrics.get('win_rate') or '—'}% | "
+        f"{hyp_metrics.get('win_rate') or '—'}% | "
+        f"{_delta_str(hyp_metrics.get('win_rate'), base_metrics.get('win_rate'), 'pp')} |\n"
+        f"| Avg return | {base_metrics.get('avg_return') or '—'}% | "
+        f"{hyp_metrics.get('avg_return') or '—'}% | "
+        f"{_delta_str(hyp_metrics.get('avg_return'), base_metrics.get('avg_return'), '%')} |\n"
+        f"| Picks | {base_metrics.get('count', '—')} | {hyp_metrics.get('count', '—')} | — |\n\n"
+        f"## Decision\n{conclusion['reason']}\n\n"
+        f"## Action\n"
+        f"{'Branch merged into main.' if decision == 'accepted' else 'Branch closed without merging.'}\n"
+    )
+
+    run(["git", "add", str(concluded_doc)], check=False)
+
+    pr = hyp.get("pr_number")
+    if pr:
+        if decision == "accepted":
+            subprocess.run(
+                ["gh", "pr", "merge", str(pr), "--squash", "--delete-branch"],
+                cwd=str(ROOT), check=False,
+            )
+        else:
+            subprocess.run(
+                ["gh", "pr", "close", str(pr), "--delete-branch"],
+                cwd=str(ROOT), check=False,
+            )
+
+    hyp["status"] = "concluded"
+    hyp["conclusion"] = decision
+
+    print(f"  {hid}: {decision} — {conclusion['reason']}", flush=True)
+    return True
+
+
+def _delta_str(hyp_val, base_val, unit: str) -> str:
+    if hyp_val is None or base_val is None:
+        return "—"
+    delta = hyp_val - base_val
+    sign = "+" if delta >= 0 else ""
+    return f"{sign}{delta:.1f}{unit}"
+
+
+def promote_pending(registry: dict) -> None:
+    """Promote the highest-priority pending hypothesis to running if a slot is open."""
+    running_count = sum(1 for h in registry["hypotheses"] if h["status"] == "running")
+    max_active = registry.get("max_active", 5)
+    if running_count >= max_active:
+        return
+    pending = [h for h in registry["hypotheses"] if h["status"] == "pending"]
+    if not pending:
+        return
+    to_promote = max(pending, key=lambda h: h.get("priority", 0))
+    to_promote["status"] = "running"
+    print(f"\n  Promoted pending hypothesis to running: {to_promote['id']}", flush=True)
+
+
+def main():
+    registry = load_registry()
+    filter_id = os.environ.get("FILTER_ID", "").strip()
+
+    hypotheses = registry.get("hypotheses", [])
+    running = [
+        h for h in hypotheses
+        if h["status"] == "running" and (not filter_id or h["id"] == filter_id)
+    ]
+
+    if not running:
+        print("No running hypotheses to process.", flush=True)
+    else:
+        for hyp in running:
+            run_hypothesis(hyp)
+
+    promote_pending(registry)
+    save_registry(registry)
+    print("\nRegistry updated.", flush=True)
+
+
+if __name__ == "__main__":
+    main()

From fe5b8886c0b7d86c1ad618f8b5943e8c826b62ac Mon Sep 17 00:00:00 2001
From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com>
Date: Fri, 10 Apr 2026 09:50:37 -0700
Subject: [PATCH 10/14] fix(hypotheses): only count successful discovery days
 in picks_log

---
 scripts/run_hypothesis_runner.py | 62 ++++++++++++++++++++------------
 1 file changed, 39 insertions(+), 23 deletions(-)

diff --git a/scripts/run_hypothesis_runner.py b/scripts/run_hypothesis_runner.py
index 79d73e04..6795d0b7 100644
--- a/scripts/run_hypothesis_runner.py
+++ b/scripts/run_hypothesis_runner.py
@@ -64,15 +64,17 @@ def extract_picks(worktree: str, scanner: str) -> list:
                 data = json.load(f)
             for item in data.get("final_ranking", []):
                 if item.get("strategy_match") == scanner:
-                    picks.append({
-                        "date": TODAY,
-                        "ticker": item["ticker"],
-                        "score": item.get("final_score"),
-                        "confidence": item.get("confidence"),
-                        "scanner": scanner,
-                        "return_7d": None,
-                        "win_7d": None,
-                    })
+                    picks.append(
+                        {
+                            "date": TODAY,
+                            "ticker": item["ticker"],
+                            "score": item.get("final_score"),
+                            "confidence": item.get("confidence"),
+                            "scanner": scanner,
+                            "return_7d": None,
+                            "win_7d": None,
+                        }
+                    )
         except Exception as e:
             print(f"    Warning: could not read {result_file}: {e}", flush=True)
     return picks
@@ -125,7 +127,13 @@ def run_hypothesis(hyp: dict) -> bool:
 
     try:
         result = subprocess.run(
-            [sys.executable, "scripts/run_daily_discovery.py", "--date", TODAY, "--no-update-positions"],
+            [
+                sys.executable,
+                "scripts/run_daily_discovery.py",
+                "--date",
+                TODAY,
+                "--no-update-positions",
+            ],
             cwd=worktree,
             check=False,
         )
@@ -139,12 +147,12 @@ def run_hypothesis(hyp: dict) -> bool:
             save_picks_to_worktree(worktree, hid, scanner, merged)
             run(["git", "push", "origin", f"HEAD:{branch}"], cwd=worktree)
 
-        if TODAY not in hyp.get("picks_log", []):
-            hyp.setdefault("picks_log", []).append(TODAY)
-        hyp["days_elapsed"] = len(hyp["picks_log"])
+            if TODAY not in hyp.get("picks_log", []):
+                hyp.setdefault("picks_log", []).append(TODAY)
+            hyp["days_elapsed"] = len(hyp["picks_log"])
 
-        if hyp["days_elapsed"] >= hyp["min_days"]:
-            return conclude_hypothesis(hyp)
+            if hyp["days_elapsed"] >= hyp["min_days"]:
+                return conclude_hypothesis(hyp)
 
     finally:
         run(["git", "worktree", "remove", "--force", worktree], check=False)
@@ -171,11 +179,16 @@ def conclude_hypothesis(hyp: dict) -> bool:
     else:
         result = subprocess.run(
             [
-                sys.executable, "scripts/compare_hypothesis.py",
-                "--hypothesis-id", hid,
-                "--picks-json", json.dumps(picks),
-                "--scanner", scanner,
-                "--db-path", str(DB_PATH),
+                sys.executable,
+                "scripts/compare_hypothesis.py",
+                "--hypothesis-id",
+                hid,
+                "--picks-json",
+                json.dumps(picks),
+                "--scanner",
+                scanner,
+                "--db-path",
+                str(DB_PATH),
             ],
             cwd=str(ROOT),
             capture_output=True,
@@ -221,12 +234,14 @@ def conclude_hypothesis(hyp: dict) -> bool:
         if decision == "accepted":
             subprocess.run(
                 ["gh", "pr", "merge", str(pr), "--squash", "--delete-branch"],
-                cwd=str(ROOT), check=False,
+                cwd=str(ROOT),
+                check=False,
             )
         else:
             subprocess.run(
                 ["gh", "pr", "close", str(pr), "--delete-branch"],
-                cwd=str(ROOT), check=False,
+                cwd=str(ROOT),
+                check=False,
             )
 
     hyp["status"] = "concluded"
@@ -264,7 +279,8 @@ def main():
 
     hypotheses = registry.get("hypotheses", [])
     running = [
-        h for h in hypotheses
+        h
+        for h in hypotheses
         if h["status"] == "running" and (not filter_id or h["id"] == filter_id)
     ]
 

From 5b87a56f310ea40a9e74c1d47dd4ba02eaf05878 Mon Sep 17 00:00:00 2001
From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com>
Date: Fri, 10 Apr 2026 09:52:58 -0700
Subject: [PATCH 11/14] feat(hypotheses): add Hypotheses dashboard tab

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tests/test_hypotheses_page.py        |  73 ++++++++++++
 tradingagents/ui/dashboard.py        |   3 +-
 tradingagents/ui/pages/__init__.py   |   7 ++
 tradingagents/ui/pages/hypotheses.py | 171 +++++++++++++++++++++++++++
 4 files changed, 253 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_hypotheses_page.py
 create mode 100644 tradingagents/ui/pages/hypotheses.py

diff --git a/tests/test_hypotheses_page.py b/tests/test_hypotheses_page.py
new file mode 100644
index 00000000..196f7cb5
--- /dev/null
+++ b/tests/test_hypotheses_page.py
@@ -0,0 +1,73 @@
+"""Tests for the hypotheses dashboard page data loading."""
+import json
+import sys
+from pathlib import Path
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from tradingagents.ui.pages.hypotheses import (
+    load_active_hypotheses,
+    load_concluded_hypotheses,
+    days_until_ready,
+)
+
+
+def test_load_active_hypotheses(tmp_path):
+    active = {
+        "max_active": 5,
+        "hypotheses": [
+            {
+                "id": "options_flow-test",
+                "title": "Test hypothesis",
+                "scanner": "options_flow",
+                "status": "running",
+                "priority": 7,
+                "days_elapsed": 5,
+                "min_days": 14,
+                "created_at": "2026-04-01",
+                "picks_log": ["2026-04-01"] * 5,
+                "conclusion": None,
+            }
+        ],
+    }
+    f = tmp_path / "active.json"
+    f.write_text(json.dumps(active))
+    result = load_active_hypotheses(str(f))
+    assert len(result) == 1
+    assert result[0]["id"] == "options_flow-test"
+
+
+def test_load_active_hypotheses_missing_file(tmp_path):
+    result = load_active_hypotheses(str(tmp_path / "missing.json"))
+    assert result == []
+
+
+def test_load_concluded_hypotheses(tmp_path):
+    doc = tmp_path / "2026-04-10-options_flow-test.md"
+    doc.write_text(
+        "# Hypothesis: Test\n\n"
+        "**Scanner:** options_flow\n"
+        "**Period:** 2026-03-27 → 2026-04-10 (14 days)\n"
+        "**Outcome:** accepted ✅\n"
+    )
+    results = load_concluded_hypotheses(str(tmp_path))
+    assert len(results) == 1
+    assert results[0]["filename"] == doc.name
+    assert results[0]["outcome"] == "accepted ✅"
+
+
+def test_load_concluded_hypotheses_empty_dir(tmp_path):
+    results = load_concluded_hypotheses(str(tmp_path))
+    assert results == []
+
+
+def test_days_until_ready_has_days_left():
+    hyp = {"days_elapsed": 5, "min_days": 14}
+    assert days_until_ready(hyp) == 9
+
+
+def test_days_until_ready_past_due():
+    hyp = {"days_elapsed": 15, "min_days": 14}
+    assert days_until_ready(hyp) == 0
diff --git a/tradingagents/ui/dashboard.py b/tradingagents/ui/dashboard.py
index bf6d88ea..2817ac62 100644
--- a/tradingagents/ui/dashboard.py
+++ b/tradingagents/ui/dashboard.py
@@ -52,7 +52,7 @@ def render_sidebar():
         # Navigation
         page = st.radio(
             "Navigation",
-            options=["Overview", "Signals", "Portfolio", "Performance", "Config"],
+            options=["Overview", "Signals", "Portfolio", "Performance", "Hypotheses", "Config"],
             label_visibility="collapsed",
         )
 
@@ -116,6 +116,7 @@ def route_page(page):
         "Signals": pages.todays_picks,
         "Portfolio": pages.portfolio,
         "Performance": pages.performance,
+        "Hypotheses": pages.hypotheses,
         "Config": pages.settings,
     }
     module = page_map.get(page)
diff --git a/tradingagents/ui/pages/__init__.py b/tradingagents/ui/pages/__init__.py
index 22a16b20..da3547e4 100644
--- a/tradingagents/ui/pages/__init__.py
+++ b/tradingagents/ui/pages/__init__.py
@@ -39,6 +39,12 @@ except Exception as _e:
     _logger.error("Failed to import settings page: %s", _e, exc_info=True)
     settings = None
 
+try:
+    from tradingagents.ui.pages import hypotheses
+except Exception as _e:
+    _logger.error("Failed to import hypotheses page: %s", _e, exc_info=True)
+    hypotheses = None
+
 
 __all__ = [
     "home",
@@ -46,4 +52,5 @@ __all__ = [
     "portfolio",
     "performance",
     "settings",
+    "hypotheses",
 ]
diff --git a/tradingagents/ui/pages/hypotheses.py b/tradingagents/ui/pages/hypotheses.py
new file mode 100644
index 00000000..3492ccae
--- /dev/null
+++ b/tradingagents/ui/pages/hypotheses.py
@@ -0,0 +1,171 @@
+"""
+Hypotheses dashboard page — tracks active and concluded experiments.
+
+Reads docs/iterations/hypotheses/active.json and the concluded/ directory.
+No external API calls; all data is file-based.
+"""
+
+import json
+import re
+from pathlib import Path
+from typing import Any, Dict, List
+
+import streamlit as st
+
+from tradingagents.ui.theme import COLORS, page_header
+
+_REPO_ROOT = Path(__file__).parent.parent.parent.parent
+_ACTIVE_JSON = _REPO_ROOT / "docs/iterations/hypotheses/active.json"
+_CONCLUDED_DIR = _REPO_ROOT / "docs/iterations/hypotheses/concluded"
+
+
+def load_active_hypotheses(active_path: str = str(_ACTIVE_JSON)) -> List[Dict[str, Any]]:
+    """Load all hypotheses from active.json. Returns [] if file missing."""
+    path = Path(active_path)
+    if not path.exists():
+        return []
+    try:
+        with open(path) as f:
+            data = json.load(f)
+        return data.get("hypotheses", [])
+    except Exception:
+        return []
+
+
+def load_concluded_hypotheses(concluded_dir: str = str(_CONCLUDED_DIR)) -> List[Dict[str, Any]]:
+    """
+    Load concluded hypothesis metadata by parsing markdown files in concluded/.
+    Extracts: filename, title, scanner, period, outcome.
+    """
+    dir_path = Path(concluded_dir)
+    if not dir_path.exists():
+        return []
+    results = []
+    for md_file in sorted(dir_path.glob("*.md"), reverse=True):
+        if md_file.name == ".gitkeep":
+            continue
+        try:
+            text = md_file.read_text()
+            title = _extract_md_field(text, r"^# Hypothesis: (.+)$")
+            scanner = _extract_md_field(text, r"^\*\*Scanner:\*\* (.+)$")
+            period = _extract_md_field(text, r"^\*\*Period:\*\* (.+)$")
+            outcome = _extract_md_field(text, r"^\*\*Outcome:\*\* (.+)$")
+            results.append({
+                "filename": md_file.name,
+                "title": title or md_file.stem,
+                "scanner": scanner or "—",
+                "period": period or "—",
+                "outcome": outcome or "—",
+            })
+        except Exception:
+            continue
+    return results
+
+
+def _extract_md_field(text: str, pattern: str) -> str:
+    """Extract a field value from a markdown line using regex."""
+    match = re.search(pattern, text, re.MULTILINE)
+    return match.group(1).strip() if match else ""
+
+
+def days_until_ready(hyp: Dict[str, Any]) -> int:
+    """Return number of days remaining before hypothesis can conclude (min 0)."""
+    return max(0, hyp.get("min_days", 14) - hyp.get("days_elapsed", 0))
+
+
+def render() -> None:
+    """Render the hypotheses tracking page."""
+    st.markdown(
+        page_header("Hypotheses", "Active experiments & concluded findings"),
+        unsafe_allow_html=True,
+    )
+
+    hypotheses = load_active_hypotheses()
+    concluded = load_concluded_hypotheses()
+
+    if not hypotheses and not concluded:
+        st.info(
+            "No hypotheses yet. Run `/backtest-hypothesis \"<description>\"` to start an experiment."
+        )
+        return
+
+    running = [h for h in hypotheses if h["status"] == "running"]
+    pending = [h for h in hypotheses if h["status"] == "pending"]
+
+    st.markdown(
+        f'<div class="section-title">Active Experiments '
+        f'<span class="accent">// {len(running)} running, {len(pending)} pending</span></div>',
+        unsafe_allow_html=True,
+    )
+
+    if running or pending:
+        import pandas as pd
+        active_rows = []
+        for h in sorted(running + pending, key=lambda x: -x.get("priority", 0)):
+            days_left = days_until_ready(h)
+            ready_str = "concluding soon" if days_left == 0 else f"{days_left}d left"
+            active_rows.append({
+                "ID": h["id"],
+                "Title": h.get("title", "—"),
+                "Scanner": h.get("scanner", "—"),
+                "Status": h["status"],
+                "Progress": f"{h.get('days_elapsed', 0)}/{h.get('min_days', 14)}d",
+                "Picks": len(h.get("picks_log", [])),
+                "Ready": ready_str,
+                "Priority": h.get("priority", "—"),
+            })
+        df = pd.DataFrame(active_rows)
+        st.dataframe(
+            df,
+            width="stretch",
+            hide_index=True,
+            column_config={
+                "ID": st.column_config.TextColumn(width="medium"),
+                "Title": st.column_config.TextColumn(width="large"),
+                "Scanner": st.column_config.TextColumn(width="medium"),
+                "Status": st.column_config.TextColumn(width="small"),
+                "Progress": st.column_config.TextColumn(width="small"),
+                "Picks": st.column_config.NumberColumn(format="%d", width="small"),
+                "Ready": st.column_config.TextColumn(width="medium"),
+                "Priority": st.column_config.NumberColumn(format="%d/9", width="small"),
+            },
+        )
+    else:
+        st.info("No active experiments.")
+
+    st.markdown("<div style='height:1.5rem;'></div>", unsafe_allow_html=True)
+
+    st.markdown(
+        f'<div class="section-title">Concluded Experiments '
+        f'<span class="accent">// {len(concluded)} total</span></div>',
+        unsafe_allow_html=True,
+    )
+
+    if concluded:
+        import pandas as pd
+        concluded_rows = []
+        for c in concluded:
+            outcome = c["outcome"]
+            emoji = "✅" if "accepted" in outcome else "❌"
+            concluded_rows.append({
+                "Date": c["filename"][:10],
+                "Title": c["title"],
+                "Scanner": c["scanner"],
+                "Period": c["period"],
+                "Outcome": emoji,
+            })
+        cdf = pd.DataFrame(concluded_rows)
+        st.dataframe(
+            cdf,
+            width="stretch",
+            hide_index=True,
+            column_config={
+                "Date": st.column_config.TextColumn(width="small"),
+                "Title": st.column_config.TextColumn(width="large"),
+                "Scanner": st.column_config.TextColumn(width="medium"),
+                "Period": st.column_config.TextColumn(width="medium"),
+                "Outcome": st.column_config.TextColumn(width="small"),
+            },
+        )
+    else:
+        st.info("No concluded experiments yet.")

From 9562bb7cc0dfbf7a43f2f8dd0eb88e64be5b3136 Mon Sep 17 00:00:00 2001
From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com>
Date: Fri, 10 Apr 2026 09:56:22 -0700
Subject: [PATCH 12/14] fix(hypotheses): id validation, worktree prune, safe
 loop, 14d enrichment cutoff

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 scripts/compare_hypothesis.py    |  2 +-
 scripts/run_hypothesis_runner.py | 11 ++++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/scripts/compare_hypothesis.py b/scripts/compare_hypothesis.py
index ac6a72aa..9415a7f4 100644
--- a/scripts/compare_hypothesis.py
+++ b/scripts/compare_hypothesis.py
@@ -56,7 +56,7 @@ def compute_7d_return(ticker: str, pick_date: str) -> Tuple[Optional[float], Opt
 
 def enrich_picks_with_returns(picks: list) -> list:
     """Compute 7d return for each pick >= 7 days old that lacks return_7d."""
-    cutoff = (datetime.utcnow() - timedelta(days=7)).strftime("%Y-%m-%d")
+    cutoff = (datetime.utcnow() - timedelta(days=14)).strftime("%Y-%m-%d")
     for pick in picks:
         if pick.get("return_7d") is not None:
             continue
diff --git a/scripts/run_hypothesis_runner.py b/scripts/run_hypothesis_runner.py
index 6795d0b7..b15c02f6 100644
--- a/scripts/run_hypothesis_runner.py
+++ b/scripts/run_hypothesis_runner.py
@@ -19,6 +19,7 @@ Environment variables:
 
 import json
 import os
+import re
 import subprocess
 import sys
 from datetime import datetime
@@ -116,6 +117,10 @@ def save_picks_to_worktree(worktree: str, hypothesis_id: str, scanner: str, pick
 def run_hypothesis(hyp: dict) -> bool:
     """Run one hypothesis experiment cycle. Returns True if the experiment concluded."""
     hid = hyp["id"]
+    # Validate id to prevent path traversal in worktree path
+    if not re.fullmatch(r"[a-zA-Z0-9_\-]+", hid):
+        print(f"  Skipping hypothesis with invalid id: {hid!r}", flush=True)
+        return False
     branch = hyp["branch"]
     scanner = hyp["scanner"]
     worktree = f"/tmp/hyp-{hid}"
@@ -287,8 +292,12 @@ def main():
     if not running:
         print("No running hypotheses to process.", flush=True)
     else:
+        run(["git", "worktree", "prune"], check=False)
         for hyp in running:
-            run_hypothesis(hyp)
+            try:
+                run_hypothesis(hyp)
+            except Exception as e:
+                print(f"  Error processing {hyp['id']}: {e}", flush=True)
 
     promote_pending(registry)
     save_registry(registry)

From 49175e3b0ad525b25b15a04e5f2918d9496f0f0a Mon Sep 17 00:00:00 2001
From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com>
Date: Fri, 10 Apr 2026 10:52:00 -0700
Subject: [PATCH 13/14] feat(hypotheses): post conclusion as PR comment instead
 of auto-merging

---
 scripts/run_hypothesis_runner.py | 36 ++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/scripts/run_hypothesis_runner.py b/scripts/run_hypothesis_runner.py
index b15c02f6..38340617 100644
--- a/scripts/run_hypothesis_runner.py
+++ b/scripts/run_hypothesis_runner.py
@@ -229,25 +229,35 @@ def conclude_hypothesis(hyp: dict) -> bool:
         f"| Picks | {base_metrics.get('count', '—')} | {hyp_metrics.get('count', '—')} | — |\n\n"
         f"## Decision\n{conclusion['reason']}\n\n"
         f"## Action\n"
-        f"{'Branch merged into main.' if decision == 'accepted' else 'Branch closed without merging.'}\n"
+        f"{'Ready to merge — awaiting manual review.' if decision == 'accepted' else 'Experiment concluded — awaiting manual review before closing.'}\n"
     )
 
     run(["git", "add", str(concluded_doc)], check=False)
 
     pr = hyp.get("pr_number")
     if pr:
-        if decision == "accepted":
-            subprocess.run(
-                ["gh", "pr", "merge", str(pr), "--squash", "--delete-branch"],
-                cwd=str(ROOT),
-                check=False,
-            )
-        else:
-            subprocess.run(
-                ["gh", "pr", "close", str(pr), "--delete-branch"],
-                cwd=str(ROOT),
-                check=False,
-            )
+        # Mark PR ready for review (removes draft status) and post conclusion as a comment.
+        # The PR is NOT merged or closed automatically — the user reviews and decides.
+        outcome_emoji = "✅ accepted" if decision == "accepted" else "❌ rejected"
+        comment = (
+            f"**Hypothesis concluded: {outcome_emoji}**\n\n"
+            f"{conclusion['reason']}\n\n"
+            f"| Metric | Baseline | Experiment |\n"
+            f"|---|---|---|\n"
+            f"| 7d win rate | {base_metrics.get('win_rate') or '—'}% | {hyp_metrics.get('win_rate') or '—'}% |\n"
+            f"| Avg return | {base_metrics.get('avg_return') or '—'}% | {hyp_metrics.get('avg_return') or '—'}% |\n\n"
+            f"{'Merge this PR to apply the change.' if decision == 'accepted' else 'Close this PR to discard the experiment.'}"
+        )
+        subprocess.run(
+            ["gh", "pr", "ready", str(pr)],
+            cwd=str(ROOT),
+            check=False,
+        )
+        subprocess.run(
+            ["gh", "pr", "comment", str(pr), "--body", comment],
+            cwd=str(ROOT),
+            check=False,
+        )
 
     hyp["status"] = "concluded"
     hyp["conclusion"] = decision

From 26df957e3751053f5aceca553702c6632fd97700 Mon Sep 17 00:00:00 2001
From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com>
Date: Fri, 10 Apr 2026 10:57:52 -0700
Subject: [PATCH 14/14] feat(hypotheses): add LLM analysis to hypothesis
 conclusion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When ANTHROPIC_API_KEY is set, conclude_hypothesis now:
- Loads the scanner domain file for context
- Calls claude-haiku-4-5-20251001 for a 3–5 sentence interpretation
- Embeds the analysis in the concluded .md doc and PR comment

The LLM enriches the conclusion with sample-size caveats, market
context, and a follow-up hypothesis suggestion — without overriding
the programmatic accept/reject decision.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 scripts/run_hypothesis_runner.py | 85 +++++++++++++++++++++++++++++++-
 1 file changed, 83 insertions(+), 2 deletions(-)

diff --git a/scripts/run_hypothesis_runner.py b/scripts/run_hypothesis_runner.py
index 38340617..179b8794 100644
--- a/scripts/run_hypothesis_runner.py
+++ b/scripts/run_hypothesis_runner.py
@@ -24,6 +24,7 @@ import subprocess
 import sys
 from datetime import datetime
 from pathlib import Path
+from typing import Optional
 
 ROOT = Path(__file__).resolve().parent.parent
 sys.path.insert(0, str(ROOT))
@@ -165,6 +166,75 @@ def run_hypothesis(hyp: dict) -> bool:
     return False
 
 
+def llm_analysis(hyp: dict, conclusion: dict, scanner_domain: str) -> Optional[str]:
+    """
+    Ask Claude to interpret the experiment results and provide richer context.
+
+    Returns a markdown string to embed in the PR comment, or None if the API
+    call fails or ANTHROPIC_API_KEY is not set.
+
+    The LLM does NOT override the programmatic decision — it adds nuance:
+    sample-size caveats, market-condition context, follow-up hypotheses.
+    """
+    api_key = os.environ.get("ANTHROPIC_API_KEY")
+    if not api_key:
+        return None
+
+    try:
+        import anthropic
+    except ImportError:
+        print("    anthropic SDK not installed, skipping LLM analysis", flush=True)
+        return None
+
+    hyp_metrics = conclusion["hypothesis"]
+    base_metrics = conclusion["baseline"]
+    decision = conclusion["decision"]
+
+    prompt = f"""You are analyzing the results of a scanner hypothesis experiment for an automated trading discovery system.
+
+## Hypothesis
+**ID:** {hyp["id"]}
+**Title:** {hyp.get("title", "")}
+**Description:** {hyp.get("description", hyp.get("title", ""))}
+**Scanner:** {hyp["scanner"]}
+**Period:** {hyp.get("created_at")} → {TODAY} ({hyp.get("days_elapsed")} days)
+
+## Statistical Results
+**Decision (programmatic):** {decision}
+**Reason:** {conclusion["reason"]}
+
+| Metric | Baseline | Experiment | Delta |
+|---|---|---|---|
+| 7d win rate | {base_metrics.get("win_rate") or "—"}% | {hyp_metrics.get("win_rate") or "—"}% | {_delta_str(hyp_metrics.get("win_rate"), base_metrics.get("win_rate"), "pp")} |
+| Avg 7d return | {base_metrics.get("avg_return") or "—"}% | {hyp_metrics.get("avg_return") or "—"}% | {_delta_str(hyp_metrics.get("avg_return"), base_metrics.get("avg_return"), "%")} |
+| Picks evaluated | {base_metrics.get("evaluated", base_metrics.get("count", "—"))} | {hyp_metrics.get("evaluated", hyp_metrics.get("count", "—"))} | — |
+
+## Scanner Domain Knowledge
+{scanner_domain}
+
+---
+
+Provide a concise analysis (3–5 sentences) covering:
+1. Whether the sample size is sufficient to trust the result, or if more data is needed
+2. Any caveats about the measurement period (e.g., unusual market conditions)
+3. What the numbers suggest about the underlying hypothesis — even if the decision is "rejected", is the direction meaningful?
+4. One concrete follow-up hypothesis worth testing next
+
+Be direct. Do not restate the numbers — interpret them. Do not recommend merging or closing the PR."""
+
+    try:
+        client = anthropic.Anthropic(api_key=api_key)
+        message = client.messages.create(
+            model="claude-haiku-4-5-20251001",
+            max_tokens=512,
+            messages=[{"role": "user", "content": prompt}],
+        )
+        return message.content[0].text.strip()
+    except Exception as e:
+        print(f"    LLM analysis failed: {e}", flush=True)
+        return None
+
+
 def conclude_hypothesis(hyp: dict) -> bool:
     """Run comparison, write conclusion doc, close/merge PR. Returns True."""
     hid = hyp["id"]
@@ -208,6 +278,14 @@ def conclude_hypothesis(hyp: dict) -> bool:
     hyp_metrics = conclusion["hypothesis"]
     base_metrics = conclusion["baseline"]
 
+    # Load scanner domain knowledge (may not exist yet — that's fine)
+    scanner_domain_path = ROOT / "docs" / "iterations" / "scanners" / f"{scanner}.md"
+    scanner_domain = scanner_domain_path.read_text() if scanner_domain_path.exists() else ""
+
+    # Optional LLM analysis — enriches the conclusion without overriding the decision
+    analysis = llm_analysis(hyp, conclusion, scanner_domain)
+    analysis_section = f"\n\n## Analysis\n{analysis}" if analysis else ""
+
     period_start = hyp.get("created_at", TODAY)
     concluded_doc = CONCLUDED_DIR / f"{TODAY}-{hid}.md"
     concluded_doc.write_text(
@@ -227,7 +305,8 @@ def conclude_hypothesis(hyp: dict) -> bool:
         f"{hyp_metrics.get('avg_return') or '—'}% | "
         f"{_delta_str(hyp_metrics.get('avg_return'), base_metrics.get('avg_return'), '%')} |\n"
         f"| Picks | {base_metrics.get('count', '—')} | {hyp_metrics.get('count', '—')} | — |\n\n"
-        f"## Decision\n{conclusion['reason']}\n\n"
+        f"## Decision\n{conclusion['reason']}\n"
+        f"{analysis_section}\n\n"
         f"## Action\n"
         f"{'Ready to merge — awaiting manual review.' if decision == 'accepted' else 'Experiment concluded — awaiting manual review before closing.'}\n"
     )
@@ -239,13 +318,15 @@ def conclude_hypothesis(hyp: dict) -> bool:
         # Mark PR ready for review (removes draft status) and post conclusion as a comment.
         # The PR is NOT merged or closed automatically — the user reviews and decides.
         outcome_emoji = "✅ accepted" if decision == "accepted" else "❌ rejected"
+        analysis_block = f"\n\n**Analysis**\n{analysis}" if analysis else ""
         comment = (
             f"**Hypothesis concluded: {outcome_emoji}**\n\n"
             f"{conclusion['reason']}\n\n"
             f"| Metric | Baseline | Experiment |\n"
             f"|---|---|---|\n"
             f"| 7d win rate | {base_metrics.get('win_rate') or '—'}% | {hyp_metrics.get('win_rate') or '—'}% |\n"
-            f"| Avg return | {base_metrics.get('avg_return') or '—'}% | {hyp_metrics.get('avg_return') or '—'}% |\n\n"
+            f"| Avg return | {base_metrics.get('avg_return') or '—'}% | {hyp_metrics.get('avg_return') or '—'}% |\n"
+            f"{analysis_block}\n\n"
             f"{'Merge this PR to apply the change.' if decision == 'accepted' else 'Close this PR to discard the experiment.'}"
         )
         subprocess.run(