From de4ef56c91653653d3919aecbfb59ada79535a37 Mon Sep 17 00:00:00 2001 From: Youssef Aitousarrah Date: Thu, 9 Apr 2026 23:34:39 -0700 Subject: [PATCH 01/14] docs(spec): hypothesis backtesting system design Co-Authored-By: Claude Sonnet 4.6 --- ...026-04-09-hypothesis-backtesting-design.md | 196 ++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 docs/superpowers/specs/2026-04-09-hypothesis-backtesting-design.md diff --git a/docs/superpowers/specs/2026-04-09-hypothesis-backtesting-design.md b/docs/superpowers/specs/2026-04-09-hypothesis-backtesting-design.md new file mode 100644 index 00000000..6fa943f6 --- /dev/null +++ b/docs/superpowers/specs/2026-04-09-hypothesis-backtesting-design.md @@ -0,0 +1,196 @@ +# Hypothesis Backtesting System — Design Spec + +## Goal + +Enable systematic, branch-per-hypothesis experimentation for scanner improvements. Each hypothesis runs its modified code daily in isolation, accumulates picks, and auto-concludes with a statistical comparison once enough data exists. Up to 5 experiments run in parallel, prioritized by expected impact, with full UI visibility. + +--- + +## Architecture + +``` +docs/iterations/hypotheses/ + active.json ← source of truth for all experiments + concluded/ + YYYY-MM-DD-.md ← one file per concluded hypothesis + +.claude/commands/ + backtest-hypothesis.md ← /backtest-hypothesis command + +.github/workflows/ + hypothesis-runner.yml ← daily 08:00 UTC, runs all active experiments + +tradingagents/ui/pages/ + hypotheses.py ← new Streamlit dashboard tab +``` + +The `active.json` file lives on `main`. Each hypothesis branch (`hypothesis/-`) contains the code change being tested. The daily runner checks out each branch, runs discovery, commits picks back to that branch, and — once `min_days` have elapsed — concludes the hypothesis and cleans up. + +--- + +## `active.json` Schema + +```json +{ + "max_active": 5, + "hypotheses": [ + { + "id": "options_flow-scan-3-expirations", + "scanner": "options_flow", + "title": "Scan 3 expirations instead of 1", + "description": "Hypothesis: scanning up to 3 expirations captures institutional positioning in 30+ DTE contracts, improving signal quality over nearest-expiry-only.", + "branch": "hypothesis/options_flow-scan-3-expirations", + "pr_number": 14, + "status": "running", + "priority": 8, + "expected_impact": "high", + "hypothesis_type": "implementation", + "created_at": "2026-04-09", + "min_days": 14, + "days_elapsed": 3, + "picks_log": ["2026-04-09", "2026-04-10", "2026-04-11"], + "baseline_scanner": "options_flow", + "conclusion": null + } + ] +} +``` + +**Field reference:** + +| Field | Description | +|---|---| +| `id` | `-` — unique, used for branch and file names | +| `status` | `running` / `paused` / `concluded` | +| `priority` | 1–10 (higher = more important); auto-pause lowest when at capacity | +| `hypothesis_type` | `statistical` (answer from existing data) or `implementation` (requires branch + forward testing) | +| `min_days` | Minimum picks days before conclusion analysis runs | +| `picks_log` | Dates when the runner collected picks on this branch | +| `conclusion` | `null` while running; `"accepted"` or `"rejected"` once concluded | + +--- + +## `/backtest-hypothesis` Command + +**Trigger:** `claude /backtest-hypothesis ""` + +**Flow:** + +1. **Classify** the hypothesis as `statistical` or `implementation`. + - Statistical: answerable from existing `performance_database.json` data — no code change needed. + - Implementation: requires a code change and forward-testing period. + +2. **Statistical path:** Run the analysis immediately against existing performance data. Write conclusion to the relevant scanner domain file (`docs/iterations/scanners/.md`). Done — no branch created. + +3. **Implementation path:** + a. Read `active.json`. If `running` count < 5, proceed. If at 5, auto-pause the entry with the lowest `priority` (set `status: "paused"`, keep branch alive). + b. Create branch `hypothesis/-` from `main`. + c. Implement the minimal code change on the branch. + d. Open a draft PR: title `hypothesis(): `, body describes the hypothesis, expected impact, and `min_days`. + e. Write new entry to `active.json` on `main` with `status: "running"`. + f. Print summary: branch name, PR number, expected conclusion date. + +**Priority scoring** (set at creation time): + +| Factor | Score contribution | +|---|---| +| Scanner has poor 30d win rate (<40%) | +3 | +| Change is low-complexity (1 file, 1 parameter) | +2 | +| Hypothesis directly addresses a known weak spot in LEARNINGS.md | +2 | +| High daily pick volume from scanner (more data faster) | +1 | +| Evidence from external research (arXiv, Alpha Architect, etc.) | +1 | +| Conflicting evidence or uncertain direction | -2 | + +Max score 9. Claude assigns this score and writes it to `active.json`. + +--- + +## Daily Hypothesis Runner (`hypothesis-runner.yml`) + +Runs at **08:00 UTC daily** (after iterate at 06:00 UTC). + +**Per-hypothesis loop** (for each entry with `status: "running"`): + +``` +1. git checkout hypothesis/<id> +2. Run daily discovery pipeline (same as daily-discovery.yml) +3. Append today's date to picks_log +4. Commit picks update back to hypothesis branch +5. If days_elapsed >= min_days: + a. Run statistical comparison vs baseline scanner (same scanner, main branch picks) + b. Compute: win rate delta, avg return delta, pick volume delta, p-value if N >= 20 + c. Decision rule: + - accepted if win rate delta > +5pp OR avg return delta > +1% (with p < 0.1 if N >= 20) + - rejected otherwise + d. Write concluded doc to docs/iterations/hypotheses/concluded/YYYY-MM-DD-<id>.md + e. Update scanner domain file with finding + f. Set status = "concluded", conclusion = "accepted"/"rejected" in active.json + g. If accepted: merge PR into main + If rejected: close PR without merging, delete hypothesis branch + h. Push active.json update to main +``` + +**Capacity:** 5 experiments × ~2 min each = ~10 min max runtime. Workflow timeout: 60 minutes. + +--- + +## Conclusion Document Format + +`docs/iterations/hypotheses/concluded/YYYY-MM-DD-<id>.md`: + +```markdown +# Hypothesis: <title> + +**Scanner:** options_flow +**Branch:** hypothesis/options_flow-scan-3-expirations +**Period:** 2026-04-09 → 2026-04-23 (14 days) +**Outcome:** accepted ✅ / rejected ❌ + +## Hypothesis +<original description> + +## Results + +| Metric | Baseline | Experiment | Delta | +|---|---|---|---| +| 7d win rate | 42% | 53% | +11pp | +| 30d avg return | -2.9% | +0.8% | +3.7% | +| Picks/day | 1.2 | 1.8 | +0.6 | + +## Decision +<1-2 sentences on why accepted/rejected> + +## Action +<what was merged or discarded> +``` + +--- + +## Dashboard Tab (`tradingagents/ui/pages/hypotheses.py`) + +New "Hypotheses" tab in the Streamlit dashboard. + +**Active experiments table:** + +| Hypothesis | Scanner | Status | Days | Picks | Expected Ready | Priority | +|---|---|---|---|---|---|---| +| Scan 3 expirations | options_flow | running | 3/14 | 4 | 2026-04-23 | 8 | +| ITM-only filter | options_flow | paused | 1/14 | 1 | — | 5 | + +**Concluded experiments table:** + +| Hypothesis | Scanner | Outcome | Concluded | Win Rate Delta | +|---|---|---|---|---| +| Premium filter >$25K | options_flow | ✅ merged | 2026-04-01 | +9pp | +| Reddit DD confidence gate | reddit_dd | ❌ rejected | 2026-03-20 | -3pp | + +Both tables read directly from `active.json` and the `concluded/` directory. No separate database. + +--- + +## What Is Not In Scope + +- Hypothesis branches do not interact with each other (no cross-branch comparison) +- No A/B testing within a single discovery run (too complex, not needed) +- No email/Slack notifications (rolling PRs in GitHub are the notification mechanism) +- No manual override of priority scoring (set at creation, editable directly in `active.json`) From 36884966f1973502b90dbdd40051d79e1eff1e93 Mon Sep 17 00:00:00 2001 From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com> Date: Thu, 9 Apr 2026 23:50:55 -0700 Subject: [PATCH 02/14] =?UTF-8?q?docs(spec):=20fix=20hypothesis=20capacity?= =?UTF-8?q?=20=E2=80=94=20running=20experiments=20never=20paused?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pending hypotheses queue by priority and promote when a slot opens, rather than pausing a running experiment mid-streak. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --- .../2026-04-09-hypothesis-backtesting-design.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/docs/superpowers/specs/2026-04-09-hypothesis-backtesting-design.md b/docs/superpowers/specs/2026-04-09-hypothesis-backtesting-design.md index 6fa943f6..042bb796 100644 --- a/docs/superpowers/specs/2026-04-09-hypothesis-backtesting-design.md +++ b/docs/superpowers/specs/2026-04-09-hypothesis-backtesting-design.md @@ -61,8 +61,8 @@ The `active.json` file lives on `main`. Each hypothesis branch (`hypothesis/<sca | Field | Description | |---|---| | `id` | `<scanner>-<slug>` — unique, used for branch and file names | -| `status` | `running` / `paused` / `concluded` | -| `priority` | 1–10 (higher = more important); auto-pause lowest when at capacity | +| `status` | `running` / `pending` / `concluded` | +| `priority` | 1–9 (higher = more important); determines queue order for `pending` hypotheses | | `hypothesis_type` | `statistical` (answer from existing data) or `implementation` (requires branch + forward testing) | | `min_days` | Minimum picks days before conclusion analysis runs | | `picks_log` | Dates when the runner collected picks on this branch | @@ -83,12 +83,14 @@ The `active.json` file lives on `main`. Each hypothesis branch (`hypothesis/<sca 2. **Statistical path:** Run the analysis immediately against existing performance data. Write conclusion to the relevant scanner domain file (`docs/iterations/scanners/<scanner>.md`). Done — no branch created. 3. **Implementation path:** - a. Read `active.json`. If `running` count < 5, proceed. If at 5, auto-pause the entry with the lowest `priority` (set `status: "paused"`, keep branch alive). + a. Read `active.json`. If `running` count < 5, start immediately. If all 5 slots are occupied by running experiments, add the new hypothesis as `status: "pending"` — running experiments are never interrupted (pausing mid-experiment breaks the picks streak and invalidates the statistical comparison). b. Create branch `hypothesis/<scanner>-<slug>` from `main`. c. Implement the minimal code change on the branch. d. Open a draft PR: title `hypothesis(<scanner>): <title>`, body describes the hypothesis, expected impact, and `min_days`. - e. Write new entry to `active.json` on `main` with `status: "running"`. - f. Print summary: branch name, PR number, expected conclusion date. + e. Write new entry to `active.json` on `main` with `status: "running"` (or `"pending"` if at capacity). + f. Print summary: branch name, PR number, expected start date (if pending), expected conclusion date (if running). + +**Pending → running promotion:** At the end of each daily runner cycle, after any experiments conclude, the runner checks for `pending` entries and promotes the highest-priority one to `running` if a slot opened up. **Priority scoring** (set at creation time): @@ -175,7 +177,7 @@ New "Hypotheses" tab in the Streamlit dashboard. | Hypothesis | Scanner | Status | Days | Picks | Expected Ready | Priority | |---|---|---|---|---|---|---| | Scan 3 expirations | options_flow | running | 3/14 | 4 | 2026-04-23 | 8 | -| ITM-only filter | options_flow | paused | 1/14 | 1 | — | 5 | +| ITM-only filter | options_flow | pending | 0/14 | 0 | waiting for slot | 5 | **Concluded experiments table:** From e0b6e28a3be0ce04674074b9b4dae9c9c1613d7c Mon Sep 17 00:00:00 2001 From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com> Date: Fri, 10 Apr 2026 09:04:58 -0700 Subject: [PATCH 03/14] docs(plan): hypothesis backtesting implementation plan Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --- .../2026-04-10-hypothesis-backtesting.md | 1493 +++++++++++++++++ 1 file changed, 1493 insertions(+) create mode 100644 docs/superpowers/plans/2026-04-10-hypothesis-backtesting.md diff --git a/docs/superpowers/plans/2026-04-10-hypothesis-backtesting.md b/docs/superpowers/plans/2026-04-10-hypothesis-backtesting.md new file mode 100644 index 00000000..711c3cb8 --- /dev/null +++ b/docs/superpowers/plans/2026-04-10-hypothesis-backtesting.md @@ -0,0 +1,1493 @@ +# Hypothesis Backtesting System — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Build a branch-per-hypothesis experimentation system that runs scanner code changes daily in isolation, accumulates picks, auto-concludes with a statistical comparison, and surfaces everything in the dashboard. + +**Architecture:** `active.json` is the registry (lives on `main`). Each hypothesis gets a `hypothesis/<scanner>-<slug>` branch with the code change. A daily workflow (08:00 UTC) uses git worktrees to run discovery on each branch, stores picks in `docs/iterations/hypotheses/<id>/picks.json` on the hypothesis branch, and concludes when `min_days` elapsed. The `/backtest-hypothesis` command classifies, creates branches, and manages the registry. + +**Tech Stack:** Python 3.10, yfinance (`download_history`), GitHub Actions, Streamlit, `gh` CLI, `git worktree` + +--- + +## File Map + +| Path | Action | Purpose | +|---|---|---| +| `docs/iterations/hypotheses/active.json` | Create | Registry of all experiments | +| `docs/iterations/hypotheses/concluded/.gitkeep` | Create | Directory placeholder | +| `scripts/compare_hypothesis.py` | Create | Fetch returns + statistical comparison | +| `.claude/commands/backtest-hypothesis.md` | Create | `/backtest-hypothesis` Claude command | +| `.github/workflows/hypothesis-runner.yml` | Create | Daily 08:00 UTC runner | +| `tradingagents/ui/pages/hypotheses.py` | Create | Dashboard "Hypotheses" tab | +| `tradingagents/ui/pages/__init__.py` | Modify | Register new page | +| `tradingagents/ui/dashboard.py` | Modify | Add "Hypotheses" to nav | + +--- + +## Task 1: Hypothesis Registry Structure + +**Files:** +- Create: `docs/iterations/hypotheses/active.json` +- Create: `docs/iterations/hypotheses/concluded/.gitkeep` + +- [ ] **Step 1: Create the directory and initial `active.json`** + +```bash +mkdir -p docs/iterations/hypotheses/concluded +``` + +Write `docs/iterations/hypotheses/active.json`: + +```json +{ + "max_active": 5, + "hypotheses": [] +} +``` + +- [ ] **Step 2: Create the concluded directory placeholder** + +```bash +touch docs/iterations/hypotheses/concluded/.gitkeep +``` + +- [ ] **Step 3: Verify JSON is valid** + +```bash +python3 -c "import json; json.load(open('docs/iterations/hypotheses/active.json')); print('valid')" +``` + +Expected: `valid` + +- [ ] **Step 4: Commit** + +```bash +git add docs/iterations/hypotheses/ +git commit -m "feat(hypotheses): initialize hypothesis registry" +``` + +--- + +## Task 2: Comparison Script + +**Files:** +- Create: `scripts/compare_hypothesis.py` +- Create: `tests/test_compare_hypothesis.py` + +`★ Insight ─────────────────────────────────────` +The comparison reads picks from the hypothesis branch via `git show <branch>:path` — this avoids checking out the branch just to read a file, keeping the working tree on `main` throughout. +`─────────────────────────────────────────────────` + +- [ ] **Step 1: Write the failing tests** + +Create `tests/test_compare_hypothesis.py`: + +```python +"""Tests for the hypothesis comparison script.""" +import json +import subprocess +import sys +from datetime import date, timedelta +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from scripts.compare_hypothesis import ( + compute_metrics, + compute_7d_return, + load_baseline_metrics, + make_decision, +) + + +# ── compute_metrics ────────────────────────────────────────────────────────── + +def test_compute_metrics_empty(): + result = compute_metrics([]) + assert result == {"count": 0, "evaluated": 0, "win_rate": None, "avg_return": None} + + +def test_compute_metrics_all_wins(): + picks = [ + {"return_7d": 5.0, "win_7d": True}, + {"return_7d": 3.0, "win_7d": True}, + ] + result = compute_metrics(picks) + assert result["win_rate"] == 100.0 + assert result["avg_return"] == 4.0 + assert result["evaluated"] == 2 + + +def test_compute_metrics_mixed(): + picks = [ + {"return_7d": 10.0, "win_7d": True}, + {"return_7d": -5.0, "win_7d": False}, + {"return_7d": None, "win_7d": None}, # pending — excluded + ] + result = compute_metrics(picks) + assert result["win_rate"] == 50.0 + assert result["avg_return"] == 2.5 + assert result["evaluated"] == 2 + assert result["count"] == 3 + + +# ── compute_7d_return ──────────────────────────────────────────────────────── + +def test_compute_7d_return_positive(): + mock_df = MagicMock() + mock_df.empty = False + # Simulate DataFrame with Close column: entry=100, exit=110 + mock_df.__len__ = lambda self: 2 + mock_df["Close"].iloc.__getitem__ = MagicMock(side_effect=lambda i: 100.0 if i == 0 else 110.0) + + with patch("scripts.compare_hypothesis.download_history", return_value=mock_df): + ret, win = compute_7d_return("AAPL", "2026-03-01") + + assert ret == pytest.approx(10.0, rel=0.01) + assert win is True + + +def test_compute_7d_return_empty_data(): + mock_df = MagicMock() + mock_df.empty = True + + with patch("scripts.compare_hypothesis.download_history", return_value=mock_df): + ret, win = compute_7d_return("AAPL", "2026-03-01") + + assert ret is None + assert win is None + + +# ── load_baseline_metrics ──────────────────────────────────────────────────── + +def test_load_baseline_metrics(tmp_path): + db = { + "recommendations_by_date": { + "2026-03-01": [ + {"strategy_match": "options_flow", "return_7d": 5.0, "win_7d": True}, + {"strategy_match": "options_flow", "return_7d": -2.0, "win_7d": False}, + {"strategy_match": "reddit_dd", "return_7d": 3.0, "win_7d": True}, + ] + } + } + db_file = tmp_path / "performance_database.json" + db_file.write_text(json.dumps(db)) + + result = load_baseline_metrics("options_flow", str(db_file)) + + assert result["win_rate"] == 50.0 + assert result["avg_return"] == 1.5 + assert result["count"] == 2 + + +def test_load_baseline_metrics_missing_file(tmp_path): + result = load_baseline_metrics("options_flow", str(tmp_path / "missing.json")) + assert result == {"count": 0, "win_rate": None, "avg_return": None} + + +# ── make_decision ───────────────────────────────────────────────────────────── + +def test_make_decision_accepted_by_win_rate(): + hyp = {"win_rate": 60.0, "avg_return": 0.5, "evaluated": 10} + baseline = {"win_rate": 50.0, "avg_return": 0.5} + decision, reason = make_decision(hyp, baseline) + assert decision == "accepted" + assert "win rate" in reason.lower() + + +def test_make_decision_accepted_by_return(): + hyp = {"win_rate": 52.0, "avg_return": 3.0, "evaluated": 10} + baseline = {"win_rate": 50.0, "avg_return": 1.5} + decision, reason = make_decision(hyp, baseline) + assert decision == "accepted" + assert "return" in reason.lower() + + +def test_make_decision_rejected(): + hyp = {"win_rate": 48.0, "avg_return": 0.2, "evaluated": 10} + baseline = {"win_rate": 50.0, "avg_return": 1.0} + decision, reason = make_decision(hyp, baseline) + assert decision == "rejected" + + +def test_make_decision_insufficient_data(): + hyp = {"win_rate": 80.0, "avg_return": 5.0, "evaluated": 2} + baseline = {"win_rate": 50.0, "avg_return": 1.0} + decision, reason = make_decision(hyp, baseline) + assert decision == "rejected" + assert "insufficient" in reason.lower() +``` + +- [ ] **Step 2: Run tests to confirm they fail** + +```bash +python -m pytest tests/test_compare_hypothesis.py -v 2>&1 | head -30 +``` + +Expected: `ModuleNotFoundError: No module named 'scripts.compare_hypothesis'` or similar import error — confirms tests are wired correctly. + +- [ ] **Step 3: Write `scripts/compare_hypothesis.py`** + +```python +#!/usr/bin/env python3 +""" +Hypothesis comparison — computes 7d returns for hypothesis picks and +compares them against the baseline scanner in performance_database.json. + +Usage (called by hypothesis-runner.yml after min_days elapsed): + python scripts/compare_hypothesis.py \\ + --hypothesis-id options_flow-scan-3-expirations \\ + --picks-json '{"picks": [...]}' \\ + --scanner options_flow \\ + --db-path data/recommendations/performance_database.json + +Prints a JSON conclusion to stdout: + { + "decision": "accepted", + "reason": "...", + "hypothesis": {"win_rate": 58.0, "avg_return": 1.8, "count": 14, "evaluated": 10}, + "baseline": {"win_rate": 42.0, "avg_return": -0.3, "count": 87} + } +""" + +import argparse +import json +import sys +from datetime import datetime, timedelta +from pathlib import Path +from typing import Optional, Tuple + +ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(ROOT)) + +from tradingagents.dataflows.y_finance import download_history + + +# Minimum evaluated picks required to make a decision +_MIN_EVALUATED = 5 +# Thresholds from spec +_WIN_RATE_DELTA_THRESHOLD = 5.0 # percentage points +_AVG_RETURN_DELTA_THRESHOLD = 1.0 # percent + + +def compute_7d_return(ticker: str, pick_date: str) -> Tuple[Optional[float], Optional[bool]]: + """ + Fetch 7-day return for a pick using yfinance. + + Args: + ticker: Stock symbol, e.g. "AAPL" + pick_date: Date the pick was made, "YYYY-MM-DD" + + Returns: + (return_pct, is_win) or (None, None) if data unavailable + """ + try: + entry_dt = datetime.strptime(pick_date, "%Y-%m-%d") + exit_dt = entry_dt + timedelta(days=10) # +3 buffer for weekends/holidays + df = download_history( + ticker, + start=entry_dt.strftime("%Y-%m-%d"), + end=exit_dt.strftime("%Y-%m-%d"), + ) + if df.empty or len(df) < 2: + return None, None + + # Use first available close as entry, 7th trading day as exit + close = df["Close"] + entry_price = float(close.iloc[0]) + exit_idx = min(5, len(close) - 1) # ~7 calendar days = ~5 trading days + exit_price = float(close.iloc[exit_idx]) + + if entry_price <= 0: + return None, None + + ret = (exit_price - entry_price) / entry_price * 100 + return round(ret, 4), ret > 0 + + except Exception: + return None, None + + +def enrich_picks_with_returns(picks: list) -> list: + """ + Compute 7d return for each pick that is old enough (>= 7 days) and + doesn't already have return_7d populated. + + Args: + picks: List of pick dicts with at least 'ticker' and 'date' fields + + Returns: + Same list with return_7d and win_7d populated where possible + """ + cutoff = (datetime.utcnow() - timedelta(days=7)).strftime("%Y-%m-%d") + for pick in picks: + if pick.get("return_7d") is not None: + continue # already computed + if pick.get("date", "9999-99-99") > cutoff: + continue # too recent + ret, win = compute_7d_return(pick["ticker"], pick["date"]) + pick["return_7d"] = ret + pick["win_7d"] = win + return picks + + +def compute_metrics(picks: list) -> dict: + """ + Compute win rate and avg return for a list of picks. + + Only picks with non-None return_7d contribute to win_rate and avg_return. + + Returns: + {"count": int, "evaluated": int, "win_rate": float|None, "avg_return": float|None} + """ + evaluated = [p for p in picks if p.get("return_7d") is not None] + if not evaluated: + return {"count": len(picks), "evaluated": 0, "win_rate": None, "avg_return": None} + + wins = sum(1 for p in evaluated if p.get("win_7d")) + avg_ret = sum(p["return_7d"] for p in evaluated) / len(evaluated) + return { + "count": len(picks), + "evaluated": len(evaluated), + "win_rate": round(wins / len(evaluated) * 100, 1), + "avg_return": round(avg_ret, 2), + } + + +def load_baseline_metrics(scanner: str, db_path: str) -> dict: + """ + Load baseline metrics for a scanner from performance_database.json. + + Args: + scanner: Scanner name, e.g. "options_flow" + db_path: Path to performance_database.json + + Returns: + {"count": int, "win_rate": float|None, "avg_return": float|None} + """ + path = Path(db_path) + if not path.exists(): + return {"count": 0, "win_rate": None, "avg_return": None} + + try: + with open(path) as f: + db = json.load(f) + except Exception: + return {"count": 0, "win_rate": None, "avg_return": None} + + picks = [] + for recs in db.get("recommendations_by_date", {}).values(): + for rec in (recs if isinstance(recs, list) else []): + if rec.get("strategy_match") == scanner and rec.get("return_7d") is not None: + picks.append(rec) + + return compute_metrics(picks) + + +def make_decision(hypothesis: dict, baseline: dict) -> Tuple[str, str]: + """ + Decide accepted or rejected based on metrics delta. + + Rules: + - Minimum _MIN_EVALUATED evaluated picks required + - accepted if win_rate_delta > _WIN_RATE_DELTA_THRESHOLD (5pp) + OR avg_return_delta > _AVG_RETURN_DELTA_THRESHOLD (1%) + - rejected otherwise + + Returns: + (decision, reason) where decision is "accepted" or "rejected" + """ + evaluated = hypothesis.get("evaluated", 0) + if evaluated < _MIN_EVALUATED: + return "rejected", f"Insufficient data: only {evaluated} evaluated picks (need {_MIN_EVALUATED})" + + hyp_wr = hypothesis.get("win_rate") + hyp_ret = hypothesis.get("avg_return") + base_wr = baseline.get("win_rate") + base_ret = baseline.get("avg_return") + + reasons = [] + + if hyp_wr is not None and base_wr is not None: + delta_wr = hyp_wr - base_wr + if delta_wr > _WIN_RATE_DELTA_THRESHOLD: + reasons.append(f"win rate improved by {delta_wr:+.1f}pp ({base_wr:.1f}% → {hyp_wr:.1f}%)") + + if hyp_ret is not None and base_ret is not None: + delta_ret = hyp_ret - base_ret + if delta_ret > _AVG_RETURN_DELTA_THRESHOLD: + reasons.append(f"avg return improved by {delta_ret:+.2f}% ({base_ret:+.2f}% → {hyp_ret:+.2f}%)") + + if reasons: + return "accepted", "; ".join(reasons) + + wr_str = f"{hyp_wr:.1f}% vs baseline {base_wr:.1f}%" if hyp_wr is not None else "no win rate data" + ret_str = f"{hyp_ret:+.2f}% vs baseline {base_ret:+.2f}%" if hyp_ret is not None else "no return data" + return "rejected", f"No significant improvement — win rate: {wr_str}; avg return: {ret_str}" + + +def main(): + parser = argparse.ArgumentParser(description="Compare hypothesis picks against baseline") + parser.add_argument("--hypothesis-id", required=True) + parser.add_argument("--picks-json", required=True, help="JSON string of picks list") + parser.add_argument("--scanner", required=True, help="Baseline scanner name") + parser.add_argument( + "--db-path", + default="data/recommendations/performance_database.json", + help="Path to performance_database.json", + ) + args = parser.parse_args() + + picks = json.loads(args.picks_json) + picks = enrich_picks_with_returns(picks) + + hyp_metrics = compute_metrics(picks) + base_metrics = load_baseline_metrics(args.scanner, args.db_path) + + decision, reason = make_decision(hyp_metrics, base_metrics) + + result = { + "hypothesis_id": args.hypothesis_id, + "decision": decision, + "reason": reason, + "hypothesis": hyp_metrics, + "baseline": base_metrics, + "enriched_picks": picks, + } + print(json.dumps(result, indent=2)) + + +if __name__ == "__main__": + main() +``` + +- [ ] **Step 4: Run tests to confirm they pass** + +```bash +python -m pytest tests/test_compare_hypothesis.py -v +``` + +Expected: all 10 tests pass. + +- [ ] **Step 5: Commit** + +```bash +git add scripts/compare_hypothesis.py tests/test_compare_hypothesis.py +git commit -m "feat(hypotheses): add comparison + conclusion script" +``` + +--- + +## Task 3: `/backtest-hypothesis` Command + +**Files:** +- Create: `.claude/commands/backtest-hypothesis.md` + +- [ ] **Step 1: Write the command file** + +Create `.claude/commands/backtest-hypothesis.md`: + +````markdown +# /backtest-hypothesis + +Test a hypothesis about a scanner improvement using branch-per-hypothesis isolation. + +**Usage:** `/backtest-hypothesis "<description of the hypothesis>"` + +**Example:** `/backtest-hypothesis "options_flow: scan 3 expirations instead of 1 to capture institutional 30+ DTE positioning"` + +--- + +## Step 1: Read Current Registry + +Read `docs/iterations/hypotheses/active.json`. Note: +- How many hypotheses currently have `status: "running"` +- The `max_active` limit (default 5) +- Any existing `pending` entries + +Also read `docs/iterations/LEARNINGS.md` and the relevant scanner domain file in +`docs/iterations/scanners/` to understand the current baseline. + +## Step 2: Classify the Hypothesis + +Determine whether this is: + +**Statistical** — answerable from existing data in `data/recommendations/performance_database.json` +without any code change. Examples: +- "Does high confidence (≥8) predict better 30d returns?" +- "Are options_flow picks that are ITM outperforming OTM ones?" + +**Implementation** — requires a code change and forward-testing period. Examples: +- "Scan 3 expirations instead of 1" +- "Apply a premium filter of $50K instead of $25K" + +## Step 3a: Statistical Path + +If statistical: run the analysis now against `data/recommendations/performance_database.json`. +Write the finding to the relevant scanner domain file under **Evidence Log**. Print a summary. +Done — no branch needed. + +## Step 3b: Implementation Path + +### 3b-i: Capacity check + +Count running hypotheses from `active.json`. If fewer than `max_active` running, proceed. +If at capacity: add the new hypothesis as `status: "pending"` — running experiments are NEVER +paused mid-streak. Inform the user which slot it queued behind and when it will likely start. + +### 3b-ii: Score the hypothesis + +Assign a `priority` score (1–9) using these factors: + +| Factor | Score | +|---|---| +| Scanner 30d win rate < 40% | +3 | +| Change touches 1 file, 1 parameter | +2 | +| Directly addresses a weak spot in LEARNINGS.md | +2 | +| Scanner generates ≥2 picks/day (data accrues fast) | +1 | +| Supported by external research (arXiv, Alpha Architect, etc.) | +1 | +| Contradictory evidence or unclear direction | −2 | + +### 3b-iii: Determine min_days + +Set `min_days` based on the scanner's typical picks-per-day rate: +- ≥2 picks/day → 14 days +- 1 pick/day → 21 days +- <1 pick/day → 30 days + +### 3b-iv: Create the branch and implement the code change + +```bash +BRANCH="hypothesis/<scanner>-<slug>" +git checkout -b "$BRANCH" +``` + +Make the minimal code change that implements the hypothesis. Read the scanner file first. +Only change what the hypothesis requires — do not refactor surrounding code. + +```bash +git add tradingagents/ +git commit -m "hypothesis(<scanner>): <title>" +``` + +### 3b-v: Create picks tracking file on the branch + +Create `docs/iterations/hypotheses/<id>/picks.json` on the hypothesis branch: + +```json +{ + "hypothesis_id": "<id>", + "scanner": "<scanner>", + "picks": [] +} +``` + +```bash +mkdir -p docs/iterations/hypotheses/<id> +# write the file +git add docs/iterations/hypotheses/<id>/picks.json +git commit -m "hypothesis(<scanner>): add picks tracker" +git push -u origin "$BRANCH" +``` + +### 3b-vi: Open a draft PR + +```bash +gh pr create \ + --title "hypothesis(<scanner>): <title>" \ + --body "**Hypothesis:** <description> + +**Expected impact:** <high/medium/low> +**Min days:** <N> +**Priority:** <score>/9 + +*This is an automated hypothesis experiment. It will be auto-concluded after ${MIN_DAYS} days of data.*" \ + --draft \ + --base main +``` + +Note the PR number from the output. + +### 3b-vii: Update active.json on main + +Check out `main`, then update `docs/iterations/hypotheses/active.json` to add the new entry: + +```json +{ + "id": "<scanner>-<slug>", + "scanner": "<scanner>", + "title": "<title>", + "description": "<description>", + "branch": "hypothesis/<scanner>-<slug>", + "pr_number": <N>, + "status": "running", + "priority": <score>, + "expected_impact": "<high|medium|low>", + "hypothesis_type": "implementation", + "created_at": "<YYYY-MM-DD>", + "min_days": <N>, + "days_elapsed": 0, + "picks_log": [], + "baseline_scanner": "<scanner>", + "conclusion": null +} +``` + +```bash +git checkout main +git add docs/iterations/hypotheses/active.json +git commit -m "feat(hypotheses): register hypothesis <id>" +git push origin main +``` + +## Step 4: Print Summary + +Print a confirmation: +- Hypothesis ID and branch name +- Status: running or pending +- Expected conclusion date (created_at + min_days) +- PR link (if running) +- Priority score and why +```` + +- [ ] **Step 2: Verify the file exists and is non-empty** + +```bash +wc -l .claude/commands/backtest-hypothesis.md +``` + +Expected: at least 80 lines. + +- [ ] **Step 3: Commit** + +```bash +git add .claude/commands/backtest-hypothesis.md +git commit -m "feat(hypotheses): add /backtest-hypothesis command" +``` + +--- + +## Task 4: Hypothesis Runner Workflow + +**Files:** +- Create: `.github/workflows/hypothesis-runner.yml` + +- [ ] **Step 1: Write the workflow** + +Create `.github/workflows/hypothesis-runner.yml`: + +```yaml +name: Hypothesis Runner + +on: + schedule: + # 8:00 AM UTC daily — runs after iterate (06:00) and daily-discovery (12:30) + - cron: "0 8 * * *" + workflow_dispatch: + inputs: + hypothesis_id: + description: "Run a specific hypothesis ID only (blank = all running)" + required: false + default: "" + +env: + PYTHON_VERSION: "3.10" + +jobs: + run-hypotheses: + runs-on: ubuntu-latest + environment: TradingAgent + timeout-minutes: 60 + permissions: + contents: write + pull-requests: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + token: ${{ secrets.GH_TOKEN }} + + - name: Set up git identity + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: pip + + - name: Install dependencies + run: pip install --upgrade pip && pip install -e . + + - name: Run hypothesis experiments + env: + GH_TOKEN: ${{ secrets.GH_TOKEN }} + GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + FINNHUB_API_KEY: ${{ secrets.FINNHUB_API_KEY }} + ALPHA_VANTAGE_API_KEY: ${{ secrets.ALPHA_VANTAGE_API_KEY }} + FMP_API_KEY: ${{ secrets.FMP_API_KEY }} + REDDIT_CLIENT_ID: ${{ secrets.REDDIT_CLIENT_ID }} + REDDIT_CLIENT_SECRET: ${{ secrets.REDDIT_CLIENT_SECRET }} + TRADIER_API_KEY: ${{ secrets.TRADIER_API_KEY }} + FILTER_ID: ${{ inputs.hypothesis_id }} + run: | + python scripts/run_hypothesis_runner.py + + - name: Commit active.json updates + run: | + git add docs/iterations/hypotheses/active.json || true + if git diff --cached --quiet; then + echo "No registry changes" + else + git commit -m "chore(hypotheses): update registry $(date -u +%Y-%m-%d)" + git pull --rebase origin main + git push origin main + fi +``` + +- [ ] **Step 2: Write `scripts/run_hypothesis_runner.py`** + +Create `scripts/run_hypothesis_runner.py`: + +```python +#!/usr/bin/env python3 +""" +Hypothesis Runner — orchestrates daily experiment cycles. + +For each running hypothesis in active.json: + 1. Creates a git worktree for the hypothesis branch + 2. Runs the daily discovery pipeline in that worktree + 3. Extracts picks from the discovery result, appends to picks.json + 4. Commits and pushes picks to hypothesis branch + 5. Removes worktree + 6. Updates active.json (days_elapsed, picks_log) + 7. If days_elapsed >= min_days: concludes the hypothesis + +After all hypotheses: promotes highest-priority pending → running if a slot opened. + +Environment variables read: + FILTER_ID — if set, only run the hypothesis with this ID +""" + +import json +import os +import subprocess +import sys +from datetime import datetime, timedelta +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(ROOT)) + +ACTIVE_JSON = ROOT / "docs/iterations/hypotheses/active.json" +CONCLUDED_DIR = ROOT / "docs/iterations/hypotheses/concluded" +DB_PATH = ROOT / "data/recommendations/performance_database.json" +TODAY = datetime.utcnow().strftime("%Y-%m-%d") + + +def load_registry() -> dict: + with open(ACTIVE_JSON) as f: + return json.load(f) + + +def save_registry(registry: dict) -> None: + with open(ACTIVE_JSON, "w") as f: + json.dump(registry, f, indent=2) + + +def run(cmd: list, cwd: str = None, check: bool = True) -> subprocess.CompletedProcess: + print(f" $ {' '.join(cmd)}", flush=True) + return subprocess.run(cmd, cwd=cwd or str(ROOT), check=check, capture_output=False) + + +def run_capture(cmd: list, cwd: str = None) -> str: + result = subprocess.run(cmd, cwd=cwd or str(ROOT), capture_output=True, text=True) + return result.stdout.strip() + + +def extract_picks(worktree: str, scanner: str) -> list: + """ + Extract picks for the given scanner from the most recent discovery result + in the worktree's results/discovery/<TODAY>/ directory. + """ + results_dir = Path(worktree) / "results" / "discovery" / TODAY + if not results_dir.exists(): + print(f" No discovery results for {TODAY} in worktree", flush=True) + return [] + + picks = [] + for run_dir in sorted(results_dir.iterdir()): + result_file = run_dir / "discovery_result.json" + if not result_file.exists(): + continue + try: + with open(result_file) as f: + data = json.load(f) + for item in data.get("final_ranking", []): + if item.get("strategy_match") == scanner: + picks.append({ + "date": TODAY, + "ticker": item["ticker"], + "score": item.get("final_score"), + "confidence": item.get("confidence"), + "scanner": scanner, + "return_7d": None, + "win_7d": None, + }) + except Exception as e: + print(f" Warning: could not read {result_file}: {e}", flush=True) + + return picks + + +def load_picks_from_branch(hypothesis_id: str, branch: str) -> list: + """Load picks.json from the hypothesis branch using git show.""" + picks_path = f"docs/iterations/hypotheses/{hypothesis_id}/picks.json" + result = subprocess.run( + ["git", "show", f"{branch}:{picks_path}"], + cwd=str(ROOT), + capture_output=True, + text=True, + ) + if result.returncode != 0: + return [] + try: + return json.loads(result.stdout).get("picks", []) + except Exception: + return [] + + +def save_picks_to_worktree(worktree: str, hypothesis_id: str, scanner: str, picks: list) -> None: + """Write updated picks.json into the worktree and commit.""" + picks_dir = Path(worktree) / "docs" / "iterations" / "hypotheses" / hypothesis_id + picks_dir.mkdir(parents=True, exist_ok=True) + picks_file = picks_dir / "picks.json" + payload = {"hypothesis_id": hypothesis_id, "scanner": scanner, "picks": picks} + picks_file.write_text(json.dumps(payload, indent=2)) + + run(["git", "add", str(picks_file)], cwd=worktree) + result = subprocess.run( + ["git", "diff", "--cached", "--quiet"], cwd=worktree + ) + if result.returncode != 0: + run( + ["git", "commit", "-m", f"chore(hypotheses): picks {TODAY} for {hypothesis_id}"], + cwd=worktree, + ) + + +def run_hypothesis(hyp: dict) -> bool: + """ + Run one hypothesis experiment cycle. Returns True if the experiment concluded. + """ + hid = hyp["id"] + branch = hyp["branch"] + scanner = hyp["scanner"] + worktree = f"/tmp/hyp-{hid}" + + print(f"\n── Hypothesis: {hid} ──", flush=True) + + # 1. Create worktree + run(["git", "fetch", "origin", branch], check=False) + run(["git", "worktree", "add", worktree, branch]) + + try: + # 2. Run discovery in worktree + result = subprocess.run( + [sys.executable, "scripts/run_daily_discovery.py", "--date", TODAY, "--no-update-positions"], + cwd=worktree, + check=False, + ) + if result.returncode != 0: + print(f" Discovery failed for {hid}, skipping picks update", flush=True) + else: + # 3. Extract picks + merge with existing + new_picks = extract_picks(worktree, scanner) + existing_picks = load_picks_from_branch(hid, branch) + # Deduplicate by (date, ticker) + seen = {(p["date"], p["ticker"]) for p in existing_picks} + merged = existing_picks + [p for p in new_picks if (p["date"], p["ticker"]) not in seen] + + # 4. Save picks + commit in worktree + save_picks_to_worktree(worktree, hid, scanner, merged) + + # 5. Push hypothesis branch + run(["git", "push", "origin", f"HEAD:{branch}"], cwd=worktree) + + # 6. Update registry fields + if TODAY not in hyp.get("picks_log", []): + hyp.setdefault("picks_log", []).append(TODAY) + hyp["days_elapsed"] = len(hyp["picks_log"]) + + # 7. Check conclusion + if hyp["days_elapsed"] >= hyp["min_days"]: + return conclude_hypothesis(hyp) + + finally: + run(["git", "worktree", "remove", "--force", worktree], check=False) + + return False + + +def conclude_hypothesis(hyp: dict) -> bool: + """Run comparison, write conclusion doc, close/merge PR. Returns True.""" + hid = hyp["id"] + scanner = hyp["scanner"] + branch = hyp["branch"] + + print(f"\n Concluding {hid}...", flush=True) + + # Load picks from branch + picks = load_picks_from_branch(hid, branch) + if not picks: + print(f" No picks found for {hid}, marking rejected", flush=True) + conclusion = { + "decision": "rejected", + "reason": "No picks were collected during the experiment period", + "hypothesis": {"count": 0, "evaluated": 0, "win_rate": None, "avg_return": None}, + "baseline": {"count": 0, "win_rate": None, "avg_return": None}, + } + else: + # Run comparison script + result = subprocess.run( + [ + sys.executable, "scripts/compare_hypothesis.py", + "--hypothesis-id", hid, + "--picks-json", json.dumps(picks), + "--scanner", scanner, + "--db-path", str(DB_PATH), + ], + cwd=str(ROOT), + capture_output=True, + text=True, + ) + if result.returncode != 0: + print(f" compare_hypothesis.py failed: {result.stderr}", flush=True) + return False + conclusion = json.loads(result.stdout) + + decision = conclusion["decision"] + hyp_metrics = conclusion["hypothesis"] + base_metrics = conclusion["baseline"] + + # Write concluded doc + period_start = hyp.get("created_at", TODAY) + concluded_doc = CONCLUDED_DIR / f"{TODAY}-{hid}.md" + concluded_doc.write_text( + f"# Hypothesis: {hyp['title']}\n\n" + f"**Scanner:** {scanner}\n" + f"**Branch:** {branch}\n" + f"**Period:** {period_start} → {TODAY} ({hyp['days_elapsed']} days)\n" + f"**Outcome:** {'accepted ✅' if decision == 'accepted' else 'rejected ❌'}\n\n" + f"## Hypothesis\n{hyp.get('description', hyp['title'])}\n\n" + f"## Results\n\n" + f"| Metric | Baseline | Experiment | Delta |\n" + f"|---|---|---|---|\n" + f"| 7d win rate | {base_metrics.get('win_rate') or '—'}% | " + f"{hyp_metrics.get('win_rate') or '—'}% | " + f"{_delta_str(hyp_metrics.get('win_rate'), base_metrics.get('win_rate'), 'pp')} |\n" + f"| Avg return | {base_metrics.get('avg_return') or '—'}% | " + f"{hyp_metrics.get('avg_return') or '—'}% | " + f"{_delta_str(hyp_metrics.get('avg_return'), base_metrics.get('avg_return'), '%')} |\n" + f"| Picks | {base_metrics.get('count', '—')} | {hyp_metrics.get('count', '—')} | — |\n\n" + f"## Decision\n{conclusion['reason']}\n\n" + f"## Action\n" + f"{'Branch merged into main.' if decision == 'accepted' else 'Branch closed without merging.'}\n" + ) + + run(["git", "add", str(concluded_doc)], check=False) + + # Close or merge PR + pr = hyp.get("pr_number") + if pr: + if decision == "accepted": + subprocess.run( + ["gh", "pr", "merge", str(pr), "--squash", "--delete-branch"], + cwd=str(ROOT), check=False, + ) + else: + subprocess.run( + ["gh", "pr", "close", str(pr), "--delete-branch"], + cwd=str(ROOT), check=False, + ) + + # Update registry entry + hyp["status"] = "concluded" + hyp["conclusion"] = decision + + print(f" {hid}: {decision} — {conclusion['reason']}", flush=True) + return True + + +def _delta_str(hyp_val, base_val, unit: str) -> str: + if hyp_val is None or base_val is None: + return "—" + delta = hyp_val - base_val + sign = "+" if delta >= 0 else "" + return f"{sign}{delta:.1f}{unit}" + + +def promote_pending(registry: dict) -> None: + """Promote the highest-priority pending hypothesis to running if a slot is open.""" + running_count = sum(1 for h in registry["hypotheses"] if h["status"] == "running") + max_active = registry.get("max_active", 5) + if running_count >= max_active: + return + + pending = [h for h in registry["hypotheses"] if h["status"] == "pending"] + if not pending: + return + + # Promote highest priority + to_promote = max(pending, key=lambda h: h.get("priority", 0)) + to_promote["status"] = "running" + print(f"\n Promoted pending hypothesis to running: {to_promote['id']}", flush=True) + + +def main(): + registry = load_registry() + filter_id = os.environ.get("FILTER_ID", "").strip() + + hypotheses = registry.get("hypotheses", []) + running = [ + h for h in hypotheses + if h["status"] == "running" and (not filter_id or h["id"] == filter_id) + ] + + if not running: + print("No running hypotheses to process.", flush=True) + else: + for hyp in running: + run_hypothesis(hyp) + + promote_pending(registry) + save_registry(registry) + print("\nRegistry updated.", flush=True) + + +if __name__ == "__main__": + main() +``` + +- [ ] **Step 3: Verify the workflow YAML is valid** + +```bash +python3 -c "import yaml; yaml.safe_load(open('.github/workflows/hypothesis-runner.yml'))" 2>/dev/null \ + || python3 -c " +import re, sys +with open('.github/workflows/hypothesis-runner.yml') as f: + content = f.read() +# Just check the file exists and has the cron line +assert '0 8 * * *' in content, 'missing cron' +print('workflow file looks good') +" +``` + +- [ ] **Step 4: Commit** + +```bash +git add .github/workflows/hypothesis-runner.yml scripts/run_hypothesis_runner.py +git commit -m "feat(hypotheses): add daily hypothesis runner workflow" +``` + +--- + +## Task 5: Dashboard Hypotheses Tab + +**Files:** +- Create: `tradingagents/ui/pages/hypotheses.py` +- Modify: `tradingagents/ui/pages/__init__.py` +- Modify: `tradingagents/ui/dashboard.py` + +- [ ] **Step 1: Write the failing test** + +Create `tests/test_hypotheses_page.py`: + +```python +"""Tests for the hypotheses dashboard page data loading.""" +import json +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).parent.parent)) + + +from tradingagents.ui.pages.hypotheses import ( + load_active_hypotheses, + load_concluded_hypotheses, + days_until_ready, +) + + +# ── load_active_hypotheses ──────────────────────────────────────────────────── + +def test_load_active_hypotheses(tmp_path): + active = { + "max_active": 5, + "hypotheses": [ + { + "id": "options_flow-test", + "title": "Test hypothesis", + "scanner": "options_flow", + "status": "running", + "priority": 7, + "days_elapsed": 5, + "min_days": 14, + "created_at": "2026-04-01", + "picks_log": ["2026-04-01"] * 5, + "conclusion": None, + } + ], + } + f = tmp_path / "active.json" + f.write_text(json.dumps(active)) + + result = load_active_hypotheses(str(f)) + assert len(result) == 1 + assert result[0]["id"] == "options_flow-test" + + +def test_load_active_hypotheses_missing_file(tmp_path): + result = load_active_hypotheses(str(tmp_path / "missing.json")) + assert result == [] + + +# ── load_concluded_hypotheses ───────────────────────────────────────────────── + +def test_load_concluded_hypotheses(tmp_path): + doc = tmp_path / "2026-04-10-options_flow-test.md" + doc.write_text( + "# Hypothesis: Test\n\n" + "**Scanner:** options_flow\n" + "**Period:** 2026-03-27 → 2026-04-10 (14 days)\n" + "**Outcome:** accepted ✅\n" + ) + + results = load_concluded_hypotheses(str(tmp_path)) + assert len(results) == 1 + assert results[0]["filename"] == doc.name + assert results[0]["outcome"] == "accepted ✅" + + +def test_load_concluded_hypotheses_empty_dir(tmp_path): + results = load_concluded_hypotheses(str(tmp_path)) + assert results == [] + + +# ── days_until_ready ────────────────────────────────────────────────────────── + +def test_days_until_ready_has_days_left(): + hyp = {"days_elapsed": 5, "min_days": 14} + assert days_until_ready(hyp) == 9 + + +def test_days_until_ready_past_due(): + hyp = {"days_elapsed": 15, "min_days": 14} + assert days_until_ready(hyp) == 0 +``` + +- [ ] **Step 2: Run tests to confirm they fail** + +```bash +python -m pytest tests/test_hypotheses_page.py -v 2>&1 | head -20 +``` + +Expected: `ModuleNotFoundError` for `tradingagents.ui.pages.hypotheses`. + +- [ ] **Step 3: Write `tradingagents/ui/pages/hypotheses.py`** + +```python +""" +Hypotheses dashboard page — tracks active and concluded experiments. + +Reads docs/iterations/hypotheses/active.json and the concluded/ directory. +No external API calls; all data is file-based. +""" + +import json +import re +from pathlib import Path +from typing import Any, Dict, List + +import streamlit as st + +from tradingagents.ui.theme import COLORS, page_header + +_REPO_ROOT = Path(__file__).parent.parent.parent.parent +_ACTIVE_JSON = _REPO_ROOT / "docs/iterations/hypotheses/active.json" +_CONCLUDED_DIR = _REPO_ROOT / "docs/iterations/hypotheses/concluded" + + +# ── Data loaders ───────────────────────────────────────────────────────────── + + +def load_active_hypotheses(active_path: str = str(_ACTIVE_JSON)) -> List[Dict[str, Any]]: + """Load all hypotheses from active.json. Returns [] if file missing.""" + path = Path(active_path) + if not path.exists(): + return [] + try: + with open(path) as f: + data = json.load(f) + return data.get("hypotheses", []) + except Exception: + return [] + + +def load_concluded_hypotheses(concluded_dir: str = str(_CONCLUDED_DIR)) -> List[Dict[str, Any]]: + """ + Load concluded hypothesis metadata by parsing the markdown files in concluded/. + + Extracts: filename, title, scanner, period, outcome from each .md file. + """ + dir_path = Path(concluded_dir) + if not dir_path.exists(): + return [] + + results = [] + for md_file in sorted(dir_path.glob("*.md"), reverse=True): + if md_file.name == ".gitkeep": + continue + try: + text = md_file.read_text() + title = _extract_md_field(text, r"^# Hypothesis: (.+)$") + scanner = _extract_md_field(text, r"^\*\*Scanner:\*\* (.+)$") + period = _extract_md_field(text, r"^\*\*Period:\*\* (.+)$") + outcome = _extract_md_field(text, r"^\*\*Outcome:\*\* (.+)$") + results.append({ + "filename": md_file.name, + "title": title or md_file.stem, + "scanner": scanner or "—", + "period": period or "—", + "outcome": outcome or "—", + }) + except Exception: + continue + + return results + + +def _extract_md_field(text: str, pattern: str) -> str: + """Extract a field value from a markdown line using regex.""" + match = re.search(pattern, text, re.MULTILINE) + return match.group(1).strip() if match else "" + + +def days_until_ready(hyp: Dict[str, Any]) -> int: + """Return number of days remaining before hypothesis can conclude (min 0).""" + return max(0, hyp.get("min_days", 14) - hyp.get("days_elapsed", 0)) + + +# ── Rendering ───────────────────────────────────────────────────────────────── + + +def render() -> None: + """Render the hypotheses tracking page.""" + st.markdown( + page_header("Hypotheses", "Active experiments & concluded findings"), + unsafe_allow_html=True, + ) + + hypotheses = load_active_hypotheses() + concluded = load_concluded_hypotheses() + + if not hypotheses and not concluded: + st.info( + "No hypotheses yet. Run `/backtest-hypothesis \"<description>\"` to start an experiment." + ) + return + + # ── Active experiments ──────────────────────────────────────────────────── + running = [h for h in hypotheses if h["status"] == "running"] + pending = [h for h in hypotheses if h["status"] == "pending"] + + st.markdown( + f'<div class="section-title">Active Experiments ' + f'<span class="accent">// {len(running)} running, {len(pending)} pending</span></div>', + unsafe_allow_html=True, + ) + + if running or pending: + active_rows = [] + for h in sorted(running + pending, key=lambda x: -x.get("priority", 0)): + days_left = days_until_ready(h) + ready_str = "concluding soon" if days_left == 0 else f"{days_left}d left" + status_color = COLORS["green"] if h["status"] == "running" else COLORS["amber"] + active_rows.append({ + "ID": h["id"], + "Title": h.get("title", "—"), + "Scanner": h.get("scanner", "—"), + "Status": h["status"], + "Progress": f"{h.get('days_elapsed', 0)}/{h.get('min_days', 14)}d", + "Picks": len(h.get("picks_log", [])), + "Ready": ready_str, + "Priority": h.get("priority", "—"), + }) + + import pandas as pd + df = pd.DataFrame(active_rows) + st.dataframe( + df, + width="stretch", + hide_index=True, + column_config={ + "ID": st.column_config.TextColumn(width="medium"), + "Title": st.column_config.TextColumn(width="large"), + "Scanner": st.column_config.TextColumn(width="medium"), + "Status": st.column_config.TextColumn(width="small"), + "Progress": st.column_config.TextColumn(width="small"), + "Picks": st.column_config.NumberColumn(format="%d", width="small"), + "Ready": st.column_config.TextColumn(width="medium"), + "Priority": st.column_config.NumberColumn(format="%d/9", width="small"), + }, + ) + else: + st.info("No active experiments.") + + st.markdown("<div style='height:1.5rem;'></div>", unsafe_allow_html=True) + + # ── Concluded experiments ───────────────────────────────────────────────── + st.markdown( + f'<div class="section-title">Concluded Experiments ' + f'<span class="accent">// {len(concluded)} total</span></div>', + unsafe_allow_html=True, + ) + + if concluded: + import pandas as pd + concluded_rows = [] + for c in concluded: + outcome = c["outcome"] + emoji = "✅" if "accepted" in outcome else "❌" + concluded_rows.append({ + "Date": c["filename"][:10], + "Title": c["title"], + "Scanner": c["scanner"], + "Period": c["period"], + "Outcome": emoji, + }) + cdf = pd.DataFrame(concluded_rows) + st.dataframe( + cdf, + width="stretch", + hide_index=True, + column_config={ + "Date": st.column_config.TextColumn(width="small"), + "Title": st.column_config.TextColumn(width="large"), + "Scanner": st.column_config.TextColumn(width="medium"), + "Period": st.column_config.TextColumn(width="medium"), + "Outcome": st.column_config.TextColumn(width="small"), + }, + ) + else: + st.info("No concluded experiments yet.") +``` + +- [ ] **Step 4: Run tests to confirm they pass** + +```bash +python -m pytest tests/test_hypotheses_page.py -v +``` + +Expected: all 6 tests pass. + +- [ ] **Step 5: Register the page in `tradingagents/ui/pages/__init__.py`** + +Add after the `settings` import block (around line 38): + +```python +try: + from tradingagents.ui.pages import hypotheses +except Exception as _e: + _logger.error("Failed to import hypotheses page: %s", _e, exc_info=True) + hypotheses = None +``` + +And add `"hypotheses"` to `__all__`: + +```python +__all__ = [ + "home", + "todays_picks", + "portfolio", + "performance", + "settings", + "hypotheses", +] +``` + +- [ ] **Step 6: Add "Hypotheses" to dashboard navigation in `tradingagents/ui/dashboard.py`** + +In `render_sidebar`, change the `options` list: + +```python +page = st.radio( + "Navigation", + options=["Overview", "Signals", "Portfolio", "Performance", "Hypotheses", "Config"], + label_visibility="collapsed", +) +``` + +In `route_page`, add to `page_map`: + +```python +page_map = { + "Overview": pages.home, + "Signals": pages.todays_picks, + "Portfolio": pages.portfolio, + "Performance": pages.performance, + "Hypotheses": pages.hypotheses, + "Config": pages.settings, +} +``` + +- [ ] **Step 7: Run the full test suite** + +```bash +python -m pytest tests/test_compare_hypothesis.py tests/test_hypotheses_page.py -v +``` + +Expected: all 16 tests pass. + +- [ ] **Step 8: Commit everything** + +```bash +git add \ + tradingagents/ui/pages/hypotheses.py \ + tradingagents/ui/pages/__init__.py \ + tradingagents/ui/dashboard.py \ + tests/test_hypotheses_page.py +git commit -m "feat(hypotheses): add Hypotheses dashboard tab" +``` + +--- + +## Self-Review + +**Spec coverage check:** +- ✅ `active.json` schema with `status: running/pending/concluded` — Task 1 +- ✅ `/backtest-hypothesis` command: classify, priority scoring, pending queue, branch creation — Task 3 +- ✅ Running experiments never paused — enforced in `run_hypothesis_runner.py` (only `running` entries processed; new ones queue as `pending`) +- ✅ Daily runner: worktree per hypothesis, run discovery, commit picks, conclude — Task 4 +- ✅ Statistical comparison with 5pp / 1% thresholds, minimum 5 evaluated picks — Task 2 +- ✅ Auto-promote pending → running when slot opens — `promote_pending()` in runner +- ✅ Concluded doc written with metrics table — `conclude_hypothesis()` in runner +- ✅ PR merged (accepted) or closed (rejected) automatically — `conclude_hypothesis()` +- ✅ Dashboard tab with active + concluded tables — Task 5 + +**Type/name consistency:** +- `hypothesis_id` / `hid` / `id` field: the dict key is always `"id"`, the local var is `hid`, the argument is `--hypothesis-id` — consistent throughout +- `picks.json` structure: `{"hypothesis_id": ..., "scanner": ..., "picks": [...]}` — used in `save_picks_to_worktree` and `load_picks_from_branch` consistently +- `strategy_match` field used to filter picks in `extract_picks` — matches `discovery_result.json` structure confirmed by inspection From d3065f59f1abafd85abfa3c081790a54865a4a85 Mon Sep 17 00:00:00 2001 From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com> Date: Fri, 10 Apr 2026 09:26:17 -0700 Subject: [PATCH 04/14] feat(hypotheses): initialize hypothesis registry --- docs/iterations/hypotheses/active.json | 4 ++++ docs/iterations/hypotheses/concluded/.gitkeep | 0 2 files changed, 4 insertions(+) create mode 100644 docs/iterations/hypotheses/active.json create mode 100644 docs/iterations/hypotheses/concluded/.gitkeep diff --git a/docs/iterations/hypotheses/active.json b/docs/iterations/hypotheses/active.json new file mode 100644 index 00000000..6ed3446d --- /dev/null +++ b/docs/iterations/hypotheses/active.json @@ -0,0 +1,4 @@ +{ + "max_active": 5, + "hypotheses": [] +} diff --git a/docs/iterations/hypotheses/concluded/.gitkeep b/docs/iterations/hypotheses/concluded/.gitkeep new file mode 100644 index 00000000..e69de29b From 6c438f87e6cc71d79566a9b83e46a6bacd5289e7 Mon Sep 17 00:00:00 2001 From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com> Date: Fri, 10 Apr 2026 09:29:08 -0700 Subject: [PATCH 05/14] feat(hypotheses): add comparison + conclusion script Implements compute_7d_return, compute_metrics, load_baseline_metrics, and make_decision functions with full TDD coverage (11 tests passing). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --- scripts/__init__.py | 0 scripts/compare_hypothesis.py | 153 +++++++++++++++++++++++++++++++ tests/test_compare_hypothesis.py | 135 +++++++++++++++++++++++++++ 3 files changed, 288 insertions(+) create mode 100644 scripts/__init__.py create mode 100644 scripts/compare_hypothesis.py create mode 100644 tests/test_compare_hypothesis.py diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/compare_hypothesis.py b/scripts/compare_hypothesis.py new file mode 100644 index 00000000..991f5baf --- /dev/null +++ b/scripts/compare_hypothesis.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +""" +Hypothesis comparison — computes 7d returns for hypothesis picks and +compares them against the baseline scanner in performance_database.json. + +Usage (called by hypothesis-runner.yml after min_days elapsed): + python scripts/compare_hypothesis.py \ + --hypothesis-id options_flow-scan-3-expirations \ + --picks-json '[{"date": "2026-04-01", "ticker": "AAPL", ...}]' \ + --scanner options_flow \ + --db-path data/recommendations/performance_database.json + +Prints a JSON conclusion to stdout. +""" + +import argparse +import json +import sys +from datetime import datetime, timedelta +from pathlib import Path +from typing import Optional, Tuple + +ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(ROOT)) + +from tradingagents.dataflows.y_finance import download_history + +_MIN_EVALUATED = 5 +_WIN_RATE_DELTA_THRESHOLD = 5.0 +_AVG_RETURN_DELTA_THRESHOLD = 1.0 + + +def compute_7d_return(ticker: str, pick_date: str) -> Tuple[Optional[float], Optional[bool]]: + """Fetch 7-day return for a pick using yfinance. Returns (pct, is_win) or (None, None).""" + try: + entry_dt = datetime.strptime(pick_date, "%Y-%m-%d") + exit_dt = entry_dt + timedelta(days=10) + df = download_history( + ticker, + start=entry_dt.strftime("%Y-%m-%d"), + end=exit_dt.strftime("%Y-%m-%d"), + ) + if df.empty or len(df) < 2: + return None, None + close = df["Close"] + entry_price = float(close.iloc[0]) + exit_idx = min(5, len(close) - 1) + exit_price = float(close.iloc[exit_idx]) + if entry_price <= 0: + return None, None + ret = (exit_price - entry_price) / entry_price * 100 + return round(ret, 4), ret > 0 + except Exception: + return None, None + + +def enrich_picks_with_returns(picks: list) -> list: + """Compute 7d return for each pick >= 7 days old that lacks return_7d.""" + cutoff = (datetime.utcnow() - timedelta(days=7)).strftime("%Y-%m-%d") + for pick in picks: + if pick.get("return_7d") is not None: + continue + if pick.get("date", "9999-99-99") > cutoff: + continue + ret, win = compute_7d_return(pick["ticker"], pick["date"]) + pick["return_7d"] = ret + pick["win_7d"] = win + return picks + + +def compute_metrics(picks: list) -> dict: + """Compute win rate and avg return. Only picks with non-None return_7d are evaluated.""" + evaluated = [p for p in picks if p.get("return_7d") is not None] + if not evaluated: + return {"count": len(picks), "evaluated": 0, "win_rate": None, "avg_return": None} + wins = sum(1 for p in evaluated if p.get("win_7d")) + avg_ret = sum(p["return_7d"] for p in evaluated) / len(evaluated) + return { + "count": len(picks), + "evaluated": len(evaluated), + "win_rate": round(wins / len(evaluated) * 100, 1), + "avg_return": round(avg_ret, 2), + } + + +def load_baseline_metrics(scanner: str, db_path: str) -> dict: + """Load baseline metrics for a scanner from performance_database.json.""" + path = Path(db_path) + if not path.exists(): + return {"count": 0, "win_rate": None, "avg_return": None} + try: + with open(path) as f: + db = json.load(f) + except Exception: + return {"count": 0, "win_rate": None, "avg_return": None} + picks = [] + for recs in db.get("recommendations_by_date", {}).values(): + for rec in (recs if isinstance(recs, list) else []): + if rec.get("strategy_match") == scanner and rec.get("return_7d") is not None: + picks.append(rec) + return compute_metrics(picks) + + +def make_decision(hypothesis: dict, baseline: dict) -> Tuple[str, str]: + """Decide accepted/rejected. Requires _MIN_EVALUATED evaluated picks.""" + evaluated = hypothesis.get("evaluated", 0) + if evaluated < _MIN_EVALUATED: + return "rejected", f"Insufficient data: only {evaluated} evaluated picks (need {_MIN_EVALUATED})" + hyp_wr = hypothesis.get("win_rate") + hyp_ret = hypothesis.get("avg_return") + base_wr = baseline.get("win_rate") + base_ret = baseline.get("avg_return") + reasons = [] + if hyp_wr is not None and base_wr is not None: + delta_wr = hyp_wr - base_wr + if delta_wr > _WIN_RATE_DELTA_THRESHOLD: + reasons.append(f"win rate improved by {delta_wr:+.1f}pp ({base_wr:.1f}% → {hyp_wr:.1f}%)") + if hyp_ret is not None and base_ret is not None: + delta_ret = hyp_ret - base_ret + if delta_ret > _AVG_RETURN_DELTA_THRESHOLD: + reasons.append(f"avg return improved by {delta_ret:+.2f}% ({base_ret:+.2f}% → {hyp_ret:+.2f}%)") + if reasons: + return "accepted", "; ".join(reasons) + wr_str = f"{hyp_wr:.1f}% vs baseline {base_wr:.1f}%" if hyp_wr is not None else "no win rate data" + ret_str = f"{hyp_ret:+.2f}% vs baseline {base_ret:+.2f}%" if hyp_ret is not None else "no return data" + return "rejected", f"No significant improvement — win rate: {wr_str}; avg return: {ret_str}" + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--hypothesis-id", required=True) + parser.add_argument("--picks-json", required=True) + parser.add_argument("--scanner", required=True) + parser.add_argument("--db-path", default="data/recommendations/performance_database.json") + args = parser.parse_args() + picks = json.loads(args.picks_json) + picks = enrich_picks_with_returns(picks) + hyp_metrics = compute_metrics(picks) + base_metrics = load_baseline_metrics(args.scanner, args.db_path) + decision, reason = make_decision(hyp_metrics, base_metrics) + result = { + "hypothesis_id": args.hypothesis_id, + "decision": decision, + "reason": reason, + "hypothesis": hyp_metrics, + "baseline": base_metrics, + "enriched_picks": picks, + } + print(json.dumps(result, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/tests/test_compare_hypothesis.py b/tests/test_compare_hypothesis.py new file mode 100644 index 00000000..2cf41609 --- /dev/null +++ b/tests/test_compare_hypothesis.py @@ -0,0 +1,135 @@ +"""Tests for the hypothesis comparison script.""" +import json +import sys +from datetime import date, timedelta +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from scripts.compare_hypothesis import ( + compute_metrics, + compute_7d_return, + load_baseline_metrics, + make_decision, +) + + +# ── compute_metrics ────────────────────────────────────────────────────────── + +def test_compute_metrics_empty(): + result = compute_metrics([]) + assert result == {"count": 0, "evaluated": 0, "win_rate": None, "avg_return": None} + + +def test_compute_metrics_all_wins(): + picks = [ + {"return_7d": 5.0, "win_7d": True}, + {"return_7d": 3.0, "win_7d": True}, + ] + result = compute_metrics(picks) + assert result["win_rate"] == 100.0 + assert result["avg_return"] == 4.0 + assert result["evaluated"] == 2 + + +def test_compute_metrics_mixed(): + picks = [ + {"return_7d": 10.0, "win_7d": True}, + {"return_7d": -5.0, "win_7d": False}, + {"return_7d": None, "win_7d": None}, # pending — excluded + ] + result = compute_metrics(picks) + assert result["win_rate"] == 50.0 + assert result["avg_return"] == 2.5 + assert result["evaluated"] == 2 + assert result["count"] == 3 + + +# ── compute_7d_return ──────────────────────────────────────────────────────── + +def test_compute_7d_return_positive(): + import pandas as pd + + close_data = [100.0, 101.0, 102.0, 103.0, 104.0, 110.0] + mock_df = pd.DataFrame({"Close": close_data}) + + with patch("scripts.compare_hypothesis.download_history", return_value=mock_df): + ret, win = compute_7d_return("AAPL", "2026-03-01") + + assert ret == pytest.approx(10.0, rel=0.01) + assert win is True + + +def test_compute_7d_return_empty_data(): + import pandas as pd + + mock_df = pd.DataFrame() + + with patch("scripts.compare_hypothesis.download_history", return_value=mock_df): + ret, win = compute_7d_return("AAPL", "2026-03-01") + + assert ret is None + assert win is None + + +# ── load_baseline_metrics ──────────────────────────────────────────────────── + +def test_load_baseline_metrics(tmp_path): + db = { + "recommendations_by_date": { + "2026-03-01": [ + {"strategy_match": "options_flow", "return_7d": 5.0, "win_7d": True}, + {"strategy_match": "options_flow", "return_7d": -2.0, "win_7d": False}, + {"strategy_match": "reddit_dd", "return_7d": 3.0, "win_7d": True}, + ] + } + } + db_file = tmp_path / "performance_database.json" + db_file.write_text(json.dumps(db)) + + result = load_baseline_metrics("options_flow", str(db_file)) + + assert result["win_rate"] == 50.0 + assert result["avg_return"] == 1.5 + assert result["count"] == 2 + + +def test_load_baseline_metrics_missing_file(tmp_path): + result = load_baseline_metrics("options_flow", str(tmp_path / "missing.json")) + assert result == {"count": 0, "win_rate": None, "avg_return": None} + + +# ── make_decision ───────────────────────────────────────────────────────────── + +def test_make_decision_accepted_by_win_rate(): + hyp = {"win_rate": 60.0, "avg_return": 0.5, "evaluated": 10} + baseline = {"win_rate": 50.0, "avg_return": 0.5} + decision, reason = make_decision(hyp, baseline) + assert decision == "accepted" + assert "win rate" in reason.lower() + + +def test_make_decision_accepted_by_return(): + hyp = {"win_rate": 52.0, "avg_return": 3.0, "evaluated": 10} + baseline = {"win_rate": 50.0, "avg_return": 1.5} + decision, reason = make_decision(hyp, baseline) + assert decision == "accepted" + assert "return" in reason.lower() + + +def test_make_decision_rejected(): + hyp = {"win_rate": 48.0, "avg_return": 0.2, "evaluated": 10} + baseline = {"win_rate": 50.0, "avg_return": 1.0} + decision, reason = make_decision(hyp, baseline) + assert decision == "rejected" + + +def test_make_decision_insufficient_data(): + hyp = {"win_rate": 80.0, "avg_return": 5.0, "evaluated": 2} + baseline = {"win_rate": 50.0, "avg_return": 1.0} + decision, reason = make_decision(hyp, baseline) + assert decision == "rejected" + assert "insufficient" in reason.lower() From 2747ccddcd5bd024313e5e1ccf773d0d00838b25 Mon Sep 17 00:00:00 2001 From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com> Date: Fri, 10 Apr 2026 09:29:22 -0700 Subject: [PATCH 06/14] feat(hypotheses): add comparison + conclusion script Implements compute_7d_return, compute_metrics, load_baseline_metrics, and make_decision functions with full TDD coverage (11 tests passing). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --- scripts/compare_hypothesis.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/scripts/compare_hypothesis.py b/scripts/compare_hypothesis.py index 991f5baf..a1cd08bc 100644 --- a/scripts/compare_hypothesis.py +++ b/scripts/compare_hypothesis.py @@ -105,7 +105,10 @@ def make_decision(hypothesis: dict, baseline: dict) -> Tuple[str, str]: """Decide accepted/rejected. Requires _MIN_EVALUATED evaluated picks.""" evaluated = hypothesis.get("evaluated", 0) if evaluated < _MIN_EVALUATED: - return "rejected", f"Insufficient data: only {evaluated} evaluated picks (need {_MIN_EVALUATED})" + return ( + "rejected", + f"Insufficient data: only {evaluated} evaluated picks (need {_MIN_EVALUATED})", + ) hyp_wr = hypothesis.get("win_rate") hyp_ret = hypothesis.get("avg_return") base_wr = baseline.get("win_rate") @@ -114,15 +117,23 @@ def make_decision(hypothesis: dict, baseline: dict) -> Tuple[str, str]: if hyp_wr is not None and base_wr is not None: delta_wr = hyp_wr - base_wr if delta_wr > _WIN_RATE_DELTA_THRESHOLD: - reasons.append(f"win rate improved by {delta_wr:+.1f}pp ({base_wr:.1f}% → {hyp_wr:.1f}%)") + reasons.append( + f"win rate improved by {delta_wr:+.1f}pp ({base_wr:.1f}% → {hyp_wr:.1f}%)" + ) if hyp_ret is not None and base_ret is not None: delta_ret = hyp_ret - base_ret if delta_ret > _AVG_RETURN_DELTA_THRESHOLD: - reasons.append(f"avg return improved by {delta_ret:+.2f}% ({base_ret:+.2f}% → {hyp_ret:+.2f}%)") + reasons.append( + f"avg return improved by {delta_ret:+.2f}% ({base_ret:+.2f}% → {hyp_ret:+.2f}%)" + ) if reasons: return "accepted", "; ".join(reasons) - wr_str = f"{hyp_wr:.1f}% vs baseline {base_wr:.1f}%" if hyp_wr is not None else "no win rate data" - ret_str = f"{hyp_ret:+.2f}% vs baseline {base_ret:+.2f}%" if hyp_ret is not None else "no return data" + wr_str = ( + f"{hyp_wr:.1f}% vs baseline {base_wr:.1f}%" if hyp_wr is not None else "no win rate data" + ) + ret_str = ( + f"{hyp_ret:+.2f}% vs baseline {base_ret:+.2f}%" if hyp_ret is not None else "no return data" + ) return "rejected", f"No significant improvement — win rate: {wr_str}; avg return: {ret_str}" From f8063f3596a5bc274f002e546d9499ce600a8366 Mon Sep 17 00:00:00 2001 From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com> Date: Fri, 10 Apr 2026 09:31:07 -0700 Subject: [PATCH 07/14] fix(hypotheses): use correct 7-trading-day exit index in comparison --- scripts/compare_hypothesis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/compare_hypothesis.py b/scripts/compare_hypothesis.py index a1cd08bc..ac6a72aa 100644 --- a/scripts/compare_hypothesis.py +++ b/scripts/compare_hypothesis.py @@ -44,7 +44,7 @@ def compute_7d_return(ticker: str, pick_date: str) -> Tuple[Optional[float], Opt return None, None close = df["Close"] entry_price = float(close.iloc[0]) - exit_idx = min(5, len(close) - 1) + exit_idx = min(6, len(close) - 1) exit_price = float(close.iloc[exit_idx]) if entry_price <= 0: return None, None From 38b9cef41c41cabca78b03fd1b1b20d50e173ed6 Mon Sep 17 00:00:00 2001 From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com> Date: Fri, 10 Apr 2026 09:46:33 -0700 Subject: [PATCH 08/14] feat(hypotheses): add /backtest-hypothesis command --- .claude/commands/backtest-hypothesis.md | 159 ++++++++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100644 .claude/commands/backtest-hypothesis.md diff --git a/.claude/commands/backtest-hypothesis.md b/.claude/commands/backtest-hypothesis.md new file mode 100644 index 00000000..3941bb70 --- /dev/null +++ b/.claude/commands/backtest-hypothesis.md @@ -0,0 +1,159 @@ +# /backtest-hypothesis + +Test a hypothesis about a scanner improvement using branch-per-hypothesis isolation. + +**Usage:** `/backtest-hypothesis "<description of the hypothesis>"` + +**Example:** `/backtest-hypothesis "options_flow: scan 3 expirations instead of 1 to capture institutional 30+ DTE positioning"` + +--- + +## Step 1: Read Current Registry + +Read `docs/iterations/hypotheses/active.json`. Note: +- How many hypotheses currently have `status: "running"` +- The `max_active` limit (default 5) +- Any existing `pending` entries + +Also read `docs/iterations/LEARNINGS.md` and the relevant scanner domain file in +`docs/iterations/scanners/` to understand the current baseline. + +## Step 2: Classify the Hypothesis + +Determine whether this is: + +**Statistical** — answerable from existing data in `data/recommendations/performance_database.json` +without any code change. Examples: +- "Does high confidence (≥8) predict better 30d returns?" +- "Are options_flow picks that are ITM outperforming OTM ones?" + +**Implementation** — requires a code change and forward-testing period. Examples: +- "Scan 3 expirations instead of 1" +- "Apply a premium filter of $50K instead of $25K" + +## Step 3a: Statistical Path + +If statistical: run the analysis now against `data/recommendations/performance_database.json`. +Write the finding to the relevant scanner domain file under **Evidence Log**. Print a summary. +Done — no branch needed. + +## Step 3b: Implementation Path + +### 3b-i: Capacity check + +Count running hypotheses from `active.json`. If fewer than `max_active` running, proceed. +If at capacity: add the new hypothesis as `status: "pending"` — running experiments are NEVER +paused mid-streak. Inform the user which slot it is queued behind and when it will likely start. + +### 3b-ii: Score the hypothesis + +Assign a `priority` score (1–9) using these factors: + +| Factor | Score | +|---|---| +| Scanner 30d win rate < 40% | +3 | +| Change touches 1 file, 1 parameter | +2 | +| Directly addresses a weak spot in LEARNINGS.md | +2 | +| Scanner generates ≥2 picks/day (data accrues fast) | +1 | +| Supported by external research (arXiv, Alpha Architect, etc.) | +1 | +| Contradictory evidence or unclear direction | −2 | + +### 3b-iii: Determine min_days + +Set `min_days` based on the scanner's typical picks-per-day rate: +- ≥2 picks/day → 14 days +- 1 pick/day → 21 days +- <1 pick/day → 30 days + +### 3b-iv: Create the branch and implement the code change + +```bash +BRANCH="hypothesis/<scanner>-<slug>" +git checkout -b "$BRANCH" +``` + +Make the minimal code change that implements the hypothesis. Read the scanner file first. +Only change what the hypothesis requires — do not refactor surrounding code. + +```bash +git add tradingagents/ +git commit -m "hypothesis(<scanner>): <title>" +``` + +### 3b-v: Create picks tracking file on the branch + +Create `docs/iterations/hypotheses/<id>/picks.json` on the hypothesis branch: + +```json +{ + "hypothesis_id": "<id>", + "scanner": "<scanner>", + "picks": [] +} +``` + +```bash +mkdir -p docs/iterations/hypotheses/<id> +git add docs/iterations/hypotheses/<id>/picks.json +git commit -m "hypothesis(<scanner>): add picks tracker" +git push -u origin "$BRANCH" +``` + +### 3b-vi: Open a draft PR + +```bash +gh pr create \ + --title "hypothesis(<scanner>): <title>" \ + --body "**Hypothesis:** <description> + +**Expected impact:** <high/medium/low> +**Min days:** <N> +**Priority:** <score>/9 + +*This is an automated hypothesis experiment. It will be auto-concluded after ${MIN_DAYS} days of data.*" \ + --draft \ + --base main +``` + +Note the PR number from the output. + +### 3b-vii: Update active.json on main + +Check out `main`, then update `docs/iterations/hypotheses/active.json` to add the new entry: + +```json +{ + "id": "<scanner>-<slug>", + "scanner": "<scanner>", + "title": "<title>", + "description": "<description>", + "branch": "hypothesis/<scanner>-<slug>", + "pr_number": <N>, + "status": "running", + "priority": <score>, + "expected_impact": "<high|medium|low>", + "hypothesis_type": "implementation", + "created_at": "<YYYY-MM-DD>", + "min_days": <N>, + "days_elapsed": 0, + "picks_log": [], + "baseline_scanner": "<scanner>", + "conclusion": null +} +``` + +```bash +git checkout main +git add docs/iterations/hypotheses/active.json +git commit -m "feat(hypotheses): register hypothesis <id>" +git push origin main +``` + +## Step 4: Print Summary + +Print a confirmation: +- Hypothesis ID and branch name +- Status: running or pending +- Expected conclusion date (created_at + min_days) +- PR link (if running) +- Priority score and why From 1b782b1cd62f6e454078436cb2cc306fbc197818 Mon Sep 17 00:00:00 2001 From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com> Date: Fri, 10 Apr 2026 09:49:10 -0700 Subject: [PATCH 09/14] feat(hypotheses): add daily hypothesis runner workflow --- .github/workflows/hypothesis-runner.yml | 74 +++++++ scripts/run_hypothesis_runner.py | 283 ++++++++++++++++++++++++ 2 files changed, 357 insertions(+) create mode 100644 .github/workflows/hypothesis-runner.yml create mode 100644 scripts/run_hypothesis_runner.py diff --git a/.github/workflows/hypothesis-runner.yml b/.github/workflows/hypothesis-runner.yml new file mode 100644 index 00000000..0d6cd4bc --- /dev/null +++ b/.github/workflows/hypothesis-runner.yml @@ -0,0 +1,74 @@ +name: Hypothesis Runner + +on: + schedule: + # 8:00 AM UTC daily — runs after iterate (06:00 UTC) + - cron: "0 8 * * *" + workflow_dispatch: + inputs: + hypothesis_id: + description: "Run a specific hypothesis ID only (blank = all running)" + required: false + default: "" + +env: + PYTHON_VERSION: "3.10" + +jobs: + run-hypotheses: + runs-on: ubuntu-latest + environment: TradingAgent + timeout-minutes: 60 + permissions: + contents: write + pull-requests: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + token: ${{ secrets.GH_TOKEN }} + + - name: Set up git identity + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: pip + + - name: Install dependencies + run: pip install --upgrade pip && pip install -e . + + - name: Run hypothesis experiments + env: + GH_TOKEN: ${{ secrets.GH_TOKEN }} + GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + FINNHUB_API_KEY: ${{ secrets.FINNHUB_API_KEY }} + ALPHA_VANTAGE_API_KEY: ${{ secrets.ALPHA_VANTAGE_API_KEY }} + FMP_API_KEY: ${{ secrets.FMP_API_KEY }} + REDDIT_CLIENT_ID: ${{ secrets.REDDIT_CLIENT_ID }} + REDDIT_CLIENT_SECRET: ${{ secrets.REDDIT_CLIENT_SECRET }} + TRADIER_API_KEY: ${{ secrets.TRADIER_API_KEY }} + FILTER_ID: ${{ inputs.hypothesis_id }} + run: | + python scripts/run_hypothesis_runner.py + + - name: Commit active.json updates + env: + GH_TOKEN: ${{ secrets.GH_TOKEN }} + run: | + git add docs/iterations/hypotheses/active.json docs/iterations/hypotheses/concluded/ || true + if git diff --cached --quiet; then + echo "No registry changes" + else + git commit -m "chore(hypotheses): update registry $(date -u +%Y-%m-%d)" + git pull --rebase origin main + git push origin main + fi diff --git a/scripts/run_hypothesis_runner.py b/scripts/run_hypothesis_runner.py new file mode 100644 index 00000000..79d73e04 --- /dev/null +++ b/scripts/run_hypothesis_runner.py @@ -0,0 +1,283 @@ +#!/usr/bin/env python3 +""" +Hypothesis Runner — orchestrates daily experiment cycles. + +For each running hypothesis in active.json: + 1. Creates a git worktree for the hypothesis branch + 2. Runs the daily discovery pipeline in that worktree + 3. Extracts picks from the discovery result, appends to picks.json + 4. Commits and pushes picks to hypothesis branch + 5. Removes worktree + 6. Updates active.json (days_elapsed, picks_log) + 7. If days_elapsed >= min_days: concludes the hypothesis + +After all hypotheses: promotes highest-priority pending → running if a slot opened. + +Environment variables: + FILTER_ID — if set, only run the hypothesis with this ID +""" + +import json +import os +import subprocess +import sys +from datetime import datetime +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(ROOT)) + +ACTIVE_JSON = ROOT / "docs/iterations/hypotheses/active.json" +CONCLUDED_DIR = ROOT / "docs/iterations/hypotheses/concluded" +DB_PATH = ROOT / "data/recommendations/performance_database.json" +TODAY = datetime.utcnow().strftime("%Y-%m-%d") + + +def load_registry() -> dict: + with open(ACTIVE_JSON) as f: + return json.load(f) + + +def save_registry(registry: dict) -> None: + with open(ACTIVE_JSON, "w") as f: + json.dump(registry, f, indent=2) + + +def run(cmd: list, cwd: str = None, check: bool = True) -> subprocess.CompletedProcess: + print(f" $ {' '.join(cmd)}", flush=True) + return subprocess.run(cmd, cwd=cwd or str(ROOT), check=check, capture_output=False) + + +def extract_picks(worktree: str, scanner: str) -> list: + """Extract picks for the given scanner from the most recent discovery result in the worktree.""" + results_dir = Path(worktree) / "results" / "discovery" / TODAY + if not results_dir.exists(): + print(f" No discovery results for {TODAY} in worktree", flush=True) + return [] + picks = [] + for run_dir in sorted(results_dir.iterdir()): + result_file = run_dir / "discovery_result.json" + if not result_file.exists(): + continue + try: + with open(result_file) as f: + data = json.load(f) + for item in data.get("final_ranking", []): + if item.get("strategy_match") == scanner: + picks.append({ + "date": TODAY, + "ticker": item["ticker"], + "score": item.get("final_score"), + "confidence": item.get("confidence"), + "scanner": scanner, + "return_7d": None, + "win_7d": None, + }) + except Exception as e: + print(f" Warning: could not read {result_file}: {e}", flush=True) + return picks + + +def load_picks_from_branch(hypothesis_id: str, branch: str) -> list: + """Load picks.json from the hypothesis branch using git show.""" + picks_path = f"docs/iterations/hypotheses/{hypothesis_id}/picks.json" + result = subprocess.run( + ["git", "show", f"{branch}:{picks_path}"], + cwd=str(ROOT), + capture_output=True, + text=True, + ) + if result.returncode != 0: + return [] + try: + return json.loads(result.stdout).get("picks", []) + except Exception: + return [] + + +def save_picks_to_worktree(worktree: str, hypothesis_id: str, scanner: str, picks: list) -> None: + """Write updated picks.json into the worktree and commit.""" + picks_dir = Path(worktree) / "docs" / "iterations" / "hypotheses" / hypothesis_id + picks_dir.mkdir(parents=True, exist_ok=True) + picks_file = picks_dir / "picks.json" + payload = {"hypothesis_id": hypothesis_id, "scanner": scanner, "picks": picks} + picks_file.write_text(json.dumps(payload, indent=2)) + run(["git", "add", str(picks_file)], cwd=worktree) + result = subprocess.run(["git", "diff", "--cached", "--quiet"], cwd=worktree) + if result.returncode != 0: + run( + ["git", "commit", "-m", f"chore(hypotheses): picks {TODAY} for {hypothesis_id}"], + cwd=worktree, + ) + + +def run_hypothesis(hyp: dict) -> bool: + """Run one hypothesis experiment cycle. Returns True if the experiment concluded.""" + hid = hyp["id"] + branch = hyp["branch"] + scanner = hyp["scanner"] + worktree = f"/tmp/hyp-{hid}" + + print(f"\n── Hypothesis: {hid} ──", flush=True) + + run(["git", "fetch", "origin", branch], check=False) + run(["git", "worktree", "add", worktree, branch]) + + try: + result = subprocess.run( + [sys.executable, "scripts/run_daily_discovery.py", "--date", TODAY, "--no-update-positions"], + cwd=worktree, + check=False, + ) + if result.returncode != 0: + print(f" Discovery failed for {hid}, skipping picks update", flush=True) + else: + new_picks = extract_picks(worktree, scanner) + existing_picks = load_picks_from_branch(hid, branch) + seen = {(p["date"], p["ticker"]) for p in existing_picks} + merged = existing_picks + [p for p in new_picks if (p["date"], p["ticker"]) not in seen] + save_picks_to_worktree(worktree, hid, scanner, merged) + run(["git", "push", "origin", f"HEAD:{branch}"], cwd=worktree) + + if TODAY not in hyp.get("picks_log", []): + hyp.setdefault("picks_log", []).append(TODAY) + hyp["days_elapsed"] = len(hyp["picks_log"]) + + if hyp["days_elapsed"] >= hyp["min_days"]: + return conclude_hypothesis(hyp) + + finally: + run(["git", "worktree", "remove", "--force", worktree], check=False) + + return False + + +def conclude_hypothesis(hyp: dict) -> bool: + """Run comparison, write conclusion doc, close/merge PR. Returns True.""" + hid = hyp["id"] + scanner = hyp["scanner"] + branch = hyp["branch"] + + print(f"\n Concluding {hid}...", flush=True) + + picks = load_picks_from_branch(hid, branch) + if not picks: + conclusion = { + "decision": "rejected", + "reason": "No picks were collected during the experiment period", + "hypothesis": {"count": 0, "evaluated": 0, "win_rate": None, "avg_return": None}, + "baseline": {"count": 0, "win_rate": None, "avg_return": None}, + } + else: + result = subprocess.run( + [ + sys.executable, "scripts/compare_hypothesis.py", + "--hypothesis-id", hid, + "--picks-json", json.dumps(picks), + "--scanner", scanner, + "--db-path", str(DB_PATH), + ], + cwd=str(ROOT), + capture_output=True, + text=True, + ) + if result.returncode != 0: + print(f" compare_hypothesis.py failed: {result.stderr}", flush=True) + return False + conclusion = json.loads(result.stdout) + + decision = conclusion["decision"] + hyp_metrics = conclusion["hypothesis"] + base_metrics = conclusion["baseline"] + + period_start = hyp.get("created_at", TODAY) + concluded_doc = CONCLUDED_DIR / f"{TODAY}-{hid}.md" + concluded_doc.write_text( + f"# Hypothesis: {hyp['title']}\n\n" + f"**Scanner:** {scanner}\n" + f"**Branch:** {branch}\n" + f"**Period:** {period_start} → {TODAY} ({hyp['days_elapsed']} days)\n" + f"**Outcome:** {'accepted ✅' if decision == 'accepted' else 'rejected ❌'}\n\n" + f"## Hypothesis\n{hyp.get('description', hyp['title'])}\n\n" + f"## Results\n\n" + f"| Metric | Baseline | Experiment | Delta |\n" + f"|---|---|---|---|\n" + f"| 7d win rate | {base_metrics.get('win_rate') or '—'}% | " + f"{hyp_metrics.get('win_rate') or '—'}% | " + f"{_delta_str(hyp_metrics.get('win_rate'), base_metrics.get('win_rate'), 'pp')} |\n" + f"| Avg return | {base_metrics.get('avg_return') or '—'}% | " + f"{hyp_metrics.get('avg_return') or '—'}% | " + f"{_delta_str(hyp_metrics.get('avg_return'), base_metrics.get('avg_return'), '%')} |\n" + f"| Picks | {base_metrics.get('count', '—')} | {hyp_metrics.get('count', '—')} | — |\n\n" + f"## Decision\n{conclusion['reason']}\n\n" + f"## Action\n" + f"{'Branch merged into main.' if decision == 'accepted' else 'Branch closed without merging.'}\n" + ) + + run(["git", "add", str(concluded_doc)], check=False) + + pr = hyp.get("pr_number") + if pr: + if decision == "accepted": + subprocess.run( + ["gh", "pr", "merge", str(pr), "--squash", "--delete-branch"], + cwd=str(ROOT), check=False, + ) + else: + subprocess.run( + ["gh", "pr", "close", str(pr), "--delete-branch"], + cwd=str(ROOT), check=False, + ) + + hyp["status"] = "concluded" + hyp["conclusion"] = decision + + print(f" {hid}: {decision} — {conclusion['reason']}", flush=True) + return True + + +def _delta_str(hyp_val, base_val, unit: str) -> str: + if hyp_val is None or base_val is None: + return "—" + delta = hyp_val - base_val + sign = "+" if delta >= 0 else "" + return f"{sign}{delta:.1f}{unit}" + + +def promote_pending(registry: dict) -> None: + """Promote the highest-priority pending hypothesis to running if a slot is open.""" + running_count = sum(1 for h in registry["hypotheses"] if h["status"] == "running") + max_active = registry.get("max_active", 5) + if running_count >= max_active: + return + pending = [h for h in registry["hypotheses"] if h["status"] == "pending"] + if not pending: + return + to_promote = max(pending, key=lambda h: h.get("priority", 0)) + to_promote["status"] = "running" + print(f"\n Promoted pending hypothesis to running: {to_promote['id']}", flush=True) + + +def main(): + registry = load_registry() + filter_id = os.environ.get("FILTER_ID", "").strip() + + hypotheses = registry.get("hypotheses", []) + running = [ + h for h in hypotheses + if h["status"] == "running" and (not filter_id or h["id"] == filter_id) + ] + + if not running: + print("No running hypotheses to process.", flush=True) + else: + for hyp in running: + run_hypothesis(hyp) + + promote_pending(registry) + save_registry(registry) + print("\nRegistry updated.", flush=True) + + +if __name__ == "__main__": + main() From fe5b8886c0b7d86c1ad618f8b5943e8c826b62ac Mon Sep 17 00:00:00 2001 From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com> Date: Fri, 10 Apr 2026 09:50:37 -0700 Subject: [PATCH 10/14] fix(hypotheses): only count successful discovery days in picks_log --- scripts/run_hypothesis_runner.py | 62 ++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 23 deletions(-) diff --git a/scripts/run_hypothesis_runner.py b/scripts/run_hypothesis_runner.py index 79d73e04..6795d0b7 100644 --- a/scripts/run_hypothesis_runner.py +++ b/scripts/run_hypothesis_runner.py @@ -64,15 +64,17 @@ def extract_picks(worktree: str, scanner: str) -> list: data = json.load(f) for item in data.get("final_ranking", []): if item.get("strategy_match") == scanner: - picks.append({ - "date": TODAY, - "ticker": item["ticker"], - "score": item.get("final_score"), - "confidence": item.get("confidence"), - "scanner": scanner, - "return_7d": None, - "win_7d": None, - }) + picks.append( + { + "date": TODAY, + "ticker": item["ticker"], + "score": item.get("final_score"), + "confidence": item.get("confidence"), + "scanner": scanner, + "return_7d": None, + "win_7d": None, + } + ) except Exception as e: print(f" Warning: could not read {result_file}: {e}", flush=True) return picks @@ -125,7 +127,13 @@ def run_hypothesis(hyp: dict) -> bool: try: result = subprocess.run( - [sys.executable, "scripts/run_daily_discovery.py", "--date", TODAY, "--no-update-positions"], + [ + sys.executable, + "scripts/run_daily_discovery.py", + "--date", + TODAY, + "--no-update-positions", + ], cwd=worktree, check=False, ) @@ -139,12 +147,12 @@ def run_hypothesis(hyp: dict) -> bool: save_picks_to_worktree(worktree, hid, scanner, merged) run(["git", "push", "origin", f"HEAD:{branch}"], cwd=worktree) - if TODAY not in hyp.get("picks_log", []): - hyp.setdefault("picks_log", []).append(TODAY) - hyp["days_elapsed"] = len(hyp["picks_log"]) + if TODAY not in hyp.get("picks_log", []): + hyp.setdefault("picks_log", []).append(TODAY) + hyp["days_elapsed"] = len(hyp["picks_log"]) - if hyp["days_elapsed"] >= hyp["min_days"]: - return conclude_hypothesis(hyp) + if hyp["days_elapsed"] >= hyp["min_days"]: + return conclude_hypothesis(hyp) finally: run(["git", "worktree", "remove", "--force", worktree], check=False) @@ -171,11 +179,16 @@ def conclude_hypothesis(hyp: dict) -> bool: else: result = subprocess.run( [ - sys.executable, "scripts/compare_hypothesis.py", - "--hypothesis-id", hid, - "--picks-json", json.dumps(picks), - "--scanner", scanner, - "--db-path", str(DB_PATH), + sys.executable, + "scripts/compare_hypothesis.py", + "--hypothesis-id", + hid, + "--picks-json", + json.dumps(picks), + "--scanner", + scanner, + "--db-path", + str(DB_PATH), ], cwd=str(ROOT), capture_output=True, @@ -221,12 +234,14 @@ def conclude_hypothesis(hyp: dict) -> bool: if decision == "accepted": subprocess.run( ["gh", "pr", "merge", str(pr), "--squash", "--delete-branch"], - cwd=str(ROOT), check=False, + cwd=str(ROOT), + check=False, ) else: subprocess.run( ["gh", "pr", "close", str(pr), "--delete-branch"], - cwd=str(ROOT), check=False, + cwd=str(ROOT), + check=False, ) hyp["status"] = "concluded" @@ -264,7 +279,8 @@ def main(): hypotheses = registry.get("hypotheses", []) running = [ - h for h in hypotheses + h + for h in hypotheses if h["status"] == "running" and (not filter_id or h["id"] == filter_id) ] From 5b87a56f310ea40a9e74c1d47dd4ba02eaf05878 Mon Sep 17 00:00:00 2001 From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com> Date: Fri, 10 Apr 2026 09:52:58 -0700 Subject: [PATCH 11/14] feat(hypotheses): add Hypotheses dashboard tab Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --- tests/test_hypotheses_page.py | 73 ++++++++++++ tradingagents/ui/dashboard.py | 3 +- tradingagents/ui/pages/__init__.py | 7 ++ tradingagents/ui/pages/hypotheses.py | 171 +++++++++++++++++++++++++++ 4 files changed, 253 insertions(+), 1 deletion(-) create mode 100644 tests/test_hypotheses_page.py create mode 100644 tradingagents/ui/pages/hypotheses.py diff --git a/tests/test_hypotheses_page.py b/tests/test_hypotheses_page.py new file mode 100644 index 00000000..196f7cb5 --- /dev/null +++ b/tests/test_hypotheses_page.py @@ -0,0 +1,73 @@ +"""Tests for the hypotheses dashboard page data loading.""" +import json +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from tradingagents.ui.pages.hypotheses import ( + load_active_hypotheses, + load_concluded_hypotheses, + days_until_ready, +) + + +def test_load_active_hypotheses(tmp_path): + active = { + "max_active": 5, + "hypotheses": [ + { + "id": "options_flow-test", + "title": "Test hypothesis", + "scanner": "options_flow", + "status": "running", + "priority": 7, + "days_elapsed": 5, + "min_days": 14, + "created_at": "2026-04-01", + "picks_log": ["2026-04-01"] * 5, + "conclusion": None, + } + ], + } + f = tmp_path / "active.json" + f.write_text(json.dumps(active)) + result = load_active_hypotheses(str(f)) + assert len(result) == 1 + assert result[0]["id"] == "options_flow-test" + + +def test_load_active_hypotheses_missing_file(tmp_path): + result = load_active_hypotheses(str(tmp_path / "missing.json")) + assert result == [] + + +def test_load_concluded_hypotheses(tmp_path): + doc = tmp_path / "2026-04-10-options_flow-test.md" + doc.write_text( + "# Hypothesis: Test\n\n" + "**Scanner:** options_flow\n" + "**Period:** 2026-03-27 → 2026-04-10 (14 days)\n" + "**Outcome:** accepted ✅\n" + ) + results = load_concluded_hypotheses(str(tmp_path)) + assert len(results) == 1 + assert results[0]["filename"] == doc.name + assert results[0]["outcome"] == "accepted ✅" + + +def test_load_concluded_hypotheses_empty_dir(tmp_path): + results = load_concluded_hypotheses(str(tmp_path)) + assert results == [] + + +def test_days_until_ready_has_days_left(): + hyp = {"days_elapsed": 5, "min_days": 14} + assert days_until_ready(hyp) == 9 + + +def test_days_until_ready_past_due(): + hyp = {"days_elapsed": 15, "min_days": 14} + assert days_until_ready(hyp) == 0 diff --git a/tradingagents/ui/dashboard.py b/tradingagents/ui/dashboard.py index bf6d88ea..2817ac62 100644 --- a/tradingagents/ui/dashboard.py +++ b/tradingagents/ui/dashboard.py @@ -52,7 +52,7 @@ def render_sidebar(): # Navigation page = st.radio( "Navigation", - options=["Overview", "Signals", "Portfolio", "Performance", "Config"], + options=["Overview", "Signals", "Portfolio", "Performance", "Hypotheses", "Config"], label_visibility="collapsed", ) @@ -116,6 +116,7 @@ def route_page(page): "Signals": pages.todays_picks, "Portfolio": pages.portfolio, "Performance": pages.performance, + "Hypotheses": pages.hypotheses, "Config": pages.settings, } module = page_map.get(page) diff --git a/tradingagents/ui/pages/__init__.py b/tradingagents/ui/pages/__init__.py index 22a16b20..da3547e4 100644 --- a/tradingagents/ui/pages/__init__.py +++ b/tradingagents/ui/pages/__init__.py @@ -39,6 +39,12 @@ except Exception as _e: _logger.error("Failed to import settings page: %s", _e, exc_info=True) settings = None +try: + from tradingagents.ui.pages import hypotheses +except Exception as _e: + _logger.error("Failed to import hypotheses page: %s", _e, exc_info=True) + hypotheses = None + __all__ = [ "home", @@ -46,4 +52,5 @@ __all__ = [ "portfolio", "performance", "settings", + "hypotheses", ] diff --git a/tradingagents/ui/pages/hypotheses.py b/tradingagents/ui/pages/hypotheses.py new file mode 100644 index 00000000..3492ccae --- /dev/null +++ b/tradingagents/ui/pages/hypotheses.py @@ -0,0 +1,171 @@ +""" +Hypotheses dashboard page — tracks active and concluded experiments. + +Reads docs/iterations/hypotheses/active.json and the concluded/ directory. +No external API calls; all data is file-based. +""" + +import json +import re +from pathlib import Path +from typing import Any, Dict, List + +import streamlit as st + +from tradingagents.ui.theme import COLORS, page_header + +_REPO_ROOT = Path(__file__).parent.parent.parent.parent +_ACTIVE_JSON = _REPO_ROOT / "docs/iterations/hypotheses/active.json" +_CONCLUDED_DIR = _REPO_ROOT / "docs/iterations/hypotheses/concluded" + + +def load_active_hypotheses(active_path: str = str(_ACTIVE_JSON)) -> List[Dict[str, Any]]: + """Load all hypotheses from active.json. Returns [] if file missing.""" + path = Path(active_path) + if not path.exists(): + return [] + try: + with open(path) as f: + data = json.load(f) + return data.get("hypotheses", []) + except Exception: + return [] + + +def load_concluded_hypotheses(concluded_dir: str = str(_CONCLUDED_DIR)) -> List[Dict[str, Any]]: + """ + Load concluded hypothesis metadata by parsing markdown files in concluded/. + Extracts: filename, title, scanner, period, outcome. + """ + dir_path = Path(concluded_dir) + if not dir_path.exists(): + return [] + results = [] + for md_file in sorted(dir_path.glob("*.md"), reverse=True): + if md_file.name == ".gitkeep": + continue + try: + text = md_file.read_text() + title = _extract_md_field(text, r"^# Hypothesis: (.+)$") + scanner = _extract_md_field(text, r"^\*\*Scanner:\*\* (.+)$") + period = _extract_md_field(text, r"^\*\*Period:\*\* (.+)$") + outcome = _extract_md_field(text, r"^\*\*Outcome:\*\* (.+)$") + results.append({ + "filename": md_file.name, + "title": title or md_file.stem, + "scanner": scanner or "—", + "period": period or "—", + "outcome": outcome or "—", + }) + except Exception: + continue + return results + + +def _extract_md_field(text: str, pattern: str) -> str: + """Extract a field value from a markdown line using regex.""" + match = re.search(pattern, text, re.MULTILINE) + return match.group(1).strip() if match else "" + + +def days_until_ready(hyp: Dict[str, Any]) -> int: + """Return number of days remaining before hypothesis can conclude (min 0).""" + return max(0, hyp.get("min_days", 14) - hyp.get("days_elapsed", 0)) + + +def render() -> None: + """Render the hypotheses tracking page.""" + st.markdown( + page_header("Hypotheses", "Active experiments & concluded findings"), + unsafe_allow_html=True, + ) + + hypotheses = load_active_hypotheses() + concluded = load_concluded_hypotheses() + + if not hypotheses and not concluded: + st.info( + "No hypotheses yet. Run `/backtest-hypothesis \"<description>\"` to start an experiment." + ) + return + + running = [h for h in hypotheses if h["status"] == "running"] + pending = [h for h in hypotheses if h["status"] == "pending"] + + st.markdown( + f'<div class="section-title">Active Experiments ' + f'<span class="accent">// {len(running)} running, {len(pending)} pending</span></div>', + unsafe_allow_html=True, + ) + + if running or pending: + import pandas as pd + active_rows = [] + for h in sorted(running + pending, key=lambda x: -x.get("priority", 0)): + days_left = days_until_ready(h) + ready_str = "concluding soon" if days_left == 0 else f"{days_left}d left" + active_rows.append({ + "ID": h["id"], + "Title": h.get("title", "—"), + "Scanner": h.get("scanner", "—"), + "Status": h["status"], + "Progress": f"{h.get('days_elapsed', 0)}/{h.get('min_days', 14)}d", + "Picks": len(h.get("picks_log", [])), + "Ready": ready_str, + "Priority": h.get("priority", "—"), + }) + df = pd.DataFrame(active_rows) + st.dataframe( + df, + width="stretch", + hide_index=True, + column_config={ + "ID": st.column_config.TextColumn(width="medium"), + "Title": st.column_config.TextColumn(width="large"), + "Scanner": st.column_config.TextColumn(width="medium"), + "Status": st.column_config.TextColumn(width="small"), + "Progress": st.column_config.TextColumn(width="small"), + "Picks": st.column_config.NumberColumn(format="%d", width="small"), + "Ready": st.column_config.TextColumn(width="medium"), + "Priority": st.column_config.NumberColumn(format="%d/9", width="small"), + }, + ) + else: + st.info("No active experiments.") + + st.markdown("<div style='height:1.5rem;'></div>", unsafe_allow_html=True) + + st.markdown( + f'<div class="section-title">Concluded Experiments ' + f'<span class="accent">// {len(concluded)} total</span></div>', + unsafe_allow_html=True, + ) + + if concluded: + import pandas as pd + concluded_rows = [] + for c in concluded: + outcome = c["outcome"] + emoji = "✅" if "accepted" in outcome else "❌" + concluded_rows.append({ + "Date": c["filename"][:10], + "Title": c["title"], + "Scanner": c["scanner"], + "Period": c["period"], + "Outcome": emoji, + }) + cdf = pd.DataFrame(concluded_rows) + st.dataframe( + cdf, + width="stretch", + hide_index=True, + column_config={ + "Date": st.column_config.TextColumn(width="small"), + "Title": st.column_config.TextColumn(width="large"), + "Scanner": st.column_config.TextColumn(width="medium"), + "Period": st.column_config.TextColumn(width="medium"), + "Outcome": st.column_config.TextColumn(width="small"), + }, + ) + else: + st.info("No concluded experiments yet.") From 9562bb7cc0dfbf7a43f2f8dd0eb88e64be5b3136 Mon Sep 17 00:00:00 2001 From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com> Date: Fri, 10 Apr 2026 09:56:22 -0700 Subject: [PATCH 12/14] fix(hypotheses): id validation, worktree prune, safe loop, 14d enrichment cutoff Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --- scripts/compare_hypothesis.py | 2 +- scripts/run_hypothesis_runner.py | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/scripts/compare_hypothesis.py b/scripts/compare_hypothesis.py index ac6a72aa..9415a7f4 100644 --- a/scripts/compare_hypothesis.py +++ b/scripts/compare_hypothesis.py @@ -56,7 +56,7 @@ def compute_7d_return(ticker: str, pick_date: str) -> Tuple[Optional[float], Opt def enrich_picks_with_returns(picks: list) -> list: """Compute 7d return for each pick >= 7 days old that lacks return_7d.""" - cutoff = (datetime.utcnow() - timedelta(days=7)).strftime("%Y-%m-%d") + cutoff = (datetime.utcnow() - timedelta(days=14)).strftime("%Y-%m-%d") for pick in picks: if pick.get("return_7d") is not None: continue diff --git a/scripts/run_hypothesis_runner.py b/scripts/run_hypothesis_runner.py index 6795d0b7..b15c02f6 100644 --- a/scripts/run_hypothesis_runner.py +++ b/scripts/run_hypothesis_runner.py @@ -19,6 +19,7 @@ Environment variables: import json import os +import re import subprocess import sys from datetime import datetime @@ -116,6 +117,10 @@ def save_picks_to_worktree(worktree: str, hypothesis_id: str, scanner: str, pick def run_hypothesis(hyp: dict) -> bool: """Run one hypothesis experiment cycle. Returns True if the experiment concluded.""" hid = hyp["id"] + # Validate id to prevent path traversal in worktree path + if not re.fullmatch(r"[a-zA-Z0-9_\-]+", hid): + print(f" Skipping hypothesis with invalid id: {hid!r}", flush=True) + return False branch = hyp["branch"] scanner = hyp["scanner"] worktree = f"/tmp/hyp-{hid}" @@ -287,8 +292,12 @@ def main(): if not running: print("No running hypotheses to process.", flush=True) else: + run(["git", "worktree", "prune"], check=False) for hyp in running: - run_hypothesis(hyp) + try: + run_hypothesis(hyp) + except Exception as e: + print(f" Error processing {hyp['id']}: {e}", flush=True) promote_pending(registry) save_registry(registry) From 49175e3b0ad525b25b15a04e5f2918d9496f0f0a Mon Sep 17 00:00:00 2001 From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com> Date: Fri, 10 Apr 2026 10:52:00 -0700 Subject: [PATCH 13/14] feat(hypotheses): post conclusion as PR comment instead of auto-merging --- scripts/run_hypothesis_runner.py | 36 ++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/scripts/run_hypothesis_runner.py b/scripts/run_hypothesis_runner.py index b15c02f6..38340617 100644 --- a/scripts/run_hypothesis_runner.py +++ b/scripts/run_hypothesis_runner.py @@ -229,25 +229,35 @@ def conclude_hypothesis(hyp: dict) -> bool: f"| Picks | {base_metrics.get('count', '—')} | {hyp_metrics.get('count', '—')} | — |\n\n" f"## Decision\n{conclusion['reason']}\n\n" f"## Action\n" - f"{'Branch merged into main.' if decision == 'accepted' else 'Branch closed without merging.'}\n" + f"{'Ready to merge — awaiting manual review.' if decision == 'accepted' else 'Experiment concluded — awaiting manual review before closing.'}\n" ) run(["git", "add", str(concluded_doc)], check=False) pr = hyp.get("pr_number") if pr: - if decision == "accepted": - subprocess.run( - ["gh", "pr", "merge", str(pr), "--squash", "--delete-branch"], - cwd=str(ROOT), - check=False, - ) - else: - subprocess.run( - ["gh", "pr", "close", str(pr), "--delete-branch"], - cwd=str(ROOT), - check=False, - ) + # Mark PR ready for review (removes draft status) and post conclusion as a comment. + # The PR is NOT merged or closed automatically — the user reviews and decides. + outcome_emoji = "✅ accepted" if decision == "accepted" else "❌ rejected" + comment = ( + f"**Hypothesis concluded: {outcome_emoji}**\n\n" + f"{conclusion['reason']}\n\n" + f"| Metric | Baseline | Experiment |\n" + f"|---|---|---|\n" + f"| 7d win rate | {base_metrics.get('win_rate') or '—'}% | {hyp_metrics.get('win_rate') or '—'}% |\n" + f"| Avg return | {base_metrics.get('avg_return') or '—'}% | {hyp_metrics.get('avg_return') or '—'}% |\n\n" + f"{'Merge this PR to apply the change.' if decision == 'accepted' else 'Close this PR to discard the experiment.'}" + ) + subprocess.run( + ["gh", "pr", "ready", str(pr)], + cwd=str(ROOT), + check=False, + ) + subprocess.run( + ["gh", "pr", "comment", str(pr), "--body", comment], + cwd=str(ROOT), + check=False, + ) hyp["status"] = "concluded" hyp["conclusion"] = decision From 26df957e3751053f5aceca553702c6632fd97700 Mon Sep 17 00:00:00 2001 From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com> Date: Fri, 10 Apr 2026 10:57:52 -0700 Subject: [PATCH 14/14] feat(hypotheses): add LLM analysis to hypothesis conclusion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When ANTHROPIC_API_KEY is set, conclude_hypothesis now: - Loads the scanner domain file for context - Calls claude-haiku-4-5-20251001 for a 3–5 sentence interpretation - Embeds the analysis in the concluded .md doc and PR comment The LLM enriches the conclusion with sample-size caveats, market context, and a follow-up hypothesis suggestion — without overriding the programmatic accept/reject decision. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --- scripts/run_hypothesis_runner.py | 85 +++++++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 2 deletions(-) diff --git a/scripts/run_hypothesis_runner.py b/scripts/run_hypothesis_runner.py index 38340617..179b8794 100644 --- a/scripts/run_hypothesis_runner.py +++ b/scripts/run_hypothesis_runner.py @@ -24,6 +24,7 @@ import subprocess import sys from datetime import datetime from pathlib import Path +from typing import Optional ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(ROOT)) @@ -165,6 +166,75 @@ def run_hypothesis(hyp: dict) -> bool: return False +def llm_analysis(hyp: dict, conclusion: dict, scanner_domain: str) -> Optional[str]: + """ + Ask Claude to interpret the experiment results and provide richer context. + + Returns a markdown string to embed in the PR comment, or None if the API + call fails or ANTHROPIC_API_KEY is not set. + + The LLM does NOT override the programmatic decision — it adds nuance: + sample-size caveats, market-condition context, follow-up hypotheses. + """ + api_key = os.environ.get("ANTHROPIC_API_KEY") + if not api_key: + return None + + try: + import anthropic + except ImportError: + print(" anthropic SDK not installed, skipping LLM analysis", flush=True) + return None + + hyp_metrics = conclusion["hypothesis"] + base_metrics = conclusion["baseline"] + decision = conclusion["decision"] + + prompt = f"""You are analyzing the results of a scanner hypothesis experiment for an automated trading discovery system. + +## Hypothesis +**ID:** {hyp["id"]} +**Title:** {hyp.get("title", "")} +**Description:** {hyp.get("description", hyp.get("title", ""))} +**Scanner:** {hyp["scanner"]} +**Period:** {hyp.get("created_at")} → {TODAY} ({hyp.get("days_elapsed")} days) + +## Statistical Results +**Decision (programmatic):** {decision} +**Reason:** {conclusion["reason"]} + +| Metric | Baseline | Experiment | Delta | +|---|---|---|---| +| 7d win rate | {base_metrics.get("win_rate") or "—"}% | {hyp_metrics.get("win_rate") or "—"}% | {_delta_str(hyp_metrics.get("win_rate"), base_metrics.get("win_rate"), "pp")} | +| Avg 7d return | {base_metrics.get("avg_return") or "—"}% | {hyp_metrics.get("avg_return") or "—"}% | {_delta_str(hyp_metrics.get("avg_return"), base_metrics.get("avg_return"), "%")} | +| Picks evaluated | {base_metrics.get("evaluated", base_metrics.get("count", "—"))} | {hyp_metrics.get("evaluated", hyp_metrics.get("count", "—"))} | — | + +## Scanner Domain Knowledge +{scanner_domain} + +--- + +Provide a concise analysis (3–5 sentences) covering: +1. Whether the sample size is sufficient to trust the result, or if more data is needed +2. Any caveats about the measurement period (e.g., unusual market conditions) +3. What the numbers suggest about the underlying hypothesis — even if the decision is "rejected", is the direction meaningful? +4. One concrete follow-up hypothesis worth testing next + +Be direct. Do not restate the numbers — interpret them. Do not recommend merging or closing the PR.""" + + try: + client = anthropic.Anthropic(api_key=api_key) + message = client.messages.create( + model="claude-haiku-4-5-20251001", + max_tokens=512, + messages=[{"role": "user", "content": prompt}], + ) + return message.content[0].text.strip() + except Exception as e: + print(f" LLM analysis failed: {e}", flush=True) + return None + + def conclude_hypothesis(hyp: dict) -> bool: """Run comparison, write conclusion doc, close/merge PR. Returns True.""" hid = hyp["id"] @@ -208,6 +278,14 @@ def conclude_hypothesis(hyp: dict) -> bool: hyp_metrics = conclusion["hypothesis"] base_metrics = conclusion["baseline"] + # Load scanner domain knowledge (may not exist yet — that's fine) + scanner_domain_path = ROOT / "docs" / "iterations" / "scanners" / f"{scanner}.md" + scanner_domain = scanner_domain_path.read_text() if scanner_domain_path.exists() else "" + + # Optional LLM analysis — enriches the conclusion without overriding the decision + analysis = llm_analysis(hyp, conclusion, scanner_domain) + analysis_section = f"\n\n## Analysis\n{analysis}" if analysis else "" + period_start = hyp.get("created_at", TODAY) concluded_doc = CONCLUDED_DIR / f"{TODAY}-{hid}.md" concluded_doc.write_text( @@ -227,7 +305,8 @@ def conclude_hypothesis(hyp: dict) -> bool: f"{hyp_metrics.get('avg_return') or '—'}% | " f"{_delta_str(hyp_metrics.get('avg_return'), base_metrics.get('avg_return'), '%')} |\n" f"| Picks | {base_metrics.get('count', '—')} | {hyp_metrics.get('count', '—')} | — |\n\n" - f"## Decision\n{conclusion['reason']}\n\n" + f"## Decision\n{conclusion['reason']}\n" + f"{analysis_section}\n\n" f"## Action\n" f"{'Ready to merge — awaiting manual review.' if decision == 'accepted' else 'Experiment concluded — awaiting manual review before closing.'}\n" ) @@ -239,13 +318,15 @@ def conclude_hypothesis(hyp: dict) -> bool: # Mark PR ready for review (removes draft status) and post conclusion as a comment. # The PR is NOT merged or closed automatically — the user reviews and decides. outcome_emoji = "✅ accepted" if decision == "accepted" else "❌ rejected" + analysis_block = f"\n\n**Analysis**\n{analysis}" if analysis else "" comment = ( f"**Hypothesis concluded: {outcome_emoji}**\n\n" f"{conclusion['reason']}\n\n" f"| Metric | Baseline | Experiment |\n" f"|---|---|---|\n" f"| 7d win rate | {base_metrics.get('win_rate') or '—'}% | {hyp_metrics.get('win_rate') or '—'}% |\n" - f"| Avg return | {base_metrics.get('avg_return') or '—'}% | {hyp_metrics.get('avg_return') or '—'}% |\n\n" + f"| Avg return | {base_metrics.get('avg_return') or '—'}% | {hyp_metrics.get('avg_return') or '—'}% |\n" + f"{analysis_block}\n\n" f"{'Merge this PR to apply the change.' if decision == 'accepted' else 'Close this PR to discard the experiment.'}" ) subprocess.run(