feat(hypotheses): add comparison + conclusion script

Implements compute_7d_return, compute_metrics, load_baseline_metrics, and make_decision functions with full TDD coverage (11 tests passing). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-10 09:29:08 -07:00 · 2026-04-10 09:29:08 -07:00 · 6c438f87e6
parent d3065f59f1
commit 6c438f87e6
3 changed files with 288 additions and 0 deletions
--- a/scripts/init.py
+++ b/scripts/init.py
--- a/scripts/compare_hypothesis.py
+++ b/scripts/compare_hypothesis.py
@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+"""
+Hypothesis comparison — computes 7d returns for hypothesis picks and
+compares them against the baseline scanner in performance_database.json.
+
+Usage (called by hypothesis-runner.yml after min_days elapsed):
+    python scripts/compare_hypothesis.py \
+        --hypothesis-id options_flow-scan-3-expirations \
+        --picks-json '[{"date": "2026-04-01", "ticker": "AAPL", ...}]' \
+        --scanner options_flow \
+        --db-path data/recommendations/performance_database.json
+
+Prints a JSON conclusion to stdout.
+"""
+
+import argparse
+import json
+import sys
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Optional, Tuple
+
+ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(ROOT))
+
+from tradingagents.dataflows.y_finance import download_history
+
+_MIN_EVALUATED = 5
+_WIN_RATE_DELTA_THRESHOLD = 5.0
+_AVG_RETURN_DELTA_THRESHOLD = 1.0
+
+
+def compute_7d_return(ticker: str, pick_date: str) -> Tuple[Optional[float], Optional[bool]]:
+    """Fetch 7-day return for a pick using yfinance. Returns (pct, is_win) or (None, None)."""
+    try:
+        entry_dt = datetime.strptime(pick_date, "%Y-%m-%d")
+        exit_dt = entry_dt + timedelta(days=10)
+        df = download_history(
+            ticker,
+            start=entry_dt.strftime("%Y-%m-%d"),
+            end=exit_dt.strftime("%Y-%m-%d"),
+        )
+        if df.empty or len(df) < 2:
+            return None, None
+        close = df["Close"]
+        entry_price = float(close.iloc[0])
+        exit_idx = min(5, len(close) - 1)
+        exit_price = float(close.iloc[exit_idx])
+        if entry_price <= 0:
+            return None, None
+        ret = (exit_price - entry_price) / entry_price * 100
+        return round(ret, 4), ret > 0
+    except Exception:
+        return None, None
+
+
+def enrich_picks_with_returns(picks: list) -> list:
+    """Compute 7d return for each pick >= 7 days old that lacks return_7d."""
+    cutoff = (datetime.utcnow() - timedelta(days=7)).strftime("%Y-%m-%d")
+    for pick in picks:
+        if pick.get("return_7d") is not None:
+            continue
+        if pick.get("date", "9999-99-99") > cutoff:
+            continue
+        ret, win = compute_7d_return(pick["ticker"], pick["date"])
+        pick["return_7d"] = ret
+        pick["win_7d"] = win
+    return picks
+
+
+def compute_metrics(picks: list) -> dict:
+    """Compute win rate and avg return. Only picks with non-None return_7d are evaluated."""
+    evaluated = [p for p in picks if p.get("return_7d") is not None]
+    if not evaluated:
+        return {"count": len(picks), "evaluated": 0, "win_rate": None, "avg_return": None}
+    wins = sum(1 for p in evaluated if p.get("win_7d"))
+    avg_ret = sum(p["return_7d"] for p in evaluated) / len(evaluated)
+    return {
+        "count": len(picks),
+        "evaluated": len(evaluated),
+        "win_rate": round(wins / len(evaluated) * 100, 1),
+        "avg_return": round(avg_ret, 2),
+    }
+
+
+def load_baseline_metrics(scanner: str, db_path: str) -> dict:
+    """Load baseline metrics for a scanner from performance_database.json."""
+    path = Path(db_path)
+    if not path.exists():
+        return {"count": 0, "win_rate": None, "avg_return": None}
+    try:
+        with open(path) as f:
+            db = json.load(f)
+    except Exception:
+        return {"count": 0, "win_rate": None, "avg_return": None}
+    picks = []
+    for recs in db.get("recommendations_by_date", {}).values():
+        for rec in (recs if isinstance(recs, list) else []):
+            if rec.get("strategy_match") == scanner and rec.get("return_7d") is not None:
+                picks.append(rec)
+    return compute_metrics(picks)
+
+
+def make_decision(hypothesis: dict, baseline: dict) -> Tuple[str, str]:
+    """Decide accepted/rejected. Requires _MIN_EVALUATED evaluated picks."""
+    evaluated = hypothesis.get("evaluated", 0)
+    if evaluated < _MIN_EVALUATED:
+        return "rejected", f"Insufficient data: only {evaluated} evaluated picks (need {_MIN_EVALUATED})"
+    hyp_wr = hypothesis.get("win_rate")
+    hyp_ret = hypothesis.get("avg_return")
+    base_wr = baseline.get("win_rate")
+    base_ret = baseline.get("avg_return")
+    reasons = []
+    if hyp_wr is not None and base_wr is not None:
+        delta_wr = hyp_wr - base_wr
+        if delta_wr > _WIN_RATE_DELTA_THRESHOLD:
+            reasons.append(f"win rate improved by {delta_wr:+.1f}pp ({base_wr:.1f}% → {hyp_wr:.1f}%)")
+    if hyp_ret is not None and base_ret is not None:
+        delta_ret = hyp_ret - base_ret
+        if delta_ret > _AVG_RETURN_DELTA_THRESHOLD:
+            reasons.append(f"avg return improved by {delta_ret:+.2f}% ({base_ret:+.2f}% → {hyp_ret:+.2f}%)")
+    if reasons:
+        return "accepted", "; ".join(reasons)
+    wr_str = f"{hyp_wr:.1f}% vs baseline {base_wr:.1f}%" if hyp_wr is not None else "no win rate data"
+    ret_str = f"{hyp_ret:+.2f}% vs baseline {base_ret:+.2f}%" if hyp_ret is not None else "no return data"
+    return "rejected", f"No significant improvement — win rate: {wr_str}; avg return: {ret_str}"
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--hypothesis-id", required=True)
+    parser.add_argument("--picks-json", required=True)
+    parser.add_argument("--scanner", required=True)
+    parser.add_argument("--db-path", default="data/recommendations/performance_database.json")
+    args = parser.parse_args()
+    picks = json.loads(args.picks_json)
+    picks = enrich_picks_with_returns(picks)
+    hyp_metrics = compute_metrics(picks)
+    base_metrics = load_baseline_metrics(args.scanner, args.db_path)
+    decision, reason = make_decision(hyp_metrics, base_metrics)
+    result = {
+        "hypothesis_id": args.hypothesis_id,
+        "decision": decision,
+        "reason": reason,
+        "hypothesis": hyp_metrics,
+        "baseline": base_metrics,
+        "enriched_picks": picks,
+    }
+    print(json.dumps(result, indent=2))
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/test_compare_hypothesis.py
+++ b/tests/test_compare_hypothesis.py
@ -0,0 +1,135 @@
+"""Tests for the hypothesis comparison script."""
+import json
+import sys
+from datetime import date, timedelta
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from scripts.compare_hypothesis import (
+    compute_metrics,
+    compute_7d_return,
+    load_baseline_metrics,
+    make_decision,
+)
+
+
+# ── compute_metrics ──────────────────────────────────────────────────────────
+
+def test_compute_metrics_empty():
+    result = compute_metrics([])
+    assert result == {"count": 0, "evaluated": 0, "win_rate": None, "avg_return": None}
+
+
+def test_compute_metrics_all_wins():
+    picks = [
+        {"return_7d": 5.0, "win_7d": True},
+        {"return_7d": 3.0, "win_7d": True},
+    ]
+    result = compute_metrics(picks)
+    assert result["win_rate"] == 100.0
+    assert result["avg_return"] == 4.0
+    assert result["evaluated"] == 2
+
+
+def test_compute_metrics_mixed():
+    picks = [
+        {"return_7d": 10.0, "win_7d": True},
+        {"return_7d": -5.0, "win_7d": False},
+        {"return_7d": None, "win_7d": None},   # pending — excluded
+    ]
+    result = compute_metrics(picks)
+    assert result["win_rate"] == 50.0
+    assert result["avg_return"] == 2.5
+    assert result["evaluated"] == 2
+    assert result["count"] == 3
+
+
+# ── compute_7d_return ────────────────────────────────────────────────────────
+
+def test_compute_7d_return_positive():
+    import pandas as pd
+
+    close_data = [100.0, 101.0, 102.0, 103.0, 104.0, 110.0]
+    mock_df = pd.DataFrame({"Close": close_data})
+
+    with patch("scripts.compare_hypothesis.download_history", return_value=mock_df):
+        ret, win = compute_7d_return("AAPL", "2026-03-01")
+
+    assert ret == pytest.approx(10.0, rel=0.01)
+    assert win is True
+
+
+def test_compute_7d_return_empty_data():
+    import pandas as pd
+
+    mock_df = pd.DataFrame()
+
+    with patch("scripts.compare_hypothesis.download_history", return_value=mock_df):
+        ret, win = compute_7d_return("AAPL", "2026-03-01")
+
+    assert ret is None
+    assert win is None
+
+
+# ── load_baseline_metrics ────────────────────────────────────────────────────
+
+def test_load_baseline_metrics(tmp_path):
+    db = {
+        "recommendations_by_date": {
+            "2026-03-01": [
+                {"strategy_match": "options_flow", "return_7d": 5.0, "win_7d": True},
+                {"strategy_match": "options_flow", "return_7d": -2.0, "win_7d": False},
+                {"strategy_match": "reddit_dd", "return_7d": 3.0, "win_7d": True},
+            ]
+        }
+    }
+    db_file = tmp_path / "performance_database.json"
+    db_file.write_text(json.dumps(db))
+
+    result = load_baseline_metrics("options_flow", str(db_file))
+
+    assert result["win_rate"] == 50.0
+    assert result["avg_return"] == 1.5
+    assert result["count"] == 2
+
+
+def test_load_baseline_metrics_missing_file(tmp_path):
+    result = load_baseline_metrics("options_flow", str(tmp_path / "missing.json"))
+    assert result == {"count": 0, "win_rate": None, "avg_return": None}
+
+
+# ── make_decision ─────────────────────────────────────────────────────────────
+
+def test_make_decision_accepted_by_win_rate():
+    hyp = {"win_rate": 60.0, "avg_return": 0.5, "evaluated": 10}
+    baseline = {"win_rate": 50.0, "avg_return": 0.5}
+    decision, reason = make_decision(hyp, baseline)
+    assert decision == "accepted"
+    assert "win rate" in reason.lower()
+
+
+def test_make_decision_accepted_by_return():
+    hyp = {"win_rate": 52.0, "avg_return": 3.0, "evaluated": 10}
+    baseline = {"win_rate": 50.0, "avg_return": 1.5}
+    decision, reason = make_decision(hyp, baseline)
+    assert decision == "accepted"
+    assert "return" in reason.lower()
+
+
+def test_make_decision_rejected():
+    hyp = {"win_rate": 48.0, "avg_return": 0.2, "evaluated": 10}
+    baseline = {"win_rate": 50.0, "avg_return": 1.0}
+    decision, reason = make_decision(hyp, baseline)
+    assert decision == "rejected"
+
+
+def test_make_decision_insufficient_data():
+    hyp = {"win_rate": 80.0, "avg_return": 5.0, "evaluated": 2}
+    baseline = {"win_rate": 50.0, "avg_return": 1.0}
+    decision, reason = make_decision(hyp, baseline)
+    assert decision == "rejected"
+    assert "insufficient" in reason.lower()