diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/compare_hypothesis.py b/scripts/compare_hypothesis.py new file mode 100644 index 00000000..991f5baf --- /dev/null +++ b/scripts/compare_hypothesis.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +""" +Hypothesis comparison — computes 7d returns for hypothesis picks and +compares them against the baseline scanner in performance_database.json. + +Usage (called by hypothesis-runner.yml after min_days elapsed): + python scripts/compare_hypothesis.py \ + --hypothesis-id options_flow-scan-3-expirations \ + --picks-json '[{"date": "2026-04-01", "ticker": "AAPL", ...}]' \ + --scanner options_flow \ + --db-path data/recommendations/performance_database.json + +Prints a JSON conclusion to stdout. +""" + +import argparse +import json +import sys +from datetime import datetime, timedelta +from pathlib import Path +from typing import Optional, Tuple + +ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(ROOT)) + +from tradingagents.dataflows.y_finance import download_history + +_MIN_EVALUATED = 5 +_WIN_RATE_DELTA_THRESHOLD = 5.0 +_AVG_RETURN_DELTA_THRESHOLD = 1.0 + + +def compute_7d_return(ticker: str, pick_date: str) -> Tuple[Optional[float], Optional[bool]]: + """Fetch 7-day return for a pick using yfinance. Returns (pct, is_win) or (None, None).""" + try: + entry_dt = datetime.strptime(pick_date, "%Y-%m-%d") + exit_dt = entry_dt + timedelta(days=10) + df = download_history( + ticker, + start=entry_dt.strftime("%Y-%m-%d"), + end=exit_dt.strftime("%Y-%m-%d"), + ) + if df.empty or len(df) < 2: + return None, None + close = df["Close"] + entry_price = float(close.iloc[0]) + exit_idx = min(5, len(close) - 1) + exit_price = float(close.iloc[exit_idx]) + if entry_price <= 0: + return None, None + ret = (exit_price - entry_price) / entry_price * 100 + return round(ret, 4), ret > 0 + except Exception: + return None, None + + +def enrich_picks_with_returns(picks: list) -> list: + """Compute 7d return for each pick >= 7 days old that lacks return_7d.""" + cutoff = (datetime.utcnow() - timedelta(days=7)).strftime("%Y-%m-%d") + for pick in picks: + if pick.get("return_7d") is not None: + continue + if pick.get("date", "9999-99-99") > cutoff: + continue + ret, win = compute_7d_return(pick["ticker"], pick["date"]) + pick["return_7d"] = ret + pick["win_7d"] = win + return picks + + +def compute_metrics(picks: list) -> dict: + """Compute win rate and avg return. Only picks with non-None return_7d are evaluated.""" + evaluated = [p for p in picks if p.get("return_7d") is not None] + if not evaluated: + return {"count": len(picks), "evaluated": 0, "win_rate": None, "avg_return": None} + wins = sum(1 for p in evaluated if p.get("win_7d")) + avg_ret = sum(p["return_7d"] for p in evaluated) / len(evaluated) + return { + "count": len(picks), + "evaluated": len(evaluated), + "win_rate": round(wins / len(evaluated) * 100, 1), + "avg_return": round(avg_ret, 2), + } + + +def load_baseline_metrics(scanner: str, db_path: str) -> dict: + """Load baseline metrics for a scanner from performance_database.json.""" + path = Path(db_path) + if not path.exists(): + return {"count": 0, "win_rate": None, "avg_return": None} + try: + with open(path) as f: + db = json.load(f) + except Exception: + return {"count": 0, "win_rate": None, "avg_return": None} + picks = [] + for recs in db.get("recommendations_by_date", {}).values(): + for rec in (recs if isinstance(recs, list) else []): + if rec.get("strategy_match") == scanner and rec.get("return_7d") is not None: + picks.append(rec) + return compute_metrics(picks) + + +def make_decision(hypothesis: dict, baseline: dict) -> Tuple[str, str]: + """Decide accepted/rejected. Requires _MIN_EVALUATED evaluated picks.""" + evaluated = hypothesis.get("evaluated", 0) + if evaluated < _MIN_EVALUATED: + return "rejected", f"Insufficient data: only {evaluated} evaluated picks (need {_MIN_EVALUATED})" + hyp_wr = hypothesis.get("win_rate") + hyp_ret = hypothesis.get("avg_return") + base_wr = baseline.get("win_rate") + base_ret = baseline.get("avg_return") + reasons = [] + if hyp_wr is not None and base_wr is not None: + delta_wr = hyp_wr - base_wr + if delta_wr > _WIN_RATE_DELTA_THRESHOLD: + reasons.append(f"win rate improved by {delta_wr:+.1f}pp ({base_wr:.1f}% → {hyp_wr:.1f}%)") + if hyp_ret is not None and base_ret is not None: + delta_ret = hyp_ret - base_ret + if delta_ret > _AVG_RETURN_DELTA_THRESHOLD: + reasons.append(f"avg return improved by {delta_ret:+.2f}% ({base_ret:+.2f}% → {hyp_ret:+.2f}%)") + if reasons: + return "accepted", "; ".join(reasons) + wr_str = f"{hyp_wr:.1f}% vs baseline {base_wr:.1f}%" if hyp_wr is not None else "no win rate data" + ret_str = f"{hyp_ret:+.2f}% vs baseline {base_ret:+.2f}%" if hyp_ret is not None else "no return data" + return "rejected", f"No significant improvement — win rate: {wr_str}; avg return: {ret_str}" + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--hypothesis-id", required=True) + parser.add_argument("--picks-json", required=True) + parser.add_argument("--scanner", required=True) + parser.add_argument("--db-path", default="data/recommendations/performance_database.json") + args = parser.parse_args() + picks = json.loads(args.picks_json) + picks = enrich_picks_with_returns(picks) + hyp_metrics = compute_metrics(picks) + base_metrics = load_baseline_metrics(args.scanner, args.db_path) + decision, reason = make_decision(hyp_metrics, base_metrics) + result = { + "hypothesis_id": args.hypothesis_id, + "decision": decision, + "reason": reason, + "hypothesis": hyp_metrics, + "baseline": base_metrics, + "enriched_picks": picks, + } + print(json.dumps(result, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/tests/test_compare_hypothesis.py b/tests/test_compare_hypothesis.py new file mode 100644 index 00000000..2cf41609 --- /dev/null +++ b/tests/test_compare_hypothesis.py @@ -0,0 +1,135 @@ +"""Tests for the hypothesis comparison script.""" +import json +import sys +from datetime import date, timedelta +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from scripts.compare_hypothesis import ( + compute_metrics, + compute_7d_return, + load_baseline_metrics, + make_decision, +) + + +# ── compute_metrics ────────────────────────────────────────────────────────── + +def test_compute_metrics_empty(): + result = compute_metrics([]) + assert result == {"count": 0, "evaluated": 0, "win_rate": None, "avg_return": None} + + +def test_compute_metrics_all_wins(): + picks = [ + {"return_7d": 5.0, "win_7d": True}, + {"return_7d": 3.0, "win_7d": True}, + ] + result = compute_metrics(picks) + assert result["win_rate"] == 100.0 + assert result["avg_return"] == 4.0 + assert result["evaluated"] == 2 + + +def test_compute_metrics_mixed(): + picks = [ + {"return_7d": 10.0, "win_7d": True}, + {"return_7d": -5.0, "win_7d": False}, + {"return_7d": None, "win_7d": None}, # pending — excluded + ] + result = compute_metrics(picks) + assert result["win_rate"] == 50.0 + assert result["avg_return"] == 2.5 + assert result["evaluated"] == 2 + assert result["count"] == 3 + + +# ── compute_7d_return ──────────────────────────────────────────────────────── + +def test_compute_7d_return_positive(): + import pandas as pd + + close_data = [100.0, 101.0, 102.0, 103.0, 104.0, 110.0] + mock_df = pd.DataFrame({"Close": close_data}) + + with patch("scripts.compare_hypothesis.download_history", return_value=mock_df): + ret, win = compute_7d_return("AAPL", "2026-03-01") + + assert ret == pytest.approx(10.0, rel=0.01) + assert win is True + + +def test_compute_7d_return_empty_data(): + import pandas as pd + + mock_df = pd.DataFrame() + + with patch("scripts.compare_hypothesis.download_history", return_value=mock_df): + ret, win = compute_7d_return("AAPL", "2026-03-01") + + assert ret is None + assert win is None + + +# ── load_baseline_metrics ──────────────────────────────────────────────────── + +def test_load_baseline_metrics(tmp_path): + db = { + "recommendations_by_date": { + "2026-03-01": [ + {"strategy_match": "options_flow", "return_7d": 5.0, "win_7d": True}, + {"strategy_match": "options_flow", "return_7d": -2.0, "win_7d": False}, + {"strategy_match": "reddit_dd", "return_7d": 3.0, "win_7d": True}, + ] + } + } + db_file = tmp_path / "performance_database.json" + db_file.write_text(json.dumps(db)) + + result = load_baseline_metrics("options_flow", str(db_file)) + + assert result["win_rate"] == 50.0 + assert result["avg_return"] == 1.5 + assert result["count"] == 2 + + +def test_load_baseline_metrics_missing_file(tmp_path): + result = load_baseline_metrics("options_flow", str(tmp_path / "missing.json")) + assert result == {"count": 0, "win_rate": None, "avg_return": None} + + +# ── make_decision ───────────────────────────────────────────────────────────── + +def test_make_decision_accepted_by_win_rate(): + hyp = {"win_rate": 60.0, "avg_return": 0.5, "evaluated": 10} + baseline = {"win_rate": 50.0, "avg_return": 0.5} + decision, reason = make_decision(hyp, baseline) + assert decision == "accepted" + assert "win rate" in reason.lower() + + +def test_make_decision_accepted_by_return(): + hyp = {"win_rate": 52.0, "avg_return": 3.0, "evaluated": 10} + baseline = {"win_rate": 50.0, "avg_return": 1.5} + decision, reason = make_decision(hyp, baseline) + assert decision == "accepted" + assert "return" in reason.lower() + + +def test_make_decision_rejected(): + hyp = {"win_rate": 48.0, "avg_return": 0.2, "evaluated": 10} + baseline = {"win_rate": 50.0, "avg_return": 1.0} + decision, reason = make_decision(hyp, baseline) + assert decision == "rejected" + + +def test_make_decision_insufficient_data(): + hyp = {"win_rate": 80.0, "avg_return": 5.0, "evaluated": 2} + baseline = {"win_rate": 50.0, "avg_return": 1.0} + decision, reason = make_decision(hyp, baseline) + assert decision == "rejected" + assert "insufficient" in reason.lower()