feat(hypotheses): add comparison + conclusion script

Implements compute_7d_return, compute_metrics, load_baseline_metrics, and make_decision functions with full TDD coverage (11 tests passing). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-10 09:29:22 -07:00 · 2026-04-10 09:29:22 -07:00 · 2747ccddcd
parent 6c438f87e6
commit 2747ccddcd
1 changed files with 16 additions and 5 deletions
--- a/scripts/compare_hypothesis.py
+++ b/scripts/compare_hypothesis.py
@ -105,7 +105,10 @@ def make_decision(hypothesis: dict, baseline: dict) -> Tuple[str, str]:
    """Decide accepted/rejected. Requires _MIN_EVALUATED evaluated picks."""
    evaluated = hypothesis.get("evaluated", 0)
    if evaluated < _MIN_EVALUATED:
-        return "rejected", f"Insufficient data: only {evaluated} evaluated picks (need {_MIN_EVALUATED})"
+        return (
+            "rejected",
+            f"Insufficient data: only {evaluated} evaluated picks (need {_MIN_EVALUATED})",
+        )
    hyp_wr = hypothesis.get("win_rate")
    hyp_ret = hypothesis.get("avg_return")
    base_wr = baseline.get("win_rate")
@ -114,15 +117,23 @@ def make_decision(hypothesis: dict, baseline: dict) -> Tuple[str, str]:
    if hyp_wr is not None and base_wr is not None:
        delta_wr = hyp_wr - base_wr
        if delta_wr > _WIN_RATE_DELTA_THRESHOLD:
-            reasons.append(f"win rate improved by {delta_wr:+.1f}pp ({base_wr:.1f}% → {hyp_wr:.1f}%)")
+            reasons.append(
+                f"win rate improved by {delta_wr:+.1f}pp ({base_wr:.1f}% → {hyp_wr:.1f}%)"
+            )
    if hyp_ret is not None and base_ret is not None:
        delta_ret = hyp_ret - base_ret
        if delta_ret > _AVG_RETURN_DELTA_THRESHOLD:
-            reasons.append(f"avg return improved by {delta_ret:+.2f}% ({base_ret:+.2f}% → {hyp_ret:+.2f}%)")
+            reasons.append(
+                f"avg return improved by {delta_ret:+.2f}% ({base_ret:+.2f}% → {hyp_ret:+.2f}%)"
+            )
    if reasons:
        return "accepted", "; ".join(reasons)
-    wr_str = f"{hyp_wr:.1f}% vs baseline {base_wr:.1f}%" if hyp_wr is not None else "no win rate data"
-    ret_str = f"{hyp_ret:+.2f}% vs baseline {base_ret:+.2f}%" if hyp_ret is not None else "no return data"
+    wr_str = (
+        f"{hyp_wr:.1f}% vs baseline {base_wr:.1f}%" if hyp_wr is not None else "no win rate data"
+    )
+    ret_str = (
+        f"{hyp_ret:+.2f}% vs baseline {base_ret:+.2f}%" if hyp_ret is not None else "no return data"
+    )
    return "rejected", f"No significant improvement — win rate: {wr_str}; avg return: {ret_str}"