TradingAgents/scripts/compare_hypothesis.py

#!/usr/bin/env python3
"""
Hypothesis comparison — computes 7d returns for hypothesis picks and
compares them against the baseline scanner in performance_database.json.

Usage (called by hypothesis-runner.yml after min_days elapsed):
    python scripts/compare_hypothesis.py \
        --hypothesis-id options_flow-scan-3-expirations \
        --picks-json '[{"date": "2026-04-01", "ticker": "AAPL", ...}]' \
        --scanner options_flow \
        --db-path data/recommendations/performance_database.json

Prints a JSON conclusion to stdout.
"""

import argparse
import json
import sys
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional, Tuple

ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))

from tradingagents.dataflows.y_finance import download_history

_MIN_EVALUATED = 5
_WIN_RATE_DELTA_THRESHOLD = 5.0
_AVG_RETURN_DELTA_THRESHOLD = 1.0


def compute_7d_return(ticker: str, pick_date: str) -> Tuple[Optional[float], Optional[bool]]:
    """Fetch 7-day return for a pick using yfinance. Returns (pct, is_win) or (None, None)."""
    try:
        entry_dt = datetime.strptime(pick_date, "%Y-%m-%d")
        exit_dt = entry_dt + timedelta(days=10)
        df = download_history(
            ticker,
            start=entry_dt.strftime("%Y-%m-%d"),
            end=exit_dt.strftime("%Y-%m-%d"),
        )
        if df.empty or len(df) < 2:
            return None, None
        close = df["Close"]
        entry_price = float(close.iloc[0])
        exit_idx = min(6, len(close) - 1)
        exit_price = float(close.iloc[exit_idx])
        if entry_price <= 0:
            return None, None
        ret = (exit_price - entry_price) / entry_price * 100
        return round(ret, 4), ret > 0
    except Exception:
        return None, None


def enrich_picks_with_returns(picks: list) -> list:
    """Compute 7d return for each pick >= 7 days old that lacks return_7d."""
    cutoff = (datetime.utcnow() - timedelta(days=14)).strftime("%Y-%m-%d")
    for pick in picks:
        if pick.get("return_7d") is not None:
            continue
        if pick.get("date", "9999-99-99") > cutoff:
            continue
        ret, win = compute_7d_return(pick["ticker"], pick["date"])
        pick["return_7d"] = ret
        pick["win_7d"] = win
    return picks


def compute_metrics(picks: list) -> dict:
    """Compute win rate and avg return. Only picks with non-None return_7d are evaluated."""
    evaluated = [p for p in picks if p.get("return_7d") is not None]
    if not evaluated:
        return {"count": len(picks), "evaluated": 0, "win_rate": None, "avg_return": None}
    wins = sum(1 for p in evaluated if p.get("win_7d"))
    avg_ret = sum(p["return_7d"] for p in evaluated) / len(evaluated)
    return {
        "count": len(picks),
        "evaluated": len(evaluated),
        "win_rate": round(wins / len(evaluated) * 100, 1),
        "avg_return": round(avg_ret, 2),
    }


def load_baseline_metrics(scanner: str, db_path: str) -> dict:
    """Load baseline metrics for a scanner from performance_database.json."""
    path = Path(db_path)
    if not path.exists():
        return {"count": 0, "win_rate": None, "avg_return": None}
    try:
        with open(path) as f:
            db = json.load(f)
    except Exception:
        return {"count": 0, "win_rate": None, "avg_return": None}
    picks = []
    for recs in db.get("recommendations_by_date", {}).values():
        for rec in (recs if isinstance(recs, list) else []):
            if rec.get("strategy_match") == scanner and rec.get("return_7d") is not None:
                picks.append(rec)
    return compute_metrics(picks)


def make_decision(hypothesis: dict, baseline: dict) -> Tuple[str, str]:
    """Decide accepted/rejected. Requires _MIN_EVALUATED evaluated picks."""
    evaluated = hypothesis.get("evaluated", 0)
    if evaluated < _MIN_EVALUATED:
        return (
            "rejected",
            f"Insufficient data: only {evaluated} evaluated picks (need {_MIN_EVALUATED})",
        )
    hyp_wr = hypothesis.get("win_rate")
    hyp_ret = hypothesis.get("avg_return")
    base_wr = baseline.get("win_rate")
    base_ret = baseline.get("avg_return")
    reasons = []
    if hyp_wr is not None and base_wr is not None:
        delta_wr = hyp_wr - base_wr
        if delta_wr > _WIN_RATE_DELTA_THRESHOLD:
            reasons.append(
                f"win rate improved by {delta_wr:+.1f}pp ({base_wr:.1f}% → {hyp_wr:.1f}%)"
            )
    if hyp_ret is not None and base_ret is not None:
        delta_ret = hyp_ret - base_ret
        if delta_ret > _AVG_RETURN_DELTA_THRESHOLD:
            reasons.append(
                f"avg return improved by {delta_ret:+.2f}% ({base_ret:+.2f}% → {hyp_ret:+.2f}%)"
            )
    if reasons:
        return "accepted", "; ".join(reasons)
    wr_str = (
        f"{hyp_wr:.1f}% vs baseline {base_wr:.1f}%" if hyp_wr is not None else "no win rate data"
    )
    ret_str = (
        f"{hyp_ret:+.2f}% vs baseline {base_ret:+.2f}%" if hyp_ret is not None else "no return data"
    )
    return "rejected", f"No significant improvement — win rate: {wr_str}; avg return: {ret_str}"


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--hypothesis-id", required=True)
    parser.add_argument("--picks-json", required=True)
    parser.add_argument("--scanner", required=True)
    parser.add_argument("--db-path", default="data/recommendations/performance_database.json")
    args = parser.parse_args()
    picks = json.loads(args.picks_json)
    picks = enrich_picks_with_returns(picks)
    hyp_metrics = compute_metrics(picks)
    base_metrics = load_baseline_metrics(args.scanner, args.db_path)
    decision, reason = make_decision(hyp_metrics, base_metrics)
    result = {
        "hypothesis_id": args.hypothesis_id,
        "decision": decision,
        "reason": reason,
        "hypothesis": hyp_metrics,
        "baseline": base_metrics,
        "enriched_picks": picks,
    }
    print(json.dumps(result, indent=2))


if __name__ == "__main__":
    main()