TradingAgents/scripts/run_hypothesis_runner.py

#!/usr/bin/env python3
"""
Hypothesis Runner — orchestrates daily experiment cycles.

For each running hypothesis in active.json:
  1. Creates a git worktree for the hypothesis branch
  2. Runs the daily discovery pipeline in that worktree
  3. Extracts picks from the discovery result, appends to picks.json
  4. Commits and pushes picks to hypothesis branch
  5. Removes worktree
  6. Updates active.json (days_elapsed, picks_log)
  7. If days_elapsed >= min_days: concludes the hypothesis

After all hypotheses: promotes highest-priority pending → running if a slot opened.

Environment variables:
  FILTER_ID — if set, only run the hypothesis with this ID
"""

import json
import os
import re
import subprocess
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional

ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))

ACTIVE_JSON = ROOT / "docs/iterations/hypotheses/active.json"
CONCLUDED_DIR = ROOT / "docs/iterations/hypotheses/concluded"
DB_PATH = ROOT / "data/recommendations/performance_database.json"
TODAY = datetime.utcnow().strftime("%Y-%m-%d")


def load_registry() -> dict:
    with open(ACTIVE_JSON) as f:
        return json.load(f)


def save_registry(registry: dict) -> None:
    with open(ACTIVE_JSON, "w") as f:
        json.dump(registry, f, indent=2)


def run(cmd: list, cwd: str = None, check: bool = True) -> subprocess.CompletedProcess:
    print(f"  $ {' '.join(cmd)}", flush=True)
    return subprocess.run(cmd, cwd=cwd or str(ROOT), check=check, capture_output=False)


def extract_picks(worktree: str, scanner: str) -> list:
    """Extract picks for the given scanner from the most recent discovery result in the worktree."""
    results_dir = Path(worktree) / "results" / "discovery" / TODAY
    if not results_dir.exists():
        print(f"    No discovery results for {TODAY} in worktree", flush=True)
        return []
    picks = []
    for run_dir in sorted(results_dir.iterdir()):
        result_file = run_dir / "discovery_result.json"
        if not result_file.exists():
            continue
        try:
            with open(result_file) as f:
                data = json.load(f)
            for item in data.get("final_ranking", []):
                if item.get("strategy_match") == scanner:
                    picks.append(
                        {
                            "date": TODAY,
                            "ticker": item["ticker"],
                            "score": item.get("final_score"),
                            "confidence": item.get("confidence"),
                            "scanner": scanner,
                            "return_7d": None,
                            "win_7d": None,
                        }
                    )
        except Exception as e:
            print(f"    Warning: could not read {result_file}: {e}", flush=True)
    return picks


def load_picks_from_branch(hypothesis_id: str, branch: str) -> list:
    """Load picks.json from the hypothesis branch using git show."""
    picks_path = f"docs/iterations/hypotheses/{hypothesis_id}/picks.json"
    result = subprocess.run(
        ["git", "show", f"{branch}:{picks_path}"],
        cwd=str(ROOT),
        capture_output=True,
        text=True,
    )
    if result.returncode != 0:
        return []
    try:
        return json.loads(result.stdout).get("picks", [])
    except Exception:
        return []


def save_picks_to_worktree(worktree: str, hypothesis_id: str, scanner: str, picks: list) -> None:
    """Write updated picks.json into the worktree and commit."""
    picks_dir = Path(worktree) / "docs" / "iterations" / "hypotheses" / hypothesis_id
    picks_dir.mkdir(parents=True, exist_ok=True)
    picks_file = picks_dir / "picks.json"
    payload = {"hypothesis_id": hypothesis_id, "scanner": scanner, "picks": picks}
    picks_file.write_text(json.dumps(payload, indent=2))
    run(["git", "add", str(picks_file)], cwd=worktree)
    result = subprocess.run(["git", "diff", "--cached", "--quiet"], cwd=worktree)
    if result.returncode != 0:
        run(
            ["git", "commit", "-m", f"chore(hypotheses): picks {TODAY} for {hypothesis_id}"],
            cwd=worktree,
        )


def run_hypothesis(hyp: dict) -> bool:
    """Run one hypothesis experiment cycle. Returns True if the experiment concluded."""
    hid = hyp["id"]
    # Validate id to prevent path traversal in worktree path
    if not re.fullmatch(r"[a-zA-Z0-9_\-]+", hid):
        print(f"  Skipping hypothesis with invalid id: {hid!r}", flush=True)
        return False
    branch = hyp["branch"]
    scanner = hyp["scanner"]
    worktree = f"/tmp/hyp-{hid}"

    print(f"\n── Hypothesis: {hid} ──", flush=True)

    run(["git", "fetch", "origin", branch], check=False)
    run(["git", "worktree", "add", worktree, branch])

    # Symlink .env from main repo into worktree so load_dotenv() finds it locally.
    # In CI, secrets are env vars already — the symlink is a no-op there.
    env_src = ROOT / ".env"
    env_dst = Path(worktree) / ".env"
    if env_src.exists() and not env_dst.exists():
        env_dst.symlink_to(env_src)

    try:
        result = subprocess.run(
            [
                sys.executable,
                "scripts/run_daily_discovery.py",
                "--date",
                TODAY,
                "--no-update-positions",
            ],
            cwd=worktree,
            check=False,
        )
        if result.returncode != 0:
            print(f"    Discovery failed for {hid}, skipping picks update", flush=True)
        else:
            new_picks = extract_picks(worktree, scanner)
            existing_picks = load_picks_from_branch(hid, branch)
            seen = {(p["date"], p["ticker"]) for p in existing_picks}
            merged = existing_picks + [p for p in new_picks if (p["date"], p["ticker"]) not in seen]
            save_picks_to_worktree(worktree, hid, scanner, merged)
            run(["git", "push", "origin", f"HEAD:{branch}"], cwd=worktree)

            if TODAY not in hyp.get("picks_log", []):
                hyp.setdefault("picks_log", []).append(TODAY)
            hyp["days_elapsed"] = len(hyp["picks_log"])

            if hyp["days_elapsed"] >= hyp["min_days"]:
                return conclude_hypothesis(hyp)

    finally:
        run(["git", "worktree", "remove", "--force", worktree], check=False)

    return False


def llm_analysis(hyp: dict, conclusion: dict, scanner_domain: str) -> Optional[str]:
    """
    Ask Gemini to interpret the experiment results and provide richer context.

    Returns a markdown string to embed in the PR comment, or None if the API
    call fails or GOOGLE_API_KEY is not set.

    The LLM does NOT override the programmatic decision — it adds nuance:
    sample-size caveats, market-condition context, follow-up hypotheses.
    """
    api_key = os.environ.get("GOOGLE_API_KEY")
    if not api_key:
        return None

    try:
        from google import genai
    except ImportError:
        print("    google-genai SDK not installed, skipping LLM analysis", flush=True)
        return None

    hyp_metrics = conclusion["hypothesis"]
    base_metrics = conclusion["baseline"]
    decision = conclusion["decision"]

    prompt = f"""You are analyzing the results of a scanner hypothesis experiment for an automated trading discovery system.

## Hypothesis
**ID:** {hyp["id"]}
**Title:** {hyp.get("title", "")}
**Description:** {hyp.get("description", hyp.get("title", ""))}
**Scanner:** {hyp["scanner"]}
**Period:** {hyp.get("created_at")} → {TODAY} ({hyp.get("days_elapsed")} days)

## Statistical Results
**Decision (programmatic):** {decision}
**Reason:** {conclusion["reason"]}

| Metric | Baseline | Experiment | Delta |
|---|---|---|---|
| 7d win rate | {base_metrics.get("win_rate") or "—"}% | {hyp_metrics.get("win_rate") or "—"}% | {_delta_str(hyp_metrics.get("win_rate"), base_metrics.get("win_rate"), "pp")} |
| Avg 7d return | {base_metrics.get("avg_return") or "—"}% | {hyp_metrics.get("avg_return") or "—"}% | {_delta_str(hyp_metrics.get("avg_return"), base_metrics.get("avg_return"), "%")} |
| Picks evaluated | {base_metrics.get("evaluated", base_metrics.get("count", "—"))} | {hyp_metrics.get("evaluated", hyp_metrics.get("count", "—"))} | — |

## Scanner Domain Knowledge
{scanner_domain}

---

Provide a concise analysis (3–5 sentences) covering:
1. Whether the sample size is sufficient to trust the result, or if more data is needed
2. Any caveats about the measurement period (e.g., unusual market conditions)
3. What the numbers suggest about the underlying hypothesis — even if the decision is "rejected", is the direction meaningful?
4. One concrete follow-up hypothesis worth testing next

Be direct. Do not restate the numbers — interpret them. Do not recommend merging or closing the PR."""

    try:
        client = genai.Client(api_key=api_key)
        response = client.models.generate_content(
            model="gemini-3-flash-preview",
            contents=prompt,
        )
        return response.text.strip()
    except Exception as e:
        print(f"    LLM analysis failed: {e}", flush=True)
        return None


def _detect_baseline_drift(scanner: str, since: str) -> Optional[str]:
    """
    Check if the scanner's source file changed on main since the experiment started.

    Returns a warning string if drift is detected, None otherwise.

    When main's scanner code changes mid-experiment, the baseline picks in
    performance_database.json start reflecting the new code. The comparison
    becomes confounded: hypothesis vs. original-main for early picks, but
    hypothesis vs. new-main for later picks.
    """
    scanner_file = f"tradingagents/dataflows/discovery/scanners/{scanner}.py"
    result = subprocess.run(
        ["git", "log", "main", f"--since={since}", "--oneline", "--", scanner_file],
        cwd=str(ROOT),
        capture_output=True,
        text=True,
    )
    if result.returncode != 0 or not result.stdout.strip():
        return None

    commits = result.stdout.strip().splitlines()
    latest = commits[0]
    count = len(commits)
    noun = "commit" if count == 1 else "commits"
    warning = (
        f"`{scanner_file}` changed {count} {noun} on main since {since} "
        f"(latest: {latest}). Baseline picks may reflect the updated code — "
        f"interpret the delta with caution."
    )
    print(f"    ⚠️  Baseline drift: {warning}", flush=True)
    return warning


def conclude_hypothesis(hyp: dict) -> bool:
    """Run comparison, write conclusion doc, close/merge PR. Returns True."""
    hid = hyp["id"]
    scanner = hyp["scanner"]
    branch = hyp["branch"]

    print(f"\n  Concluding {hid}...", flush=True)

    picks = load_picks_from_branch(hid, branch)
    if not picks:
        conclusion = {
            "decision": "rejected",
            "reason": "No picks were collected during the experiment period",
            "hypothesis": {"count": 0, "evaluated": 0, "win_rate": None, "avg_return": None},
            "baseline": {"count": 0, "win_rate": None, "avg_return": None},
        }
    else:
        result = subprocess.run(
            [
                sys.executable,
                "scripts/compare_hypothesis.py",
                "--hypothesis-id",
                hid,
                "--picks-json",
                json.dumps(picks),
                "--scanner",
                scanner,
                "--db-path",
                str(DB_PATH),
            ],
            cwd=str(ROOT),
            capture_output=True,
            text=True,
        )
        if result.returncode != 0:
            print(f"    compare_hypothesis.py failed: {result.stderr}", flush=True)
            return False
        conclusion = json.loads(result.stdout)

    decision = conclusion["decision"]
    hyp_metrics = conclusion["hypothesis"]
    base_metrics = conclusion["baseline"]

    # Detect if the scanner file changed on main since the experiment started.
    # If it did, the baseline picks (from main's daily runs) may no longer reflect
    # the original code — the comparison could be confounded.
    confound_warning = _detect_baseline_drift(scanner, hyp.get("created_at", TODAY))

    # Load scanner domain knowledge (may not exist yet — that's fine)
    scanner_domain_path = ROOT / "docs" / "iterations" / "scanners" / f"{scanner}.md"
    scanner_domain = scanner_domain_path.read_text() if scanner_domain_path.exists() else ""

    # Optional LLM analysis — enriches the conclusion without overriding the decision
    analysis = llm_analysis(hyp, conclusion, scanner_domain)
    analysis_section = f"\n\n## Analysis\n{analysis}" if analysis else ""

    confound_section = (
        f"\n\n> ⚠️ **Baseline drift detected:** {confound_warning}" if confound_warning else ""
    )

    period_start = hyp.get("created_at", TODAY)
    concluded_doc = CONCLUDED_DIR / f"{TODAY}-{hid}.md"
    concluded_doc.write_text(
        f"# Hypothesis: {hyp['title']}\n\n"
        f"**Scanner:** {scanner}\n"
        f"**Branch:** {branch}\n"
        f"**Period:** {period_start} → {TODAY} ({hyp['days_elapsed']} days)\n"
        f"**Outcome:** {'accepted ✅' if decision == 'accepted' else 'rejected ❌'}\n\n"
        f"## Hypothesis\n{hyp.get('description', hyp['title'])}\n\n"
        f"## Results\n\n"
        f"| Metric | Baseline | Experiment | Delta |\n"
        f"|---|---|---|---|\n"
        f"| 7d win rate | {base_metrics.get('win_rate') or '—'}% | "
        f"{hyp_metrics.get('win_rate') or '—'}% | "
        f"{_delta_str(hyp_metrics.get('win_rate'), base_metrics.get('win_rate'), 'pp')} |\n"
        f"| Avg return | {base_metrics.get('avg_return') or '—'}% | "
        f"{hyp_metrics.get('avg_return') or '—'}% | "
        f"{_delta_str(hyp_metrics.get('avg_return'), base_metrics.get('avg_return'), '%')} |\n"
        f"| Picks | {base_metrics.get('count', '—')} | {hyp_metrics.get('count', '—')} | — |\n\n"
        f"## Decision\n{conclusion['reason']}\n"
        f"{confound_section}"
        f"{analysis_section}\n\n"
        f"## Action\n"
        f"{'Ready to merge — awaiting manual review.' if decision == 'accepted' else 'Experiment concluded — awaiting manual review before closing.'}\n"
    )

    run(["git", "add", str(concluded_doc)], check=False)

    pr = hyp.get("pr_number")
    if pr:
        # Mark PR ready for review (removes draft status) and post conclusion as a comment.
        # The PR is NOT merged or closed automatically — the user reviews and decides.
        outcome_emoji = "✅ accepted" if decision == "accepted" else "❌ rejected"
        analysis_block = f"\n\n**Analysis**\n{analysis}" if analysis else ""
        confound_block = (
            f"\n\n> ⚠️ **Baseline drift:** {confound_warning}" if confound_warning else ""
        )
        comment = (
            f"**Hypothesis concluded: {outcome_emoji}**\n\n"
            f"{conclusion['reason']}\n\n"
            f"| Metric | Baseline | Experiment |\n"
            f"|---|---|---|\n"
            f"| 7d win rate | {base_metrics.get('win_rate') or '—'}% | {hyp_metrics.get('win_rate') or '—'}% |\n"
            f"| Avg return | {base_metrics.get('avg_return') or '—'}% | {hyp_metrics.get('avg_return') or '—'}% |\n"
            f"{confound_block}"
            f"{analysis_block}\n\n"
            f"{'Merge this PR to apply the change.' if decision == 'accepted' else 'Close this PR to discard the experiment.'}"
        )
        subprocess.run(
            ["gh", "pr", "ready", str(pr)],
            cwd=str(ROOT),
            check=False,
        )
        subprocess.run(
            ["gh", "pr", "comment", str(pr), "--body", comment],
            cwd=str(ROOT),
            check=False,
        )

    hyp["status"] = "concluded"
    hyp["conclusion"] = decision

    print(f"  {hid}: {decision} — {conclusion['reason']}", flush=True)
    return True


def _delta_str(hyp_val, base_val, unit: str) -> str:
    if hyp_val is None or base_val is None:
        return "—"
    delta = hyp_val - base_val
    sign = "+" if delta >= 0 else ""
    return f"{sign}{delta:.1f}{unit}"


def conclude_statistical_hypothesis(hyp: dict) -> None:
    """
    Conclude a statistical hypothesis immediately using existing performance data.

    Statistical hypotheses don't require worktrees or code changes — they answer
    a question against already-collected pick data. This runs synchronously and
    writes a markdown report to docs/iterations/hypotheses/concluded/.
    """
    hid = hyp["id"]
    scanner = hyp["scanner"]
    print(f"\n── Statistical hypothesis: {hid} ──", flush=True)

    # Load performance database
    picks = []
    if DB_PATH.exists():
        try:
            with open(DB_PATH) as f:
                db = json.load(f)
            picks = [
                p for p in db if p.get("scanner") == scanner or p.get("strategy_match") == scanner
            ]
        except Exception as e:
            print(f"    Could not read performance database: {e}", flush=True)

    n = len(picks)
    print(f"    Found {n} picks for scanner '{scanner}'", flush=True)

    # Compute basic stats
    scores = [p["final_score"] for p in picks if p.get("final_score") is not None]
    avg_score = round(sum(scores) / len(scores), 1) if scores else None

    returns_7d = [p["return_7d"] for p in picks if p.get("return_7d") is not None]
    win_rate = (
        round(100 * sum(1 for r in returns_7d if r > 0) / len(returns_7d), 1)
        if returns_7d
        else None
    )
    avg_return = round(sum(returns_7d) / len(returns_7d), 2) if returns_7d else None

    stats_block = (
        f"- Total picks: {n}\n"
        f"- Avg score: {avg_score if avg_score is not None else '—'}\n"
        f"- 7d win rate: {win_rate if win_rate is not None else '—'}%\n"
        f"- Avg 7d return: {avg_return if avg_return is not None else '—'}%\n"
    )

    # Read scanner domain for LLM context
    scanner_domain = ""
    domain_file = ROOT / "docs" / "iterations" / "scanners" / f"{scanner}.md"
    if domain_file.exists():
        scanner_domain = domain_file.read_text()[:3000]

    # LLM analysis — reuse llm_analysis() with a synthetic conclusion dict
    conclusion = {
        "decision": "statistical",
        "reason": hyp.get("description", "Statistical analysis of existing pick data"),
        "hypothesis": {"count": n, "win_rate": win_rate, "avg_return": avg_return},
        "baseline": {},
    }
    llm_insight = llm_analysis(hyp, conclusion, scanner_domain)

    # Write concluded report
    CONCLUDED_DIR.mkdir(parents=True, exist_ok=True)
    report_path = CONCLUDED_DIR / f"{hid}.md"
    insight_block = f"\n## LLM Analysis\n\n{llm_insight}\n" if llm_insight else ""
    report_path.write_text(
        f"# Statistical Hypothesis: {hyp.get('title', hid)}\n\n"
        f"**ID:** {hid}\n"
        f"**Scanner:** {scanner}\n"
        f"**Description:** {hyp.get('description', '')}\n"
        f"**Concluded:** {TODAY}\n\n"
        f"## Data Summary\n\n{stats_block}"
        f"{insight_block}"
    )
    print(f"    Report written to {report_path}", flush=True)

    hyp["status"] = "concluded"
    hyp["conclusion"] = "statistical"
    hyp["days_elapsed"] = 0


def promote_pending(registry: dict) -> None:
    """Promote the highest-priority pending implementation hypothesis to running if a slot is open."""
    # Only implementation/forward_test hypotheses count toward max_active.
    # Statistical hypotheses are concluded immediately and never occupy runner slots.
    running_count = sum(
        1
        for h in registry["hypotheses"]
        if h["status"] == "running"
        and h.get("hypothesis_type", "implementation") == "implementation"
    )
    max_active = registry.get("max_active", 5)
    if running_count >= max_active:
        return
    pending = [h for h in registry["hypotheses"] if h["status"] == "pending"]
    if not pending:
        return
    to_promote = max(pending, key=lambda h: h.get("priority", 0))
    to_promote["status"] = "running"
    print(f"\n  Promoted pending hypothesis to running: {to_promote['id']}", flush=True)


def main():
    # Skip weekends — markets are closed, picks would be noise and days_elapsed
    # would count non-trading days toward min_days.
    weekday = datetime.utcnow().weekday()  # 0=Mon … 6=Sun
    if weekday >= 5:
        day_name = "Saturday" if weekday == 5 else "Sunday"
        print(f"Skipping hypothesis runner — today is {day_name} (market closed).", flush=True)
        return

    registry = load_registry()
    filter_id = os.environ.get("FILTER_ID", "").strip()

    # Fast-path: conclude all pending statistical hypotheses immediately.
    # They answer questions from existing data — no cap, no worktree, no waiting.
    statistical_pending = [
        h
        for h in registry.get("hypotheses", [])
        if h["status"] == "pending"
        and h.get("hypothesis_type") == "statistical"
        and (not filter_id or h["id"] == filter_id)
    ]
    for hyp in statistical_pending:
        try:
            conclude_statistical_hypothesis(hyp)
        except Exception as e:
            print(f"  Error concluding statistical hypothesis {hyp['id']}: {e}", flush=True)

    hypotheses = registry.get("hypotheses", [])
    running = [
        h
        for h in hypotheses
        if h["status"] == "running" and (not filter_id or h["id"] == filter_id)
    ]

    if not running:
        print("No running hypotheses to process.", flush=True)
    else:
        run(["git", "worktree", "prune"], check=False)
        for hyp in running:
            try:
                run_hypothesis(hyp)
            except Exception as e:
                print(f"  Error processing {hyp['id']}: {e}", flush=True)

    promote_pending(registry)

    # Prune concluded hypotheses from active.json — they live in concluded/ already.
    before = len(registry["hypotheses"])
    registry["hypotheses"] = [h for h in registry["hypotheses"] if h["status"] != "concluded"]
    pruned = before - len(registry["hypotheses"])
    if pruned:
        print(f"\n  Pruned {pruned} concluded hypothesis/hypotheses from active.json.", flush=True)

    save_registry(registry)
    print("\nRegistry updated.", flush=True)


if __name__ == "__main__":
    main()