From 91311ad69d770a942887767a938142df826dfd46 Mon Sep 17 00:00:00 2001
From: Youssef Aitousarrah <youssef.aitousarrah@gmail.com>
Date: Fri, 10 Apr 2026 15:42:21 -0700
Subject: [PATCH] feat(hypotheses): detect baseline drift when scanner changes
 on main mid-experiment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before concluding a hypothesis, check if the scanner's source file
changed on main since created_at. If it did, the baseline picks in
performance_database.json reflect the updated code for the later part
of the experiment, which can confound the comparison.

When drift is detected, a warning is embedded in:
- the concluded .md doc (blockquote below Decision)
- the PR comment (blockquote in the conclusion body)

The programmatic decision is not overridden — the warning is purely
informational, allowing the reviewer to judge whether the result is
trustworthy.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 scripts/run_hypothesis_runner.py | 50 ++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/scripts/run_hypothesis_runner.py b/scripts/run_hypothesis_runner.py
index 383b3eca..f1233fe1 100644
--- a/scripts/run_hypothesis_runner.py
+++ b/scripts/run_hypothesis_runner.py
@@ -242,6 +242,40 @@ Be direct. Do not restate the numbers — interpret them. Do not recommend mergi
         return None
 
 
+def _detect_baseline_drift(scanner: str, since: str) -> Optional[str]:
+    """
+    Check if the scanner's source file changed on main since the experiment started.
+
+    Returns a warning string if drift is detected, None otherwise.
+
+    When main's scanner code changes mid-experiment, the baseline picks in
+    performance_database.json start reflecting the new code. The comparison
+    becomes confounded: hypothesis vs. original-main for early picks, but
+    hypothesis vs. new-main for later picks.
+    """
+    scanner_file = f"tradingagents/dataflows/discovery/scanners/{scanner}.py"
+    result = subprocess.run(
+        ["git", "log", "main", f"--since={since}", "--oneline", "--", scanner_file],
+        cwd=str(ROOT),
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0 or not result.stdout.strip():
+        return None
+
+    commits = result.stdout.strip().splitlines()
+    latest = commits[0]
+    count = len(commits)
+    noun = "commit" if count == 1 else "commits"
+    warning = (
+        f"`{scanner_file}` changed {count} {noun} on main since {since} "
+        f"(latest: {latest}). Baseline picks may reflect the updated code — "
+        f"interpret the delta with caution."
+    )
+    print(f"    ⚠️  Baseline drift: {warning}", flush=True)
+    return warning
+
+
 def conclude_hypothesis(hyp: dict) -> bool:
     """Run comparison, write conclusion doc, close/merge PR. Returns True."""
     hid = hyp["id"]
@@ -285,6 +319,11 @@ def conclude_hypothesis(hyp: dict) -> bool:
     hyp_metrics = conclusion["hypothesis"]
     base_metrics = conclusion["baseline"]
 
+    # Detect if the scanner file changed on main since the experiment started.
+    # If it did, the baseline picks (from main's daily runs) may no longer reflect
+    # the original code — the comparison could be confounded.
+    confound_warning = _detect_baseline_drift(scanner, hyp.get("created_at", TODAY))
+
     # Load scanner domain knowledge (may not exist yet — that's fine)
     scanner_domain_path = ROOT / "docs" / "iterations" / "scanners" / f"{scanner}.md"
     scanner_domain = scanner_domain_path.read_text() if scanner_domain_path.exists() else ""
@@ -293,6 +332,12 @@ def conclude_hypothesis(hyp: dict) -> bool:
     analysis = llm_analysis(hyp, conclusion, scanner_domain)
     analysis_section = f"\n\n## Analysis\n{analysis}" if analysis else ""
 
+    confound_section = (
+        f"\n\n> ⚠️ **Baseline drift detected:** {confound_warning}"
+        if confound_warning
+        else ""
+    )
+
     period_start = hyp.get("created_at", TODAY)
     concluded_doc = CONCLUDED_DIR / f"{TODAY}-{hid}.md"
     concluded_doc.write_text(
@@ -313,6 +358,7 @@ def conclude_hypothesis(hyp: dict) -> bool:
         f"{_delta_str(hyp_metrics.get('avg_return'), base_metrics.get('avg_return'), '%')} |\n"
         f"| Picks | {base_metrics.get('count', '—')} | {hyp_metrics.get('count', '—')} | — |\n\n"
         f"## Decision\n{conclusion['reason']}\n"
+        f"{confound_section}"
         f"{analysis_section}\n\n"
         f"## Action\n"
         f"{'Ready to merge — awaiting manual review.' if decision == 'accepted' else 'Experiment concluded — awaiting manual review before closing.'}\n"
@@ -326,6 +372,9 @@ def conclude_hypothesis(hyp: dict) -> bool:
         # The PR is NOT merged or closed automatically — the user reviews and decides.
         outcome_emoji = "✅ accepted" if decision == "accepted" else "❌ rejected"
         analysis_block = f"\n\n**Analysis**\n{analysis}" if analysis else ""
+        confound_block = (
+            f"\n\n> ⚠️ **Baseline drift:** {confound_warning}" if confound_warning else ""
+        )
         comment = (
             f"**Hypothesis concluded: {outcome_emoji}**\n\n"
             f"{conclusion['reason']}\n\n"
@@ -333,6 +382,7 @@ def conclude_hypothesis(hyp: dict) -> bool:
             f"|---|---|---|\n"
             f"| 7d win rate | {base_metrics.get('win_rate') or '—'}% | {hyp_metrics.get('win_rate') or '—'}% |\n"
             f"| Avg return | {base_metrics.get('avg_return') or '—'}% | {hyp_metrics.get('avg_return') or '—'}% |\n"
+            f"{confound_block}"
             f"{analysis_block}\n\n"
             f"{'Merge this PR to apply the change.' if decision == 'accepted' else 'Close this PR to discard the experiment.'}"
         )