feat(hypotheses): detect baseline drift when scanner changes on main mid-experiment

Before concluding a hypothesis, check if the scanner's source file
changed on main since created_at. If it did, the baseline picks in
performance_database.json reflect the updated code for the later part
of the experiment, which can confound the comparison.

When drift is detected, a warning is embedded in:
- the concluded .md doc (blockquote below Decision)
- the PR comment (blockquote in the conclusion body)

The programmatic decision is not overridden — the warning is purely
informational, allowing the reviewer to judge whether the result is
trustworthy.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Youssef Aitousarrah 2026-04-10 15:42:21 -07:00
parent 9e1c800f01
commit 91311ad69d
1 changed files with 50 additions and 0 deletions

View File

@ -242,6 +242,40 @@ Be direct. Do not restate the numbers — interpret them. Do not recommend mergi
return None
def _detect_baseline_drift(scanner: str, since: str) -> Optional[str]:
"""
Check if the scanner's source file changed on main since the experiment started.
Returns a warning string if drift is detected, None otherwise.
When main's scanner code changes mid-experiment, the baseline picks in
performance_database.json start reflecting the new code. The comparison
becomes confounded: hypothesis vs. original-main for early picks, but
hypothesis vs. new-main for later picks.
"""
scanner_file = f"tradingagents/dataflows/discovery/scanners/{scanner}.py"
result = subprocess.run(
["git", "log", "main", f"--since={since}", "--oneline", "--", scanner_file],
cwd=str(ROOT),
capture_output=True,
text=True,
)
if result.returncode != 0 or not result.stdout.strip():
return None
commits = result.stdout.strip().splitlines()
latest = commits[0]
count = len(commits)
noun = "commit" if count == 1 else "commits"
warning = (
f"`{scanner_file}` changed {count} {noun} on main since {since} "
f"(latest: {latest}). Baseline picks may reflect the updated code — "
f"interpret the delta with caution."
)
print(f" ⚠️ Baseline drift: {warning}", flush=True)
return warning
def conclude_hypothesis(hyp: dict) -> bool:
"""Run comparison, write conclusion doc, close/merge PR. Returns True."""
hid = hyp["id"]
@ -285,6 +319,11 @@ def conclude_hypothesis(hyp: dict) -> bool:
hyp_metrics = conclusion["hypothesis"]
base_metrics = conclusion["baseline"]
# Detect if the scanner file changed on main since the experiment started.
# If it did, the baseline picks (from main's daily runs) may no longer reflect
# the original code — the comparison could be confounded.
confound_warning = _detect_baseline_drift(scanner, hyp.get("created_at", TODAY))
# Load scanner domain knowledge (may not exist yet — that's fine)
scanner_domain_path = ROOT / "docs" / "iterations" / "scanners" / f"{scanner}.md"
scanner_domain = scanner_domain_path.read_text() if scanner_domain_path.exists() else ""
@ -293,6 +332,12 @@ def conclude_hypothesis(hyp: dict) -> bool:
analysis = llm_analysis(hyp, conclusion, scanner_domain)
analysis_section = f"\n\n## Analysis\n{analysis}" if analysis else ""
confound_section = (
f"\n\n> ⚠️ **Baseline drift detected:** {confound_warning}"
if confound_warning
else ""
)
period_start = hyp.get("created_at", TODAY)
concluded_doc = CONCLUDED_DIR / f"{TODAY}-{hid}.md"
concluded_doc.write_text(
@ -313,6 +358,7 @@ def conclude_hypothesis(hyp: dict) -> bool:
f"{_delta_str(hyp_metrics.get('avg_return'), base_metrics.get('avg_return'), '%')} |\n"
f"| Picks | {base_metrics.get('count', '')} | {hyp_metrics.get('count', '')} | — |\n\n"
f"## Decision\n{conclusion['reason']}\n"
f"{confound_section}"
f"{analysis_section}\n\n"
f"## Action\n"
f"{'Ready to merge — awaiting manual review.' if decision == 'accepted' else 'Experiment concluded — awaiting manual review before closing.'}\n"
@ -326,6 +372,9 @@ def conclude_hypothesis(hyp: dict) -> bool:
# The PR is NOT merged or closed automatically — the user reviews and decides.
outcome_emoji = "✅ accepted" if decision == "accepted" else "❌ rejected"
analysis_block = f"\n\n**Analysis**\n{analysis}" if analysis else ""
confound_block = (
f"\n\n> ⚠️ **Baseline drift:** {confound_warning}" if confound_warning else ""
)
comment = (
f"**Hypothesis concluded: {outcome_emoji}**\n\n"
f"{conclusion['reason']}\n\n"
@ -333,6 +382,7 @@ def conclude_hypothesis(hyp: dict) -> bool:
f"|---|---|---|\n"
f"| 7d win rate | {base_metrics.get('win_rate') or ''}% | {hyp_metrics.get('win_rate') or ''}% |\n"
f"| Avg return | {base_metrics.get('avg_return') or ''}% | {hyp_metrics.get('avg_return') or ''}% |\n"
f"{confound_block}"
f"{analysis_block}\n\n"
f"{'Merge this PR to apply the change.' if decision == 'accepted' else 'Close this PR to discard the experiment.'}"
)