feat(hypotheses): uncap statistical hypotheses from max_active limit

Statistical hypotheses now conclude immediately on the next runner cycle
without counting toward max_active. Only implementation hypotheses occupy
runner slots. Added conclude_statistical_hypothesis() for instant analysis
against existing performance data with Gemini LLM enrichment.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Youssef Aitousarrah 2026-04-13 12:39:17 -07:00
parent 79a58a540c
commit 662fdb5753
2 changed files with 125 additions and 4 deletions

View File

@ -35,13 +35,41 @@ without any code change. Examples:
If statistical: run the analysis now against `data/recommendations/performance_database.json`. If statistical: run the analysis now against `data/recommendations/performance_database.json`.
Write the finding to the relevant scanner domain file under **Evidence Log**. Print a summary. Write the finding to the relevant scanner domain file under **Evidence Log**. Print a summary.
Done — no branch needed.
Then register the hypothesis in `docs/iterations/hypotheses/active.json` as `status: "pending"`
so the runner picks it up on the next cycle and attaches LLM analysis to the report:
```json
{
"id": "<scanner>-<slug>",
"scanner": "<scanner>",
"title": "<title>",
"description": "<description>",
"branch": null,
"pr_number": null,
"status": "pending",
"priority": 0,
"expected_impact": "low",
"hypothesis_type": "statistical",
"created_at": "<YYYY-MM-DD>",
"min_days": 0,
"days_elapsed": 0,
"picks_log": [],
"baseline_scanner": "<scanner>",
"conclusion": null
}
```
Commit and push the updated `active.json` to `main`. Done — no branch or worktree needed.
## Step 3b: Implementation Path ## Step 3b: Implementation Path
### 3b-i: Capacity check ### 3b-i: Capacity check
Count running hypotheses from `active.json`. If fewer than `max_active` running, proceed. Count running hypotheses where `hypothesis_type == "implementation"` from `active.json`.
Statistical hypotheses do not consume runner slots and are excluded from this count.
If fewer than `max_active` implementation hypotheses are running, proceed.
If at capacity: add the new hypothesis as `status: "pending"` — running experiments are NEVER If at capacity: add the new hypothesis as `status: "pending"` — running experiments are NEVER
paused mid-streak. Inform the user which slot it is queued behind and when it will likely start. paused mid-streak. Inform the user which slot it is queued behind and when it will likely start.

View File

@ -409,9 +409,89 @@ def _delta_str(hyp_val, base_val, unit: str) -> str:
return f"{sign}{delta:.1f}{unit}" return f"{sign}{delta:.1f}{unit}"
def conclude_statistical_hypothesis(hyp: dict) -> None:
"""
Conclude a statistical hypothesis immediately using existing performance data.
Statistical hypotheses don't require worktrees or code changes — they answer
a question against already-collected pick data. This runs synchronously and
writes a markdown report to docs/iterations/hypotheses/concluded/.
"""
hid = hyp["id"]
scanner = hyp["scanner"]
print(f"\n── Statistical hypothesis: {hid} ──", flush=True)
# Load performance database
picks = []
if DB_PATH.exists():
try:
with open(DB_PATH) as f:
db = json.load(f)
picks = [p for p in db if p.get("scanner") == scanner or p.get("strategy_match") == scanner]
except Exception as e:
print(f" Could not read performance database: {e}", flush=True)
n = len(picks)
print(f" Found {n} picks for scanner '{scanner}'", flush=True)
# Compute basic stats
scores = [p["final_score"] for p in picks if p.get("final_score") is not None]
avg_score = round(sum(scores) / len(scores), 1) if scores else None
returns_7d = [p["return_7d"] for p in picks if p.get("return_7d") is not None]
win_rate = round(100 * sum(1 for r in returns_7d if r > 0) / len(returns_7d), 1) if returns_7d else None
avg_return = round(sum(returns_7d) / len(returns_7d), 2) if returns_7d else None
stats_block = (
f"- Total picks: {n}\n"
f"- Avg score: {avg_score if avg_score is not None else ''}\n"
f"- 7d win rate: {win_rate if win_rate is not None else ''}%\n"
f"- Avg 7d return: {avg_return if avg_return is not None else ''}%\n"
)
# Read scanner domain for LLM context
scanner_domain = ""
domain_file = ROOT / "docs" / "iterations" / "scanners" / f"{scanner}.md"
if domain_file.exists():
scanner_domain = domain_file.read_text()[:3000]
# LLM analysis — reuse llm_analysis() with a synthetic conclusion dict
conclusion = {
"decision": "statistical",
"reason": hyp.get("description", "Statistical analysis of existing pick data"),
"hypothesis": {"count": n, "win_rate": win_rate, "avg_return": avg_return},
"baseline": {},
}
llm_insight = llm_analysis(hyp, conclusion, scanner_domain)
# Write concluded report
CONCLUDED_DIR.mkdir(parents=True, exist_ok=True)
report_path = CONCLUDED_DIR / f"{hid}.md"
insight_block = f"\n## LLM Analysis\n\n{llm_insight}\n" if llm_insight else ""
report_path.write_text(
f"# Statistical Hypothesis: {hyp.get('title', hid)}\n\n"
f"**ID:** {hid}\n"
f"**Scanner:** {scanner}\n"
f"**Description:** {hyp.get('description', '')}\n"
f"**Concluded:** {TODAY}\n\n"
f"## Data Summary\n\n{stats_block}"
f"{insight_block}"
)
print(f" Report written to {report_path}", flush=True)
hyp["status"] = "concluded"
hyp["conclusion"] = "statistical"
hyp["days_elapsed"] = 0
def promote_pending(registry: dict) -> None: def promote_pending(registry: dict) -> None:
"""Promote the highest-priority pending hypothesis to running if a slot is open.""" """Promote the highest-priority pending implementation hypothesis to running if a slot is open."""
running_count = sum(1 for h in registry["hypotheses"] if h["status"] == "running") # Only implementation/forward_test hypotheses count toward max_active.
# Statistical hypotheses are concluded immediately and never occupy runner slots.
running_count = sum(
1 for h in registry["hypotheses"]
if h["status"] == "running" and h.get("hypothesis_type", "implementation") == "implementation"
)
max_active = registry.get("max_active", 5) max_active = registry.get("max_active", 5)
if running_count >= max_active: if running_count >= max_active:
return return
@ -435,6 +515,19 @@ def main():
registry = load_registry() registry = load_registry()
filter_id = os.environ.get("FILTER_ID", "").strip() filter_id = os.environ.get("FILTER_ID", "").strip()
# Fast-path: conclude all pending statistical hypotheses immediately.
# They answer questions from existing data — no cap, no worktree, no waiting.
statistical_pending = [
h for h in registry.get("hypotheses", [])
if h["status"] == "pending" and h.get("hypothesis_type") == "statistical"
and (not filter_id or h["id"] == filter_id)
]
for hyp in statistical_pending:
try:
conclude_statistical_hypothesis(hyp)
except Exception as e:
print(f" Error concluding statistical hypothesis {hyp['id']}: {e}", flush=True)
hypotheses = registry.get("hypotheses", []) hypotheses = registry.get("hypotheses", [])
running = [ running = [
h h