fix(discovery): fix infinite hang when a scanner thread blocks indefinitely
Two issues caused the agent to get stuck after the last log message from a completed scanner (e.g. "✓ reddit_trending: 11 candidates"): 1. `as_completed()` had no global timeout. If a scanner thread blocked in a non-interruptible I/O call, `as_completed()` waited forever because it only yields a future once it has finished — the per-future `future.result(timeout=N)` call was never even reached. Fixed by passing `timeout=global_timeout` to `as_completed()` so the outer iterator raises TimeoutError after a capped wall-clock budget, then logs which scanners didn't complete and continues. 2. `SectorRotationScanner` called `get_ticker_info()` (one HTTP request per ticker) in a serial loop for up to 100 tickers from a 592-ticker file, easily exceeding the 30 s per-scanner budget. Fixed by batch-downloading close prices for all tickers in a single `download_history()` call, computing 5-day returns locally, and only calling `get_ticker_info()` for the small subset of laggard tickers (<2% 5d move) that actually need a sector label. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
6a9a190af5
commit
ce2a6ef8fa
|
|
@ -86,21 +86,58 @@ class SectorRotationScanner(BaseScanner):
|
||||||
sector_names = [SECTOR_ETFS.get(etf, etf) for etf in accelerating_sectors]
|
sector_names = [SECTOR_ETFS.get(etf, etf) for etf in accelerating_sectors]
|
||||||
logger.info(f"Accelerating sectors: {', '.join(sector_names)}")
|
logger.info(f"Accelerating sectors: {', '.join(sector_names)}")
|
||||||
|
|
||||||
# Step 2: Find laggard stocks in those sectors
|
# Step 2: Batch-download 5-day close prices for all candidate tickers at once.
|
||||||
|
# This replaces the previous serial get_ticker_info() + download_history() loop
|
||||||
|
# which made up to max_tickers individual HTTP requests and would time out.
|
||||||
tickers = _load_tickers_from_file(self.ticker_file)
|
tickers = _load_tickers_from_file(self.ticker_file)
|
||||||
if not tickers:
|
if not tickers:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
tickers = tickers[: self.max_tickers]
|
tickers = tickers[: self.max_tickers]
|
||||||
|
|
||||||
candidates = []
|
try:
|
||||||
|
batch_hist = download_history(
|
||||||
|
tickers, period="1mo", interval="1d", auto_adjust=True, progress=False
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Batch history download failed: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
if batch_hist is None or batch_hist.empty:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Calculate 5-day return for each ticker from the batch data
|
||||||
|
ticker_returns: Dict[str, float] = {}
|
||||||
for ticker in tickers:
|
for ticker in tickers:
|
||||||
result = self._check_sector_laggard(ticker, accelerating_sectors, get_ticker_info)
|
try:
|
||||||
if result:
|
if isinstance(batch_hist.columns, pd.MultiIndex):
|
||||||
candidates.append(result)
|
if ticker not in batch_hist.columns.get_level_values(1):
|
||||||
|
continue
|
||||||
|
close = batch_hist.xs(ticker, axis=1, level=1)["Close"].dropna()
|
||||||
|
else:
|
||||||
|
close = batch_hist["Close"].dropna()
|
||||||
|
if len(close) < 6:
|
||||||
|
continue
|
||||||
|
ticker_returns[ticker] = (float(close.iloc[-1]) / float(close.iloc[-6]) - 1) * 100
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Step 3: Only call get_ticker_info() for laggard tickers (< 2% 5d move).
|
||||||
|
# This dramatically reduces API calls from max_tickers down to ~20-30%.
|
||||||
|
candidates = []
|
||||||
|
for ticker, ret_5d in ticker_returns.items():
|
||||||
|
if ret_5d > 2.0:
|
||||||
|
continue # Already moved — not a laggard
|
||||||
|
|
||||||
if len(candidates) >= self.limit:
|
if len(candidates) >= self.limit:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
result = self._check_sector_laggard(ticker, accelerating_sectors, get_ticker_info)
|
||||||
|
if result:
|
||||||
|
# Overwrite ret_5d with the value we already computed
|
||||||
|
result["stock_5d_return"] = round(ret_5d, 2)
|
||||||
|
candidates.append(result)
|
||||||
|
|
||||||
logger.info(f"Sector rotation: {len(candidates)} candidates")
|
logger.info(f"Sector rotation: {len(candidates)} candidates")
|
||||||
return candidates
|
return candidates
|
||||||
|
|
||||||
|
|
@ -148,7 +185,7 @@ class SectorRotationScanner(BaseScanner):
|
||||||
def _check_sector_laggard(
|
def _check_sector_laggard(
|
||||||
self, ticker: str, accelerating_sectors: List[str], get_info_fn
|
self, ticker: str, accelerating_sectors: List[str], get_info_fn
|
||||||
) -> Optional[Dict[str, Any]]:
|
) -> Optional[Dict[str, Any]]:
|
||||||
"""Check if stock is in an accelerating sector but hasn't moved yet."""
|
"""Check if stock is in an accelerating sector (sector lookup only — no price download)."""
|
||||||
try:
|
try:
|
||||||
info = get_info_fn(ticker)
|
info = get_info_fn(ticker)
|
||||||
if not info:
|
if not info:
|
||||||
|
|
@ -163,32 +200,8 @@ class SectorRotationScanner(BaseScanner):
|
||||||
if not sector_etf or sector_etf not in accelerating_sectors:
|
if not sector_etf or sector_etf not in accelerating_sectors:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Check if stock is lagging its sector
|
# 5-day return is filled in by the caller (batch-computed)
|
||||||
from tradingagents.dataflows.y_finance import download_history
|
context = f"Sector rotation: {stock_sector} sector accelerating, {ticker} lagging"
|
||||||
|
|
||||||
hist = download_history(
|
|
||||||
ticker, period="1mo", interval="1d", auto_adjust=True, progress=False
|
|
||||||
)
|
|
||||||
if hist is None or hist.empty or len(hist) < 6:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Handle MultiIndex
|
|
||||||
if isinstance(hist.columns, pd.MultiIndex):
|
|
||||||
tickers_in_data = hist.columns.get_level_values(1).unique()
|
|
||||||
target = ticker if ticker in tickers_in_data else tickers_in_data[0]
|
|
||||||
hist = hist.xs(target, level=1, axis=1)
|
|
||||||
|
|
||||||
close = hist["Close"] if "Close" in hist.columns else hist.iloc[:, 0]
|
|
||||||
ret_5d = (float(close.iloc[-1]) / float(close.iloc[-6]) - 1) * 100
|
|
||||||
|
|
||||||
# Stock is a laggard if it moved less than 2% while sector is accelerating
|
|
||||||
if ret_5d > 2.0:
|
|
||||||
return None # Already moved, not a laggard
|
|
||||||
|
|
||||||
context = (
|
|
||||||
f"Sector rotation: {stock_sector} sector accelerating, "
|
|
||||||
f"{ticker} lagging at {ret_5d:+.1f}% (5d)"
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"ticker": ticker,
|
"ticker": ticker,
|
||||||
|
|
@ -198,7 +211,7 @@ class SectorRotationScanner(BaseScanner):
|
||||||
"strategy": self.strategy,
|
"strategy": self.strategy,
|
||||||
"sector": stock_sector,
|
"sector": stock_sector,
|
||||||
"sector_etf": sector_etf,
|
"sector_etf": sector_etf,
|
||||||
"stock_5d_return": round(ret_5d, 2),
|
"stock_5d_return": 0.0, # overwritten by caller
|
||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
|
|
@ -452,8 +452,15 @@ class DiscoveryGraph:
|
||||||
|
|
||||||
pipeline_candidates: Dict[str, List[Dict[str, Any]]] = {}
|
pipeline_candidates: Dict[str, List[Dict[str, Any]]] = {}
|
||||||
|
|
||||||
|
# Global wall-clock limit: all scanners must finish within this budget.
|
||||||
|
# Using timeout_seconds as per-scanner budget × number of scanners gives a
|
||||||
|
# reasonable upper bound, capped at 5 minutes so a single slow scanner can
|
||||||
|
# never block the whole run indefinitely.
|
||||||
|
global_timeout = min(timeout_seconds * len(enabled_scanners), 300)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Running {len(enabled_scanners)} scanners concurrently (max {max_workers} workers)..."
|
f"Running {len(enabled_scanners)} scanners concurrently "
|
||||||
|
f"(max {max_workers} workers, global timeout {global_timeout}s)..."
|
||||||
)
|
)
|
||||||
|
|
||||||
def run_scanner(scanner_info: tuple) -> tuple:
|
def run_scanner(scanner_info: tuple) -> tuple:
|
||||||
|
|
@ -483,40 +490,46 @@ class DiscoveryGraph:
|
||||||
for scanner_info in enabled_scanners
|
for scanner_info in enabled_scanners
|
||||||
}
|
}
|
||||||
|
|
||||||
# Collect results as they complete (no global timeout, handle per-scanner)
|
# Collect results as they complete.
|
||||||
|
# The global_timeout passed to as_completed() ensures that if any
|
||||||
|
# scanner thread blocks indefinitely (e.g. waiting on a hung network
|
||||||
|
# call), we raise TimeoutError and continue rather than hanging forever.
|
||||||
completed_count = 0
|
completed_count = 0
|
||||||
for future in as_completed(future_to_scanner):
|
try:
|
||||||
scanner_name = future_to_scanner[future]
|
for future in as_completed(future_to_scanner, timeout=global_timeout):
|
||||||
|
scanner_name = future_to_scanner[future]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Get result with per-scanner timeout
|
name, pipeline, candidates, error, scanner_logs = future.result()
|
||||||
name, pipeline, candidates, error, scanner_logs = future.result(
|
|
||||||
timeout=timeout_seconds
|
|
||||||
)
|
|
||||||
|
|
||||||
# Initialize pipeline list if needed
|
# Initialize pipeline list if needed
|
||||||
if pipeline not in pipeline_candidates:
|
if pipeline not in pipeline_candidates:
|
||||||
pipeline_candidates[pipeline] = []
|
pipeline_candidates[pipeline] = []
|
||||||
|
|
||||||
if error:
|
if error:
|
||||||
logger.warning(f"⚠️ {name}: {error}")
|
logger.warning(f"⚠️ {name}: {error}")
|
||||||
else:
|
else:
|
||||||
pipeline_candidates[pipeline].extend(candidates)
|
pipeline_candidates[pipeline].extend(candidates)
|
||||||
logger.info(f"✓ {name}: {len(candidates)} candidates")
|
logger.info(f"✓ {name}: {len(candidates)} candidates")
|
||||||
|
|
||||||
# Thread-safe log merging
|
# Thread-safe log merging
|
||||||
if scanner_logs:
|
if scanner_logs:
|
||||||
with self._tool_logs_lock:
|
with self._tool_logs_lock:
|
||||||
state.setdefault("tool_logs", []).extend(scanner_logs)
|
state.setdefault("tool_logs", []).extend(scanner_logs)
|
||||||
|
|
||||||
except TimeoutError:
|
except Exception as e:
|
||||||
logger.warning(f"⏱️ {scanner_name}: timeout after {timeout_seconds}s")
|
logger.error(f"⚠️ {scanner_name}: unexpected error - {e}", exc_info=True)
|
||||||
|
|
||||||
except Exception as e:
|
finally:
|
||||||
logger.error(f"⚠️ {scanner_name}: unexpected error - {e}", exc_info=True)
|
completed_count += 1
|
||||||
|
|
||||||
finally:
|
except TimeoutError:
|
||||||
completed_count += 1
|
# Identify which scanners did not finish in time
|
||||||
|
stuck = [name for fut, name in future_to_scanner.items() if not fut.done()]
|
||||||
|
logger.warning(
|
||||||
|
f"⏱️ Global scanner timeout ({global_timeout}s) reached. "
|
||||||
|
f"Timed-out scanners: {stuck}. Continuing with {completed_count} completed."
|
||||||
|
)
|
||||||
|
|
||||||
# Log completion stats
|
# Log completion stats
|
||||||
if completed_count < len(enabled_scanners):
|
if completed_count < len(enabled_scanners):
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue