TradingAgents/tradingagents/dataflows/finviz_scraper.py

483 lines
19 KiB
Python

"""
Finviz + Yahoo Finance Hybrid - Short Interest Discovery
Uses Finviz to discover tickers with high short interest, then Yahoo Finance for exact data
"""
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Annotated
import requests
from bs4 import BeautifulSoup
from tradingagents.dataflows.y_finance import get_ticker_info
from tradingagents.utils.logger import get_logger
logger = get_logger(__name__)
def get_short_interest(
min_short_interest_pct: Annotated[float, "Minimum short interest % of float"] = 10.0,
min_days_to_cover: Annotated[float, "Minimum days to cover ratio"] = 2.0,
top_n: Annotated[int, "Number of top results to return"] = 20,
return_structured: Annotated[bool, "Return dict with raw data instead of markdown"] = False,
):
"""
Discover stocks with high short interest using Finviz + Yahoo Finance.
Strategy: Finviz filters stocks by short interest (discovery),
then Yahoo Finance provides exact short % data.
This is a TRUE DISCOVERY tool - finds stocks we may not know about,
not checking a predefined watchlist.
Args:
min_short_interest_pct: Minimum short interest as % of float
min_days_to_cover: Minimum days to cover ratio
top_n: Number of top results to return
return_structured: If True, returns list of dicts instead of markdown
Returns:
If return_structured=True: list of candidate dicts with ticker, short_interest_pct, signal, etc.
If return_structured=False: Formatted markdown report
"""
try:
# Step 1: Use Finviz screener to DISCOVER tickers with high short interest
logger.info(
f"Discovering tickers with short interest >{min_short_interest_pct}% from Finviz..."
)
# Determine Finviz filter
if min_short_interest_pct >= 20:
short_filter = "sh_short_o20"
elif min_short_interest_pct >= 15:
short_filter = "sh_short_o15"
elif min_short_interest_pct >= 10:
short_filter = "sh_short_o10"
else:
short_filter = "sh_short_o5"
# Build Finviz URL (v=152 is simple view)
base_url = f"https://finviz.com/screener.ashx?v=152&f={short_filter}"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept": "text/html",
}
discovered_tickers = []
# Scrape first 3 pages (60 stocks)
for page_num in range(1, 4):
if page_num == 1:
url = base_url
else:
offset = (page_num - 1) * 20 + 1
url = f"{base_url}&r={offset}"
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Find ticker links in the page
ticker_links = soup.find_all("a", href=re.compile(r"quote\.ashx\?t="))
for link in ticker_links:
ticker = link.get_text(strip=True)
# Validate it's a ticker (1-5 uppercase letters)
if re.match(r"^[A-Z]{1,5}$", ticker) and ticker not in discovered_tickers:
discovered_tickers.append(ticker)
if not discovered_tickers:
if return_structured:
return []
return f"No stocks discovered with short interest >{min_short_interest_pct}% on Finviz."
logger.info(f"Discovered {len(discovered_tickers)} tickers from Finviz")
logger.info("Fetching detailed short interest data from Yahoo Finance...")
# Step 2: Use Yahoo Finance to get EXACT short interest data for discovered tickers
def fetch_short_data(ticker):
try:
info = get_ticker_info(ticker)
# Get short interest data
short_pct = info.get("shortPercentOfFloat", info.get("sharesPercentSharesOut", 0))
if short_pct and isinstance(short_pct, (int, float)):
short_pct = short_pct * 100 # Convert to percentage
else:
return None
# Verify it meets criteria (Finviz filter might be outdated)
if short_pct >= min_short_interest_pct:
price = info.get("currentPrice", info.get("regularMarketPrice", 0))
market_cap = info.get("marketCap", 0)
volume = info.get("volume", info.get("regularMarketVolume", 0))
# Days to cover (short ratio): shares short / avg daily volume
days_to_cover = info.get("shortRatio")
if days_to_cover is None or not isinstance(days_to_cover, (int, float)):
days_to_cover = 0.0
# Apply days-to-cover filter
if days_to_cover < min_days_to_cover:
return None
# Categorize squeeze potential
if short_pct >= 30:
signal = "extreme_squeeze_risk"
elif short_pct >= 20:
signal = "high_squeeze_potential"
elif short_pct >= 15:
signal = "moderate_squeeze_potential"
else:
signal = "low_squeeze_potential"
return {
"ticker": ticker,
"price": price,
"market_cap": market_cap,
"volume": volume,
"short_interest_pct": short_pct,
"days_to_cover": days_to_cover,
"signal": signal,
}
except Exception:
return None
# Fetch data in parallel (faster)
all_candidates = []
with ThreadPoolExecutor(max_workers=10) as executor:
futures = {
executor.submit(fetch_short_data, ticker): ticker for ticker in discovered_tickers
}
for future in as_completed(futures):
result = future.result()
if result:
all_candidates.append(result)
if not all_candidates:
if return_structured:
return []
return f"No stocks with verified short interest >{min_short_interest_pct}% (Finviz found {len(discovered_tickers)} tickers but Yahoo Finance data didn't confirm)."
# Sort by short interest percentage (highest first)
sorted_candidates = sorted(
all_candidates, key=lambda x: x["short_interest_pct"], reverse=True
)[:top_n]
# Return structured data if requested
if return_structured:
return sorted_candidates
# Format output
report = "# Discovered High Short Interest Stocks\n\n"
report += f"**Criteria**: Short Interest >{min_short_interest_pct}%\n"
report += "**Data Source**: Finviz Screener (Web Scraping)\n"
report += f"**Total Discovered**: {len(all_candidates)} stocks\n\n"
report += f"**Top {len(sorted_candidates)} Candidates**:\n\n"
report += "| Ticker | Price | Market Cap | Volume | Short % | Signal |\n"
report += "|--------|-------|------------|--------|---------|--------|\n"
for candidate in sorted_candidates:
market_cap_str = format_market_cap(candidate["market_cap"])
report += f"| {candidate['ticker']} | "
report += f"${candidate['price']:.2f} | "
report += f"{market_cap_str} | "
report += f"{candidate['volume']:,} | "
report += f"{candidate['short_interest_pct']:.1f}% | "
report += f"{candidate['signal']} |\n"
report += "\n\n## Signal Definitions\n\n"
report += "- **extreme_squeeze_risk**: Short interest >30% - Very high squeeze potential\n"
report += "- **high_squeeze_potential**: Short interest 20-30% - High squeeze risk\n"
report += (
"- **moderate_squeeze_potential**: Short interest 15-20% - Moderate squeeze risk\n"
)
report += "- **low_squeeze_potential**: Short interest 10-15% - Lower squeeze risk\n\n"
report += "**Note**: High short interest alone doesn't guarantee a squeeze. Look for positive catalysts.\n"
return report
except requests.exceptions.RequestException as e:
if return_structured:
return []
return f"Error scraping Finviz: {str(e)}"
except Exception as e:
if return_structured:
return []
return f"Unexpected error discovering short interest stocks: {str(e)}"
def parse_market_cap(market_cap_text: str) -> float:
"""Parse market cap from Finviz format (e.g., '1.23B', '456M')."""
if not market_cap_text or market_cap_text == "-":
return 0.0
market_cap_text = market_cap_text.upper().strip()
# Extract number and multiplier
match = re.match(r"([0-9.]+)([BMK])?", market_cap_text)
if not match:
return 0.0
number = float(match.group(1))
multiplier = match.group(2)
if multiplier == "B":
return number * 1_000_000_000
elif multiplier == "M":
return number * 1_000_000
elif multiplier == "K":
return number * 1_000
else:
return number
def format_market_cap(market_cap: float) -> str:
"""Format market cap for display."""
if market_cap >= 1_000_000_000:
return f"${market_cap / 1_000_000_000:.2f}B"
elif market_cap >= 1_000_000:
return f"${market_cap / 1_000_000:.2f}M"
else:
return f"${market_cap:,.0f}"
def get_finviz_short_interest(
min_short_interest_pct: float = 10.0,
min_days_to_cover: float = 2.0,
top_n: int = 20,
) -> str:
"""Alias for get_short_interest to match registry naming convention"""
return get_short_interest(min_short_interest_pct, min_days_to_cover, top_n)
def get_insider_buying_screener(
transaction_type: Annotated[str, "Transaction type: 'buy', 'sell', or 'any'"] = "buy",
lookback_days: Annotated[int, "Days to look back for transactions"] = 7,
min_value: Annotated[int, "Minimum transaction value in dollars"] = 25000,
top_n: Annotated[int, "Number of top results to return"] = 20,
return_structured: Annotated[bool, "Return list of dicts instead of markdown"] = False,
deduplicate: Annotated[bool, "If False, return all transactions without deduplication"] = True,
):
"""
Discover stocks with recent insider buying/selling using OpenInsider.
LEADING INDICATOR: Insiders buying their own stock before price moves.
Results are sorted by transaction value (largest first).
Args:
transaction_type: "buy" for purchases, "sell" for sales
lookback_days: Days to look back (default 7)
min_value: Minimum transaction value in dollars
top_n: Number of top results to return
return_structured: If True, returns list of dicts instead of markdown
Returns:
If return_structured=True: list of transaction dicts
If return_structured=False: Formatted markdown report
"""
try:
filter_desc = "insider buying" if transaction_type == "buy" else "insider selling"
logger.info(f"Discovering tickers with {filter_desc} from OpenInsider...")
# OpenInsider screener URL
# xp=1 means exclude private transactions
# fd=7 means last 7 days filing date
# vl=25 means minimum value $25k
if transaction_type == "buy":
url = f"http://openinsider.com/screener?s=&o=&pl=&ph=&ll=&lh=&fd={lookback_days}&fdr=&td=0&tdr=&fdlyl=&fdlyh=&dtefrom=&dteto=&xp=1&vl={min_value // 1000}&vh=&ocl=&och=&session=all&cnt=100&page=1"
else:
url = f"http://openinsider.com/screener?s=&o=&pl=&ph=&ll=&lh=&fd={lookback_days}&fdr=&td=0&tdr=&fdlyl=&fdlyh=&dtefrom=&dteto=&xs=1&vl={min_value // 1000}&vh=&ocl=&och=&sic1=-1&sicl=100&sich=9999&grp=0&nfl=&nfh=&nil=&nih=&nol=&noh=&v2l=&v2h=&oc2l=&oc2h=&sortcol=4&cnt=100&page=1"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept": "text/html",
}
response = requests.get(url, headers=headers, timeout=60)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Find the main data table
table = soup.find("table", class_="tinytable")
if not table:
return f"No {filter_desc} data found on OpenInsider."
tbody = table.find("tbody")
if not tbody:
return f"No {filter_desc} data found on OpenInsider."
rows = tbody.find_all("tr")
transactions = []
for row in rows:
cells = row.find_all("td")
if len(cells) < 12:
continue
try:
# OpenInsider columns:
# 0: X (checkbox), 1: Filing Date, 2: Trade Date, 3: Ticker, 4: Company Name
# 5: Insider Name, 6: Title, 7: Trade Type, 8: Price, 9: Qty, 10: Owned, 11: ΔOwn, 12: Value
ticker_cell = cells[3]
ticker_link = ticker_cell.find("a")
ticker = ticker_link.get_text(strip=True) if ticker_link else ""
if not ticker or not re.match(r"^[A-Z]{1,5}$", ticker):
continue
company = cells[4].get_text(strip=True)[:40] if len(cells) > 4 else ""
insider_name = cells[5].get_text(strip=True)[:25] if len(cells) > 5 else ""
title_raw = cells[6].get_text(strip=True) if len(cells) > 6 else ""
# "10%" means 10% beneficial owner - clarify for readability
title = "10% Owner" if title_raw == "10%" else title_raw[:20]
trade_type = cells[7].get_text(strip=True) if len(cells) > 7 else ""
price = cells[8].get_text(strip=True) if len(cells) > 8 else ""
qty = cells[9].get_text(strip=True) if len(cells) > 9 else ""
value_str = cells[12].get_text(strip=True) if len(cells) > 12 else ""
# Filter by transaction type
trade_type_lower = trade_type.lower()
if (
transaction_type == "buy"
and "buy" not in trade_type_lower
and "p -" not in trade_type_lower
):
continue
if (
transaction_type == "sell"
and "sale" not in trade_type_lower
and "s -" not in trade_type_lower
):
continue
# Parse value for sorting
value_num = 0
if value_str:
# Remove $ and + signs, handle K/M suffixes
clean_value = (
value_str.replace("$", "").replace("+", "").replace(",", "").strip()
)
try:
if "M" in clean_value:
value_num = float(clean_value.replace("M", "")) * 1_000_000
elif "K" in clean_value:
value_num = float(clean_value.replace("K", "")) * 1_000
else:
value_num = float(clean_value)
except ValueError:
value_num = 0
transactions.append(
{
"ticker": ticker,
"company": company,
"insider": insider_name,
"title": title,
"trade_type": trade_type,
"price": price,
"qty": qty,
"value_str": value_str,
"value_num": value_num,
}
)
except Exception:
continue
if not transactions:
if return_structured:
return []
return f"No {filter_desc} transactions found in the last {lookback_days} days."
# Sort by value (largest first)
transactions.sort(key=lambda x: x["value_num"], reverse=True)
# Return all transactions without deduplication if requested
if return_structured and not deduplicate:
logger.info(f"Returning all {len(transactions)} {filter_desc} transactions (no dedup)")
return transactions
# Deduplicate by ticker, keeping the largest transaction per ticker
seen_tickers = set()
unique_transactions = []
for t in transactions:
if t["ticker"] not in seen_tickers:
seen_tickers.add(t["ticker"])
unique_transactions.append(t)
if len(unique_transactions) >= top_n:
break
logger.info(
f"Discovered {len(unique_transactions)} tickers with {filter_desc} (sorted by value)"
)
# Return structured data if requested
if return_structured:
return unique_transactions
# Format report
report_lines = [
f"# Insider {'Buying' if transaction_type == 'buy' else 'Selling'} Report",
f"*Top {len(unique_transactions)} stocks by transaction value (last {lookback_days} days)*\n",
"| Ticker | Company | Insider | Title | Value | Price |",
"|--------|---------|---------|-------|-------|-------|",
]
for t in unique_transactions:
report_lines.append(
f"| {t['ticker']} | {t['company']} | {t['insider']} | {t['title']} | {t['value_str']} | {t['price']} |"
)
report_lines.append(
f"\n**Total: {len(unique_transactions)} stocks with significant {filter_desc}**"
)
report_lines.append("*Sorted by transaction value (largest first)*")
return "\n".join(report_lines)
except requests.exceptions.RequestException as e:
if return_structured:
return []
return f"Error fetching insider data from OpenInsider: {e}"
except Exception as e:
if return_structured:
return []
return f"Error processing insider screener: {e}"
def get_finviz_insider_buying(
transaction_type: str = "buy",
lookback_days: int = 7,
min_value: int = 25000,
top_n: int = 20,
return_structured: bool = False,
deduplicate: bool = True,
):
"""Alias for get_insider_buying_screener to match registry naming convention.
Args:
transaction_type: "buy" for purchases, "sell" for sales
lookback_days: Days to look back (default 7)
min_value: Minimum transaction value in dollars
top_n: Number of top results to return
return_structured: If True, returns list of dicts instead of markdown
deduplicate: If False and return_structured=True, returns all transactions
(not deduplicated by ticker). Useful for cluster detection.
"""
return get_insider_buying_screener(
transaction_type=transaction_type,
lookback_days=lookback_days,
min_value=min_value,
top_n=top_n,
return_structured=return_structured,
deduplicate=deduplicate,
)