589 lines
24 KiB
Python
589 lines
24 KiB
Python
from datetime import datetime, timedelta
|
|
from typing import Annotated
|
|
|
|
import praw
|
|
|
|
from tradingagents.config import config
|
|
from tradingagents.utils.logger import get_logger
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
def get_reddit_client():
|
|
"""Initialize and return a PRAW Reddit instance."""
|
|
client_id = config.validate_key("reddit_client_id", "Reddit Client ID")
|
|
client_secret = config.validate_key("reddit_client_secret", "Reddit Client Secret")
|
|
user_agent = config.reddit_user_agent
|
|
|
|
return praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent)
|
|
|
|
|
|
def get_reddit_news(
|
|
ticker: Annotated[str, "Ticker symbol"] = None,
|
|
start_date: Annotated[str, "Start date in yyyy-mm-dd format"] = None,
|
|
end_date: Annotated[str, "End date in yyyy-mm-dd format"] = None,
|
|
query: Annotated[str, "Search query or ticker symbol"] = None,
|
|
) -> str:
|
|
"""
|
|
Fetch company news/discussion from Reddit with top comments.
|
|
"""
|
|
target_query = query or ticker
|
|
if not target_query:
|
|
raise ValueError("Must provide query or ticker")
|
|
|
|
try:
|
|
reddit = get_reddit_client()
|
|
|
|
start_dt = datetime.strptime(start_date, "%Y-%m-%d")
|
|
end_dt = datetime.strptime(end_date, "%Y-%m-%d")
|
|
# Add one day to end_date to include the full day
|
|
end_dt = end_dt + timedelta(days=1)
|
|
|
|
# Subreddits to search
|
|
subreddits = "stocks+investing+wallstreetbets+stockmarket"
|
|
|
|
# Search queries - try multiple variations
|
|
queries = [
|
|
target_query,
|
|
f"${target_query}", # Common format on WSB
|
|
target_query.lower(),
|
|
]
|
|
|
|
posts = []
|
|
seen_ids = set() # Avoid duplicates
|
|
subreddit = reddit.subreddit(subreddits)
|
|
|
|
# Try multiple search strategies
|
|
for q in queries:
|
|
# Strategy 1: Search by relevance
|
|
for submission in subreddit.search(q, sort="relevance", time_filter="all", limit=50):
|
|
if submission.id in seen_ids:
|
|
continue
|
|
|
|
post_date = datetime.fromtimestamp(submission.created_utc)
|
|
|
|
if start_dt <= post_date <= end_dt:
|
|
seen_ids.add(submission.id)
|
|
|
|
# Fetch top comments for this post
|
|
submission.comment_sort = "top"
|
|
submission.comments.replace_more(limit=0)
|
|
|
|
top_comments = []
|
|
for comment in submission.comments[:5]: # Top 5 comments
|
|
if hasattr(comment, "body") and hasattr(comment, "score"):
|
|
top_comments.append(
|
|
{
|
|
"body": (
|
|
comment.body[:300] + "..."
|
|
if len(comment.body) > 300
|
|
else comment.body
|
|
),
|
|
"score": comment.score,
|
|
"author": (
|
|
str(comment.author) if comment.author else "[deleted]"
|
|
),
|
|
}
|
|
)
|
|
|
|
posts.append(
|
|
{
|
|
"title": submission.title,
|
|
"score": submission.score,
|
|
"num_comments": submission.num_comments,
|
|
"date": post_date.strftime("%Y-%m-%d"),
|
|
"url": submission.url,
|
|
"text": (
|
|
submission.selftext[:500] + "..."
|
|
if len(submission.selftext) > 500
|
|
else submission.selftext
|
|
),
|
|
"subreddit": submission.subreddit.display_name,
|
|
"top_comments": top_comments,
|
|
}
|
|
)
|
|
|
|
# Strategy 2: Search by new (for recent posts)
|
|
for submission in subreddit.search(q, sort="new", time_filter="week", limit=50):
|
|
if submission.id in seen_ids:
|
|
continue
|
|
|
|
post_date = datetime.fromtimestamp(submission.created_utc)
|
|
|
|
if start_dt <= post_date <= end_dt:
|
|
seen_ids.add(submission.id)
|
|
|
|
submission.comment_sort = "top"
|
|
submission.comments.replace_more(limit=0)
|
|
|
|
top_comments = []
|
|
for comment in submission.comments[:5]:
|
|
if hasattr(comment, "body") and hasattr(comment, "score"):
|
|
top_comments.append(
|
|
{
|
|
"body": (
|
|
comment.body[:300] + "..."
|
|
if len(comment.body) > 300
|
|
else comment.body
|
|
),
|
|
"score": comment.score,
|
|
"author": (
|
|
str(comment.author) if comment.author else "[deleted]"
|
|
),
|
|
}
|
|
)
|
|
|
|
posts.append(
|
|
{
|
|
"title": submission.title,
|
|
"score": submission.score,
|
|
"num_comments": submission.num_comments,
|
|
"date": post_date.strftime("%Y-%m-%d"),
|
|
"url": submission.url,
|
|
"text": (
|
|
submission.selftext[:500] + "..."
|
|
if len(submission.selftext) > 500
|
|
else submission.selftext
|
|
),
|
|
"subreddit": submission.subreddit.display_name,
|
|
"top_comments": top_comments,
|
|
}
|
|
)
|
|
|
|
if not posts:
|
|
return f"No Reddit posts found for {target_query} between {start_date} and {end_date}."
|
|
|
|
# Format output
|
|
report = f"## Reddit Discussions for {target_query} ({start_date} to {end_date})\n\n"
|
|
report += f"**Total Posts Found:** {len(posts)}\n\n"
|
|
|
|
# Sort by score (popularity)
|
|
posts.sort(key=lambda x: x["score"], reverse=True)
|
|
|
|
# Detailed view of top posts
|
|
report += "### Top Posts with Community Reactions\n\n"
|
|
for i, post in enumerate(posts[:10], 1): # Top 10 posts
|
|
report += f"#### {i}. [{post['subreddit']}] {post['title']}\n"
|
|
report += f"**Score:** {post['score']} | **Comments:** {post['num_comments']} | **Date:** {post['date']}\n\n"
|
|
|
|
if post["text"]:
|
|
report += f"**Post Content:**\n{post['text']}\n\n"
|
|
|
|
if post["top_comments"]:
|
|
report += f"**Top Community Reactions ({len(post['top_comments'])} comments):**\n"
|
|
for j, comment in enumerate(post["top_comments"], 1):
|
|
report += f"{j}. *[{comment['score']} upvotes]* u/{comment['author']}: {comment['body']}\n"
|
|
report += "\n"
|
|
|
|
report += f"**Link:** {post['url']}\n\n"
|
|
report += "---\n\n"
|
|
|
|
# Summary statistics
|
|
total_engagement = sum(p["score"] + p["num_comments"] for p in posts)
|
|
avg_score = sum(p["score"] for p in posts) / len(posts) if posts else 0
|
|
|
|
report += "### Summary Statistics\n"
|
|
report += f"- **Total Posts:** {len(posts)}\n"
|
|
report += f"- **Average Score:** {avg_score:.1f}\n"
|
|
report += f"- **Total Engagement:** {total_engagement:,} (upvotes + comments)\n"
|
|
report += (
|
|
f"- **Most Active Subreddit:** {max(posts, key=lambda x: x['score'])['subreddit']}\n"
|
|
)
|
|
|
|
return report
|
|
|
|
except Exception as e:
|
|
return f"Error fetching Reddit news: {str(e)}"
|
|
|
|
|
|
def get_reddit_global_news(
|
|
curr_date: Annotated[str, "Current date in yyyy-mm-dd format"] = None,
|
|
date: Annotated[str, "Date in yyyy-mm-dd format"] = None,
|
|
look_back_days: Annotated[int, "Number of days to look back"] = 7,
|
|
limit: Annotated[int, "Maximum number of articles to return"] = 5,
|
|
) -> str:
|
|
"""
|
|
Fetch global news from Reddit.
|
|
"""
|
|
target_date = date or curr_date
|
|
if not target_date:
|
|
raise ValueError("Must provide date")
|
|
|
|
try:
|
|
reddit = get_reddit_client()
|
|
|
|
curr_dt = datetime.strptime(target_date, "%Y-%m-%d")
|
|
start_dt = curr_dt - timedelta(days=look_back_days)
|
|
|
|
# Subreddits for global news
|
|
subreddits = "financenews+finance+economics+stockmarket"
|
|
|
|
posts = []
|
|
subreddit = reddit.subreddit(subreddits)
|
|
|
|
# For global news, we just want top posts from the period
|
|
# We can use 'top' with time_filter, but 'week' is a fixed window.
|
|
# Better to iterate top of 'week' and filter by date.
|
|
|
|
for submission in subreddit.top(time_filter="week", limit=50):
|
|
post_date = datetime.fromtimestamp(submission.created_utc)
|
|
|
|
if start_dt <= post_date <= curr_dt + timedelta(days=1):
|
|
posts.append(
|
|
{
|
|
"title": submission.title,
|
|
"score": submission.score,
|
|
"date": post_date.strftime("%Y-%m-%d"),
|
|
"subreddit": submission.subreddit.display_name,
|
|
}
|
|
)
|
|
|
|
if not posts:
|
|
return f"No global news found on Reddit for the past {look_back_days} days."
|
|
|
|
# Format output
|
|
report = f"## Global News from Reddit (Last {look_back_days} days)\n\n"
|
|
|
|
posts.sort(key=lambda x: x["score"], reverse=True)
|
|
|
|
for post in posts[:limit]:
|
|
report += f"### [{post['subreddit']}] {post['title']} (Score: {post['score']})\n"
|
|
report += f"**Date:** {post['date']}\n\n"
|
|
|
|
return report
|
|
|
|
except Exception as e:
|
|
return f"Error fetching global Reddit news: {str(e)}"
|
|
|
|
|
|
def get_reddit_trending_tickers(
|
|
limit: Annotated[int, "Number of posts to retrieve"] = 10,
|
|
look_back_days: Annotated[int, "Number of days to look back"] = 3,
|
|
) -> str:
|
|
"""
|
|
Fetch trending discussions from Reddit (r/wallstreetbets, r/stocks, r/investing)
|
|
to be analyzed for trending tickers.
|
|
"""
|
|
try:
|
|
reddit = get_reddit_client()
|
|
|
|
# Subreddits to scan
|
|
subreddits = "wallstreetbets+stocks+investing+stockmarket"
|
|
subreddit = reddit.subreddit(subreddits)
|
|
|
|
posts = []
|
|
|
|
# Scan hot posts
|
|
for submission in subreddit.hot(limit=limit * 2): # Fetch more to filter by date
|
|
# Check date
|
|
post_date = datetime.fromtimestamp(submission.created_utc)
|
|
if (datetime.now() - post_date).days > look_back_days:
|
|
continue
|
|
|
|
# Fetch top comments
|
|
submission.comment_sort = "top"
|
|
submission.comments.replace_more(limit=0)
|
|
|
|
top_comments = []
|
|
for comment in submission.comments[:3]:
|
|
if hasattr(comment, "body"):
|
|
top_comments.append(f"- {comment.body[:200]}...")
|
|
|
|
posts.append(
|
|
{
|
|
"title": submission.title,
|
|
"score": submission.score,
|
|
"subreddit": submission.subreddit.display_name,
|
|
"text": (
|
|
submission.selftext[:500] + "..."
|
|
if len(submission.selftext) > 500
|
|
else submission.selftext
|
|
),
|
|
"comments": top_comments,
|
|
}
|
|
)
|
|
|
|
if len(posts) >= limit:
|
|
break
|
|
|
|
if not posts:
|
|
return "No trending discussions found."
|
|
|
|
# Format report for LLM
|
|
report = "## Trending Reddit Discussions\n\n"
|
|
for i, post in enumerate(posts, 1):
|
|
report += f"### {i}. [{post['subreddit']}] {post['title']} (Score: {post['score']})\n"
|
|
if post["text"]:
|
|
report += f"**Content:** {post['text']}\n"
|
|
if post["comments"]:
|
|
report += "**Top Comments:**\n" + "\n".join(post["comments"]) + "\n"
|
|
report += "\n---\n"
|
|
|
|
return report
|
|
|
|
except Exception as e:
|
|
return f"Error fetching trending tickers: {str(e)}"
|
|
|
|
|
|
def get_reddit_discussions(
|
|
symbol: Annotated[str, "Ticker symbol"],
|
|
from_date: Annotated[str, "Start date in yyyy-mm-dd format"],
|
|
to_date: Annotated[str, "End date in yyyy-mm-dd format"],
|
|
) -> str:
|
|
"""
|
|
Wrapper for get_reddit_news to match get_reddit_discussions registry signature.
|
|
"""
|
|
return get_reddit_news(ticker=symbol, start_date=from_date, end_date=to_date)
|
|
|
|
|
|
def get_reddit_undiscovered_dd(
|
|
lookback_hours: Annotated[int, "Hours to look back"] = 72,
|
|
scan_limit: Annotated[int, "Number of new posts to scan"] = 100,
|
|
top_n: Annotated[int, "Number of top DD posts to return"] = 10,
|
|
num_comments: Annotated[int, "Number of top comments to include"] = 10,
|
|
llm_evaluator=None, # Will be passed from discovery graph
|
|
as_list: bool = False,
|
|
) -> str | list:
|
|
"""
|
|
Find high-quality undiscovered DD using LLM evaluation.
|
|
|
|
LEADING INDICATOR: Deep research before it goes viral.
|
|
|
|
Strategy:
|
|
1. Scan NEW posts (not hot) from quality subreddits
|
|
2. Send ALL to LLM for quality evaluation (parallel)
|
|
3. LLM filters for: quality analysis, sound thesis, novel insights
|
|
4. Return top-scoring DD posts
|
|
|
|
Args:
|
|
lookback_hours: How far back to scan
|
|
scan_limit: Number of posts to scan
|
|
top_n: Number of top DD to return
|
|
llm_evaluator: LLM instance for evaluation
|
|
|
|
Returns:
|
|
Report of high-quality undiscovered DD
|
|
"""
|
|
try:
|
|
reddit = get_reddit_client()
|
|
|
|
subreddits = "stocks+investing+StockMarket+wallstreetbets+Superstonk+pennystocks"
|
|
subreddit = reddit.subreddit(subreddits)
|
|
cutoff_time = datetime.now() - timedelta(hours=lookback_hours)
|
|
|
|
# Collect ALL recent posts (minimal filtering)
|
|
candidate_posts = []
|
|
|
|
for submission in subreddit.new(limit=scan_limit):
|
|
post_date = datetime.fromtimestamp(submission.created_utc)
|
|
|
|
if post_date < cutoff_time:
|
|
continue
|
|
|
|
# Only filter: has text content
|
|
if not submission.selftext or len(submission.selftext) < 200:
|
|
continue
|
|
|
|
top_comments = []
|
|
if llm_evaluator:
|
|
# Get top comments for community validation
|
|
submission.comment_sort = "top"
|
|
submission.comments.replace_more(limit=0)
|
|
for comment in submission.comments[:num_comments]:
|
|
if hasattr(comment, "body") and hasattr(comment, "score"):
|
|
top_comments.append(
|
|
{
|
|
"body": comment.body[:1000], # Include more of each comment
|
|
"score": comment.score,
|
|
}
|
|
)
|
|
|
|
candidate_posts.append(
|
|
{
|
|
"title": submission.title,
|
|
"author": str(submission.author) if submission.author else "[deleted]",
|
|
"score": submission.score,
|
|
"num_comments": submission.num_comments,
|
|
"subreddit": submission.subreddit.display_name,
|
|
"flair": submission.link_flair_text or "None",
|
|
"date": post_date.strftime("%Y-%m-%d %H:%M"),
|
|
"url": f"https://reddit.com{submission.permalink}",
|
|
"text": submission.selftext[:1500], # First 1500 chars for LLM
|
|
"full_length": len(submission.selftext),
|
|
"hours_ago": int((datetime.now() - post_date).total_seconds() / 3600),
|
|
"top_comments": top_comments,
|
|
}
|
|
)
|
|
|
|
if not candidate_posts:
|
|
return f"# Undiscovered DD\n\nNo posts found in last {lookback_hours}h."
|
|
|
|
logger.info(f"Scanning {len(candidate_posts)} Reddit posts with LLM...")
|
|
|
|
# LLM evaluation (parallel)
|
|
if llm_evaluator:
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from typing import List
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
# Define structured output schema
|
|
class DDEvaluation(BaseModel):
|
|
score: int = Field(description="Quality score 0-100")
|
|
reason: str = Field(description="Brief reasoning for the score")
|
|
tickers: List[str] = Field(
|
|
default_factory=list,
|
|
description="List of stock ticker symbols mentioned (empty list if none)",
|
|
)
|
|
|
|
# Configure LLM for Reddit content (adjust safety settings if using Gemini)
|
|
try:
|
|
# Check if using Google Gemini and configure safety settings
|
|
if (
|
|
hasattr(llm_evaluator, "model_name")
|
|
and "gemini" in llm_evaluator.model_name.lower()
|
|
):
|
|
from langchain_google_genai import HarmBlockThreshold, HarmCategory
|
|
|
|
# More permissive safety settings for financial content analysis
|
|
llm_evaluator.safety_settings = {
|
|
HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
|
|
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
|
|
HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
|
|
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
|
|
}
|
|
logger.info(
|
|
"⚙️ Configured Gemini with permissive safety settings for financial content"
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Could not configure safety settings: {e}")
|
|
|
|
# Create structured LLM
|
|
structured_llm = llm_evaluator.with_structured_output(DDEvaluation)
|
|
|
|
def evaluate_post(post):
|
|
try:
|
|
# Build prompt with comments if available
|
|
comments_section = ""
|
|
if post.get("top_comments") and len(post["top_comments"]) > 0:
|
|
comments_section = "\n\nTop Community Comments (for validation):\n"
|
|
for i, comment in enumerate(post["top_comments"], 1):
|
|
comments_section += (
|
|
f"{i}. [{comment['score']} upvotes] {comment['body']}\n"
|
|
)
|
|
|
|
prompt = f"""Evaluate this Reddit post for investment Due Diligence quality.
|
|
|
|
Title: {post['title']}
|
|
Subreddit: r/{post['subreddit']}
|
|
Upvotes: {post['score']} | Comments: {post['num_comments']}
|
|
|
|
Content:
|
|
{post['text']}{comments_section}
|
|
|
|
Score 0-100 based on:
|
|
- Quality analysis (financial data, metrics, industry research)
|
|
- Sound thesis (logical, not just hype/speculation)
|
|
- Novel insights (unique perspective vs rehashing news)
|
|
- Risk awareness (mentions downsides, realistic)
|
|
- Actionable (identifies specific ticker/opportunity)
|
|
- Community validation (do top comments support or debunk the thesis?)
|
|
|
|
Extract all stock ticker symbols mentioned in the post or comments."""
|
|
|
|
result = structured_llm.invoke(prompt)
|
|
|
|
# Handle None result (Gemini blocked content despite safety settings)
|
|
if result is None:
|
|
logger.warning(
|
|
f"⚠️ Content blocked for '{post['title'][:50]}...' - Skipping"
|
|
)
|
|
post["quality_score"] = 0
|
|
post["quality_reason"] = (
|
|
"Content blocked by LLM safety filter. "
|
|
"Consider using OpenAI/Anthropic for Reddit content."
|
|
)
|
|
post["tickers"] = []
|
|
return post
|
|
|
|
# Extract values from structured response
|
|
post["quality_score"] = result.score
|
|
post["quality_reason"] = result.reason
|
|
post["tickers"] = result.tickers # Now a list
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error evaluating '{post['title'][:50]}': {str(e)}")
|
|
post["quality_score"] = 0
|
|
post["quality_reason"] = f"Error: {str(e)}"
|
|
post["tickers"] = []
|
|
|
|
return post
|
|
|
|
# Parallel evaluation
|
|
logger.info(f"Scanning {len(candidate_posts)} Reddit posts with LLM...")
|
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
|
futures = [executor.submit(evaluate_post, post) for post in candidate_posts]
|
|
evaluated = [f.result() for f in as_completed(futures)]
|
|
|
|
# Filter quality threshold (55+ = decent DD)
|
|
quality_dd = [p for p in evaluated if p["quality_score"] >= 55]
|
|
quality_dd.sort(key=lambda x: x["quality_score"], reverse=True)
|
|
|
|
# Debug: show score distribution
|
|
all_scores = [p["quality_score"] for p in evaluated if p["quality_score"] > 0]
|
|
if all_scores:
|
|
avg_score = sum(all_scores) / len(all_scores)
|
|
max_score = max(all_scores)
|
|
logger.info(
|
|
f"Score distribution: avg={avg_score:.1f}, max={max_score}, quality_posts={len(quality_dd)}"
|
|
)
|
|
|
|
top_dd = quality_dd[:top_n]
|
|
|
|
else:
|
|
# No LLM - sort by length + engagement
|
|
candidate_posts.sort(key=lambda x: x["full_length"] + (x["score"] * 10), reverse=True)
|
|
top_dd = candidate_posts[:top_n]
|
|
|
|
if as_list:
|
|
if not llm_evaluator:
|
|
import re
|
|
|
|
ticker_pattern = r"\$([A-Z]{2,5})\b|^([A-Z]{2,5})\s"
|
|
for post in top_dd:
|
|
matches = re.findall(ticker_pattern, post["title"] + " " + post["text"])
|
|
tickers = list(set([t[0] or t[1] for t in matches if t[0] or t[1]]))
|
|
post["ticker"] = tickers[0] if tickers else ""
|
|
post["quality_score"] = 75 # default to Medium priority
|
|
return top_dd
|
|
|
|
if not top_dd:
|
|
return f"# Undiscovered DD\n\nNo high-quality DD found (scanned {len(candidate_posts)} posts)."
|
|
|
|
# Build report
|
|
report = "# 💎 Undiscovered DD (LLM-Filtered Quality)\n\n"
|
|
report += f"**Scanned:** {len(candidate_posts)} posts\n"
|
|
report += f"**High Quality:** {len(top_dd)} DD posts (score ≥60)\n\n"
|
|
|
|
for i, post in enumerate(top_dd, 1):
|
|
report += f"## {i}. {post['title']}\n\n"
|
|
|
|
if "quality_score" in post:
|
|
report += f"**Quality:** {post['quality_score']}/100 - {post['quality_reason']}\n"
|
|
if post.get("tickers") and len(post["tickers"]) > 0:
|
|
tickers_str = ", ".join([f"${t}" for t in post["tickers"]])
|
|
report += f"**Tickers:** {tickers_str}\n"
|
|
|
|
report += f"**r/{post['subreddit']}** | {post['hours_ago']}h ago | "
|
|
report += f"{post['score']} ⬆ {post['num_comments']} 💬\n\n"
|
|
|
|
report += f"{post['text'][:600]}...\n\n"
|
|
report += f"[Read Full DD]({post['url']})\n\n---\n\n"
|
|
|
|
return report
|
|
|
|
except Exception as e:
|
|
import traceback
|
|
|
|
return f"# Undiscovered DD\n\nError: {str(e)}\n{traceback.format_exc()}"
|