{ "requirements": { "entities": { "SocialPost": "Core domain entity for Reddit posts with sentiment and engagement data", "SocialMediaPostEntity": "New SQLAlchemy entity for PostgreSQL storage with vector embeddings" }, "data_persistence": { "migration_required": "File-based JSON storage to PostgreSQL + TimescaleDB + pgvectorscale", "schema": "social_media_posts table with vector embeddings, sentiment fields, and TimescaleDB optimization", "deduplication": "Reddit post_id unique constraint prevents duplicates" }, "api_needed": { "external_apis": [ "PRAW (Python Reddit API Wrapper) for Reddit data collection", "OpenRouter API for LLM sentiment analysis and embeddings" ], "internal_apis": [ "AgentToolkit methods: get_reddit_news, get_reddit_stock_info", "SocialMediaService orchestration methods", "SocialRepository PostgreSQL operations" ] }, "components": { "reddit_client": "Complete PRAW implementation (currently empty stub)", "repository": "PostgreSQL migration from file storage", "service": "Business logic with LLM integration", "agent_toolkit": "RAG methods for AI agents", "dagster_pipeline": "Scheduled daily collection" }, "domains": { "primary": "socialmedia (complete greenfield implementation)", "integration": "Follows news domain patterns for consistency" }, "business_rules": [ "Daily collection from financial subreddits (wallstreetbets, investing, stocks, SecurityAnalysis)", "OpenRouter LLM sentiment analysis with structured scoring", "Vector embeddings for semantic similarity search", "Post deduplication by Reddit post_id", "90-day data retention policy", "Rate limiting compliance with Reddit API", "Best effort processing for API failures" ] }, "technical_needs": { "domain_model": { "entities": { "SocialPost": { "purpose": "Domain entity managing business rules and data transformations", "responsibilities": [ "fromRequest() - Create from Reddit API response", "toRecord() - Transform for PostgreSQL storage", "toResponse() - Format for agent consumption", "validate() - Business rule validation", "calculateSentiment() - Derived sentiment scoring", "extractTickers() - Ticker symbol detection" ], "fields": [ "post_id: str (Reddit unique ID)", "title: str", "content: str", "author: str", "subreddit: str", "created_utc: datetime", "upvotes: int", "downvotes: int", "comments_count: int", "url: str", "sentiment_score: float", "sentiment_label: str", "tickers: List[str]", "embedding: Optional[List[float]]" ] }, "SocialMediaPostEntity": { "purpose": "SQLAlchemy entity for PostgreSQL persistence", "table": "social_media_posts", "hypertable": "TimescaleDB partitioned by created_utc", "indexes": [ "post_id (unique)", "subreddit, created_utc", "tickers (GIN array)", "embedding (pgvectorscale HNSW)" ] } } }, "persistence": { "database_type": "PostgreSQL + TimescaleDB + pgvectorscale", "schema_design": { "table": "social_media_posts", "columns": [ "id: UUID PRIMARY KEY", "post_id: VARCHAR(50) UNIQUE NOT NULL", "title: TEXT", "content: TEXT", "author: VARCHAR(100)", "subreddit: VARCHAR(50)", "created_utc: TIMESTAMPTZ (hypertable partition key)", "upvotes: INTEGER", "downvotes: INTEGER", "comments_count: INTEGER", "url: TEXT", "sentiment_score: FLOAT", "sentiment_label: VARCHAR(20)", "tickers: TEXT[] (array)", "embedding: VECTOR(1536) (pgvectorscale)", "inserted_at: TIMESTAMPTZ DEFAULT NOW()", "updated_at: TIMESTAMPTZ DEFAULT NOW()" ], "constraints": [ "UNIQUE(post_id)", "CHECK(sentiment_score BETWEEN -1 AND 1)" ] }, "access_patterns": [ "Ticker-based queries: SELECT * WHERE 'AAPL' = ANY(tickers)", "Time-range filtering: SELECT * WHERE created_utc BETWEEN ? AND ?", "Vector similarity: SELECT * ORDER BY embedding <=> ? LIMIT 10", "Sentiment aggregations: SELECT AVG(sentiment_score) GROUP BY subreddit" ], "data_volume": "~400+ posts daily, 90-day retention = ~36K posts max" }, "router": { "type": "AgentToolkit Integration (No HTTP Router)", "methods": [ "get_reddit_news(ticker: str, days: int) -> List[SocialPost]", "get_reddit_stock_info(ticker: str) -> Dict", "search_similar_posts(query: str, limit: int) -> List[SocialPost]", "get_subreddit_sentiment(subreddit: str, ticker: str) -> SentimentSummary" ], "dependencies": [ "SocialMediaService for business orchestration", "Entity transformations: SocialPost.toResponse()" ] }, "events": { "domain_events": [ "SocialPostCollected: Published when new posts are scraped", "SentimentAnalyzed: Published after LLM sentiment analysis", "EmbeddingGenerated: Published after vector embedding creation" ], "integration_events": [ "MarketDataRequested: Subscribe to ticker validation events", "TradingDecisionMade: Consume for social sentiment correlation" ] }, "dependencies": { "external_services": [ "Reddit API (PRAW): Post collection and metadata", "OpenRouter API: Sentiment analysis and embeddings", "PostgreSQL: Data persistence and queries", "TimescaleDB: Time-series optimization", "pgvectorscale: Vector similarity search" ], "internal_services": [ "None (greenfield implementation)" ], "required_by": [ "AI agents: Social sentiment context for trading decisions", "Multi-agent workflows: RAG-powered social media analysis", "Risk management: Social sentiment risk factors" ], "component_order": [ "1. SocialMediaPostEntity (database schema)", "2. SocialPost (domain entity with transformations)", "3. RedditClient (PRAW implementation)", "4. SocialRepository (PostgreSQL operations)", "5. SocialMediaService (business orchestration + LLM)", "6. AgentToolkit methods (RAG integration)", "7. Dagster pipeline (scheduled collection)" ] } }, "design": { "architecture_overview": { "pattern": "Event-driven microservice with layered internal architecture", "data_flow": "Dagster Pipeline → RedditClient → SocialMediaService → SocialRepository → PostgreSQL + pgvectorscale", "agent_flow": "AgentToolkit → SocialMediaService → SocialRepository → Vector Similarity Search + Sentiment Aggregation", "key_principles": [ "Leverage news domain patterns for consistency", "OpenRouter unified LLM provider", "Best-effort processing for API failures", "Vector-enhanced semantic search", "Rate limiting compliance with Reddit API", "Complete greenfield implementation from empty stubs" ] }, "domain_model": { "SentimentScore": { "purpose": "Structured sentiment analysis result from OpenRouter LLM", "fields": { "sentiment": "Literal['positive', 'negative', 'neutral']", "confidence": "float (0.0-1.0)", "reasoning": "str (brief explanation)" }, "validation": [ "confidence >= 0.5 for reliable sentiment", "reasoning must be non-empty" ] }, "SocialPost": { "purpose": "Core domain entity with business rules and transformations", "base_fields": { "post_id": "str (Reddit unique ID, e.g., 't3_abc123')", "title": "str", "content": "Optional[str] (selftext for text posts)", "author": "str", "subreddit": "str", "created_utc": "datetime", "upvotes": "int (score)", "downvotes": "int (calculated from score + upvote_ratio)", "comments_count": "int (num_comments)", "url": "str (permalink or external URL)" }, "enhanced_fields": { "sentiment_score": "Optional[SentimentScore]", "tickers": "List[str] (extracted ticker symbols)", "title_embedding": "Optional[List[float]] (1536 dimensions)", "content_embedding": "Optional[List[float]] (1536 dimensions)" }, "methods": { "from_praw_submission": "Create from PRAW Submission object", "to_entity": "Transform to SocialMediaPostEntity for database storage", "from_entity": "Create from database entity", "validate": "Business rule validation", "extract_tickers": "Extract stock symbols from title and content", "has_reliable_sentiment": "Check if sentiment confidence >= 0.5", "to_response": "Format for agent consumption" }, "validation_rules": [ "post_id must match Reddit format (starts with 't3_')", "title cannot be empty", "created_utc cannot be in future", "sentiment_score confidence must be 0.0-1.0", "embeddings must be 1536 dimensions if present", "subreddit must be in allowed financial subreddits" ] }, "SocialJobConfig": { "purpose": "Configuration for scheduled Reddit collection", "fields": { "subreddits": "List[str] (financial subreddits to monitor)", "schedule_times": "List[str] (cron expressions for collection)", "sentiment_model": "str (OpenRouter model for sentiment)", "embedding_model": "str (OpenRouter model for embeddings)", "max_posts_per_subreddit": "int (limit per collection run)", "lookback_hours": "int (how far back to collect)", "min_score": "int (minimum upvotes threshold)", "rate_limit_delay": "float (seconds between API calls)" }, "defaults": { "subreddits": "['wallstreetbets', 'investing', 'stocks', 'SecurityAnalysis']", "schedule_times": "['0 6 * * *', '0 18 * * *']", "sentiment_model": "anthropic/claude-3.5-haiku", "embedding_model": "text-embedding-3-large", "max_posts_per_subreddit": 50, "lookback_hours": 12, "min_score": 10, "rate_limit_delay": 1.0 } } }, "data_persistence": { "database_schema": { "table_definition": "CREATE TABLE social_media_posts (\n id UUID PRIMARY KEY DEFAULT uuid7(),\n post_id VARCHAR(50) UNIQUE NOT NULL,\n title TEXT NOT NULL,\n content TEXT,\n author VARCHAR(100) NOT NULL,\n subreddit VARCHAR(50) NOT NULL,\n created_utc TIMESTAMPTZ NOT NULL,\n upvotes INTEGER NOT NULL DEFAULT 0,\n downvotes INTEGER NOT NULL DEFAULT 0,\n comments_count INTEGER NOT NULL DEFAULT 0,\n url TEXT NOT NULL,\n sentiment_score JSONB,\n sentiment_label VARCHAR(20),\n tickers TEXT[] DEFAULT '{}',\n title_embedding VECTOR(1536),\n content_embedding VECTOR(1536),\n inserted_at TIMESTAMPTZ DEFAULT NOW(),\n updated_at TIMESTAMPTZ DEFAULT NOW()\n);", "hypertable": "SELECT create_hypertable('social_media_posts', 'created_utc', chunk_time_interval => INTERVAL '1 day');", "indexes": [ "CREATE UNIQUE INDEX idx_social_posts_post_id ON social_media_posts (post_id);", "CREATE INDEX idx_social_posts_subreddit_time ON social_media_posts (subreddit, created_utc DESC);", "CREATE INDEX idx_social_posts_tickers_gin ON social_media_posts USING GIN (tickers);", "CREATE INDEX idx_social_posts_title_embedding ON social_media_posts USING vectors (title_embedding vector_cosine_ops);", "CREATE INDEX idx_social_posts_content_embedding ON social_media_posts USING vectors (content_embedding vector_cosine_ops);", "CREATE INDEX idx_social_posts_sentiment ON social_media_posts (((sentiment_score->>'sentiment'))) WHERE sentiment_score IS NOT NULL;" ], "constraints": [ "ALTER TABLE social_media_posts ADD CONSTRAINT chk_sentiment_score CHECK (sentiment_score IS NULL OR ((sentiment_score->>'confidence')::float BETWEEN 0 AND 1));", "ALTER TABLE social_media_posts ADD CONSTRAINT chk_created_utc CHECK (created_utc <= NOW());" ] }, "repository_methods": { "find_by_ticker": "async def find_by_ticker(self, ticker: str, days: int = 30, limit: int = 50) -> List[SocialPost]", "find_by_subreddit": "async def find_by_subreddit(self, subreddit: str, hours: int = 24, limit: int = 100) -> List[SocialPost]", "find_similar_posts": "async def find_similar_posts(self, query_embedding: List[float], ticker: Optional[str] = None, limit: int = 10) -> List[SocialPost]", "get_sentiment_summary": "async def get_sentiment_summary(self, ticker: str, subreddit: Optional[str] = None, hours: int = 24) -> Dict[str, Any]", "upsert_batch": "async def upsert_batch(self, posts: List[SocialPost]) -> List[SocialPost]", "cleanup_old_posts": "async def cleanup_old_posts(self, days: int = 90) -> int" }, "query_optimizations": [ "TimescaleDB hypertables for time-based partitioning", "pgvectorscale HNSW indexes for fast vector similarity", "GIN indexes for ticker array queries", "Composite indexes for common access patterns", "Materialized views for sentiment aggregations" ] }, "api_specification": { "reddit_client": { "class": "RedditClient", "purpose": "PRAW wrapper with rate limiting and error handling", "configuration": { "client_id": "Reddit app client ID", "client_secret": "Reddit app client secret", "user_agent": "TradingAgents/1.0 by /u/tradingagents", "rate_limit": "1 request per second", "timeout": "30 seconds per request" }, "methods": { "fetch_subreddit_posts": "async def fetch_subreddit_posts(self, subreddit: str, limit: int = 50, time_filter: str = 'day') -> List[Dict[str, Any]]", "search_posts": "async def search_posts(self, query: str, subreddit: Optional[str] = None, limit: int = 25) -> List[Dict[str, Any]]", "get_post_details": "async def get_post_details(self, post_id: str) -> Optional[Dict[str, Any]]" }, "error_handling": [ "Rate limit exceeded: Exponential backoff", "Authentication errors: Log and continue with next subreddit", "Network timeouts: Retry up to 3 times", "Invalid subreddit: Skip and log warning" ] }, "openrouter_client": { "reuse": "Leverage existing OpenRouterClient from news domain", "enhancements": [ "Social media specific prompts for sentiment analysis", "Batch processing for Reddit post embeddings", "Optimized token usage for short social media text" ], "sentiment_prompt": "Analyze this Reddit post about stocks/finance. Consider the informal language, memes, and community context. Respond with JSON: {\"sentiment\": \"positive|negative|neutral\", \"confidence\": 0.0-1.0, \"reasoning\": \"brief explanation\"}" } }, "components": { "RedditClient": { "layer": "External API Integration", "responsibilities": [ "Authenticate with Reddit API using PRAW", "Fetch posts from financial subreddits", "Handle rate limiting and API errors", "Transform PRAW responses to standard format" ], "dependencies": [ "PRAW library", "Reddit API credentials", "Async HTTP client (httpx)" ], "error_handling": "Best-effort with graceful degradation" }, "SocialRepository": { "layer": "Data Access", "responsibilities": [ "PostgreSQL + TimescaleDB operations", "Vector similarity searches using pgvectorscale", "Batch upsert operations for performance", "Sentiment aggregation queries" ], "dependencies": [ "AsyncSession (SQLAlchemy)", "SocialMediaPostEntity", "Vector similarity functions" ], "performance_targets": [ "Batch upsert: <5s for 1000 posts", "Vector similarity: <1s for top 10 results", "Ticker queries: <100ms for 30-day range" ] }, "SocialMediaService": { "layer": "Business Logic", "responsibilities": [ "Orchestrate Reddit data collection", "Coordinate LLM sentiment analysis", "Generate vector embeddings", "Apply business rules and validation" ], "methods": { "collect_subreddit_posts": "async def collect_subreddit_posts(self, config: SocialJobConfig) -> int", "update_post_sentiment": "async def update_post_sentiment(self, posts: List[SocialPost]) -> List[SocialPost]", "generate_embeddings": "async def generate_embeddings(self, posts: List[SocialPost]) -> List[SocialPost]", "find_trending_tickers": "async def find_trending_tickers(self, hours: int = 24) -> List[Dict[str, Any]]" }, "integration_patterns": [ "OpenRouter for sentiment and embeddings", "Repository for data persistence", "Event publishing for domain events" ] }, "AgentToolkit": { "layer": "Agent Integration", "responsibilities": [ "Provide RAG methods for AI agents", "Format social data for agent consumption", "Semantic search for relevant posts", "Sentiment aggregation and analysis" ], "methods": { "get_reddit_sentiment": "async def get_reddit_sentiment(self, ticker: str, days: int = 7) -> Dict[str, Any]", "search_social_posts": "async def search_social_posts(self, query: str, ticker: Optional[str] = None) -> List[Dict[str, Any]]", "get_trending_discussions": "async def get_trending_discussions(self, ticker: str) -> List[Dict[str, Any]]", "get_subreddit_analysis": "async def get_subreddit_analysis(self, subreddit: str, ticker: str) -> Dict[str, Any]" ], "response_format": [ "Structured JSON with post content, metadata, and sentiment", "Data quality indicators", "Source attribution and confidence scores" ] } }, "events": { "domain_events": { "SocialPostCollected": { "trigger": "New Reddit post successfully stored", "payload": { "post_id": "str", "subreddit": "str", "tickers": "List[str]", "created_utc": "datetime", "collection_timestamp": "datetime" } }, "SentimentAnalyzed": { "trigger": "LLM sentiment analysis completed", "payload": { "post_id": "str", "sentiment": "str", "confidence": "float", "processing_time": "float" } }, "EmbeddingGenerated": { "trigger": "Vector embedding created and stored", "payload": { "post_id": "str", "embedding_type": "str (title|content)", "dimensions": "int", "model_used": "str" } } }, "integration_events": { "MarketDataRequested": { "purpose": "Validate ticker symbols against market data", "consumption": "Subscribe to ensure social posts reference valid tickers" }, "TradingDecisionRequested": { "purpose": "Provide social sentiment context for trading decisions", "consumption": "Publish social sentiment summaries when trading decisions are being made" } } }, "dependencies": { "external_dependencies": { "Reddit API": { "library": "PRAW (Python Reddit API Wrapper)", "authentication": "OAuth2 with client credentials", "rate_limits": "60 requests per minute per OAuth client", "required_credentials": ["client_id", "client_secret", "user_agent"] }, "OpenRouter API": { "reuse": "Existing OpenRouterClient from news domain", "models": { "sentiment": "anthropic/claude-3.5-haiku", "embeddings": "text-embedding-3-large" }, "cost_optimization": "Batch requests and token-efficient prompts" }, "PostgreSQL Stack": { "database": "PostgreSQL 16+", "extensions": ["TimescaleDB", "pgvectorscale", "uuid-ossp"], "connection": "AsyncSession with asyncpg driver" } }, "internal_dependencies": { "news_domain": "Reference implementation patterns for consistency", "config_management": "TradingAgentsConfig for unified configuration", "database_manager": "Shared DatabaseManager and session handling" }, "implementation_order": [ "1. Database migration: Create social_media_posts table with TimescaleDB and vector support", "2. SocialMediaPostEntity: SQLAlchemy entity with proper field mappings", "3. SocialPost: Domain entity with validation and transformation methods", "4. RedditClient: PRAW integration with rate limiting and error handling", "5. SocialRepository: Database operations with vector similarity search", "6. SocialMediaService: Business logic orchestration with LLM integration", "7. AgentToolkit integration: RAG methods for AI agent consumption", "8. Dagster pipeline: Scheduled collection and processing" ] }, "implementation_guidance": { "database_setup": { "migration_script": [ "Create social_media_posts table with all columns", "Add TimescaleDB hypertable partitioning on created_utc", "Create all indexes including vector similarity indexes", "Add constraints for data validation", "Set up retention policy for 90-day data cleanup" ], "seed_data": "Optional test data with sample Reddit posts for development" }, "reddit_integration": { "praw_setup": [ "Create Reddit app at https://www.reddit.com/prefs/apps/", "Configure OAuth2 credentials in environment variables", "Implement rate limiting to respect API limits", "Handle subreddit access and content filtering" ], "data_collection_strategy": [ "Focus on financial subreddits: wallstreetbets, investing, stocks, SecurityAnalysis", "Collect hot/trending posts twice daily (6 AM, 6 PM UTC)", "Filter by minimum score threshold (10+ upvotes)", "Extract ticker symbols from post titles and content", "Deduplicate by Reddit post_id" ] }, "llm_integration": { "sentiment_analysis": [ "Use OpenRouter with anthropic/claude-3.5-haiku for cost efficiency", "Social media-specific prompts accounting for informal language and memes", "Structured JSON output with sentiment, confidence, and reasoning", "Best-effort processing: store posts even if sentiment analysis fails" ], "embeddings": [ "Use text-embedding-3-large for 1536-dimension vectors", "Batch process for efficiency", "Generate embeddings for both title and content when available", "Store NULL for failed embedding generation" ] }, "testing_strategy": { "unit_tests": [ "Entity validation and transformation methods", "Reddit client with mocked PRAW responses", "Repository operations with test database", "Service orchestration with mocked dependencies" ], "integration_tests": [ "End-to-end collection pipeline", "Vector similarity search with real pgvectorscale", "LLM integration with pytest-vcr cassettes", "Dagster pipeline execution" ], "performance_tests": [ "Vector similarity query performance (<1s for top 10)", "Batch upsert performance (<5s for 1000 posts)", "Memory usage during large collection runs" ] }, "monitoring_and_observability": { "metrics": [ "Posts collected per subreddit per day", "Sentiment analysis success rate", "Embedding generation success rate", "Vector similarity query performance", "Reddit API rate limit utilization" ], "logging": [ "Collection job start/completion with statistics", "API errors and retry attempts", "Data quality issues and validation failures", "Performance metrics for optimization" ], "alerts": [ "Collection job failures", "Reddit API authentication issues", "High error rates in LLM processing", "Database connection problems" ] } } } }