TradingAgents/docs/specs/socialmedia/design.json

567 lines
25 KiB
JSON

{
"requirements": {
"entities": {
"SocialPost": "Core domain entity for Reddit posts with sentiment and engagement data",
"SocialMediaPostEntity": "New SQLAlchemy entity for PostgreSQL storage with vector embeddings"
},
"data_persistence": {
"migration_required": "File-based JSON storage to PostgreSQL + TimescaleDB + pgvectorscale",
"schema": "social_media_posts table with vector embeddings, sentiment fields, and TimescaleDB optimization",
"deduplication": "Reddit post_id unique constraint prevents duplicates"
},
"api_needed": {
"external_apis": [
"PRAW (Python Reddit API Wrapper) for Reddit data collection",
"OpenRouter API for LLM sentiment analysis and embeddings"
],
"internal_apis": [
"AgentToolkit methods: get_reddit_news, get_reddit_stock_info",
"SocialMediaService orchestration methods",
"SocialRepository PostgreSQL operations"
]
},
"components": {
"reddit_client": "Complete PRAW implementation (currently empty stub)",
"repository": "PostgreSQL migration from file storage",
"service": "Business logic with LLM integration",
"agent_toolkit": "RAG methods for AI agents",
"dagster_pipeline": "Scheduled daily collection"
},
"domains": {
"primary": "socialmedia (complete greenfield implementation)",
"integration": "Follows news domain patterns for consistency"
},
"business_rules": [
"Daily collection from financial subreddits (wallstreetbets, investing, stocks, SecurityAnalysis)",
"OpenRouter LLM sentiment analysis with structured scoring",
"Vector embeddings for semantic similarity search",
"Post deduplication by Reddit post_id",
"90-day data retention policy",
"Rate limiting compliance with Reddit API",
"Best effort processing for API failures"
]
},
"technical_needs": {
"domain_model": {
"entities": {
"SocialPost": {
"purpose": "Domain entity managing business rules and data transformations",
"responsibilities": [
"fromRequest() - Create from Reddit API response",
"toRecord() - Transform for PostgreSQL storage",
"toResponse() - Format for agent consumption",
"validate() - Business rule validation",
"calculateSentiment() - Derived sentiment scoring",
"extractTickers() - Ticker symbol detection"
],
"fields": [
"post_id: str (Reddit unique ID)",
"title: str",
"content: str",
"author: str",
"subreddit: str",
"created_utc: datetime",
"upvotes: int",
"downvotes: int",
"comments_count: int",
"url: str",
"sentiment_score: float",
"sentiment_label: str",
"tickers: List[str]",
"embedding: Optional[List[float]]"
]
},
"SocialMediaPostEntity": {
"purpose": "SQLAlchemy entity for PostgreSQL persistence",
"table": "social_media_posts",
"hypertable": "TimescaleDB partitioned by created_utc",
"indexes": [
"post_id (unique)",
"subreddit, created_utc",
"tickers (GIN array)",
"embedding (pgvectorscale HNSW)"
]
}
}
},
"persistence": {
"database_type": "PostgreSQL + TimescaleDB + pgvectorscale",
"schema_design": {
"table": "social_media_posts",
"columns": [
"id: UUID PRIMARY KEY",
"post_id: VARCHAR(50) UNIQUE NOT NULL",
"title: TEXT",
"content: TEXT",
"author: VARCHAR(100)",
"subreddit: VARCHAR(50)",
"created_utc: TIMESTAMPTZ (hypertable partition key)",
"upvotes: INTEGER",
"downvotes: INTEGER",
"comments_count: INTEGER",
"url: TEXT",
"sentiment_score: FLOAT",
"sentiment_label: VARCHAR(20)",
"tickers: TEXT[] (array)",
"embedding: VECTOR(1536) (pgvectorscale)",
"inserted_at: TIMESTAMPTZ DEFAULT NOW()",
"updated_at: TIMESTAMPTZ DEFAULT NOW()"
],
"constraints": [
"UNIQUE(post_id)",
"CHECK(sentiment_score BETWEEN -1 AND 1)"
]
},
"access_patterns": [
"Ticker-based queries: SELECT * WHERE 'AAPL' = ANY(tickers)",
"Time-range filtering: SELECT * WHERE created_utc BETWEEN ? AND ?",
"Vector similarity: SELECT * ORDER BY embedding <=> ? LIMIT 10",
"Sentiment aggregations: SELECT AVG(sentiment_score) GROUP BY subreddit"
],
"data_volume": "~400+ posts daily, 90-day retention = ~36K posts max"
},
"router": {
"type": "AgentToolkit Integration (No HTTP Router)",
"methods": [
"get_reddit_news(ticker: str, days: int) -> List[SocialPost]",
"get_reddit_stock_info(ticker: str) -> Dict",
"search_similar_posts(query: str, limit: int) -> List[SocialPost]",
"get_subreddit_sentiment(subreddit: str, ticker: str) -> SentimentSummary"
],
"dependencies": [
"SocialMediaService for business orchestration",
"Entity transformations: SocialPost.toResponse()"
]
},
"events": {
"domain_events": [
"SocialPostCollected: Published when new posts are scraped",
"SentimentAnalyzed: Published after LLM sentiment analysis",
"EmbeddingGenerated: Published after vector embedding creation"
],
"integration_events": [
"MarketDataRequested: Subscribe to ticker validation events",
"TradingDecisionMade: Consume for social sentiment correlation"
]
},
"dependencies": {
"external_services": [
"Reddit API (PRAW): Post collection and metadata",
"OpenRouter API: Sentiment analysis and embeddings",
"PostgreSQL: Data persistence and queries",
"TimescaleDB: Time-series optimization",
"pgvectorscale: Vector similarity search"
],
"internal_services": [
"None (greenfield implementation)"
],
"required_by": [
"AI agents: Social sentiment context for trading decisions",
"Multi-agent workflows: RAG-powered social media analysis",
"Risk management: Social sentiment risk factors"
],
"component_order": [
"1. SocialMediaPostEntity (database schema)",
"2. SocialPost (domain entity with transformations)",
"3. RedditClient (PRAW implementation)",
"4. SocialRepository (PostgreSQL operations)",
"5. SocialMediaService (business orchestration + LLM)",
"6. AgentToolkit methods (RAG integration)",
"7. Dagster pipeline (scheduled collection)"
]
}
},
"design": {
"architecture_overview": {
"pattern": "Event-driven microservice with layered internal architecture",
"data_flow": "Dagster Pipeline → RedditClient → SocialMediaService → SocialRepository → PostgreSQL + pgvectorscale",
"agent_flow": "AgentToolkit → SocialMediaService → SocialRepository → Vector Similarity Search + Sentiment Aggregation",
"key_principles": [
"Leverage news domain patterns for consistency",
"OpenRouter unified LLM provider",
"Best-effort processing for API failures",
"Vector-enhanced semantic search",
"Rate limiting compliance with Reddit API",
"Complete greenfield implementation from empty stubs"
]
},
"domain_model": {
"SentimentScore": {
"purpose": "Structured sentiment analysis result from OpenRouter LLM",
"fields": {
"sentiment": "Literal['positive', 'negative', 'neutral']",
"confidence": "float (0.0-1.0)",
"reasoning": "str (brief explanation)"
},
"validation": [
"confidence >= 0.5 for reliable sentiment",
"reasoning must be non-empty"
]
},
"SocialPost": {
"purpose": "Core domain entity with business rules and transformations",
"base_fields": {
"post_id": "str (Reddit unique ID, e.g., 't3_abc123')",
"title": "str",
"content": "Optional[str] (selftext for text posts)",
"author": "str",
"subreddit": "str",
"created_utc": "datetime",
"upvotes": "int (score)",
"downvotes": "int (calculated from score + upvote_ratio)",
"comments_count": "int (num_comments)",
"url": "str (permalink or external URL)"
},
"enhanced_fields": {
"sentiment_score": "Optional[SentimentScore]",
"tickers": "List[str] (extracted ticker symbols)",
"title_embedding": "Optional[List[float]] (1536 dimensions)",
"content_embedding": "Optional[List[float]] (1536 dimensions)"
},
"methods": {
"from_praw_submission": "Create from PRAW Submission object",
"to_entity": "Transform to SocialMediaPostEntity for database storage",
"from_entity": "Create from database entity",
"validate": "Business rule validation",
"extract_tickers": "Extract stock symbols from title and content",
"has_reliable_sentiment": "Check if sentiment confidence >= 0.5",
"to_response": "Format for agent consumption"
},
"validation_rules": [
"post_id must match Reddit format (starts with 't3_')",
"title cannot be empty",
"created_utc cannot be in future",
"sentiment_score confidence must be 0.0-1.0",
"embeddings must be 1536 dimensions if present",
"subreddit must be in allowed financial subreddits"
]
},
"SocialJobConfig": {
"purpose": "Configuration for scheduled Reddit collection",
"fields": {
"subreddits": "List[str] (financial subreddits to monitor)",
"schedule_times": "List[str] (cron expressions for collection)",
"sentiment_model": "str (OpenRouter model for sentiment)",
"embedding_model": "str (OpenRouter model for embeddings)",
"max_posts_per_subreddit": "int (limit per collection run)",
"lookback_hours": "int (how far back to collect)",
"min_score": "int (minimum upvotes threshold)",
"rate_limit_delay": "float (seconds between API calls)"
},
"defaults": {
"subreddits": "['wallstreetbets', 'investing', 'stocks', 'SecurityAnalysis']",
"schedule_times": "['0 6 * * *', '0 18 * * *']",
"sentiment_model": "anthropic/claude-3.5-haiku",
"embedding_model": "text-embedding-3-large",
"max_posts_per_subreddit": 50,
"lookback_hours": 12,
"min_score": 10,
"rate_limit_delay": 1.0
}
}
},
"data_persistence": {
"database_schema": {
"table_definition": "CREATE TABLE social_media_posts (\n id UUID PRIMARY KEY DEFAULT uuid7(),\n post_id VARCHAR(50) UNIQUE NOT NULL,\n title TEXT NOT NULL,\n content TEXT,\n author VARCHAR(100) NOT NULL,\n subreddit VARCHAR(50) NOT NULL,\n created_utc TIMESTAMPTZ NOT NULL,\n upvotes INTEGER NOT NULL DEFAULT 0,\n downvotes INTEGER NOT NULL DEFAULT 0,\n comments_count INTEGER NOT NULL DEFAULT 0,\n url TEXT NOT NULL,\n sentiment_score JSONB,\n sentiment_label VARCHAR(20),\n tickers TEXT[] DEFAULT '{}',\n title_embedding VECTOR(1536),\n content_embedding VECTOR(1536),\n inserted_at TIMESTAMPTZ DEFAULT NOW(),\n updated_at TIMESTAMPTZ DEFAULT NOW()\n);",
"hypertable": "SELECT create_hypertable('social_media_posts', 'created_utc', chunk_time_interval => INTERVAL '1 day');",
"indexes": [
"CREATE UNIQUE INDEX idx_social_posts_post_id ON social_media_posts (post_id);",
"CREATE INDEX idx_social_posts_subreddit_time ON social_media_posts (subreddit, created_utc DESC);",
"CREATE INDEX idx_social_posts_tickers_gin ON social_media_posts USING GIN (tickers);",
"CREATE INDEX idx_social_posts_title_embedding ON social_media_posts USING vectors (title_embedding vector_cosine_ops);",
"CREATE INDEX idx_social_posts_content_embedding ON social_media_posts USING vectors (content_embedding vector_cosine_ops);",
"CREATE INDEX idx_social_posts_sentiment ON social_media_posts (((sentiment_score->>'sentiment'))) WHERE sentiment_score IS NOT NULL;"
],
"constraints": [
"ALTER TABLE social_media_posts ADD CONSTRAINT chk_sentiment_score CHECK (sentiment_score IS NULL OR ((sentiment_score->>'confidence')::float BETWEEN 0 AND 1));",
"ALTER TABLE social_media_posts ADD CONSTRAINT chk_created_utc CHECK (created_utc <= NOW());"
]
},
"repository_methods": {
"find_by_ticker": "async def find_by_ticker(self, ticker: str, days: int = 30, limit: int = 50) -> List[SocialPost]",
"find_by_subreddit": "async def find_by_subreddit(self, subreddit: str, hours: int = 24, limit: int = 100) -> List[SocialPost]",
"find_similar_posts": "async def find_similar_posts(self, query_embedding: List[float], ticker: Optional[str] = None, limit: int = 10) -> List[SocialPost]",
"get_sentiment_summary": "async def get_sentiment_summary(self, ticker: str, subreddit: Optional[str] = None, hours: int = 24) -> Dict[str, Any]",
"upsert_batch": "async def upsert_batch(self, posts: List[SocialPost]) -> List[SocialPost]",
"cleanup_old_posts": "async def cleanup_old_posts(self, days: int = 90) -> int"
},
"query_optimizations": [
"TimescaleDB hypertables for time-based partitioning",
"pgvectorscale HNSW indexes for fast vector similarity",
"GIN indexes for ticker array queries",
"Composite indexes for common access patterns",
"Materialized views for sentiment aggregations"
]
},
"api_specification": {
"reddit_client": {
"class": "RedditClient",
"purpose": "PRAW wrapper with rate limiting and error handling",
"configuration": {
"client_id": "Reddit app client ID",
"client_secret": "Reddit app client secret",
"user_agent": "TradingAgents/1.0 by /u/tradingagents",
"rate_limit": "1 request per second",
"timeout": "30 seconds per request"
},
"methods": {
"fetch_subreddit_posts": "async def fetch_subreddit_posts(self, subreddit: str, limit: int = 50, time_filter: str = 'day') -> List[Dict[str, Any]]",
"search_posts": "async def search_posts(self, query: str, subreddit: Optional[str] = None, limit: int = 25) -> List[Dict[str, Any]]",
"get_post_details": "async def get_post_details(self, post_id: str) -> Optional[Dict[str, Any]]"
},
"error_handling": [
"Rate limit exceeded: Exponential backoff",
"Authentication errors: Log and continue with next subreddit",
"Network timeouts: Retry up to 3 times",
"Invalid subreddit: Skip and log warning"
]
},
"openrouter_client": {
"reuse": "Leverage existing OpenRouterClient from news domain",
"enhancements": [
"Social media specific prompts for sentiment analysis",
"Batch processing for Reddit post embeddings",
"Optimized token usage for short social media text"
],
"sentiment_prompt": "Analyze this Reddit post about stocks/finance. Consider the informal language, memes, and community context. Respond with JSON: {\"sentiment\": \"positive|negative|neutral\", \"confidence\": 0.0-1.0, \"reasoning\": \"brief explanation\"}"
}
},
"components": {
"RedditClient": {
"layer": "External API Integration",
"responsibilities": [
"Authenticate with Reddit API using PRAW",
"Fetch posts from financial subreddits",
"Handle rate limiting and API errors",
"Transform PRAW responses to standard format"
],
"dependencies": [
"PRAW library",
"Reddit API credentials",
"Async HTTP client (httpx)"
],
"error_handling": "Best-effort with graceful degradation"
},
"SocialRepository": {
"layer": "Data Access",
"responsibilities": [
"PostgreSQL + TimescaleDB operations",
"Vector similarity searches using pgvectorscale",
"Batch upsert operations for performance",
"Sentiment aggregation queries"
],
"dependencies": [
"AsyncSession (SQLAlchemy)",
"SocialMediaPostEntity",
"Vector similarity functions"
],
"performance_targets": [
"Batch upsert: <5s for 1000 posts",
"Vector similarity: <1s for top 10 results",
"Ticker queries: <100ms for 30-day range"
]
},
"SocialMediaService": {
"layer": "Business Logic",
"responsibilities": [
"Orchestrate Reddit data collection",
"Coordinate LLM sentiment analysis",
"Generate vector embeddings",
"Apply business rules and validation"
],
"methods": {
"collect_subreddit_posts": "async def collect_subreddit_posts(self, config: SocialJobConfig) -> int",
"update_post_sentiment": "async def update_post_sentiment(self, posts: List[SocialPost]) -> List[SocialPost]",
"generate_embeddings": "async def generate_embeddings(self, posts: List[SocialPost]) -> List[SocialPost]",
"find_trending_tickers": "async def find_trending_tickers(self, hours: int = 24) -> List[Dict[str, Any]]"
},
"integration_patterns": [
"OpenRouter for sentiment and embeddings",
"Repository for data persistence",
"Event publishing for domain events"
]
},
"AgentToolkit": {
"layer": "Agent Integration",
"responsibilities": [
"Provide RAG methods for AI agents",
"Format social data for agent consumption",
"Semantic search for relevant posts",
"Sentiment aggregation and analysis"
],
"methods": {
"get_reddit_sentiment": "async def get_reddit_sentiment(self, ticker: str, days: int = 7) -> Dict[str, Any]",
"search_social_posts": "async def search_social_posts(self, query: str, ticker: Optional[str] = None) -> List[Dict[str, Any]]",
"get_trending_discussions": "async def get_trending_discussions(self, ticker: str) -> List[Dict[str, Any]]",
"get_subreddit_analysis": "async def get_subreddit_analysis(self, subreddit: str, ticker: str) -> Dict[str, Any]"
],
"response_format": [
"Structured JSON with post content, metadata, and sentiment",
"Data quality indicators",
"Source attribution and confidence scores"
]
}
},
"events": {
"domain_events": {
"SocialPostCollected": {
"trigger": "New Reddit post successfully stored",
"payload": {
"post_id": "str",
"subreddit": "str",
"tickers": "List[str]",
"created_utc": "datetime",
"collection_timestamp": "datetime"
}
},
"SentimentAnalyzed": {
"trigger": "LLM sentiment analysis completed",
"payload": {
"post_id": "str",
"sentiment": "str",
"confidence": "float",
"processing_time": "float"
}
},
"EmbeddingGenerated": {
"trigger": "Vector embedding created and stored",
"payload": {
"post_id": "str",
"embedding_type": "str (title|content)",
"dimensions": "int",
"model_used": "str"
}
}
},
"integration_events": {
"MarketDataRequested": {
"purpose": "Validate ticker symbols against market data",
"consumption": "Subscribe to ensure social posts reference valid tickers"
},
"TradingDecisionRequested": {
"purpose": "Provide social sentiment context for trading decisions",
"consumption": "Publish social sentiment summaries when trading decisions are being made"
}
}
},
"dependencies": {
"external_dependencies": {
"Reddit API": {
"library": "PRAW (Python Reddit API Wrapper)",
"authentication": "OAuth2 with client credentials",
"rate_limits": "60 requests per minute per OAuth client",
"required_credentials": ["client_id", "client_secret", "user_agent"]
},
"OpenRouter API": {
"reuse": "Existing OpenRouterClient from news domain",
"models": {
"sentiment": "anthropic/claude-3.5-haiku",
"embeddings": "text-embedding-3-large"
},
"cost_optimization": "Batch requests and token-efficient prompts"
},
"PostgreSQL Stack": {
"database": "PostgreSQL 16+",
"extensions": ["TimescaleDB", "pgvectorscale", "uuid-ossp"],
"connection": "AsyncSession with asyncpg driver"
}
},
"internal_dependencies": {
"news_domain": "Reference implementation patterns for consistency",
"config_management": "TradingAgentsConfig for unified configuration",
"database_manager": "Shared DatabaseManager and session handling"
},
"implementation_order": [
"1. Database migration: Create social_media_posts table with TimescaleDB and vector support",
"2. SocialMediaPostEntity: SQLAlchemy entity with proper field mappings",
"3. SocialPost: Domain entity with validation and transformation methods",
"4. RedditClient: PRAW integration with rate limiting and error handling",
"5. SocialRepository: Database operations with vector similarity search",
"6. SocialMediaService: Business logic orchestration with LLM integration",
"7. AgentToolkit integration: RAG methods for AI agent consumption",
"8. Dagster pipeline: Scheduled collection and processing"
]
},
"implementation_guidance": {
"database_setup": {
"migration_script": [
"Create social_media_posts table with all columns",
"Add TimescaleDB hypertable partitioning on created_utc",
"Create all indexes including vector similarity indexes",
"Add constraints for data validation",
"Set up retention policy for 90-day data cleanup"
],
"seed_data": "Optional test data with sample Reddit posts for development"
},
"reddit_integration": {
"praw_setup": [
"Create Reddit app at https://www.reddit.com/prefs/apps/",
"Configure OAuth2 credentials in environment variables",
"Implement rate limiting to respect API limits",
"Handle subreddit access and content filtering"
],
"data_collection_strategy": [
"Focus on financial subreddits: wallstreetbets, investing, stocks, SecurityAnalysis",
"Collect hot/trending posts twice daily (6 AM, 6 PM UTC)",
"Filter by minimum score threshold (10+ upvotes)",
"Extract ticker symbols from post titles and content",
"Deduplicate by Reddit post_id"
]
},
"llm_integration": {
"sentiment_analysis": [
"Use OpenRouter with anthropic/claude-3.5-haiku for cost efficiency",
"Social media-specific prompts accounting for informal language and memes",
"Structured JSON output with sentiment, confidence, and reasoning",
"Best-effort processing: store posts even if sentiment analysis fails"
],
"embeddings": [
"Use text-embedding-3-large for 1536-dimension vectors",
"Batch process for efficiency",
"Generate embeddings for both title and content when available",
"Store NULL for failed embedding generation"
]
},
"testing_strategy": {
"unit_tests": [
"Entity validation and transformation methods",
"Reddit client with mocked PRAW responses",
"Repository operations with test database",
"Service orchestration with mocked dependencies"
],
"integration_tests": [
"End-to-end collection pipeline",
"Vector similarity search with real pgvectorscale",
"LLM integration with pytest-vcr cassettes",
"Dagster pipeline execution"
],
"performance_tests": [
"Vector similarity query performance (<1s for top 10)",
"Batch upsert performance (<5s for 1000 posts)",
"Memory usage during large collection runs"
]
},
"monitoring_and_observability": {
"metrics": [
"Posts collected per subreddit per day",
"Sentiment analysis success rate",
"Embedding generation success rate",
"Vector similarity query performance",
"Reddit API rate limit utilization"
],
"logging": [
"Collection job start/completion with statistics",
"API errors and retry attempts",
"Data quality issues and validation failures",
"Performance metrics for optimization"
],
"alerts": [
"Collection job failures",
"Reddit API authentication issues",
"High error rates in LLM processing",
"Database connection problems"
]
}
}
}
}