From c20771bf2050533df22570afd6492721e8afa221 Mon Sep 17 00:00:00 2001 From: "Martin C. Richards" Date: Tue, 11 Nov 2025 22:28:54 +0100 Subject: [PATCH] docs: Adopt Spec-Driven Development framework Establish complete Spec-Driven Development documentation structure to enable AI-assisted implementation with product context, feature specs, and architectural standards. Documentation: - Add product docs (product.md, roadmap.md) for business context - Add feature specs for marketdata, news, and socialmedia domains - Add technical standards (practices.md, security.md, style.md, tech.md) - Update README with SDD workflow and PostgreSQL architecture Restructure: - Move Docker files to docker/db/ for cleaner organization - Move docker-compose.yml to project root - Remove deprecated configs (litellm.yml, package.json, setup.py) - Update tests for pytest-vcr integration This establishes the foundation for /spec:* workflow commands and structured AI-agent collaboration. --- .mise.toml | 1 - LICENSE | 3 +- README.md | 340 +- .../docker-compose.yml => docker-compose.yml | 4 +- docker/{ => db}/Dockerfile | 0 docker/{ => db}/seed.sql | 0 docs/product/product.md | 150 + docs/product/roadmap.md | 206 ++ docs/specs/MarketData/context.json | 66 + docs/specs/MarketData/design.json | 52 + docs/specs/MarketData/design.md | 1362 ++++++++ docs/specs/MarketData/requirements.json | 6 + docs/specs/MarketData/spec-lite.md | 98 + docs/specs/MarketData/spec.json | 95 + docs/specs/MarketData/spec.md | 352 +++ docs/specs/news/context.json | 47 + docs/specs/news/design.json | 127 + docs/specs/news/design.md | 946 ++++++ docs/specs/news/requirements.json | 6 + docs/specs/news/spec-lite.md | 80 + docs/specs/news/spec.json | 68 + docs/specs/news/spec.md | 334 ++ docs/specs/news/status.md | 336 ++ docs/specs/news/tasks.md | 1039 +++++++ docs/specs/socialmedia/context.json | 70 + docs/specs/socialmedia/design.json | 567 ++++ docs/specs/socialmedia/design.md | 834 +++++ docs/specs/socialmedia/requirements.json | 6 + docs/specs/socialmedia/spec-lite.md | 105 + docs/specs/socialmedia/spec.json | 90 + docs/specs/socialmedia/spec.md | 740 +++++ docs/specs/socialmedia/status.md | 184 ++ docs/specs/socialmedia/tasks.md | 2729 +++++++++++++++++ docs/standards/practices.md | 649 ++++ docs/standards/security.md | 837 +++++ docs/standards/style.md | 715 +++++ docs/standards/tech.md | 543 ++++ litellm.yml | 17 - package-lock.json | 6 - package.json | 1 - prd/news_service.md | 1019 ------ setup.py | 43 - test_typecheck.sh | 4 - .../news/test_article_scraper_client.py | 25 +- tests/domains/news/test_google_news_client.py | 25 +- 45 files changed, 13591 insertions(+), 1336 deletions(-) rename docker/docker-compose.yml => docker-compose.yml (90%) rename docker/{ => db}/Dockerfile (100%) rename docker/{ => db}/seed.sql (100%) create mode 100644 docs/product/product.md create mode 100644 docs/product/roadmap.md create mode 100644 docs/specs/MarketData/context.json create mode 100644 docs/specs/MarketData/design.json create mode 100644 docs/specs/MarketData/design.md create mode 100644 docs/specs/MarketData/requirements.json create mode 100644 docs/specs/MarketData/spec-lite.md create mode 100644 docs/specs/MarketData/spec.json create mode 100644 docs/specs/MarketData/spec.md create mode 100644 docs/specs/news/context.json create mode 100644 docs/specs/news/design.json create mode 100644 docs/specs/news/design.md create mode 100644 docs/specs/news/requirements.json create mode 100644 docs/specs/news/spec-lite.md create mode 100644 docs/specs/news/spec.json create mode 100644 docs/specs/news/spec.md create mode 100644 docs/specs/news/status.md create mode 100644 docs/specs/news/tasks.md create mode 100644 docs/specs/socialmedia/context.json create mode 100644 docs/specs/socialmedia/design.json create mode 100644 docs/specs/socialmedia/design.md create mode 100644 docs/specs/socialmedia/requirements.json create mode 100644 docs/specs/socialmedia/spec-lite.md create mode 100644 docs/specs/socialmedia/spec.json create mode 100644 docs/specs/socialmedia/spec.md create mode 100644 docs/specs/socialmedia/status.md create mode 100644 docs/specs/socialmedia/tasks.md create mode 100644 docs/standards/practices.md create mode 100644 docs/standards/security.md create mode 100644 docs/standards/style.md create mode 100644 docs/standards/tech.md delete mode 100644 litellm.yml delete mode 100644 package-lock.json delete mode 100644 package.json delete mode 100644 prd/news_service.md delete mode 100644 setup.py delete mode 100644 test_typecheck.sh diff --git a/.mise.toml b/.mise.toml index 5e5252de..e06c5804 100644 --- a/.mise.toml +++ b/.mise.toml @@ -2,7 +2,6 @@ python = "3.13" uv = "latest" ruff = "latest" -docker = "latest" [env] _.file = ".env" diff --git a/LICENSE b/LICENSE index 261eeb9e..de716f20 100644 --- a/LICENSE +++ b/LICENSE @@ -186,7 +186,8 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright 2025 Martin C. Richards + Copyright 2025 Tauric Research Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/README.md b/README.md index 97bd02c8..ba97cc01 100644 --- a/README.md +++ b/README.md @@ -1,53 +1,46 @@ -

- -

+# TradingAgents Project Overview -
- arXiv - Discord - WeChat - X Follow -
- Community -
+## Spec-Driven Development Integration -
- - Deutsch | - Español | - français | - 日本語 | - 한국어 | - Português | - Русский | - 中文 -
+TradingAgents integrates with the Spec-Driven Development workflow to accelerate feature development while maintaining architectural consistency. This project uses the specialized agent system described in your global CLAUDE.md for structured specifications and AI-assisted implementation. + +### Project Context for AI Agents + +**Product Definition**: Multi-agent LLM financial trading framework that mirrors real-world trading firm dynamics for research-based market analysis and trading decisions. + +**Target Users**: Single developer/researcher focused on personal trading research and data infrastructure development. + +**Core Architecture**: Domain-driven design with three domains (marketdata, news, socialmedia), PostgreSQL + TimescaleDB + pgvectorscale data stack, RAG-powered multi-agent collaboration through LangGraph workflows. + +**Key Constraints**: Research-only framework (not production trading), OpenRouter as sole LLM provider, 85%+ test coverage requirement, TDD with pytest. + +### Documentation Structure + +- **Product Docs**: `/Users/martinrichards/code/TradingAgents/docs/product/` - Business context and roadmap +- **Feature Specs**: `/Users/martinrichards/code/TradingAgents/docs/spec/` - Implementation specifications +- **Standards**: `/Users/martinrichards/code/TradingAgents/docs/standards/` - Technical architecture and practices + +### Agent Context for Implementation + +When implementing features, AI agents should reference: +- `docs/product/product.md` for business context and user requirements +- `docs/standards/tech.md` for architectural patterns and technical standards +- `docs/standards/practices.md` for TDD workflow and development practices +- `docs/standards/style.md` for code style and naming conventions + +Apply the layered architecture pattern: **Router → Service → Repository → Entity → Database** consistently across all domains. --- -# TradingAgents: Multi-Agents LLM Financial Trading Framework +# TradingAgents: Multi-Agents LLM Financial Trading Framework -> 🎉 **TradingAgents** officially released! We have received numerous inquiries about the work, and we would like to express our thanks for the enthusiasm in our community. +> **Personal Fork Notice**: This is a personal fork of the original TradingAgents framework by TauricResearch, originally licensed under Apache 2.0. This fork focuses on individual research and development with significant architectural changes including PostgreSQL + TimescaleDB + pgvectorscale data infrastructure and RAG-powered agents. > -> So we decided to fully open-source the framework. Looking forward to building impactful projects with you! +> **Original Work**: [TauricResearch/TradingAgents](https://github.com/TauricResearch/TradingAgents) - [arXiv:2412.20138](https://arxiv.org/abs/2412.20138) - +--- -
- -🚀 [TradingAgents](#tradingagents-framework) | ⚡ [Installation & CLI](#installation-and-cli) | 🎬 [Demo](https://www.youtube.com/watch?v=90gr5lwjIho) | 📦 [Package Usage](#tradingagents-package) | 📚 [API Docs](./docs/api-reference.md) | 🔧 [Troubleshooting](./docs/troubleshooting.md) | 👥 [Agent Dev](./docs/agent-development.md) | 🤝 [Contributing](#contributing) | 📄 [Citation](#citation) - -
- -
- - - - - TradingAgents Star History - - -
+🚀 [TradingAgents](#tradingagents-framework) | ⚡ [Installation & CLI](#installation-and-cli) | 📦 [Package Usage](#tradingagents-package) | 📚 [API Docs](./docs/api-reference.md) | 🔧 [Troubleshooting](./docs/troubleshooting.md) | 👥 [Agent Dev](./docs/agent-development.md) | 📄 [Citation](#citation) ## TradingAgents Framework @@ -57,7 +50,7 @@ TradingAgents is a multi-agent trading framework that mirrors the dynamics of re

-> TradingAgents framework is designed for research purposes. Trading performance may vary based on many factors, including the chosen backbone language models, model temperature, trading periods, the quality of data, and other non-deterministic factors. [It is not intended as financial, investment, or trading advice.](https://tauric.ai/disclaimer/) +> TradingAgents framework is designed for research purposes. Trading performance may vary based on many factors, including the chosen backbone language models, model temperature, trading periods, the quality of data, and other non-deterministic factors. It is not intended as financial, investment, or trading advice. Our framework decomposes complex trading tasks into specialized roles. This ensures the system achieves a robust, scalable approach to market analysis and decision-making. @@ -99,63 +92,79 @@ Our framework decomposes complex trading tasks into specialized roles. This ensu Clone TradingAgents: ```bash -git clone https://github.com/TauricResearch/TradingAgents.git +git clone https://github.com/martinrichards23/TradingAgents.git cd TradingAgents ``` -Create a virtual environment in any of your favorite environment managers: +Install development tools (mise manages Python, uv, and other tools): ```bash -conda create -n tradingagents python=3.13 -conda activate tradingagents +# Install mise if not already installed +curl https://mise.run | sh + +# Install project tools and dependencies +mise install # Installs Python, uv, ruff, pyright +mise run install # Installs project dependencies with uv ``` -Install dependencies: +Alternative manual setup: ```bash -pip install -r requirements.txt +# Create virtual environment with uv +uv venv +source .venv/bin/activate # or .venv\Scripts\activate on Windows + +# Install dependencies +uv sync +``` + +### Database Setup + +This fork uses PostgreSQL with TimescaleDB and pgvectorscale extensions: + +```bash +# Using Docker Compose (recommended) +docker-compose up -d + +# Or install PostgreSQL with extensions manually +# See docs/setup-database.md for detailed instructions ``` ### Required APIs -You will also need the FinnHub API for financial data. All of our code is implemented with the free tier. +OpenRouter API (unified LLM provider): +```bash +export OPENROUTER_API_KEY=$YOUR_OPENROUTER_API_KEY +``` + +FinnHub API for financial data (optional): ```bash export FINNHUB_API_KEY=$YOUR_FINNHUB_API_KEY ``` -You will need the OpenAI API for all the agents. +Database connection: ```bash -export OPENAI_API_KEY=$YOUR_OPENAI_API_KEY +export DATABASE_URL="postgresql://user:pass@localhost:5432/tradingagents" ``` ### CLI Usage -You can also try out the CLI directly by running: +Run the CLI directly: ```bash -python -m cli.main +mise run dev # or python -m cli.main ``` -You will see a screen where you can select your desired tickers, date, LLMs, research depth, etc.

-An interface will appear showing results as they load, letting you track the agent's progress as it runs. - -

- -

- -

- -

- ## Quick Start Get up and running with TradingAgents in 3 simple steps: ### Step 1: Set API Keys ```bash -export OPENAI_API_KEY="your_openai_api_key" +export OPENROUTER_API_KEY="your_openrouter_api_key" export FINNHUB_API_KEY="your_finnhub_api_key" # Optional for financial data +export DATABASE_URL="postgresql://user:pass@localhost:5432/tradingagents" ``` ### Step 2: Run Your First Analysis @@ -179,18 +188,21 @@ The analysis returns: - **Decision**: `BUY`, `SELL`, or `HOLD` - **Result**: Detailed analysis from all agents including market data, news sentiment, and risk assessment -**Next Steps**: Explore the [CLI interface](#cli-usage), check out [usage examples](#multi-llm-provider-examples), or dive into the [API documentation](./docs/api-reference.md). +**Next Steps**: Explore the [CLI interface](#cli-usage), check out [usage examples](#openrouter-configuration), or dive into the [API documentation](./docs/api-reference.md). ## TradingAgents Package ### Implementation Details -We built TradingAgents with LangGraph to ensure flexibility and modularity. We utilize `o1-preview` and `gpt-4o` as our deep thinking and fast thinking LLMs for our experiments. However, for testing purposes, we recommend you use `o4-mini` and `gpt-4.1-mini` to save on costs as our framework makes **lots of** API calls. +This fork is built with: +- **LangGraph** for agent orchestration +- **PostgreSQL + TimescaleDB + pgvectorscale** for data storage and vector search +- **OpenRouter** as the unified LLM provider +- **RAG** for context-aware agent decision making +- **Dagster** for data collection orchestration ### Python Usage -To use TradingAgents inside your code, you can import the `tradingagents` module and initialize a `TradingAgentsGraph()` object. The `.propagate()` function will return a decision. You can run `main.py`, here's also a quick example: - ```python from tradingagents.graph.trading_graph import TradingAgentsGraph from tradingagents.config import TradingAgentsConfig @@ -198,88 +210,64 @@ from tradingagents.config import TradingAgentsConfig config = TradingAgentsConfig.from_env() ta = TradingAgentsGraph(debug=True, config=config) -# forward propagate +# Forward propagate _, decision = ta.propagate("NVDA", "2024-05-10") print(decision) ``` -You can also adjust the default configuration to set your own choice of LLMs, debate rounds, etc. +### Custom Configuration ```python -from tradingagents.graph.trading_graph import TradingAgentsGraph from tradingagents.config import TradingAgentsConfig # Create a custom config config = TradingAgentsConfig( - deep_think_llm="gpt-4.1-nano", # Use a different model - quick_think_llm="gpt-4.1-nano", # Use a different model - max_debate_rounds=3, # Increase debate rounds - online_tools=True # Use online tools or cached data + llm_provider="openrouter", + deep_think_llm="anthropic/claude-3.5-sonnet", + quick_think_llm="anthropic/claude-3.5-haiku", + max_debate_rounds=3, + use_rag=True, # Enable RAG-powered agents + database_url="postgresql://user:pass@localhost:5432/tradingagents" ) -# Initialize with custom config ta = TradingAgentsGraph(debug=True, config=config) - -# forward propagate _, decision = ta.propagate("NVDA", "2024-05-10") print(decision) ``` -> For `online_tools`, we recommend enabling them for experimentation, as they provide access to real-time data. The agents' offline tools rely on cached data from our **Tauric TradingDB**, a curated dataset we use for backtesting. We're currently in the process of refining this dataset, and we plan to release it soon alongside our upcoming projects. Stay tuned! - -You can view the full list of configurations in `tradingagents/config.py`. - -### Complete Environment Variables Reference +### Environment Variables Reference | Variable | Description | Default | Example | |----------|-------------|---------|---------| -| `LLM_PROVIDER` | LLM provider to use | `openai` | `anthropic` | -| `DEEP_THINK_LLM` | Model for complex analysis | `o4-mini` | `claude-3-5-sonnet-latest` | -| `QUICK_THINK_LLM` | Model for fast responses | `gpt-4o-mini` | `gpt-4o-mini` | -| `BACKEND_URL` | API endpoint | `https://api.openai.com/v1` | `https://api.anthropic.com` | +| `LLM_PROVIDER` | LLM provider to use | `openrouter` | `openrouter` | +| `OPENROUTER_API_KEY` | OpenRouter API key | Required | `sk-or-...` | +| `DEEP_THINK_LLM` | Model for complex analysis | `anthropic/claude-3.5-sonnet` | `openai/gpt-4` | +| `QUICK_THINK_LLM` | Model for fast responses | `anthropic/claude-3.5-haiku` | `openai/gpt-4o-mini` | | `MAX_DEBATE_ROUNDS` | Investment debate rounds | `1` | `3` | | `MAX_RISK_DISCUSS_ROUNDS` | Risk discussion rounds | `1` | `2` | -| `ONLINE_TOOLS` | Use live APIs vs cached data | `true` | `false` | +| `USE_RAG` | Enable RAG for agents | `true` | `false` | +| `DATABASE_URL` | PostgreSQL connection string | Required | `postgresql://...` | | `DEFAULT_LOOKBACK_DAYS` | Historical data range | `30` | `60` | | `TRADINGAGENTS_RESULTS_DIR` | Output directory | `./results` | `./my_results` | -| `TRADINGAGENTS_DATA_DIR` | Data storage directory | System default | `./data` | -### Multi-LLM Provider Examples +### OpenRouter Configuration + +This fork exclusively uses OpenRouter for unified LLM access: -**Using Anthropic Claude:** ```python -from tradingagents.graph.trading_graph import TradingAgentsGraph -from tradingagents.config import TradingAgentsConfig - config = TradingAgentsConfig( - llm_provider="anthropic", - deep_think_llm="claude-3-5-sonnet-latest", - quick_think_llm="claude-3-haiku-latest", + llm_provider="openrouter", + deep_think_llm="anthropic/claude-3.5-sonnet", + quick_think_llm="openai/gpt-4o-mini", max_debate_rounds=2 ) - -ta = TradingAgentsGraph(debug=True, config=config) -_, decision = ta.propagate("TSLA", "2024-01-15") ``` -**Using Google Gemini:** -```python -config = TradingAgentsConfig( - llm_provider="google", - deep_think_llm="gemini-1.5-pro", - quick_think_llm="gemini-1.5-flash" -) -``` - -See [docs/api-reference.md](./docs/api-reference.md) for complete API documentation. - ## Development Guide -This section provides comprehensive development guidance for contributors working on the TradingAgents codebase. - ### Common Development Commands -This project uses [mise](https://mise.jdx.dev/) for tool and task management. All development tasks are managed through mise. +This project uses [mise](https://mise.jdx.dev/) for tool and task management: #### Essential Commands - **CLI Application**: `mise run dev` - Interactive CLI for running trading analysis @@ -289,9 +277,10 @@ This project uses [mise](https://mise.jdx.dev/) for tool and task management. Al - **Type checking**: `mise run typecheck` - Run pyright type checker - **Run all tests**: `mise run test` - Run tests with pytest -#### Initial Setup -- **Install tools**: `mise install` - Install Python, uv, ruff, pyright -- **Install dependencies**: `mise run install` - Install project dependencies with uv +#### Database Commands +- **Start database**: `docker-compose up -d` +- **Run migrations**: `mise run migrate` +- **Seed test data**: `mise run seed` ### Testing Principles @@ -306,55 +295,15 @@ tests/ │ └── news/ │ ├── __init__.py │ ├── test_news_service.py # Mock repo + clients -│ ├── test_news_repository.py # Docker test DB +│ ├── test_news_repository.py # PostgreSQL test DB │ └── test_google_news_client.py # pytest-vcr ``` -#### Mocking Strategy by Layer -- **Services**: Mock Repository + Clients, test real transformations -- **Repositories**: Real persistence (temp files/Docker), no mocks -- **Clients**: Real HTTP with pytest-vcr cassettes - #### Quality Standards - **85% coverage** minimum - **< 100ms** per unit test - **Mock boundaries, test behavior** -### Configuration - -The TradingAgents framework uses a centralized `TradingAgentsConfig` class for all configuration management. - -#### Core Configuration Options - -**LLM Settings**: -- `llm_provider`: OpenAI, Anthropic, Google, Ollama, or OpenRouter (default: "openai") -- `deep_think_llm`: Model for complex reasoning tasks (default: "o4-mini") -- `quick_think_llm`: Model for fast responses (default: "gpt-4o-mini") - -**Debate Parameters**: -- `max_debate_rounds`: Maximum rounds in investment debates (default: 1) -- `max_risk_discuss_rounds`: Maximum rounds in risk discussions (default: 1) - -**Data Management**: -- `online_tools`: Enable/disable live API calls vs cached data (default: True) -- `default_lookback_days`: Historical data range for analysis (default: 30) - -#### Required API Keys - -```bash -# For OpenAI (default) -export OPENAI_API_KEY="your_openai_api_key" - -# For Anthropic Claude -export ANTHROPIC_API_KEY="your_anthropic_api_key" - -# For Google Gemini -export GOOGLE_API_KEY="your_google_api_key" - -# For financial data (optional) -export FINNHUB_API_KEY="your_finnhub_api_key" -``` - ## Architecture Overview ### Multi-Agent Trading System @@ -367,74 +316,69 @@ TradingAgents uses specialized LLM agents that work together in a trading firm s #### 1. Domain-Driven Architecture Three main domains with clean separation: - **Financial Data** (`tradingagents/domains/marketdata/`): Market prices, technical analysis, fundamentals -- **News** (`tradingagents/domains/news/`): News articles and sentiment analysis +- **News** (`tradingagents/domains/news/`): News articles and sentiment analysis (95% complete) - **Social Media** (`tradingagents/domains/socialmedia/`): Social sentiment from Reddit/Twitter -#### 2. Repository-First Data Strategy -- Services read from local repositories (cached data) -- Separate update operations fetch fresh data from APIs -- Smart caching with gap detection and deduplication +#### 2. PostgreSQL + TimescaleDB + pgvectorscale Stack +- **PostgreSQL**: Primary database for structured data +- **TimescaleDB**: Time-series optimization for market data +- **pgvectorscale**: Vector storage for RAG and semantic search +- **Automated migrations**: Database schema versioning -#### 3. Agent Integration (Anti-Corruption Layer) -- `AgentToolkit` mediates between agents and domain services -- Converts rich domain models to structured JSON for LLM consumption -- Handles parameter validation and error recovery +#### 3. RAG-Powered Agent Integration +- `AgentToolkit` with RAG capabilities for contextual decision making +- Vector search for relevant historical data and patterns +- Semantic similarity matching for comparable market conditions +- Context-aware analysis based on historical performance + +#### 4. Dagster Data Orchestration +- Daily/twice-daily data collection pipelines +- Automated data quality checks and validation +- Gap detection and backfill capabilities +- Monitoring and alerting for data pipeline health ### Key Design Patterns -1. **Debate-Driven Decisions**: Bull/bear researchers debate before trading -2. **Memory-Augmented Learning**: ChromaDB stores past decisions for context +1. **RAG-Enhanced Decisions**: Agents use vector similarity search for context +2. **Time-Series Optimized**: TimescaleDB for efficient market data queries 3. **Quality-Aware Data**: All contexts include data quality metadata -4. **Structured Outputs**: Pydantic models replace error-prone string parsing +4. **Structured Outputs**: Pydantic models with database persistence ### File Structure ``` tradingagents/ -├── agents/ # Agent implementations +├── agents/ # Agent implementations with RAG capabilities │ └── libs/ # AgentToolkit and utilities ├── domains/ # Domain-specific services │ ├── marketdata/ # Financial data domain -│ ├── news/ # News domain +│ ├── news/ # News domain (95% complete) │ └── socialmedia/ # Social media domain ├── graph/ # LangGraph workflow orchestration +├── data/ # Dagster pipelines and data management └── config.py # Configuration management ``` ### Performance Optimization -**Caching Strategy:** -- Repository-first data access minimizes API calls -- Smart caching with automatic invalidation -- Gap detection for missing data ranges +**Database Strategy:** +- TimescaleDB hypertables for efficient time-series queries +- pgvectorscale for fast vector similarity search +- Materialized views for common aggregations **Model Selection:** +- OpenRouter unified interface reduces API complexity - `quick_think_llm` for data retrieval and formatting - `deep_think_llm` for complex analysis and decisions -**Cost Optimization:** -```python -config = TradingAgentsConfig( - deep_think_llm="gpt-4o-mini", # Lower cost - max_debate_rounds=1, # Fewer debates - online_tools=False, # Use cached data - default_lookback_days=30 # Limit data range -) -``` - ## Need Help? -- **Detailed Architecture**: `docs/architecture.md` - **API Documentation**: `docs/api-reference.md` - **Troubleshooting**: `docs/troubleshooting.md` - **Agent Development**: `docs/agent-development.md` -## Contributing - -We welcome contributions from the community! Whether it's fixing a bug, improving documentation, or suggesting a new feature, your input helps make this project better. If you are interested in this line of research, please consider joining our open-source financial AI research community [Tauric Research](https://tauric.ai/). - ## Citation -Please reference our work if you find *TradingAgents* provides you with some help :) +Please reference the original work if you find *TradingAgents* provides you with some help: ``` @misc{xiao2025tradingagentsmultiagentsllmfinancial, @@ -448,12 +392,6 @@ Please reference our work if you find *TradingAgents* provides you with some hel } ``` -# important-instruction-reminders -Do what has been asked; nothing more, nothing less. -NEVER create files unless they're absolutely necessary for achieving your goal. -ALWAYS prefer editing an existing file to creating a new one. -NEVER proactively create documentation files (*.md) or README files. Only create documentation files if explicitly requested by the User. +## License - - IMPORTANT: this context may or may not be relevant to your tasks. You should not respond to this context unless it is highly relevant to your task. -- remember what we learnt about testing? \ No newline at end of file +This personal fork maintains the Apache 2.0 license from the original TauricResearch/TradingAgents project. \ No newline at end of file diff --git a/docker/docker-compose.yml b/docker-compose.yml similarity index 90% rename from docker/docker-compose.yml rename to docker-compose.yml index c33c9dba..375f1d0b 100644 --- a/docker/docker-compose.yml +++ b/docker-compose.yml @@ -1,10 +1,10 @@ services: timescaledb: - build: . + build: ./db container_name: tradingagents_timescaledb environment: - POSTGRES_PASSWORD: postgres POSTGRES_USER: postgres + POSTGRES_PASSWORD: tradingagents POSTGRES_DB: tradingagents ports: - "5432:5432" diff --git a/docker/Dockerfile b/docker/db/Dockerfile similarity index 100% rename from docker/Dockerfile rename to docker/db/Dockerfile diff --git a/docker/seed.sql b/docker/db/seed.sql similarity index 100% rename from docker/seed.sql rename to docker/db/seed.sql diff --git a/docs/product/product.md b/docs/product/product.md new file mode 100644 index 00000000..dcb7e821 --- /dev/null +++ b/docs/product/product.md @@ -0,0 +1,150 @@ +# TradingAgents Product Definition + +## Product Overview + +**TradingAgents** is a personal fork of the multi-agent LLM financial trading framework designed for individual trading research and data infrastructure development. This fork focuses on PostgreSQL + TimescaleDB + pgvectorscale architecture with RAG-powered agents for enhanced decision making through historical context and pattern recognition. + +## Target User + +### Primary User +- **Single Developer/Researcher**: Individual focused on personal trading research, strategy development, and building robust data infrastructure for financial analysis + +### Use Cases +- **Personal Trading Research**: Developing and testing proprietary trading strategies with AI-powered analysis +- **Data Infrastructure Development**: Building scalable time-series and vector search capabilities for financial data +- **RAG Implementation**: Experimenting with retrieval-augmented generation for context-aware trading decisions +- **Academic Research**: Individual research projects exploring AI applications in financial markets + +## Core Value Proposition + +This personal fork transforms the original TradingAgents framework into a focused research and development platform that: +- **Enables Personal Research**: Provides a complete data infrastructure for individual trading research and strategy development +- **Implements Modern Architecture**: PostgreSQL + TimescaleDB + pgvectorscale stack for efficient time-series and vector operations +- **Supports RAG-Powered Decisions**: Agents leverage historical context through vector similarity search for informed decisions +- **Streamlines Data Collection**: Automated daily/twice-daily data pipelines with Dagster orchestration +- **Unifies LLM Access**: Single OpenRouter integration for consistent model access across all agents + +## Key Features + +### Enhanced Data Architecture +- **PostgreSQL Foundation**: Robust relational database for structured financial data +- **TimescaleDB Integration**: Optimized time-series storage and querying for market data +- **pgvectorscale Extension**: High-performance vector search for RAG and similarity matching +- **Automated Migrations**: Database schema versioning and management + +### RAG-Powered Multi-Agent System +- **Context-Aware Analysis**: Agents use vector similarity search to find relevant historical patterns +- **Enhanced Decision Making**: Retrieval-augmented generation provides historical context for trading decisions +- **Pattern Recognition**: Semantic similarity matching for comparable market conditions +- **Learning from History**: Agents reference past decisions and outcomes for improved analysis + +### Automated Data Collection +- **Dagster Orchestration**: Daily/twice-daily data collection pipelines with monitoring and alerting +- **Quality Assurance**: Automated data validation, gap detection, and backfill capabilities +- **Domain Coverage**: Comprehensive data collection for news (95% complete), market data, and social media domains +- **Scalable Processing**: Efficient batch processing with dependency management + +### Unified LLM Provider +- **OpenRouter Integration**: Single provider for all model access, reducing API complexity +- **Cost Optimization**: Strategic model selection with clear separation between analysis and data processing models +- **Model Flexibility**: Easy switching between different models through OpenRouter's unified interface + +## Business Context + +### Research Focus Areas +- **Individual Strategy Development**: Personal trading algorithm research and backtesting +- **Data Infrastructure**: Building scalable financial data storage and retrieval systems +- **AI/ML in Finance**: Experimenting with RAG, vector search, and multi-agent systems +- **Time-Series Analysis**: Advanced market data analysis with TimescaleDB optimization + +### Technical Advantages +- **Modern Data Stack**: PostgreSQL + TimescaleDB + pgvectorscale provides production-grade data infrastructure +- **RAG Implementation**: Real-world application of retrieval-augmented generation in financial decision making +- **Comprehensive Testing**: Maintains 85%+ test coverage with pragmatic TDD approach +- **Scalable Architecture**: Domain-driven design supports extensibility and maintainability + +### Development Metrics +- **Code Quality**: 85%+ test coverage, comprehensive type checking, automated formatting +- **Data Pipeline Health**: Automated monitoring and alerting for data collection processes +- **Performance**: Optimized queries with TimescaleDB, fast vector search with pgvectorscale +- **Maintainability**: Clean architecture patterns, comprehensive documentation + +## Technical Constraints + +### Requirements +- **Database**: PostgreSQL with TimescaleDB and pgvectorscale extensions +- **Python Environment**: Python 3.13+ with comprehensive dependency management +- **API Access**: OpenRouter API key for LLM access, optional FinnHub for real-time data +- **Infrastructure**: Docker Compose for local development, Dagster for data orchestration + +### Architectural Decisions +- **Single Developer Focus**: Optimized for individual use rather than multi-user collaboration +- **PostgreSQL-First**: All data persistence through PostgreSQL with appropriate extensions +- **OpenRouter Exclusive**: Unified LLM provider reduces complexity and improves consistency +- **Domain Completion**: Sequential domain development (news 95% → marketdata → socialmedia) + +## Project Scope + +### Current Implementation Status +- **News Domain**: 95% complete with comprehensive article scraping and sentiment analysis +- **Core Infrastructure**: PostgreSQL + TimescaleDB + pgvectorscale foundation established +- **Agent Framework**: RAG-powered agents with vector search capabilities +- **Data Pipelines**: Dagster orchestration for automated data collection + +### Included Features +- Complete PostgreSQL-based data architecture with time-series and vector extensions +- RAG-enhanced multi-agent analysis framework with historical context +- Automated data collection pipelines with Dagster orchestration +- OpenRouter integration for unified LLM access +- Comprehensive test suite with domain-specific testing strategies +- CLI interface for interactive analysis and debugging + +### Excluded Features +- Multi-user collaboration features +- Real money trading capabilities +- Production-grade risk management for live trading +- Multiple database backend support +- Legacy LLM provider integrations (focus on OpenRouter only) + +## Development Phases + +### Phase 1: News Domain Completion (Current - 95% Complete) +- Finalize news article scraping and processing +- Complete sentiment analysis pipeline +- Optimize news data storage and retrieval +- Implement comprehensive testing for news domain + +### Phase 2: Market Data Domain + PostgreSQL Migration +- Complete market data collection and processing +- Implement TimescaleDB optimizations for price data +- Add technical analysis calculations +- Migrate all data persistence to PostgreSQL + +### Phase 3: Social Media Domain +- Implement Reddit and Twitter data collection +- Add social sentiment analysis +- Complete the three-domain architecture +- Optimize cross-domain data relationships + +### Phase 4: Dagster Pipeline Implementation +- Daily/twice-daily data collection automation +- Comprehensive monitoring and alerting +- Data quality validation and gap detection +- Performance optimization and scaling + +### Phase 5: RAG Enhancement and OpenRouter Migration +- Complete RAG implementation for all agents +- Migrate to OpenRouter as sole LLM provider +- Optimize vector search performance +- Implement advanced pattern recognition + +## Success Criteria + +This personal fork is successful when it provides: +- **Robust Data Infrastructure**: PostgreSQL + TimescaleDB + pgvectorscale handling all financial data efficiently +- **Intelligent Decision Making**: RAG-powered agents making context-aware trading recommendations +- **Reliable Data Collection**: Automated pipelines collecting high-quality data consistently +- **Research Capability**: Complete platform for individual trading strategy research and development +- **Maintainable Codebase**: 85%+ test coverage with clear architecture and comprehensive documentation + +The fork serves as both a practical trading research platform and a demonstration of modern data architecture patterns applied to financial AI systems. \ No newline at end of file diff --git a/docs/product/roadmap.md b/docs/product/roadmap.md new file mode 100644 index 00000000..ca80edd4 --- /dev/null +++ b/docs/product/roadmap.md @@ -0,0 +1,206 @@ +# TradingAgents Personal Fork Roadmap + +## Overview + +This roadmap outlines the technical development path for the personal fork of TradingAgents, focusing on building a robust data infrastructure with PostgreSQL + TimescaleDB + pgvectorscale, implementing RAG-powered agents, and establishing automated data collection pipelines with Dagster. + +## Current Status: Phase 1 - News Domain (95% Complete) + +The foundation has been established with core domain architecture, comprehensive testing framework, and the news domain nearly complete. + +### Completed Infrastructure +- **Domain Architecture**: Clean separation of news, marketdata, and socialmedia domains +- **Testing Framework**: Pragmatic TDD with 85%+ coverage, pytest-vcr for HTTP mocking +- **Repository Pattern**: Efficient data caching and management system +- **News Domain**: Article scraping, sentiment analysis, and storage (95% complete) +- **Basic Agent System**: Multi-agent trading analysis framework with LangGraph + +## Development Phases + +### Phase 1: News Domain Completion (Current - 95% Complete) +**Timeline**: 2-3 weeks +**Status**: 🔄 In Progress + +#### Remaining Work +- **News Processing Pipeline**: Complete article content processing and deduplication +- **Sentiment Analysis Optimization**: Fine-tune sentiment scoring algorithms +- **News Repository**: Finalize PostgreSQL integration for news storage +- **Testing Coverage**: Achieve 85%+ test coverage for news domain +- **Performance Optimization**: Optimize news retrieval and search performance + +#### Success Criteria +- ✅ All news APIs integrated and tested +- ✅ Sentiment analysis producing consistent scores +- ✅ News data properly stored in PostgreSQL +- ✅ Comprehensive test suite covering edge cases +- ✅ News domain ready for RAG integration + +### Phase 2: Market Data Domain + PostgreSQL Migration (Next Priority) +**Timeline**: 4-6 weeks +**Status**: 📋 Planned + +#### Core Objectives +- **TimescaleDB Integration**: Implement hypertables for efficient time-series storage +- **Market Data Collection**: Complete price, volume, and technical indicator collection +- **PostgreSQL Migration**: Move all data persistence from file-based to PostgreSQL +- **Technical Analysis**: Implement MACD, RSI, and other technical indicators +- **Database Schema**: Design optimized schema for market data with proper indexing + +#### Key Deliverables +- Market data repository with TimescaleDB optimization +- Real-time and historical price data collection +- Technical analysis calculation engine +- Migration scripts for moving existing data +- Performance benchmarks for time-series queries + +#### Success Criteria +- ✅ Market data efficiently stored in TimescaleDB hypertables +- ✅ Sub-100ms queries for common market data retrievals +- ✅ All technical indicators calculating accurately +- ✅ Complete migration from file-based storage +- ✅ Market data domain ready for agent integration + +### Phase 3: Social Media Domain (Following Phase 2) +**Timeline**: 3-4 weeks +**Status**: 📋 Planned + +#### Core Objectives +- **Reddit Integration**: Implement Reddit API for financial subreddits +- **Twitter/X Integration**: Add social sentiment from Twitter feeds +- **Social Sentiment Analysis**: Aggregate sentiment scoring across platforms +- **Cross-Domain Relations**: Link social sentiment to market data and news +- **pgvectorscale Preparation**: Prepare social data for vector search + +#### Key Deliverables +- Reddit and Twitter data collection clients +- Social sentiment aggregation algorithms +- Social media data repository with PostgreSQL storage +- Cross-domain correlation analysis tools +- Foundation for RAG implementation + +#### Success Criteria +- ✅ Social media data collected from multiple sources +- ✅ Sentiment scores integrated with market events +- ✅ Cross-domain relationships established in database +- ✅ Social media domain ready for RAG enhancement +- ✅ Three-domain architecture complete + +### Phase 4: Dagster Data Collection Orchestration +**Timeline**: 3-4 weeks +**Status**: 📋 Planned + +#### Core Objectives +- **Pipeline Architecture**: Design daily/twice-daily data collection workflows +- **Data Quality Monitoring**: Implement validation and gap detection +- **Automated Backfill**: Handle missing data and API failures gracefully +- **Performance Monitoring**: Track pipeline health and data freshness +- **Alerting System**: Notify on pipeline failures or data quality issues + +#### Key Deliverables +- Dagster asset definitions for all data domains +- Automated data quality checks and validation +- Gap detection and backfill capabilities +- Monitoring dashboard for pipeline health +- Comprehensive logging and error handling + +#### Success Criteria +- ✅ Fully automated data collection running daily +- ✅ Data quality monitoring with automated alerts +- ✅ Zero-downtime pipeline updates and maintenance +- ✅ Historical data gaps automatically detected and filled +- ✅ Pipeline performance metrics tracked and optimized + +### Phase 5: RAG Implementation + OpenRouter Migration +**Timeline**: 4-5 weeks +**Status**: 📋 Planned + +#### Core Objectives +- **pgvectorscale Integration**: Implement vector storage for historical patterns +- **RAG Agent Enhancement**: Agents use similarity search for context +- **OpenRouter Migration**: Complete migration to unified LLM provider +- **Historical Context**: Agents reference past decisions and market conditions +- **Pattern Recognition**: Semantic similarity for comparable market scenarios + +#### Key Deliverables +- pgvectorscale extension configured and optimized +- Vector embeddings for all historical data +- RAG-enhanced agent decision making +- OpenRouter integration replacing all LLM providers +- Similarity search for historical pattern matching + +#### Success Criteria +- ✅ All agents using RAG for contextual decisions +- ✅ Vector search performing sub-50ms similarity queries +- ✅ OpenRouter as sole LLM provider across all agents +- ✅ Agents demonstrating improved decision accuracy +- ✅ Historical pattern matching enhancing trading analysis + +## Technical Milestones + +### Database Architecture +- **Month 1**: Complete PostgreSQL foundation with news domain +- **Month 2**: TimescaleDB hypertables optimized for market data +- **Month 3**: pgvectorscale configured for RAG implementation +- **Month 4**: Full database optimization and performance tuning + +### Agent Capabilities +- **Month 1**: Basic multi-agent framework operational +- **Month 2**: Agents using PostgreSQL for all data access +- **Month 3**: Cross-domain agent collaboration established +- **Month 4**: RAG-powered agents with historical context + +### Data Pipeline Maturity +- **Month 1**: Manual data collection with basic automation +- **Month 2**: Automated collection for market data +- **Month 3**: Full three-domain automated collection +- **Month 4**: Production-grade pipeline with monitoring and alerting + +## Success Metrics + +### Technical Excellence +- **Test Coverage**: Maintain 85%+ across all domains +- **Query Performance**: < 100ms for common database operations +- **Pipeline Reliability**: 99%+ uptime for data collection +- **Data Quality**: < 0.1% missing data points across all domains + +### Feature Completeness +- **Domain Coverage**: 100% implementation across news, marketdata, socialmedia +- **Agent Capabilities**: RAG-enhanced decision making operational +- **Data Infrastructure**: Complete PostgreSQL + TimescaleDB + pgvectorscale stack +- **Automation**: Fully automated data collection and processing + +### Development Velocity +- **Code Quality**: Consistent formatting, type checking, and documentation +- **Testing Strategy**: Comprehensive test suite with domain-specific approaches +- **Architecture Consistency**: Clean domain separation and layered architecture +- **Performance Optimization**: Regular profiling and optimization cycles + +## Risk Management + +### Technical Risks +- **Database Performance**: Mitigate with proper indexing and query optimization +- **API Rate Limits**: Implement intelligent backoff and caching strategies +- **Data Quality**: Establish comprehensive validation and monitoring +- **Vector Search Performance**: Optimize pgvectorscale configuration and queries + +### Development Risks +- **Scope Creep**: Maintain focus on sequential domain completion +- **Technical Debt**: Regular refactoring and code quality maintenance +- **Testing Coverage**: Continuous integration with coverage enforcement +- **Documentation**: Maintain comprehensive documentation throughout development + +## Long-Term Vision (6+ Months) + +### Advanced Capabilities +- **Strategy Backtesting**: Historical strategy validation with complete data +- **Real-Time Analysis**: Live market analysis with sub-second agent responses +- **Advanced RAG**: Multi-modal RAG with charts, documents, and audio data +- **Performance Analytics**: Comprehensive analysis of agent decision accuracy + +### Research Applications +- **Academic Research**: Platform for publishing trading AI research +- **Strategy Development**: Complete environment for developing proprietary strategies +- **Data Science**: Advanced analytics and machine learning on financial data +- **Educational Use**: Comprehensive learning platform for financial AI + +This roadmap prioritizes building a solid data foundation before enhancing agent capabilities, ensuring each phase delivers measurable value while maintaining high code quality and comprehensive testing. \ No newline at end of file diff --git a/docs/specs/MarketData/context.json b/docs/specs/MarketData/context.json new file mode 100644 index 00000000..241d202d --- /dev/null +++ b/docs/specs/MarketData/context.json @@ -0,0 +1,66 @@ +{ + "product_vision": "Multi-agent LLM financial trading framework with PostgreSQL + TimescaleDB + pgvectorscale architecture for research-based market analysis and trading decisions", + "existing_features": [ + "marketdata_domain_85_complete_file_based", + "yfinance_client_fully_implemented", + "finnhub_client_with_insider_data", + "talib_technical_analysis_integration", + "postgresql_timescaledb_foundation", + "agent_toolkit_rag_ready", + "news_domain_postgresql_patterns", + "database_manager_async_operations" + ], + "architecture": { + "layer_pattern": "Router → Service → Repository → Entity → Database", + "database": "PostgreSQL + TimescaleDB + pgvectorscale with asyncpg driver", + "llm_provider": "OpenRouter unified interface", + "agent_orchestration": "LangGraph workflows with RAG-enhanced AgentToolkit", + "data_pipeline": "Dagster planned for daily market data collection", + "domain_structure": "news (95% PostgreSQL), marketdata (85% file-based), socialmedia (planned)", + "testing_strategy": "Pragmatic TDD: services (mocked), repositories (real PostgreSQL), clients (pytest-vcr)" + }, + "marketdata_implementation_status": { + "current_components": { + "MarketDataService": "Technical analysis with 20 TA-Lib indicators, trading style presets", + "MarketDataRepository": "CSV-based storage - NEEDS PostgreSQL migration", + "YFinanceClient": "Historical OHLC, company info, financials - fully implemented", + "FinnhubClient": "Insider transactions, sentiment, company profiles - fully implemented", + "FundamentalDataService": "Balance sheet, income statement, cash flow analysis", + "InsiderDataService": "SEC insider transaction and sentiment analysis" + }, + "current_limitations": { + "storage": "CSV files in ./data/market_data/ - not scalable", + "query_performance": "File-based lookups instead of indexed database queries", + "concurrency": "No concurrent access support", + "vector_embeddings": "No RAG capabilities for historical pattern matching" + }, + "migration_needed": [ + "PostgreSQL entities for OHLC, fundamental, and insider data", + "TimescaleDB hypertables for time-series optimization", + "Vector embeddings for technical analysis RAG", + "Async repository operations matching news domain patterns", + "Batch data ingestion for daily collection" + ] + }, + "reference_patterns": { + "news_domain_success": { + "NewsRepository": "Async PostgreSQL with vector embeddings and batch operations", + "NewsArticleEntity": "SQLAlchemy model with UUID v7, TimescaleDB optimization", + "database_patterns": "Connection pooling, async sessions, proper error handling", + "testing_approach": "Real PostgreSQL for repositories, pytest-vcr for API clients" + }, + "agent_integration": "AgentToolkit expects PostgreSQL-backed services for RAG capabilities" + }, + "technical_dependencies": { + "external": [ + "yfinance for daily OHLC data (already implemented)", + "FinnHub API for insider and fundamental data (already implemented)", + "PostgreSQL with TimescaleDB and pgvectorscale extensions (ready)" + ], + "internal": [ + "DatabaseManager for async PostgreSQL connections (established)", + "News domain PostgreSQL patterns for consistency (available)", + "AgentToolkit integration for RAG-powered market analysis (ready)" + ] + } +} \ No newline at end of file diff --git a/docs/specs/MarketData/design.json b/docs/specs/MarketData/design.json new file mode 100644 index 00000000..82e870b0 --- /dev/null +++ b/docs/specs/MarketData/design.json @@ -0,0 +1,52 @@ +{ + "requirements": { + "entities": { + "MarketDataEntity": "SQLAlchemy entity for OHLC price data with TimescaleDB optimization and vector embeddings", + "FundamentalDataEntity": "Financial statement data (balance sheet, income statement, cash flow) with PostgreSQL storage", + "InsiderDataEntity": "SEC insider transaction records with sentiment analysis and PostgreSQL persistence", + "TechnicalIndicatorEntity": "Calculated TA-Lib indicator values with vector embeddings for RAG analysis" + }, + "data_persistence": { + "migration_scope": "CSV file storage to PostgreSQL + TimescaleDB + pgvectorscale", + "current_storage": "./data/market_data/ CSV files with 85% complete functionality", + "target_storage": "PostgreSQL with TimescaleDB hypertables and pgvectorscale vector storage", + "performance_goal": "10x improvement with sub-100ms query times", + "data_volume": "10 years OHLC, 5 years fundamentals, 3 years insider data for 500+ tickers" + }, + "api_needed": { + "preservation_requirement": "100% API compatibility with existing services", + "existing_apis": [ + "MarketDataService with 20 TA-Lib technical indicators and trading style presets", + "FundamentalDataService for balance sheet, income statement, cash flow analysis", + "InsiderDataService for SEC transaction data and sentiment scoring" + ], + "external_apis": [ + "YFinanceClient (fully implemented) for daily OHLC data", + "FinnhubClient (fully implemented) for insider transactions and fundamental data" + ] + }, + "components": { + "repository_migration": "MarketDataRepository from CSV to async PostgreSQL operations", + "entity_models": "SQLAlchemy entities with TimescaleDB and pgvectorscale integration", + "service_preservation": "API-compatible service layer with PostgreSQL backend", + "vector_embeddings": "RAG enhancement for historical pattern matching", + "dagster_integration": "Daily data collection pipeline automation" + }, + "domains": { + "primary": "MarketData (PostgreSQL migration from 85% complete CSV system)", + "integration": "Follows news domain PostgreSQL patterns for architectural consistency" + }, + "business_rules": [ + "Preserve 100% API compatibility with existing MarketDataService, FundamentalDataService, InsiderDataService", + "Daily automated collection from yfinance (OHLC) and FinnHub (insider + fundamentals)", + "TimescaleDB hypertables for market_data, fundamental_data, insider_data tables", + "Vector embeddings for technical analysis patterns using pgvectorscale", + "Sub-100ms query performance for common market data operations", + "Sub-200ms RAG queries for historical pattern matching", + "Data retention: 10 years OHLC, 5 years fundamentals, 3 years insider data", + "FinnHub API rate limiting compliance with backoff strategies", + "Comprehensive audit logging and ACID transaction support", + "Concurrent agent access with PostgreSQL async operations" + ] + } +} \ No newline at end of file diff --git a/docs/specs/MarketData/design.md b/docs/specs/MarketData/design.md new file mode 100644 index 00000000..7828187c --- /dev/null +++ b/docs/specs/MarketData/design.md @@ -0,0 +1,1362 @@ +# MarketData Domain: PostgreSQL Migration Technical Design + +## Project Overview + +**Project Type**: Migration Project (85% complete CSV → PostgreSQL + TimescaleDB + pgvectorscale) +**Business Impact**: 10x performance improvement with sub-100ms query times and RAG capabilities +**API Compatibility**: 100% preservation of existing MarketDataService, FundamentalDataService, InsiderDataService APIs +**Data Volume**: 10 years OHLC, 5 years fundamentals, 3 years insider data for 500+ tickers + +## Architecture Overview + +### Current State (85% Complete) +``` +CSV File Storage (./data/market_data/) +├── OHLC data in CSV files +├── Fundamental data in CSV files +├── Insider transaction data in CSV files +└── Manual file-based operations +``` + +### Target Architecture +``` +PostgreSQL + TimescaleDB + pgvectorscale +├── TimescaleDB hypertables for time-series optimization +├── pgvectorscale for RAG vector embeddings +├── Async PostgreSQL operations for concurrent agent access +├── Dagster automation for daily data collection +└── 100% API-compatible service layer +``` + +### Component Relationships +``` +External APIs (YFinance + FinnHub) → Dagster Pipeline → PostgreSQL Storage → Repository Layer → Service Layer → Agents + ↓ + pgvectorscale (RAG) +``` + +## Domain Model + +### MarketDataEntity + +**Purpose**: OHLC price data with TimescaleDB optimization and vector embeddings + +```python +from sqlalchemy import Column, String, DateTime, Numeric, Integer +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy_utils import TSVectorType +from pgvectorscale import Vector + +class MarketDataEntity(Base): + __tablename__ = 'market_data' + __table_args__ = { + 'timescaledb': { + 'time_column_name': 'timestamp', + 'chunk_time_interval': '1 day' + } + } + + id = Column(Integer, primary_key=True) + symbol = Column(String(10), nullable=False, index=True) + timestamp = Column(DateTime, nullable=False, index=True) + open_price = Column(Numeric(10, 2), nullable=False) + high_price = Column(Numeric(10, 2), nullable=False) + low_price = Column(Numeric(10, 2), nullable=False) + close_price = Column(Numeric(10, 2), nullable=False) + volume = Column(Integer, nullable=False) + adjusted_close = Column(Numeric(10, 2), nullable=False) + + # Vector embeddings for RAG + technical_pattern_embedding = Column(Vector(384)) # Technical analysis patterns + price_movement_embedding = Column(Vector(384)) # Price movement patterns + + # Business rules + @classmethod + def from_csv_record(cls, csv_data: dict) -> 'MarketDataEntity': + """Transform CSV data to entity""" + return cls( + symbol=csv_data['symbol'], + timestamp=pd.to_datetime(csv_data['timestamp']), + open_price=csv_data['open'], + high_price=csv_data['high'], + low_price=csv_data['low'], + close_price=csv_data['close'], + volume=csv_data['volume'], + adjusted_close=csv_data['adj_close'] + ) + + def to_service_response(self) -> dict: + """Transform entity to service API format""" + return { + 'symbol': self.symbol, + 'timestamp': self.timestamp.isoformat(), + 'open': float(self.open_price), + 'high': float(self.high_price), + 'low': float(self.low_price), + 'close': float(self.close_price), + 'volume': self.volume, + 'adj_close': float(self.adjusted_close) + } + + def validate(self) -> bool: + """Validate business rules""" + return ( + self.high_price >= self.low_price and + self.high_price >= self.open_price and + self.high_price >= self.close_price and + self.low_price <= self.open_price and + self.low_price <= self.close_price and + self.volume >= 0 + ) +``` + +### FundamentalDataEntity + +**Purpose**: Financial statement data with PostgreSQL storage + +```python +class FundamentalDataEntity(Base): + __tablename__ = 'fundamental_data' + + id = Column(Integer, primary_key=True) + symbol = Column(String(10), nullable=False, index=True) + report_date = Column(DateTime, nullable=False, index=True) + period_type = Column(String(10), nullable=False) # Q, Y + + # Balance Sheet + total_assets = Column(Numeric(15, 2)) + total_liabilities = Column(Numeric(15, 2)) + shareholders_equity = Column(Numeric(15, 2)) + + # Income Statement + total_revenue = Column(Numeric(15, 2)) + net_income = Column(Numeric(15, 2)) + earnings_per_share = Column(Numeric(8, 4)) + + # Cash Flow + operating_cash_flow = Column(Numeric(15, 2)) + capital_expenditures = Column(Numeric(15, 2)) + free_cash_flow = Column(Numeric(15, 2)) + + # Ratios (calculated) + pe_ratio = Column(Numeric(8, 2)) + pb_ratio = Column(Numeric(8, 2)) + roe = Column(Numeric(8, 4)) + roa = Column(Numeric(8, 4)) + debt_to_equity = Column(Numeric(8, 4)) + + # Vector embeddings for RAG + financial_health_embedding = Column(Vector(384)) + + @classmethod + def from_finnhub_response(cls, finnhub_data: dict) -> 'FundamentalDataEntity': + """Transform FinnHub API response to entity""" + return cls( + symbol=finnhub_data['symbol'], + report_date=pd.to_datetime(finnhub_data['reportedDate']), + period_type=finnhub_data['period'], + total_assets=finnhub_data.get('totalAssets'), + total_revenue=finnhub_data.get('totalRevenue'), + # ... map all fields + ) + + def calculate_ratios(self, current_price: float): + """Calculate financial ratios""" + if self.earnings_per_share and self.earnings_per_share > 0: + self.pe_ratio = current_price / self.earnings_per_share + + if self.shareholders_equity and self.shareholders_equity > 0: + self.pb_ratio = current_price / (self.shareholders_equity / 1_000_000) # Book value per share + + if self.shareholders_equity and self.net_income: + self.roe = self.net_income / self.shareholders_equity + + if self.total_assets and self.net_income: + self.roa = self.net_income / self.total_assets +``` + +### InsiderDataEntity + +**Purpose**: SEC insider transaction records with sentiment analysis + +```python +class InsiderDataEntity(Base): + __tablename__ = 'insider_data' + + id = Column(Integer, primary_key=True) + symbol = Column(String(10), nullable=False, index=True) + transaction_date = Column(DateTime, nullable=False, index=True) + + # Insider information + insider_name = Column(String(200), nullable=False) + insider_position = Column(String(100)) + + # Transaction details + transaction_type = Column(String(20), nullable=False) # Buy, Sell + shares_traded = Column(Integer, nullable=False) + transaction_price = Column(Numeric(10, 2)) + shares_owned_after = Column(Integer) + + # Derived fields + transaction_value = Column(Numeric(15, 2)) # shares * price + sentiment_score = Column(Numeric(3, 2)) # -1 to 1 + + # Vector embeddings for RAG + transaction_pattern_embedding = Column(Vector(384)) + + @classmethod + def from_finnhub_response(cls, finnhub_data: dict) -> 'InsiderDataEntity': + """Transform FinnHub insider data to entity""" + entity = cls( + symbol=finnhub_data['symbol'], + transaction_date=pd.to_datetime(finnhub_data['transactionDate']), + insider_name=finnhub_data['personName'], + insider_position=finnhub_data.get('position'), + transaction_type='Buy' if finnhub_data['change'] > 0 else 'Sell', + shares_traded=abs(finnhub_data['change']), + shares_owned_after=finnhub_data['currentShares'] + ) + entity.calculate_sentiment() + return entity + + def calculate_sentiment(self): + """Calculate sentiment score based on transaction type and insider position""" + base_score = 0.7 if self.transaction_type == 'Buy' else -0.7 + + # Adjust based on position + if self.insider_position and 'ceo' in self.insider_position.lower(): + base_score *= 1.2 + elif self.insider_position and 'cfo' in self.insider_position.lower(): + base_score *= 1.1 + + self.sentiment_score = max(-1.0, min(1.0, base_score)) +``` + +### TechnicalIndicatorEntity + +**Purpose**: Calculated TA-Lib indicator values with vector embeddings + +```python +class TechnicalIndicatorEntity(Base): + __tablename__ = 'technical_indicators' + + id = Column(Integer, primary_key=True) + symbol = Column(String(10), nullable=False, index=True) + timestamp = Column(DateTime, nullable=False, index=True) + + # Moving Averages + sma_20 = Column(Numeric(10, 2)) + sma_50 = Column(Numeric(10, 2)) + ema_12 = Column(Numeric(10, 2)) + ema_26 = Column(Numeric(10, 2)) + + # Momentum Indicators + rsi_14 = Column(Numeric(5, 2)) + macd = Column(Numeric(10, 4)) + macd_signal = Column(Numeric(10, 4)) + macd_histogram = Column(Numeric(10, 4)) + + # Volatility Indicators + bollinger_upper = Column(Numeric(10, 2)) + bollinger_lower = Column(Numeric(10, 2)) + atr_14 = Column(Numeric(10, 4)) + + # Volume Indicators + obv = Column(Numeric(15, 0)) + volume_sma_20 = Column(Numeric(15, 0)) + + # Pattern Recognition (0-100 scores) + pattern_doji = Column(Integer) + pattern_hammer = Column(Integer) + pattern_engulfing = Column(Integer) + + # Vector embeddings for RAG pattern matching + indicator_pattern_embedding = Column(Vector(384)) + + @classmethod + def calculate_from_ohlc(cls, symbol: str, ohlc_data: pd.DataFrame) -> List['TechnicalIndicatorEntity']: + """Calculate all technical indicators from OHLC data""" + import talib + + indicators = [] + + # Calculate all indicators + sma_20 = talib.SMA(ohlc_data['close'], timeperiod=20) + rsi_14 = talib.RSI(ohlc_data['close'], timeperiod=14) + macd, macd_signal, macd_hist = talib.MACD(ohlc_data['close']) + # ... calculate all indicators + + for i, timestamp in enumerate(ohlc_data.index): + if pd.notna(sma_20.iloc[i]): # Only create records with valid data + indicators.append(cls( + symbol=symbol, + timestamp=timestamp, + sma_20=sma_20.iloc[i], + rsi_14=rsi_14.iloc[i], + macd=macd.iloc[i], + macd_signal=macd_signal.iloc[i], + macd_histogram=macd_hist.iloc[i] + # ... set all calculated values + )) + + return indicators +``` + +## Database Design + +### PostgreSQL + TimescaleDB + pgvectorscale Schema + +```sql +-- Enable extensions +CREATE EXTENSION IF NOT EXISTS timescaledb CASCADE; +CREATE EXTENSION IF NOT EXISTS vectorscale CASCADE; + +-- Market Data (TimescaleDB hypertable) +CREATE TABLE market_data ( + id SERIAL PRIMARY KEY, + symbol VARCHAR(10) NOT NULL, + timestamp TIMESTAMPTZ NOT NULL, + open_price DECIMAL(10,2) NOT NULL, + high_price DECIMAL(10,2) NOT NULL, + low_price DECIMAL(10,2) NOT NULL, + close_price DECIMAL(10,2) NOT NULL, + volume BIGINT NOT NULL, + adjusted_close DECIMAL(10,2) NOT NULL, + technical_pattern_embedding vector(384), + price_movement_embedding vector(384), + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW() +); + +-- Convert to TimescaleDB hypertable +SELECT create_hypertable('market_data', 'timestamp', chunk_time_interval => INTERVAL '1 day'); + +-- Indexes for performance +CREATE INDEX idx_market_data_symbol_time ON market_data (symbol, timestamp DESC); +CREATE INDEX idx_market_data_symbol ON market_data (symbol); + +-- Vector indexes for RAG +CREATE INDEX idx_market_data_technical_embedding + ON market_data USING diskann (technical_pattern_embedding); +CREATE INDEX idx_market_data_price_embedding + ON market_data USING diskann (price_movement_embedding); + +-- Fundamental Data +CREATE TABLE fundamental_data ( + id SERIAL PRIMARY KEY, + symbol VARCHAR(10) NOT NULL, + report_date TIMESTAMPTZ NOT NULL, + period_type VARCHAR(10) NOT NULL, + + -- Balance Sheet + total_assets DECIMAL(15,2), + total_liabilities DECIMAL(15,2), + shareholders_equity DECIMAL(15,2), + + -- Income Statement + total_revenue DECIMAL(15,2), + net_income DECIMAL(15,2), + earnings_per_share DECIMAL(8,4), + + -- Cash Flow + operating_cash_flow DECIMAL(15,2), + capital_expenditures DECIMAL(15,2), + free_cash_flow DECIMAL(15,2), + + -- Ratios + pe_ratio DECIMAL(8,2), + pb_ratio DECIMAL(8,2), + roe DECIMAL(8,4), + roa DECIMAL(8,4), + debt_to_equity DECIMAL(8,4), + + -- RAG embedding + financial_health_embedding vector(384), + + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW(), + + UNIQUE(symbol, report_date, period_type) +); + +CREATE INDEX idx_fundamental_symbol_date ON fundamental_data (symbol, report_date DESC); +CREATE INDEX idx_fundamental_embedding ON fundamental_data USING diskann (financial_health_embedding); + +-- Insider Data +CREATE TABLE insider_data ( + id SERIAL PRIMARY KEY, + symbol VARCHAR(10) NOT NULL, + transaction_date TIMESTAMPTZ NOT NULL, + insider_name VARCHAR(200) NOT NULL, + insider_position VARCHAR(100), + transaction_type VARCHAR(20) NOT NULL, + shares_traded INTEGER NOT NULL, + transaction_price DECIMAL(10,2), + shares_owned_after INTEGER, + transaction_value DECIMAL(15,2), + sentiment_score DECIMAL(3,2), + transaction_pattern_embedding vector(384), + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW() +); + +CREATE INDEX idx_insider_symbol_date ON insider_data (symbol, transaction_date DESC); +CREATE INDEX idx_insider_embedding ON insider_data USING diskann (transaction_pattern_embedding); + +-- Technical Indicators (TimescaleDB hypertable) +CREATE TABLE technical_indicators ( + id SERIAL PRIMARY KEY, + symbol VARCHAR(10) NOT NULL, + timestamp TIMESTAMPTZ NOT NULL, + + -- Moving Averages + sma_20 DECIMAL(10,2), + sma_50 DECIMAL(10,2), + ema_12 DECIMAL(10,2), + ema_26 DECIMAL(10,2), + + -- Momentum + rsi_14 DECIMAL(5,2), + macd DECIMAL(10,4), + macd_signal DECIMAL(10,4), + macd_histogram DECIMAL(10,4), + + -- Volatility + bollinger_upper DECIMAL(10,2), + bollinger_lower DECIMAL(10,2), + atr_14 DECIMAL(10,4), + + -- Volume + obv DECIMAL(15,0), + volume_sma_20 DECIMAL(15,0), + + -- Patterns + pattern_doji INTEGER, + pattern_hammer INTEGER, + pattern_engulfing INTEGER, + + -- RAG embedding + indicator_pattern_embedding vector(384), + + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW() +); + +SELECT create_hypertable('technical_indicators', 'timestamp', chunk_time_interval => INTERVAL '1 day'); + +CREATE INDEX idx_technical_symbol_time ON technical_indicators (symbol, timestamp DESC); +CREATE INDEX idx_technical_embedding ON technical_indicators USING diskann (indicator_pattern_embedding); +``` + +### Migration Strategy Scripts + +```python +# migrations/001_create_market_data_tables.py + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +def upgrade(): + # Create market_data table + op.create_table('market_data', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('symbol', sa.String(10), nullable=False), + sa.Column('timestamp', sa.DateTime(timezone=True), nullable=False), + sa.Column('open_price', sa.Numeric(10, 2), nullable=False), + sa.Column('high_price', sa.Numeric(10, 2), nullable=False), + sa.Column('low_price', sa.Numeric(10, 2), nullable=False), + sa.Column('close_price', sa.Numeric(10, 2), nullable=False), + sa.Column('volume', sa.BigInteger(), nullable=False), + sa.Column('adjusted_close', sa.Numeric(10, 2), nullable=False), + sa.Column('technical_pattern_embedding', postgresql.ARRAY(sa.Float()), nullable=True), + sa.Column('price_movement_embedding', postgresql.ARRAY(sa.Float()), nullable=True), + sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=True), + sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=True), + sa.PrimaryKeyConstraint('id') + ) + + # Convert to hypertable + op.execute("SELECT create_hypertable('market_data', 'timestamp', chunk_time_interval => INTERVAL '1 day');") + + # Create indexes + op.create_index('idx_market_data_symbol_time', 'market_data', ['symbol', 'timestamp']) + op.create_index('idx_market_data_symbol', 'market_data', ['symbol']) + +def downgrade(): + op.drop_table('market_data') +``` + +## API Preservation + +### 100% Compatible Service Layer + +**MarketDataService**: Preserve all existing methods with PostgreSQL backend + +```python +from typing import List, Dict, Any, Optional +import pandas as pd +from datetime import datetime, timedelta + +class MarketDataService: + """API-compatible service with PostgreSQL backend""" + + def __init__(self, repository: MarketDataRepository): + self.repository = repository + + async def get_ohlc_data(self, symbol: str, start_date: str, end_date: str) -> pd.DataFrame: + """Get OHLC data - 100% API compatible""" + entities = await self.repository.get_ohlc_data(symbol, start_date, end_date) + + # Transform to same DataFrame format as CSV version + return pd.DataFrame([ + { + 'timestamp': entity.timestamp, + 'open': float(entity.open_price), + 'high': float(entity.high_price), + 'low': float(entity.low_price), + 'close': float(entity.close_price), + 'volume': entity.volume, + 'adj_close': float(entity.adjusted_close) + } + for entity in entities + ]).set_index('timestamp') + + async def get_technical_indicators(self, symbol: str, start_date: str, end_date: str) -> Dict[str, List[float]]: + """Get all technical indicators - 100% API compatible""" + indicators = await self.repository.get_technical_indicators(symbol, start_date, end_date) + + return { + 'sma_20': [float(ind.sma_20) if ind.sma_20 else None for ind in indicators], + 'rsi_14': [float(ind.rsi_14) if ind.rsi_14 else None for ind in indicators], + 'macd': [float(ind.macd) if ind.macd else None for ind in indicators], + 'macd_signal': [float(ind.macd_signal) if ind.macd_signal else None for ind in indicators], + # ... all indicators + } + + async def get_trading_style_preset(self, style: str, symbol: str, lookback_days: int = 30) -> Dict[str, Any]: + """Get trading style analysis - 100% API compatible""" + end_date = datetime.now() + start_date = end_date - timedelta(days=lookback_days) + + ohlc_data = await self.get_ohlc_data(symbol, start_date.isoformat(), end_date.isoformat()) + indicators = await self.get_technical_indicators(symbol, start_date.isoformat(), end_date.isoformat()) + + if style == 'momentum': + return await self._analyze_momentum(ohlc_data, indicators) + elif style == 'mean_reversion': + return await self._analyze_mean_reversion(ohlc_data, indicators) + elif style == 'breakout': + return await self._analyze_breakout(ohlc_data, indicators) + # ... all trading styles + + async def _analyze_momentum(self, ohlc_data: pd.DataFrame, indicators: Dict) -> Dict[str, Any]: + """Momentum analysis with RAG enhancement""" + latest_rsi = indicators['rsi_14'][-1] if indicators['rsi_14'] else 50 + latest_macd = indicators['macd'][-1] if indicators['macd'] else 0 + + # RAG: Find similar momentum patterns + similar_patterns = await self.repository.find_similar_momentum_patterns( + latest_rsi, latest_macd, limit=10 + ) + + return { + 'signal': 'BUY' if latest_rsi > 70 and latest_macd > 0 else 'HOLD', + 'confidence': 0.85, + 'indicators': { + 'rsi': latest_rsi, + 'macd': latest_macd + }, + 'similar_patterns': [p.to_dict() for p in similar_patterns], + 'rag_enhanced': True + } +``` + +**FundamentalDataService**: Complete API preservation + +```python +class FundamentalDataService: + """API-compatible fundamental analysis with PostgreSQL backend""" + + def __init__(self, repository: FundamentalDataRepository): + self.repository = repository + + async def get_financial_ratios(self, symbol: str, period_type: str = 'Q') -> Dict[str, float]: + """Get latest financial ratios - 100% API compatible""" + latest_data = await self.repository.get_latest_fundamental_data(symbol, period_type) + + if not latest_data: + return {} + + return { + 'pe_ratio': float(latest_data.pe_ratio) if latest_data.pe_ratio else None, + 'pb_ratio': float(latest_data.pb_ratio) if latest_data.pb_ratio else None, + 'roe': float(latest_data.roe) if latest_data.roe else None, + 'roa': float(latest_data.roa) if latest_data.roa else None, + 'debt_to_equity': float(latest_data.debt_to_equity) if latest_data.debt_to_equity else None + } + + async def analyze_financial_health(self, symbol: str) -> Dict[str, Any]: + """Financial health analysis with RAG - 100% API compatible""" + latest_data = await self.repository.get_latest_fundamental_data(symbol) + historical_data = await self.repository.get_fundamental_history(symbol, quarters=8) + + # RAG: Find companies with similar financial profiles + similar_companies = await self.repository.find_similar_financial_profiles( + latest_data.financial_health_embedding, limit=10 + ) + + return { + 'health_score': self._calculate_health_score(latest_data, historical_data), + 'trend_analysis': self._analyze_trends(historical_data), + 'peer_comparison': [comp.to_dict() for comp in similar_companies], + 'rag_enhanced': True + } +``` + +**InsiderDataService**: Complete API preservation + +```python +class InsiderDataService: + """API-compatible insider analysis with PostgreSQL backend""" + + def __init__(self, repository: InsiderDataRepository): + self.repository = repository + + async def get_recent_insider_activity(self, symbol: str, days: int = 90) -> List[Dict[str, Any]]: + """Get recent insider transactions - 100% API compatible""" + end_date = datetime.now() + start_date = end_date - timedelta(days=days) + + transactions = await self.repository.get_insider_transactions(symbol, start_date, end_date) + + return [ + { + 'insider_name': trans.insider_name, + 'position': trans.insider_position, + 'transaction_date': trans.transaction_date.isoformat(), + 'transaction_type': trans.transaction_type, + 'shares_traded': trans.shares_traded, + 'transaction_price': float(trans.transaction_price) if trans.transaction_price else None, + 'transaction_value': float(trans.transaction_value) if trans.transaction_value else None, + 'sentiment_score': float(trans.sentiment_score) if trans.sentiment_score else None + } + for trans in transactions + ] + + async def analyze_insider_sentiment(self, symbol: str, days: int = 180) -> Dict[str, Any]: + """Insider sentiment analysis with RAG - 100% API compatible""" + transactions = await self.get_recent_insider_activity(symbol, days) + + # RAG: Find similar insider activity patterns + similar_patterns = await self.repository.find_similar_insider_patterns( + symbol, days, limit=10 + ) + + buy_volume = sum(t['shares_traded'] for t in transactions if t['transaction_type'] == 'Buy') + sell_volume = sum(t['shares_traded'] for t in transactions if t['transaction_type'] == 'Sell') + + net_sentiment = buy_volume - sell_volume + + return { + 'net_sentiment': net_sentiment, + 'buy_transactions': len([t for t in transactions if t['transaction_type'] == 'Buy']), + 'sell_transactions': len([t for t in transactions if t['transaction_type'] == 'Sell']), + 'average_sentiment_score': sum(t['sentiment_score'] for t in transactions if t['sentiment_score']) / len(transactions) if transactions else 0, + 'similar_patterns': [p.to_dict() for p in similar_patterns], + 'rag_enhanced': True + } +``` + +## Component Architecture + +### Repository Migration Pattern + +**AsyncRepository with PostgreSQL Operations** + +```python +from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker +from sqlalchemy import select, and_, desc +from typing import List, Optional +from datetime import datetime + +class MarketDataRepository: + """Async PostgreSQL repository with RAG capabilities""" + + def __init__(self, session_factory: async_sessionmaker[AsyncSession]): + self.session_factory = session_factory + + async def get_ohlc_data(self, symbol: str, start_date: str, end_date: str) -> List[MarketDataEntity]: + """Get OHLC data with sub-100ms performance""" + async with self.session_factory() as session: + stmt = select(MarketDataEntity).where( + and_( + MarketDataEntity.symbol == symbol, + MarketDataEntity.timestamp >= datetime.fromisoformat(start_date), + MarketDataEntity.timestamp <= datetime.fromisoformat(end_date) + ) + ).order_by(MarketDataEntity.timestamp) + + result = await session.execute(stmt) + return result.scalars().all() + + async def save_ohlc_batch(self, entities: List[MarketDataEntity]) -> None: + """Batch insert with conflict resolution""" + async with self.session_factory() as session: + session.add_all(entities) + await session.commit() + + async def find_similar_momentum_patterns(self, rsi: float, macd: float, limit: int = 10) -> List[TechnicalIndicatorEntity]: + """RAG: Find similar technical patterns using vector similarity""" + target_embedding = self._encode_momentum_pattern(rsi, macd) + + async with self.session_factory() as session: + # Using pgvectorscale cosine similarity + stmt = select(TechnicalIndicatorEntity).order_by( + TechnicalIndicatorEntity.indicator_pattern_embedding.cosine_distance(target_embedding) + ).limit(limit) + + result = await session.execute(stmt) + return result.scalars().all() + + def _encode_momentum_pattern(self, rsi: float, macd: float) -> List[float]: + """Encode momentum indicators to vector for similarity search""" + from sentence_transformers import SentenceTransformer + + model = SentenceTransformer('all-MiniLM-L6-v2') + pattern_text = f"RSI: {rsi:.2f}, MACD: {macd:.4f}, momentum pattern" + return model.encode(pattern_text).tolist() +``` + +### Migration Data Processing + +**4-Phase Migration Strategy** + +```python +class MarketDataMigrator: + """Migrate from CSV files to PostgreSQL with data validation""" + + def __init__(self, csv_data_path: str, repository: MarketDataRepository): + self.csv_data_path = csv_data_path + self.repository = repository + + async def migrate_all_data(self) -> Dict[str, int]: + """Execute 4-phase migration strategy""" + results = {} + + # Phase 1: Market Data (OHLC) + results['market_data'] = await self._migrate_market_data() + + # Phase 2: Fundamental Data + results['fundamental_data'] = await self._migrate_fundamental_data() + + # Phase 3: Insider Data + results['insider_data'] = await self._migrate_insider_data() + + # Phase 4: Calculate Technical Indicators + results['technical_indicators'] = await self._calculate_technical_indicators() + + return results + + async def _migrate_market_data(self) -> int: + """Migrate OHLC data from CSV files""" + csv_files = glob.glob(f"{self.csv_data_path}/market_data/*.csv") + total_records = 0 + + for csv_file in csv_files: + symbol = self._extract_symbol_from_filename(csv_file) + df = pd.read_csv(csv_file) + + # Transform to entities + entities = [] + for _, row in df.iterrows(): + entity = MarketDataEntity.from_csv_record({ + 'symbol': symbol, + 'timestamp': row['Date'], + 'open': row['Open'], + 'high': row['High'], + 'low': row['Low'], + 'close': row['Close'], + 'volume': row['Volume'], + 'adj_close': row['Adj Close'] + }) + + if entity.validate(): + entities.append(entity) + + # Batch insert + await self.repository.save_ohlc_batch(entities) + total_records += len(entities) + + print(f"Migrated {len(entities)} records for {symbol}") + + return total_records + + async def _calculate_technical_indicators(self) -> int: + """Calculate and store technical indicators for all symbols""" + symbols = await self.repository.get_all_symbols() + total_indicators = 0 + + for symbol in symbols: + # Get OHLC data + ohlc_data = await self.repository.get_ohlc_data( + symbol, "2020-01-01", datetime.now().isoformat() + ) + + # Convert to DataFrame + df = pd.DataFrame([{ + 'timestamp': entity.timestamp, + 'close': float(entity.close_price), + 'high': float(entity.high_price), + 'low': float(entity.low_price), + 'volume': entity.volume + } for entity in ohlc_data]) + df.set_index('timestamp', inplace=True) + + # Calculate indicators + indicators = TechnicalIndicatorEntity.calculate_from_ohlc(symbol, df) + + # Generate embeddings + for indicator in indicators: + indicator.indicator_pattern_embedding = self._generate_indicator_embedding(indicator) + + # Save indicators + await self.repository.save_technical_indicators(indicators) + total_indicators += len(indicators) + + print(f"Calculated {len(indicators)} indicators for {symbol}") + + return total_indicators +``` + +## RAG Integration + +### Vector Embeddings for Historical Pattern Matching + +**Embedding Generation Strategy** + +```python +from sentence_transformers import SentenceTransformer +import numpy as np + +class MarketDataEmbeddingService: + """Generate vector embeddings for RAG pattern matching""" + + def __init__(self): + self.model = SentenceTransformer('all-MiniLM-L6-v2') + + def generate_technical_pattern_embedding(self, indicator_data: TechnicalIndicatorEntity) -> List[float]: + """Generate embedding for technical analysis patterns""" + pattern_description = f""" + Technical Analysis Pattern: + RSI: {indicator_data.rsi_14:.2f} + MACD: {indicator_data.macd:.4f}, Signal: {indicator_data.macd_signal:.4f} + SMA 20: {indicator_data.sma_20:.2f}, SMA 50: {indicator_data.sma_50:.2f} + Bollinger Bands: Upper {indicator_data.bollinger_upper:.2f}, Lower {indicator_data.bollinger_lower:.2f} + Volume: Above average {indicator_data.volume_sma_20 > 0} + Pattern: {'Bullish' if indicator_data.rsi_14 > 50 and indicator_data.macd > indicator_data.macd_signal else 'Bearish'} + """ + + return self.model.encode(pattern_description.strip()).tolist() + + def generate_price_movement_embedding(self, market_data: MarketDataEntity) -> List[float]: + """Generate embedding for price movement patterns""" + price_change = (market_data.close_price - market_data.open_price) / market_data.open_price * 100 + volatility = (market_data.high_price - market_data.low_price) / market_data.open_price * 100 + + movement_description = f""" + Price Movement Pattern: + Price Change: {price_change:.2f}% + Intraday Volatility: {volatility:.2f}% + Volume Profile: {'High' if market_data.volume > 1000000 else 'Normal'} + Price Level: {'Above' if market_data.close_price > market_data.open_price else 'Below'} opening + Candle Type: {'Green' if market_data.close_price >= market_data.open_price else 'Red'} + """ + + return self.model.encode(movement_description.strip()).tolist() + + def generate_financial_health_embedding(self, fundamental_data: FundamentalDataEntity) -> List[float]: + """Generate embedding for financial health patterns""" + health_description = f""" + Financial Health Profile: + PE Ratio: {fundamental_data.pe_ratio:.2f if fundamental_data.pe_ratio else 'N/A'} + ROE: {fundamental_data.roe * 100:.2f}% if fundamental_data.roe else 'N/A'} + Debt to Equity: {fundamental_data.debt_to_equity:.2f if fundamental_data.debt_to_equity else 'N/A'} + Revenue Growth: {'Positive' if fundamental_data.total_revenue and fundamental_data.total_revenue > 0 else 'Unknown'} + Profitability: {'Profitable' if fundamental_data.net_income and fundamental_data.net_income > 0 else 'Loss'} + Financial Strength: {'Strong' if fundamental_data.pe_ratio and fundamental_data.pe_ratio < 20 else 'Average'} + """ + + return self.model.encode(health_description.strip()).tolist() +``` + +**RAG Query Service** + +```python +class MarketDataRAGService: + """RAG-powered market analysis with vector similarity search""" + + def __init__(self, repository: MarketDataRepository): + self.repository = repository + self.embedding_service = MarketDataEmbeddingService() + + async def find_similar_market_conditions( + self, + symbol: str, + current_date: datetime, + similarity_threshold: float = 0.8, + limit: int = 10 + ) -> Dict[str, Any]: + """Find historically similar market conditions for decision support""" + + # Get current market state + current_ohlc = await self.repository.get_latest_ohlc(symbol, current_date) + current_indicators = await self.repository.get_latest_indicators(symbol, current_date) + current_fundamentals = await self.repository.get_latest_fundamentals(symbol) + + # Generate query embeddings + current_pattern_embedding = self.embedding_service.generate_technical_pattern_embedding(current_indicators) + + # Find similar patterns + similar_patterns = await self.repository.find_similar_patterns( + current_pattern_embedding, + similarity_threshold, + limit + ) + + # Analyze outcomes of similar patterns + pattern_outcomes = [] + for pattern in similar_patterns: + # Get price movement 5-10 days after similar pattern + future_price = await self.repository.get_price_after_date( + pattern.symbol, + pattern.timestamp, + days=7 + ) + + if future_price: + price_change = (future_price.close_price - pattern.close_price) / pattern.close_price + pattern_outcomes.append({ + 'symbol': pattern.symbol, + 'date': pattern.timestamp, + 'similarity_score': pattern.similarity_score, + 'future_return': float(price_change), + 'pattern_type': self._classify_pattern(pattern) + }) + + return { + 'current_symbol': symbol, + 'analysis_date': current_date, + 'similar_patterns_found': len(similar_patterns), + 'historical_outcomes': pattern_outcomes, + 'average_return': np.mean([p['future_return'] for p in pattern_outcomes]) if pattern_outcomes else 0, + 'success_rate': len([p for p in pattern_outcomes if p['future_return'] > 0]) / len(pattern_outcomes) if pattern_outcomes else 0, + 'confidence_score': self._calculate_confidence(pattern_outcomes) + } + + async def get_peer_analysis(self, symbol: str) -> Dict[str, Any]: + """Find peer companies with similar financial profiles""" + fundamentals = await self.repository.get_latest_fundamentals(symbol) + + if not fundamentals: + return {'error': 'No fundamental data available'} + + # Find companies with similar financial health embeddings + similar_companies = await self.repository.find_similar_financial_profiles( + fundamentals.financial_health_embedding, + exclude_symbol=symbol, + limit=10 + ) + + # Get recent performance comparison + peer_performance = [] + for peer in similar_companies: + recent_performance = await self.repository.get_recent_performance(peer.symbol, days=30) + peer_performance.append({ + 'symbol': peer.symbol, + 'similarity_score': peer.similarity_score, + 'recent_return': recent_performance.get('return', 0), + 'volatility': recent_performance.get('volatility', 0), + 'pe_ratio': float(peer.pe_ratio) if peer.pe_ratio else None + }) + + return { + 'target_symbol': symbol, + 'peer_companies': peer_performance, + 'peer_average_return': np.mean([p['recent_return'] for p in peer_performance]), + 'relative_performance': 'Above average' if recent_performance.get('return', 0) > np.mean([p['recent_return'] for p in peer_performance]) else 'Below average' + } +``` + +## Migration Strategy + +### 4-Phase Migration Approach + +**Phase 1: Database Infrastructure Setup** +```bash +# 1. Set up PostgreSQL with extensions +docker-compose up -d postgres + +# 2. Run database migrations +alembic upgrade head + +# 3. Verify TimescaleDB and pgvectorscale extensions +psql $DATABASE_URL -c "SELECT * FROM pg_extension WHERE extname IN ('timescaledb', 'vectorscale');" +``` + +**Phase 2: Data Migration with Validation** +```python +# Migration script: migrate_market_data.py + +import asyncio +from tradingagents.domains.marketdata.migration.migrator import MarketDataMigrator +from tradingagents.domains.marketdata.repository import MarketDataRepository + +async def main(): + # Initialize components + repository = MarketDataRepository(session_factory) + migrator = MarketDataMigrator("./data/market_data", repository) + + # Execute migration + print("Starting MarketData migration...") + results = await migrator.migrate_all_data() + + print(f"Migration completed:") + print(f" Market Data: {results['market_data']} records") + print(f" Fundamental Data: {results['fundamental_data']} records") + print(f" Insider Data: {results['insider_data']} records") + print(f" Technical Indicators: {results['technical_indicators']} records") + + # Validate migration + validation_results = await migrator.validate_migration() + print(f"Validation: {validation_results}") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Phase 3: Service Layer Migration** +```python +# Switch services to PostgreSQL repositories +# Update dependency injection in service factory + +class ServiceFactory: + def __init__(self, session_factory: async_sessionmaker[AsyncSession]): + self.session_factory = session_factory + + def create_market_data_service(self) -> MarketDataService: + repository = MarketDataRepository(self.session_factory) + return MarketDataService(repository) + + def create_fundamental_data_service(self) -> FundamentalDataService: + repository = FundamentalDataRepository(self.session_factory) + return FundamentalDataService(repository) + + def create_insider_data_service(self) -> InsiderDataService: + repository = InsiderDataRepository(self.session_factory) + return InsiderDataService(repository) +``` + +**Phase 4: RAG Enhancement and Testing** +```python +# Generate embeddings for existing data +async def enhance_with_rag(): + embedding_service = MarketDataEmbeddingService() + + # Generate embeddings for all technical indicators + indicators = await repository.get_all_technical_indicators() + + for indicator in indicators: + embedding = embedding_service.generate_technical_pattern_embedding(indicator) + await repository.update_indicator_embedding(indicator.id, embedding) + + print(f"Generated embeddings for {len(indicators)} technical indicators") + + # Test RAG functionality + rag_service = MarketDataRAGService(repository) + test_results = await rag_service.find_similar_market_conditions("AAPL", datetime.now()) + + print(f"RAG test completed: {test_results}") +``` + +## Testing Strategy + +### Migration Testing + +**Data Integrity Tests** +```python +class TestMarketDataMigration: + """Validate PostgreSQL migration maintains data integrity""" + + async def test_ohlc_data_accuracy(self): + """Verify OHLC data matches CSV files exactly""" + # Load original CSV data + original_df = pd.read_csv("./data/market_data/AAPL.csv") + + # Get PostgreSQL data + postgres_entities = await self.repository.get_ohlc_data("AAPL", "2020-01-01", "2024-01-01") + postgres_df = pd.DataFrame([entity.to_dict() for entity in postgres_entities]) + + # Compare datasets + assert len(original_df) == len(postgres_df) + assert np.allclose(original_df['Close'].values, postgres_df['close_price'].values) + assert np.allclose(original_df['Volume'].values, postgres_df['volume'].values) + + async def test_performance_improvement(self): + """Verify 10x performance improvement""" + import time + + # Test PostgreSQL query time + start = time.time() + postgres_data = await self.repository.get_ohlc_data("AAPL", "2023-01-01", "2024-01-01") + postgres_time = time.time() - start + + # Historical CSV time (from benchmarks) + csv_baseline_time = 0.500 # 500ms baseline + + assert postgres_time < 0.100 # Sub-100ms requirement + assert postgres_time < csv_baseline_time / 10 # 10x improvement + + async def test_rag_functionality(self): + """Verify RAG vector similarity search works""" + rag_service = MarketDataRAGService(self.repository) + + # Test pattern matching + results = await rag_service.find_similar_market_conditions("AAPL", datetime(2023, 6, 1)) + + assert len(results['historical_outcomes']) > 0 + assert 'confidence_score' in results + assert results['average_return'] is not None +``` + +**API Compatibility Tests** +```python +class TestAPICompatibility: + """Ensure 100% API compatibility after migration""" + + async def test_market_data_service_api(self): + """Verify MarketDataService API unchanged""" + service = MarketDataService(self.repository) + + # Test all existing methods + ohlc_data = await service.get_ohlc_data("AAPL", "2023-01-01", "2023-12-31") + assert isinstance(ohlc_data, pd.DataFrame) + assert list(ohlc_data.columns) == ['open', 'high', 'low', 'close', 'volume', 'adj_close'] + + indicators = await service.get_technical_indicators("AAPL", "2023-01-01", "2023-12-31") + assert isinstance(indicators, dict) + assert 'sma_20' in indicators + assert 'rsi_14' in indicators + + momentum_analysis = await service.get_trading_style_preset("momentum", "AAPL") + assert 'signal' in momentum_analysis + assert 'confidence' in momentum_analysis + + async def test_fundamental_service_api(self): + """Verify FundamentalDataService API unchanged""" + service = FundamentalDataService(self.repository) + + ratios = await service.get_financial_ratios("AAPL") + assert isinstance(ratios, dict) + assert 'pe_ratio' in ratios + + health = await service.analyze_financial_health("AAPL") + assert 'health_score' in health + assert 'trend_analysis' in health +``` + +**Performance Validation** +```python +class TestPerformanceRequirements: + """Validate performance requirements are met""" + + async def test_sub_100ms_queries(self): + """Verify sub-100ms query performance""" + import time + + queries = [ + lambda: self.repository.get_ohlc_data("AAPL", "2023-12-01", "2023-12-31"), + lambda: self.repository.get_technical_indicators("AAPL", "2023-12-01", "2023-12-31"), + lambda: self.repository.get_latest_fundamentals("AAPL") + ] + + for query in queries: + start = time.time() + await query() + elapsed = time.time() - start + + assert elapsed < 0.100, f"Query took {elapsed:.3f}s, exceeds 100ms requirement" + + async def test_rag_query_performance(self): + """Verify sub-200ms RAG query performance""" + rag_service = MarketDataRAGService(self.repository) + + start = time.time() + results = await rag_service.find_similar_market_conditions("AAPL", datetime.now()) + elapsed = time.time() - start + + assert elapsed < 0.200, f"RAG query took {elapsed:.3f}s, exceeds 200ms requirement" + + async def test_concurrent_access(self): + """Verify concurrent agent access performance""" + import asyncio + + async def concurrent_query(symbol: str): + return await self.repository.get_ohlc_data(symbol, "2023-12-01", "2023-12-31") + + # Simulate 10 concurrent agents + tasks = [concurrent_query(f"SYMBOL_{i}") for i in range(10)] + + start = time.time() + results = await asyncio.gather(*tasks) + elapsed = time.time() - start + + assert elapsed < 1.0, f"Concurrent queries took {elapsed:.3f}s, too slow for agent workload" + assert len(results) == 10 +``` + +## Implementation Guidance + +### Step-by-Step Implementation + +**Week 1: Database Setup and Schema Migration** +1. Set up PostgreSQL with TimescaleDB and pgvectorscale extensions +2. Create database schemas with proper indexing +3. Run Alembic migrations to create all tables +4. Test hypertable creation and vector index performance + +**Week 2: Entity Models and Repository Layer** +1. Implement all SQLAlchemy entity models with business logic +2. Create async repository classes with PostgreSQL operations +3. Implement batch operations for high-performance data loading +4. Add vector similarity search capabilities + +**Week 3: Data Migration and Validation** +1. Build CSV-to-PostgreSQL migration scripts +2. Migrate all historical data with integrity validation +3. Generate vector embeddings for all existing data +4. Validate data accuracy and performance benchmarks + +**Week 4: Service Layer and API Preservation** +1. Update service layer to use PostgreSQL repositories +2. Ensure 100% API compatibility with existing interfaces +3. Implement RAG-enhanced analysis features +4. Complete integration testing and performance validation + +### Code Organization + +``` +tradingagents/domains/marketdata/ +├── entities/ +│ ├── __init__.py +│ ├── market_data_entity.py +│ ├── fundamental_data_entity.py +│ ├── insider_data_entity.py +│ └── technical_indicator_entity.py +├── repositories/ +│ ├── __init__.py +│ ├── market_data_repository.py +│ ├── fundamental_data_repository.py +│ └── insider_data_repository.py +├── services/ +│ ├── __init__.py +│ ├── market_data_service.py +│ ├── fundamental_data_service.py +│ ├── insider_data_service.py +│ └── embedding_service.py +├── migration/ +│ ├── __init__.py +│ ├── migrator.py +│ └── validation.py +├── rag/ +│ ├── __init__.py +│ ├── rag_service.py +│ └── pattern_matcher.py +└── clients/ + ├── __init__.py + ├── yfinance_client.py # Already implemented + └── finnhub_client.py # Already implemented +``` + +### Configuration Updates + +**Database Configuration** +```python +# config/database.py + +from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker +from sqlalchemy.pool import QueuePool + +class DatabaseConfig: + def __init__(self, database_url: str): + self.engine = create_async_engine( + database_url, + poolclass=QueuePool, + pool_size=20, + max_overflow=30, + pool_pre_ping=True, + echo=False # Set True for SQL debugging + ) + self.session_factory = async_sessionmaker( + bind=self.engine, + expire_on_commit=False, + autoflush=True, + autocommit=False + ) + + async def health_check(self) -> bool: + """Verify database connectivity and extensions""" + async with self.session_factory() as session: + result = await session.execute( + "SELECT extname FROM pg_extension WHERE extname IN ('timescaledb', 'vectorscale')" + ) + extensions = result.fetchall() + return len(extensions) == 2 +``` + +### Monitoring and Observability + +**Performance Monitoring** +```python +class MarketDataMetrics: + """Monitor PostgreSQL migration performance""" + + def __init__(self): + self.query_times = [] + self.rag_query_times = [] + + async def track_query_performance(self, operation: str, execution_time: float): + """Track query performance metrics""" + self.query_times.append({ + 'operation': operation, + 'time': execution_time, + 'timestamp': datetime.now() + }) + + # Alert if performance degrades + if execution_time > 0.100: + print(f"WARNING: {operation} took {execution_time:.3f}s, exceeds 100ms SLA") + + def generate_performance_report(self) -> Dict[str, Any]: + """Generate performance analytics report""" + recent_queries = [q for q in self.query_times if q['timestamp'] > datetime.now() - timedelta(hours=1)] + + return { + 'total_queries': len(recent_queries), + 'average_query_time': np.mean([q['time'] for q in recent_queries]), + 'p95_query_time': np.percentile([q['time'] for q in recent_queries], 95), + 'sla_violations': len([q for q in recent_queries if q['time'] > 0.100]), + 'performance_trend': 'stable' # Could implement trending analysis + } +``` + +This technical design provides a comprehensive migration strategy from the 85% complete CSV-based system to a high-performance PostgreSQL + TimescaleDB + pgvectorscale architecture while preserving 100% API compatibility and adding powerful RAG capabilities for historical pattern matching. + +The design emphasizes: +- **Performance**: Sub-100ms queries with TimescaleDB optimization +- **Compatibility**: Zero API changes for existing services +- **Intelligence**: RAG-enhanced analysis with vector similarity search +- **Scalability**: Async PostgreSQL operations for concurrent agent access +- **Quality**: Comprehensive testing strategy for migration validation + +Implementation should follow the 4-phase approach with weekly milestones to ensure smooth migration and immediate performance benefits. diff --git a/docs/specs/MarketData/requirements.json b/docs/specs/MarketData/requirements.json new file mode 100644 index 00000000..169c8693 --- /dev/null +++ b/docs/specs/MarketData/requirements.json @@ -0,0 +1,6 @@ +{ + "raw_user_story": "As a dagster job and AI Agent I want to collect daily OHLC data from yfinance for all my tickers, insider data from FinnHub, and fundamental data from FinnHub so that agents have comprehensive market data for trading decisions", + "raw_criteria": "Daily OHLC data from yfinance for configured tickers, insider trading data from FinnHub API, fundamental data from FinnHub API, all data stored in PostgreSQL with TimescaleDB optimization, agents can query market data for analysis", + "raw_rules": "Daily automated collection, FinnHub API rate limiting compliance, data quality validation, TimescaleDB for time-series optimization", + "raw_scope": "Included: Daily OHLC from yfinance, insider data from FinnHub, fundamental data from FinnHub, PostgreSQL storage, agent integration. Excluded: Real-time data streaming, other data providers beyond yfinance and FinnHub." +} \ No newline at end of file diff --git a/docs/specs/MarketData/spec-lite.md b/docs/specs/MarketData/spec-lite.md new file mode 100644 index 00000000..974a1806 --- /dev/null +++ b/docs/specs/MarketData/spec-lite.md @@ -0,0 +1,98 @@ +# MarketData Domain - PostgreSQL Migration (Lite Spec) + +## Migration Overview + +**Project**: 85% complete MarketData domain → PostgreSQL + TimescaleDB + pgvectorscale +**Objective**: 10x performance + RAG capabilities while preserving 100% API compatibility +**Pattern**: Follow news domain PostgreSQL implementation for architectural consistency + +## Key Requirements + +### Performance Targets +- Sub-100ms market data queries (10x improvement from CSV) +- Sub-200ms RAG vector similarity search +- Support 500+ tickers with concurrent agent access + +### API Preservation (Critical) +- **MarketDataService**: All existing methods preserved +- **FundamentalDataService**: Complete compatibility maintained +- **InsiderDataService**: Zero breaking changes +- **20 TA-Lib indicators**: Full functionality preserved + +### Data Sources & Collection +- **yfinance**: Daily OHLC data via Dagster pipelines +- **FinnHub**: Insider transactions + fundamental data +- **TimescaleDB hypertables**: market_data, fundamental_data, insider_data +- **Vector storage**: pgvectorscale for RAG pattern matching + +## Technical Implementation + +### Database Schema (TimescaleDB) +```sql +-- Hypertables for time-series optimization +market_data (symbol, date, ohlc, volume) - 10 year retention +fundamental_data (symbol, report_date, metrics) - 5 year retention +insider_data (symbol, transaction_date, person, shares) - 3 year retention +technical_indicators (symbol, date, values, pattern_embedding) - RAG support +``` + +### Entity Models +- **MarketDataEntity**: OHLC + validation + database conversion +- **FundamentalDataEntity**: Financial statement data +- **InsiderDataEntity**: SEC transaction records +- **TechnicalIndicatorEntity**: Calculated values + vector embeddings + +### Repository Pattern (Async PostgreSQL) +```python +class MarketDataRepository: + async def get_ohlc_data(symbol, start, end) -> List[MarketDataEntity] + async def bulk_upsert_market_data(entities) -> int # Dagster ingestion + async def find_similar_patterns(embedding, limit) -> List[Dict] # RAG +``` + +### Service Layer (100% Compatible) +```python +class MarketDataService: + async def get_stock_data(symbol, period) -> pd.DataFrame # Preserved API + async def calculate_technical_indicators(symbol, indicators) -> Dict # 20 TA-Lib + async def get_trading_style_preset(style) -> Dict # Existing presets +``` + +## Migration Strategy + +### Phase 1: Entities & Schema +1. Create SQLAlchemy entities following news domain patterns +2. Setup TimescaleDB hypertables with proper indexing +3. Configure pgvectorscale for vector embeddings + +### Phase 2: Repository Migration +1. Implement async PostgreSQL repositories (mirror NewsRepository pattern) +2. Create data migration scripts (CSV → PostgreSQL) +3. Add vector embedding generation for RAG + +### Phase 3: Service Preservation +1. Update services to use PostgreSQL repositories +2. Maintain exact API signatures and return types +3. Add RAG-enhanced pattern analysis capabilities + +### Phase 4: Integration & Testing +1. Real PostgreSQL tests for repositories +2. Preserve pytest-vcr for YFinanceClient/FinnhubClient +3. Validate 100% API compatibility with existing agents + +## Ready Dependencies +- YFinanceClient + FinnhubClient (fully implemented) +- PostgreSQL + TimescaleDB + pgvectorscale (established) +- DatabaseManager async operations (available) +- News domain patterns for consistency (reference implementation) + +## Success Metrics +- **Performance**: 10x query improvement, sub-100ms operations +- **Compatibility**: Zero API breaking changes, seamless agent migration +- **Scalability**: 500+ concurrent tickers, efficient bulk ingestion +- **Quality**: 85%+ test coverage, comprehensive validation + +## Implementation Approach +**Follow news domain patterns** → Create entities → Migrate repositories → Preserve service APIs → Enhance with vector RAG → Integrate Dagster pipelines + +This migration provides the high-performance, RAG-enabled market data foundation essential for sophisticated multi-agent trading analysis while maintaining complete backward compatibility. \ No newline at end of file diff --git a/docs/specs/MarketData/spec.json b/docs/specs/MarketData/spec.json new file mode 100644 index 00000000..f2161721 --- /dev/null +++ b/docs/specs/MarketData/spec.json @@ -0,0 +1,95 @@ +{ + "feature": "MarketData", + "user_story": "As a Dagster pipeline and AI Agent, I want to collect daily OHLC data from yfinance, insider data from FinnHub, and fundamental data from FinnHub with PostgreSQL + TimescaleDB storage, so that agents have high-performance, RAG-enhanced market data access for comprehensive trading analysis", + "acceptance_criteria": [ + "GIVEN the MarketData domain migration WHEN PostgreSQL + TimescaleDB integration is complete THEN all existing MarketDataService APIs remain 100% compatible with 10x performance improvement", + "GIVEN daily market data collection WHEN Dagster pipelines execute THEN OHLC data from yfinance and insider/fundamental data from FinnHub are stored in TimescaleDB hypertables", + "GIVEN historical market data queries WHEN AI agents request technical analysis THEN responses are delivered within 100ms using TimescaleDB time-series optimization", + "GIVEN technical analysis requests WHEN agents query indicators THEN all 20 existing TA-Lib indicators are preserved with PostgreSQL-backed data access", + "GIVEN RAG-powered analysis WHEN agents search for historical patterns THEN vector similarity search using pgvectorscale returns relevant market conditions within 200ms", + "GIVEN concurrent agent operations WHEN multiple agents access market data THEN PostgreSQL async operations support concurrent reads without file system limitations", + "GIVEN data quality requirements WHEN market data is collected THEN comprehensive validation, audit trails, and error handling maintain data integrity with PostgreSQL ACID transactions" + ], + "business_rules": [ + "Preserve 100% API compatibility with existing MarketDataService for seamless migration", + "Daily automated collection from yfinance (OHLC) and FinnHub (insider + fundamentals) via Dagster pipelines", + "FinnHub API rate limiting compliance with proper backoff strategies", + "TimescaleDB hypertables for market_data, fundamental_data, and insider_data tables", + "Vector embeddings generation for technical analysis patterns using pgvectorscale", + "Data retention policy: 10 years for OHLC, 5 years for fundamentals, 3 years for insider data", + "Sub-100ms query performance for common market data operations", + "Comprehensive audit logging for all data collection and agent queries", + "Graceful degradation when external APIs are unavailable" + ], + "scope": { + "included": [ + "PostgreSQL + TimescaleDB + pgvectorscale migration from CSV storage", + "Preserve all existing YFinanceClient and FinnhubClient integrations", + "Maintain complete MarketDataService, FundamentalDataService, InsiderDataService APIs", + "Async PostgreSQL repository operations following news domain patterns", + "Vector embeddings for RAG-powered historical pattern matching", + "TimescaleDB hypertables for time-series optimization", + "Batch data ingestion pipeline for daily Dagster collection", + "Comprehensive testing with real PostgreSQL database", + "Agent integration enhancement with RAG capabilities" + ], + "excluded": [ + "Real-time data streaming (daily batch collection only)", + "Additional data providers beyond yfinance and FinnHub", + "New technical indicators beyond existing 20 TA-Lib indicators", + "Custom financial calculations beyond current scope", + "Multi-database support (PostgreSQL only)", + "GraphQL or REST API endpoints (agent integration only)" + ] + }, + "current_implementation_status": "85% complete with file-based CSV storage - migration project to PostgreSQL", + "existing_components": [ + "MarketDataService with 20 TA-Lib technical indicators and trading style presets", + "YFinanceClient fully implemented for OHLC, company info, and financials", + "FinnhubClient with structured models for insider transactions and sentiment", + "FundamentalDataService for balance sheet, income statement, cash flow analysis", + "InsiderDataService for SEC transaction data and sentiment scoring", + "MarketDataRepository with CSV storage - MIGRATION TARGET", + "AgentToolkit integration ready for PostgreSQL-backed RAG enhancement", + "Comprehensive testing suite with pytest-vcr for API clients" + ], + "migration_components": [ + "MarketDataEntity SQLAlchemy models for PostgreSQL storage", + "FundamentalDataEntity for financial statement data", + "InsiderDataEntity for SEC transaction records", + "TechnicalIndicatorEntity for calculated indicator values", + "Async PostgreSQL repository operations matching news domain patterns", + "TimescaleDB hypertable setup for time-series optimization", + "Vector embedding generation for technical analysis RAG", + "Data migration scripts from CSV to PostgreSQL" + ], + "aligns_with": "Multi-agent trading framework vision - provides high-performance market data foundation for sophisticated agent analysis with RAG-powered historical context", + "dependencies": [ + "Existing YFinanceClient and FinnhubClient implementations (ready)", + "PostgreSQL + TimescaleDB + pgvectorscale database infrastructure (established)", + "News domain PostgreSQL patterns for migration consistency (available)", + "DatabaseManager async operations and connection management (ready)", + "OpenRouter configuration for vector embeddings generation (available)", + "Dagster orchestration framework for daily data collection (planned)" + ], + "technical_details": { + "architecture_pattern": "Router → Service → Repository → Entity → Database (preserving existing service interfaces)", + "database_integration": "PostgreSQL + TimescaleDB + pgvectorscale migration from CSV storage", + "performance_optimization": "TimescaleDB hypertables, proper indexing, connection pooling, async operations", + "vector_storage": "pgvectorscale for RAG-powered historical pattern matching in technical analysis", + "api_preservation": "100% compatibility with existing MarketDataService, FundamentalDataService, InsiderDataService APIs", + "testing_strategy": "Real PostgreSQL for repository tests, preserved pytest-vcr for API clients, service compatibility testing" + }, + "implementation_approach": "PostgreSQL migration project following news domain patterns: create entities → migrate repositories → preserve service APIs → enhance with vector RAG → integrate Dagster pipelines", + "reference_implementations": { + "news_domain_patterns": "Follow NewsRepository, NewsArticleEntity, DatabaseManager async patterns for consistency", + "database_migration": "Use established TimescaleDB hypertable and pgvectorscale vector storage patterns", + "testing_approach": "Apply news domain testing strategy: real PostgreSQL for repositories, VCR for API clients" + }, + "success_criteria": { + "performance": "10x query performance improvement, sub-100ms market data operations, sub-200ms RAG queries", + "compatibility": "100% existing API preservation, seamless migration without agent disruption", + "scalability": "Support 500+ tickers with concurrent agent access, efficient bulk data ingestion", + "quality": "85%+ test coverage maintained, comprehensive data validation and audit trails" + } +} \ No newline at end of file diff --git a/docs/specs/MarketData/spec.md b/docs/specs/MarketData/spec.md new file mode 100644 index 00000000..5c8dc29e --- /dev/null +++ b/docs/specs/MarketData/spec.md @@ -0,0 +1,352 @@ +# MarketData Domain - PostgreSQL Migration Specification + +## Feature Overview + +**Feature**: MarketData Domain PostgreSQL Migration +**Status**: Migration project (85% complete → PostgreSQL integration) +**Priority**: High (foundational infrastructure for AI agents) + +This specification defines the migration of the MarketData domain from CSV-based storage to PostgreSQL + TimescaleDB + pgvectorscale integration, while preserving 100% API compatibility and delivering 10x performance improvements for AI agent operations. + +## User Stories + +### Primary User Story +> As a Dagster pipeline and AI Agent, I want to collect daily OHLC data from yfinance, insider data from FinnHub, and fundamental data from FinnHub with PostgreSQL + TimescaleDB storage, so that agents have high-performance, RAG-enhanced market data access for comprehensive trading analysis. + +### Supporting User Stories + +**Agent Performance** +- As an AI Agent, I want market data queries to complete in under 100ms, so that real-time trading analysis is efficient +- As a Technical Analyst Agent, I want vector similarity search for historical patterns, so that pattern-based trading decisions are context-aware + +**Data Pipeline Reliability** +- As a Dagster pipeline, I want atomic data ingestion with PostgreSQL ACID transactions, so that data integrity is guaranteed during bulk operations +- As a Risk Management Agent, I want comprehensive audit trails for all market data access, so that trading decisions are fully traceable + +## Acceptance Criteria + +### Migration Compatibility +- **AC1**: GIVEN the MarketData domain migration WHEN PostgreSQL + TimescaleDB integration is complete THEN all existing MarketDataService APIs remain 100% compatible with 10x performance improvement + +### Data Collection Pipeline +- **AC2**: GIVEN daily market data collection WHEN Dagster pipelines execute THEN OHLC data from yfinance and insider/fundamental data from FinnHub are stored in TimescaleDB hypertables + +### Performance Requirements +- **AC3**: GIVEN historical market data queries WHEN AI agents request technical analysis THEN responses are delivered within 100ms using TimescaleDB time-series optimization +- **AC4**: GIVEN technical analysis requests WHEN agents query indicators THEN all 20 existing TA-Lib indicators are preserved with PostgreSQL-backed data access + +### RAG Integration +- **AC5**: GIVEN RAG-powered analysis WHEN agents search for historical patterns THEN vector similarity search using pgvectorscale returns relevant market conditions within 200ms + +### Scalability +- **AC6**: GIVEN concurrent agent operations WHEN multiple agents access market data THEN PostgreSQL async operations support concurrent reads without file system limitations + +### Data Quality +- **AC7**: GIVEN data quality requirements WHEN market data is collected THEN comprehensive validation, audit trails, and error handling maintain data integrity with PostgreSQL ACID transactions + +## Business Rules + +### API Preservation +- **BR1**: Preserve 100% API compatibility with existing MarketDataService for seamless migration +- **BR2**: Maintain all existing method signatures in FundamentalDataService and InsiderDataService + +### Data Collection Standards +- **BR3**: Daily automated collection from yfinance (OHLC) and FinnHub (insider + fundamentals) via Dagster pipelines +- **BR4**: FinnHub API rate limiting compliance with proper backoff strategies +- **BR5**: Graceful degradation when external APIs are unavailable + +### Database Architecture +- **BR6**: TimescaleDB hypertables for market_data, fundamental_data, and insider_data tables +- **BR7**: Vector embeddings generation for technical analysis patterns using pgvectorscale + +### Performance Standards +- **BR8**: Sub-100ms query performance for common market data operations +- **BR9**: Data retention policy: 10 years for OHLC, 5 years for fundamentals, 3 years for insider data + +### Audit and Compliance +- **BR10**: Comprehensive audit logging for all data collection and agent queries + +## Technical Implementation Details + +### Architecture Pattern +**Router → Service → Repository → Entity → Database** + +The migration preserves the existing service interfaces while upgrading the underlying data persistence layer. + +### Database Schema Design + +#### TimescaleDB Hypertables + +```sql +-- Market Data (OHLC) +CREATE TABLE market_data ( + id SERIAL PRIMARY KEY, + symbol VARCHAR(10) NOT NULL, + date TIMESTAMPTZ NOT NULL, + open DECIMAL(12,4), + high DECIMAL(12,4), + low DECIMAL(12,4), + close DECIMAL(12,4), + adj_close DECIMAL(12,4), + volume BIGINT, + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW() +); + +SELECT create_hypertable('market_data', 'date', chunk_time_interval => INTERVAL '1 month'); + +-- Fundamental Data +CREATE TABLE fundamental_data ( + id SERIAL PRIMARY KEY, + symbol VARCHAR(10) NOT NULL, + report_date TIMESTAMPTZ NOT NULL, + period_type VARCHAR(20), -- annual, quarterly + metric_name VARCHAR(100), + metric_value DECIMAL(20,4), + created_at TIMESTAMPTZ DEFAULT NOW() +); + +SELECT create_hypertable('fundamental_data', 'report_date', chunk_time_interval => INTERVAL '3 months'); + +-- Insider Data +CREATE TABLE insider_data ( + id SERIAL PRIMARY KEY, + symbol VARCHAR(10) NOT NULL, + transaction_date TIMESTAMPTZ NOT NULL, + person_name VARCHAR(200), + position VARCHAR(100), + transaction_type VARCHAR(50), + shares BIGINT, + price DECIMAL(12,4), + value DECIMAL(20,4), + created_at TIMESTAMPTZ DEFAULT NOW() +); + +SELECT create_hypertable('insider_data', 'transaction_date', chunk_time_interval => INTERVAL '1 month'); +``` + +#### Vector Storage for RAG + +```sql +-- Technical Indicators with Vector Embeddings +CREATE TABLE technical_indicators ( + id SERIAL PRIMARY KEY, + symbol VARCHAR(10) NOT NULL, + date TIMESTAMPTZ NOT NULL, + indicator_name VARCHAR(50), + indicator_value DECIMAL(12,6), + pattern_embedding vector(384), -- OpenRouter embeddings + created_at TIMESTAMPTZ DEFAULT NOW() +); + +CREATE INDEX ON technical_indicators USING hnsw (pattern_embedding vector_cosine_ops); +``` + +### SQLAlchemy Entity Models + +```python +# MarketDataEntity +@dataclass +class MarketDataEntity: + symbol: str + date: datetime + open: Optional[Decimal] = None + high: Optional[Decimal] = None + low: Optional[Decimal] = None + close: Optional[Decimal] = None + adj_close: Optional[Decimal] = None + volume: Optional[int] = None + id: Optional[int] = None + created_at: Optional[datetime] = None + updated_at: Optional[datetime] = None + + @classmethod + def from_yfinance_data(cls, symbol: str, row: pd.Series) -> "MarketDataEntity": + """Convert yfinance data to entity""" + + def to_database_record(self) -> dict: + """Convert entity to database record""" + + def validate(self) -> None: + """Validate entity data integrity""" +``` + +### Repository Migration + +```python +class MarketDataRepository: + """PostgreSQL + TimescaleDB repository with async operations""" + + def __init__(self, database_manager: DatabaseManager): + self.db = database_manager + + async def get_ohlc_data( + self, + symbol: str, + start_date: datetime, + end_date: datetime + ) -> List[MarketDataEntity]: + """Retrieve OHLC data with TimescaleDB optimization""" + query = """ + SELECT * FROM market_data + WHERE symbol = $1 AND date BETWEEN $2 AND $3 + ORDER BY date DESC + """ + rows = await self.db.fetch(query, symbol, start_date, end_date) + return [MarketDataEntity.from_database_record(row) for row in rows] + + async def bulk_upsert_market_data( + self, + entities: List[MarketDataEntity] + ) -> int: + """Atomic bulk upsert for Dagster pipelines""" + + async def find_similar_patterns( + self, + pattern_embedding: List[float], + limit: int = 10 + ) -> List[Dict]: + """RAG-powered pattern matching using pgvectorscale""" + query = """ + SELECT symbol, date, indicator_name, indicator_value, + pattern_embedding <=> $1 as similarity + FROM technical_indicators + ORDER BY pattern_embedding <=> $1 + LIMIT $2 + """ + return await self.db.fetch(query, pattern_embedding, limit) +``` + +### Service Compatibility Layer + +```python +class MarketDataService: + """Preserved API with PostgreSQL backend""" + + def __init__(self, repository: MarketDataRepository, yfinance_client: YFinanceClient): + self.repository = repository + self.yfinance_client = yfinance_client + + async def get_stock_data(self, symbol: str, period: str = "1y") -> pd.DataFrame: + """100% compatible with existing API signature""" + # Implementation using PostgreSQL repository + + async def calculate_technical_indicators( + self, + symbol: str, + indicators: List[str] + ) -> Dict[str, np.ndarray]: + """Preserve all 20 TA-Lib indicators with PostgreSQL data""" + + async def get_trading_style_preset(self, style: str) -> Dict: + """Preserved trading style presets with enhanced performance""" +``` + +### Vector RAG Enhancement + +```python +class MarketDataRAGService: + """RAG-powered market analysis enhancement""" + + async def find_historical_patterns( + self, + current_indicators: Dict[str, float], + lookback_days: int = 30 + ) -> List[Dict]: + """Vector similarity search for historical patterns""" + + async def generate_pattern_embedding( + self, + indicator_values: Dict[str, float] + ) -> List[float]: + """Generate embeddings using OpenRouter for pattern matching""" +``` + +## Migration Components + +### Phase 1: Database Schema & Entities +1. **SQLAlchemy Entity Models** + - MarketDataEntity for OHLC data + - FundamentalDataEntity for financial statements + - InsiderDataEntity for SEC transactions + - TechnicalIndicatorEntity for calculated values + +2. **TimescaleDB Setup** + - Hypertable creation for time-series optimization + - Proper indexing strategy + - Vector extension configuration + +### Phase 2: Repository Migration +1. **Async PostgreSQL Operations** + - Follow news domain patterns for consistency + - Connection pooling and transaction management + - Error handling and retry logic + +2. **Data Migration Scripts** + - CSV to PostgreSQL data transfer + - Data validation and integrity checks + - Performance optimization + +### Phase 3: Service Preservation +1. **API Compatibility** + - Maintain all existing method signatures + - Preserve return types and data formats + - Performance optimization through PostgreSQL + +2. **Vector RAG Integration** + - Pattern embedding generation + - Similarity search capabilities + - Historical context enhancement + +### Phase 4: Testing & Integration +1. **Comprehensive Testing** + - Real PostgreSQL database for repository tests + - Preserved pytest-vcr for API clients + - Service compatibility validation + +2. **Agent Integration** + - AgentToolkit RAG capabilities + - Performance benchmarking + - Concurrent access testing + +## Dependencies + +### Ready Dependencies +- **YFinanceClient and FinnhubClient**: Fully implemented and tested +- **PostgreSQL + TimescaleDB + pgvectorscale**: Database infrastructure established +- **News domain PostgreSQL patterns**: Migration templates available +- **DatabaseManager**: Async operations and connection management ready +- **OpenRouter configuration**: Vector embeddings generation available + +### Planned Dependencies +- **Dagster orchestration**: Framework for daily data collection pipelines + +## Success Criteria + +### Performance Metrics +- **10x query performance improvement** over CSV-based storage +- **Sub-100ms market data operations** for common agent queries +- **Sub-200ms RAG queries** for vector similarity search +- **Support for 500+ tickers** with concurrent agent access + +### Compatibility Standards +- **100% existing API preservation** without breaking changes +- **Seamless migration** without agent disruption +- **Efficient bulk data ingestion** for Dagster pipelines + +### Quality Assurance +- **85%+ test coverage maintained** across all components +- **Comprehensive data validation** and audit trails +- **PostgreSQL ACID transactions** for data integrity + +## Architecture Alignment + +This migration aligns with the multi-agent trading framework vision by providing: + +1. **High-performance market data foundation** for sophisticated agent analysis +2. **RAG-powered historical context** for pattern-based trading decisions +3. **Scalable concurrent access** supporting multiple agents simultaneously +4. **Comprehensive audit trails** for regulatory compliance and risk management +5. **Time-series optimization** for efficient technical analysis operations + +The migration follows established news domain patterns to ensure architectural consistency across the entire TradingAgents framework. \ No newline at end of file diff --git a/docs/specs/news/context.json b/docs/specs/news/context.json new file mode 100644 index 00000000..139f1d7d --- /dev/null +++ b/docs/specs/news/context.json @@ -0,0 +1,47 @@ +{ + "product_vision": "Multi-agent LLM financial trading framework that mirrors real-world trading firm dynamics for research-based market analysis and trading decisions", + "existing_features": [ + "news_domain_95_complete", + "google_news_client", + "article_scraper_client", + "news_repository_with_embeddings", + "postgresql_timescaledb_stack", + "agent_toolkit_rag_integration", + "openrouter_llm_provider" + ], + "architecture": { + "layer_pattern": "Router → Service → Repository → Entity → Database", + "database": "PostgreSQL + TimescaleDB + pgvectorscale", + "llm_provider": "OpenRouter unified interface", + "agent_orchestration": "LangGraph workflows", + "data_pipeline": "Dagster (planned, not implemented)", + "domain_structure": "news (95% complete), marketdata (planned), socialmedia (planned)", + "testing_strategy": "Domain-specific: mocks for services, real DB for repositories, pytest-vcr for HTTP" + }, + "news_implementation_status": { + "core_components": { + "NewsService": "Business logic with company/global news context", + "NewsRepository": "Async PostgreSQL with batch upsert, vector embeddings", + "GoogleNewsClient": "RSS feed client for live data", + "ArticleScraperClient": "newspaper4k with paywall detection" + }, + "data_models": { + "NewsArticle": "Domain dataclass with validation", + "NewsArticleEntity": "SQLAlchemy model with 1536-dim vector embeddings" + }, + "key_features": [ + "URL-based deduplication", + "Vector embeddings for similarity", + "Paywall detection and fallback", + "Comprehensive test coverage with pytest-vcr" + ] + }, + "dagster_status": "Planned but not implemented - documentation references exist but no pipeline code", + "technical_patterns": { + "async_operations": "All repository methods async with session management", + "batch_operations": "upsert_batch for performance", + "error_handling": "Graceful degradation with logging", + "vector_search": "Semantic similarity for RAG", + "connection_management": "DatabaseManager with asyncpg and pooling" + } +} \ No newline at end of file diff --git a/docs/specs/news/design.json b/docs/specs/news/design.json new file mode 100644 index 00000000..cdf87ad9 --- /dev/null +++ b/docs/specs/news/design.json @@ -0,0 +1,127 @@ +{ + "requirements": { + "entities": { + "NewsArticle": "Existing domain entity, enhance with structured sentiment and vector embedding support", + "NewsJobConfig": "New configuration entity for scheduled job parameters (tickers, schedule, model settings)" + }, + "data_persistence": { + "news_articles_table": "Existing table with vector embedding columns, enhance sentiment_score JSONB column", + "vector_indexes": "pgvectorscale indexes for title_embedding and content_embedding (1536 dimensions)", + "data_flows": [ + "APScheduler → NewsService.update_company_news() → NewsRepository.upsert_batch()", + "ArticleData → OpenRouter API → structured sentiment → NewsArticle entity", + "Article content → OpenRouter embeddings API → pgvectorscale storage" + ] + }, + "api_needed": { + "external_apis": [ + "OpenRouter for LLM sentiment analysis using quick_think_llm", + "OpenRouter for embeddings using text-embedding models", + "Existing GoogleNewsClient and ArticleScraperClient" + ], + "internal_apis": [ + "Enhanced NewsService.update_company_news() method", + "New NewsRepository.find_similar_articles() for semantic search", + "New ScheduledNewsCollector job orchestration class" + ] + }, + "components": { + "scheduler": "APScheduler integration for daily news collection", + "sentiment_analyzer": "OpenRouter LLM client for structured sentiment analysis", + "embedding_generator": "OpenRouter embeddings client for vector generation", + "job_orchestrator": "ScheduledNewsCollector class for job coordination" + }, + "domains": { + "primary": "news (completing final 5%)", + "integration": "Leverages existing Router → Service → Repository → Entity → Database pattern" + }, + "business_rules": [ + "Best-effort sentiment analysis - LLM failures don't block article storage", + "URL-based deduplication using existing NewsRepository patterns", + "Paywall resilience via existing ArticleScraperClient graceful degradation", + "Date filtering: articles within last 30 days only", + "Sentiment confidence threshold: 0.5 minimum for reliable scores", + "Content length limits: 8000 chars for embedding generation", + "Embedding generation: Both title and content vectors required" + ] + }, + "technical_needs": { + "domain_model": { + "entities": { + "NewsArticle": { + "status": "exists_needs_enhancement", + "enhancements": [ + "Structured sentiment JSON format: {sentiment: positive|negative|neutral, confidence: 0.0-1.0, reasoning: string}", + "Vector embedding support for title and content (1536 dimensions)", + "Enhanced validation for sentiment confidence thresholds" + ] + }, + "NewsJobConfig": { + "status": "new_entity", + "fields": ["tickers: list[str]", "schedule_hour: int", "sentiment_model: str", "embedding_model: str", "max_articles_per_ticker: int"], + "validation": "Schedule hour 0-23, max articles 50-500 range" + } + }, + "services": { + "NewsService": { + "status": "exists_needs_enhancement", + "enhancements": [ + "Integrate LLM sentiment analysis in update methods", + "Add vector embedding generation pipeline", + "Enhanced error handling for LLM and embedding failures" + ] + }, + "ScheduledNewsCollector": { + "status": "new_service", + "responsibilities": [ + "Orchestrate daily news collection jobs", + "Manage job configuration and scheduling", + "Monitor job execution and handle failures", + "Integrate with existing NewsService methods" + ] + } + } + }, + "persistence": { + "database": "PostgreSQL + TimescaleDB + pgvectorscale", + "schema_updates": { + "news_articles": { + "existing_columns": "headline, url, source, published_date, summary, entities, sentiment_score, author, category, title_embedding, content_embedding", + "modifications": [ + "Enhance sentiment_score JSONB to support structured format", + "Add vector similarity indexes for title_embedding and content_embedding", + "Add composite index on (symbol, published_date) for News Analyst queries" + ] + } + }, + "access_patterns": [ + "Time-based queries: articles for ticker in date range", + "Semantic similarity: find similar articles using vector search", + "Sentiment filtering: articles by sentiment type and confidence", + "Batch operations: efficient upsert of daily collection results" + ] + }, + "router": { + "status": "not_needed", + "reason": "News Analysts access via AgentToolkit anti-corruption layer, no direct REST API required" + }, + "events": { + "status": "not_applicable", + "reason": "Scheduled batch processing, no real-time event requirements" + }, + "dependencies": { + "external": [ + "OpenRouter API (existing TradingAgentsConfig integration)", + "OpenRouter embeddings models (existing TradingAgentsConfig integration)", + "APScheduler (new dependency for job scheduling)" + ], + "internal": [ + "Existing NewsService (95% complete)", + "Existing NewsRepository with async PostgreSQL patterns", + "Existing GoogleNewsClient and ArticleScraperClient", + "DatabaseManager for connection management", + "TradingAgentsConfig for LLM and API configuration" + ] + } + } +} \ No newline at end of file diff --git a/docs/specs/news/design.md b/docs/specs/news/design.md new file mode 100644 index 00000000..bfcd8b7f --- /dev/null +++ b/docs/specs/news/design.md @@ -0,0 +1,946 @@ +# News Domain Technical Design + +## Overview + +This document details the technical design for completing the final 5% of the News domain implementation. The existing infrastructure is 95% complete with Google News collection, article scraping, and basic storage implemented. The remaining work focuses on **scheduled execution**, **LLM-powered sentiment analysis**, and **vector embeddings** using OpenRouter as the unified LLM provider. + +## Architecture Overview + +### Component Relationships + +```mermaid +graph TD + A[APScheduler] --> B[ScheduledNewsCollector] + B --> C[NewsService] + C --> D[GoogleNewsClient] + C --> E[ArticleScraperClient] + C --> F[OpenRouter LLM Client] + C --> G[OpenRouter Embeddings Client] + C --> H[NewsRepository] + H --> I[PostgreSQL + TimescaleDB + pgvectorscale] + + J[News Analysts] --> K[AgentToolkit] + K --> C + K --> H +``` + +### Data Flow Architecture + +1. **Scheduled Collection Flow** + ``` + APScheduler → ScheduledNewsCollector → NewsService.update_company_news() + → GoogleNewsClient → ArticleScraperClient → OpenRouter (sentiment + embeddings) + → NewsRepository.upsert_batch() → PostgreSQL + ``` + +2. **Agent Query Flow** + ``` + News Analyst → AgentToolkit → NewsService.find_relevant_articles() + → NewsRepository (semantic search) → pgvectorscale vector similarity + ``` + +### Key Design Principles + +- **Leverage Existing 95%**: Build on proven GoogleNewsClient and ArticleScraperClient infrastructure +- **OpenRouter Unified**: Single API for both sentiment analysis and embeddings +- **Best-Effort Processing**: LLM failures don't block article storage +- **Vector-Enhanced Search**: Semantic similarity for News Analysts +- **Fault-Tolerant Scheduling**: Robust error handling and monitoring + +## Domain Model + +### Enhanced NewsArticle Entity + +The existing `NewsArticle` entity requires enhancements for structured sentiment and vector support: + +```python +from typing import Optional, Dict, Any, List +from pydantic import BaseModel, Field, validator +import datetime + +class SentimentScore(BaseModel): + """Structured sentiment analysis result""" + sentiment: Literal["positive", "negative", "neutral"] + confidence: float = Field(ge=0.0, le=1.0) + reasoning: str + + @validator('confidence') + def validate_confidence(cls, v): + if v < 0.5: + raise ValueError("Confidence must be >= 0.5 for reliable sentiment") + return v + +class NewsArticle(BaseModel): + """Enhanced NewsArticle entity with sentiment and vector support""" + # Existing fields (95% complete) + headline: str + url: str = Field(..., regex=r'^https?://') + source: str + published_date: datetime.datetime + summary: Optional[str] = None + entities: List[str] = Field(default_factory=list) + author: Optional[str] = None + category: Optional[str] = None + + # Enhanced fields (final 5%) + sentiment_score: Optional[SentimentScore] = None + title_embedding: Optional[List[float]] = Field(None, min_items=1536, max_items=1536) + content_embedding: Optional[List[float]] = Field(None, min_items=1536, max_items=1536) + + # Metadata + created_at: datetime.datetime = Field(default_factory=datetime.datetime.now) + updated_at: datetime.datetime = Field(default_factory=datetime.datetime.now) + + @validator('content_embedding', 'title_embedding') + def validate_embeddings(cls, v): + if v and len(v) != 1536: + raise ValueError("Embeddings must be 1536 dimensions for OpenRouter compatibility") + return v + + def has_reliable_sentiment(self) -> bool: + """Check if sentiment analysis is reliable (confidence >= 0.5)""" + return bool(self.sentiment_score and self.sentiment_score.confidence >= 0.5) + + def to_record(self) -> Dict[str, Any]: + """Convert to database record format""" + record = self.dict() + # Convert sentiment to JSONB format + if self.sentiment_score: + record['sentiment_score'] = self.sentiment_score.dict() + return record + + @classmethod + def from_record(cls, record: Dict[str, Any]) -> 'NewsArticle': + """Create entity from database record""" + if record.get('sentiment_score'): + record['sentiment_score'] = SentimentScore(**record['sentiment_score']) + return cls(**record) +``` + +### New NewsJobConfig Entity + +Configuration entity for scheduled news collection: + +```python +from pydantic import BaseModel, Field, validator +from typing import List + +class NewsJobConfig(BaseModel): + """Configuration for scheduled news collection jobs""" + tickers: List[str] = Field(..., min_items=1, max_items=50) + schedule_hour: int = Field(..., ge=0, le=23) + sentiment_model: str = Field(default="anthropic/claude-3.5-haiku") + embedding_model: str = Field(default="text-embedding-3-large") + max_articles_per_ticker: int = Field(default=20, ge=5, le=100) + lookback_days: int = Field(default=7, ge=1, le=30) + + @validator('tickers') + def validate_tickers(cls, v): + # Ensure uppercase stock symbols + return [ticker.upper().strip() for ticker in v] + + @validator('sentiment_model') + def validate_sentiment_model(cls, v): + # Ensure OpenRouter model format + if '/' not in v: + raise ValueError("Model must be in OpenRouter format (provider/model)") + return v + + def to_cron_expression(self) -> str: + """Convert to cron expression for APScheduler""" + return f"0 {self.schedule_hour} * * *" # Daily at specified hour +``` + +## Database Design + +### Schema Enhancements + +The existing `news_articles` table requires minimal modifications to support the final 5%: + +```sql +-- Existing table structure (95% complete) +CREATE TABLE IF NOT EXISTS news_articles ( + id SERIAL PRIMARY KEY, + headline TEXT NOT NULL, + url TEXT UNIQUE NOT NULL, + source TEXT NOT NULL, + published_date TIMESTAMPTZ NOT NULL, + summary TEXT, + entities TEXT[] DEFAULT '{}', + sentiment_score JSONB, -- Enhanced for structured format + author TEXT, + category TEXT, + title_embedding vector(1536), -- New: pgvectorscale vector type + content_embedding vector(1536), -- New: pgvectorscale vector type + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW() +); + +-- New indexes for final 5% performance +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_news_articles_symbol_date + ON news_articles (((entities)), published_date DESC); + +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_news_articles_title_embedding + ON news_articles USING vectors (title_embedding vector_cosine_ops); + +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_news_articles_content_embedding + ON news_articles USING vectors (content_embedding vector_cosine_ops); + +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_news_articles_sentiment + ON news_articles (((sentiment_score->>'sentiment'))) + WHERE sentiment_score IS NOT NULL; +``` + +### Query Patterns + +**Time-based News Queries (News Analysts)** +```sql +-- Optimized for Agent queries: recent news for specific ticker +SELECT headline, summary, sentiment_score, published_date +FROM news_articles +WHERE entities @> ARRAY[$1::text] + AND published_date >= NOW() - INTERVAL '30 days' +ORDER BY published_date DESC +LIMIT 20; +``` + +**Semantic Similarity Queries (Vector Search)** +```sql +-- Find similar articles using pgvectorscale +SELECT headline, url, summary, + 1 - (title_embedding <=> $1::vector) AS similarity_score +FROM news_articles +WHERE entities @> ARRAY[$2::text] + AND title_embedding IS NOT NULL +ORDER BY title_embedding <=> $1::vector +LIMIT 10; +``` + +**Batch Upsert Operations (Daily Collection)** +```sql +-- Efficient upsert for daily news collection +INSERT INTO news_articles (headline, url, source, published_date, summary, entities, sentiment_score, title_embedding, content_embedding) +VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) +ON CONFLICT (url) DO UPDATE SET + headline = EXCLUDED.headline, + summary = EXCLUDED.summary, + entities = EXCLUDED.entities, + sentiment_score = EXCLUDED.sentiment_score, + title_embedding = EXCLUDED.title_embedding, + content_embedding = EXCLUDED.content_embedding, + updated_at = NOW(); +``` + +## API Integration + +### OpenRouter Unified Client + +Single OpenRouter integration for both sentiment analysis and embeddings: + +```python +from typing import List, Optional, Dict, Any +import httpx +from tradingagents.config import TradingAgentsConfig + +class OpenRouterClient: + """Unified OpenRouter client for sentiment analysis and embeddings""" + + def __init__(self, config: TradingAgentsConfig): + self.config = config + self.base_url = "https://openrouter.ai/api/v1" + self.headers = { + "Authorization": f"Bearer {config.openrouter_api_key}", + "Content-Type": "application/json" + } + + async def analyze_sentiment(self, text: str, model: Optional[str] = None) -> SentimentScore: + """Generate structured sentiment analysis using LLM""" + model = model or self.config.quick_think_llm + + prompt = f"""Analyze the sentiment of this news article text and respond with ONLY a JSON object: + +Article: {text[:2000]} # Truncate for token limits + +Required JSON format: +{{ + "sentiment": "positive|negative|neutral", + "confidence": 0.0-1.0, + "reasoning": "brief explanation" +}}""" + + payload = { + "model": model, + "messages": [{"role": "user", "content": prompt}], + "temperature": 0.1, # Low temperature for consistent structured output + "max_tokens": 200 + } + + async with httpx.AsyncClient() as client: + try: + response = await client.post( + f"{self.base_url}/chat/completions", + headers=self.headers, + json=payload, + timeout=30.0 + ) + response.raise_for_status() + + result = response.json() + content = result["choices"][0]["message"]["content"].strip() + + # Parse JSON response + import json + sentiment_data = json.loads(content) + return SentimentScore(**sentiment_data) + + except Exception as e: + # Best-effort: return neutral sentiment on failure + return SentimentScore( + sentiment="neutral", + confidence=0.3, # Below reliability threshold + reasoning=f"Analysis failed: {str(e)[:100]}" + ) + + async def generate_embeddings(self, texts: List[str], model: Optional[str] = None) -> List[List[float]]: + """Generate embeddings for multiple texts""" + model = model or "text-embedding-3-large" + + # Truncate texts to avoid token limits + truncated_texts = [text[:8000] for text in texts] + + payload = { + "model": model, + "input": truncated_texts + } + + async with httpx.AsyncClient() as client: + try: + response = await client.post( + f"{self.base_url}/embeddings", + headers=self.headers, + json=payload, + timeout=60.0 + ) + response.raise_for_status() + + result = response.json() + return [item["embedding"] for item in result["data"]] + + except Exception as e: + # Return None embeddings on failure (stored as NULL in DB) + return [None] * len(texts) +``` + +### Enhanced NewsService Integration + +Update existing NewsService to integrate LLM capabilities: + +```python +class NewsService: + """Enhanced NewsService with LLM sentiment and embeddings (final 5%)""" + + def __init__(self, + repository: NewsRepository, + google_client: GoogleNewsClient, + scraper_client: ArticleScraperClient, + openrouter_client: OpenRouterClient): + self.repository = repository + self.google_client = google_client + self.scraper_client = scraper_client + self.openrouter_client = openrouter_client + + async def update_company_news(self, + symbol: str, + lookback_days: int = 7, + max_articles: int = 20, + include_sentiment: bool = True, + include_embeddings: bool = True) -> List[NewsArticle]: + """Enhanced method with LLM sentiment analysis and embeddings""" + + # Step 1: Use existing 95% infrastructure for collection + cutoff_date = datetime.datetime.now() - datetime.timedelta(days=lookback_days) + + # Fetch from Google News (existing) + google_results = await self.google_client.fetch_company_news(symbol, max_articles) + + articles = [] + for result in google_results: + if result.published_date < cutoff_date: + continue + + # Scrape full content (existing) + scraped_content = await self.scraper_client.scrape_article(result.url) + + # Create base article (existing pattern) + article = NewsArticle( + headline=result.title, + url=result.url, + source=result.source, + published_date=result.published_date, + summary=scraped_content.summary if scraped_content else result.description, + entities=[symbol], + author=scraped_content.author if scraped_content else None + ) + + # Step 2: NEW - Add LLM sentiment analysis + if include_sentiment and scraped_content and scraped_content.content: + article.sentiment_score = await self.openrouter_client.analyze_sentiment( + scraped_content.content + ) + + articles.append(article) + + # Step 3: NEW - Batch generate embeddings + if include_embeddings and articles: + titles = [a.headline for a in articles] + contents = [a.summary or a.headline for a in articles] + + title_embeddings = await self.openrouter_client.generate_embeddings(titles) + content_embeddings = await self.openrouter_client.generate_embeddings(contents) + + for i, article in enumerate(articles): + if i < len(title_embeddings) and title_embeddings[i]: + article.title_embedding = title_embeddings[i] + if i < len(content_embeddings) and content_embeddings[i]: + article.content_embedding = content_embeddings[i] + + # Step 4: Batch persist (existing pattern) + await self.repository.upsert_batch(articles) + return articles + + async def find_similar_articles(self, + query_text: str, + symbol: Optional[str] = None, + limit: int = 10) -> List[NewsArticle]: + """NEW: Semantic similarity search for News Analysts""" + + # Generate query embedding + query_embeddings = await self.openrouter_client.generate_embeddings([query_text]) + if not query_embeddings[0]: + # Fallback to text search + return await self.repository.find_by_text_search(query_text, symbol, limit) + + return await self.repository.find_similar_articles( + query_embeddings[0], symbol, limit + ) +``` + +## Job Scheduling Architecture + +### APScheduler Integration + +Robust scheduled execution using APScheduler: + +```python +from apscheduler.schedulers.asyncio import AsyncIOScheduler +from apscheduler.jobstores.redis import RedisJobStore # Optional: persistent job store +from apscheduler.executors.asyncio import AsyncIOExecutor +import logging + +class ScheduledNewsCollector: + """Orchestrates scheduled news collection jobs""" + + def __init__(self, + news_service: NewsService, + config: TradingAgentsConfig, + job_config: NewsJobConfig): + self.news_service = news_service + self.config = config + self.job_config = job_config + + # Configure APScheduler + jobstores = { + 'default': {'type': 'memory'} # Use Redis for production + } + executors = { + 'default': AsyncIOExecutor(), + } + job_defaults = { + 'coalesce': False, # Don't combine missed jobs + 'max_instances': 1, # One job per ticker at a time + 'misfire_grace_time': 300 # 5 minute grace period + } + + self.scheduler = AsyncIOScheduler( + jobstores=jobstores, + executors=executors, + job_defaults=job_defaults, + timezone='UTC' + ) + + async def start(self): + """Start the scheduler and register jobs""" + + for ticker in self.job_config.tickers: + # Schedule daily collection for each ticker + self.scheduler.add_job( + func=self._collect_ticker_news, + trigger='cron', + hour=self.job_config.schedule_hour, + minute=0, + args=[ticker], + id=f"news_collection_{ticker}", + replace_existing=True, + max_instances=1 + ) + + self.scheduler.start() + logging.info(f"Started news collection scheduler for {len(self.job_config.tickers)} tickers") + + async def stop(self): + """Gracefully stop the scheduler""" + if self.scheduler.running: + self.scheduler.shutdown(wait=True) + + async def _collect_ticker_news(self, ticker: str): + """Execute news collection for a single ticker""" + + start_time = datetime.datetime.now() + + try: + logging.info(f"Starting news collection for {ticker}") + + articles = await self.news_service.update_company_news( + symbol=ticker, + lookback_days=self.job_config.lookback_days, + max_articles=self.job_config.max_articles_per_ticker, + include_sentiment=True, + include_embeddings=True + ) + + # Log metrics + sentiment_count = sum(1 for a in articles if a.has_reliable_sentiment()) + embedding_count = sum(1 for a in articles if a.title_embedding) + + duration = (datetime.datetime.now() - start_time).total_seconds() + + logging.info( + f"Completed news collection for {ticker}: " + f"{len(articles)} articles, {sentiment_count} with sentiment, " + f"{embedding_count} with embeddings in {duration:.1f}s" + ) + + except Exception as e: + logging.error(f"News collection failed for {ticker}: {str(e)}") + # Don't raise - let scheduler continue with other tickers + + def get_job_status(self) -> Dict[str, Any]: + """Get status of all scheduled jobs""" + jobs = self.scheduler.get_jobs() + return { + "scheduler_running": self.scheduler.running, + "job_count": len(jobs), + "jobs": [ + { + "id": job.id, + "next_run": job.next_run_time.isoformat() if job.next_run_time else None, + "trigger": str(job.trigger) + } + for job in jobs + ] + } +``` + +### Error Handling and Monitoring + +Comprehensive error handling for production reliability: + +```python +class NewsCollectionMonitor: + """Monitor and handle news collection job failures""" + + def __init__(self, collector: ScheduledNewsCollector): + self.collector = collector + self.failure_counts = defaultdict(int) + self.max_failures = 3 + + async def handle_job_failure(self, ticker: str, error: Exception): + """Handle job failure with exponential backoff""" + + self.failure_counts[ticker] += 1 + + if self.failure_counts[ticker] >= self.max_failures: + logging.error(f"Max failures reached for {ticker}, disabling job") + self.collector.scheduler.remove_job(f"news_collection_{ticker}") + # Could send alert here + else: + # Schedule retry with exponential backoff + delay_minutes = 2 ** self.failure_counts[ticker] + retry_time = datetime.datetime.now() + datetime.timedelta(minutes=delay_minutes) + + self.collector.scheduler.add_job( + func=self.collector._collect_ticker_news, + trigger='date', + run_date=retry_time, + args=[ticker], + id=f"news_retry_{ticker}_{int(retry_time.timestamp())}", + max_instances=1 + ) + + def reset_failure_count(self, ticker: str): + """Reset failure count on successful job""" + if ticker in self.failure_counts: + del self.failure_counts[ticker] +``` + +## Implementation Strategy + +### Phase 1: Entity and Database Enhancements (Week 1) + +**Deliverables:** +- [ ] Enhanced `NewsArticle` entity with `SentimentScore` and vector support +- [ ] New `NewsJobConfig` entity with validation +- [ ] Database migration for vector indexes and sentiment_score JSONB enhancement +- [ ] Repository method `find_similar_articles()` with pgvectorscale integration + +**Testing Focus:** +- Unit tests for entity validation and serialization +- Repository integration tests with vector similarity queries +- Database migration verification + +### Phase 2: OpenRouter Integration (Week 2) + +**Deliverables:** +- [ ] `OpenRouterClient` with sentiment analysis and embeddings +- [ ] Enhanced `NewsService.update_company_news()` with LLM integration +- [ ] Error handling for LLM failures (best-effort approach) +- [ ] Integration tests with OpenRouter API (using pytest-vcr) + +**Testing Focus:** +- Mock OpenRouter responses for consistent testing +- Error handling scenarios (API failures, malformed responses) +- Embedding dimension validation + +### Phase 3: Job Scheduling System (Week 3) + +**Deliverables:** +- [ ] `ScheduledNewsCollector` with APScheduler integration +- [ ] `NewsCollectionMonitor` for error handling and retries +- [ ] Configuration management for job scheduling +- [ ] Graceful startup and shutdown procedures + +**Testing Focus:** +- Scheduler lifecycle testing +- Job execution and failure handling +- Configuration validation + +### Phase 4: Testing and Performance Optimization (Week 4) + +**Deliverables:** +- [ ] Complete test coverage maintaining >85% threshold +- [ ] Performance optimization for vector queries +- [ ] Documentation and deployment guides +- [ ] Integration with existing News Analyst AgentToolkit + +**Testing Focus:** +- End-to-end integration tests +- Performance benchmarks for vector similarity queries +- Load testing for scheduled job execution + +## Testing Strategy + +### Test Architecture + +Following the existing pragmatic TDD approach with mock boundaries: + +``` +tests/domains/news/ +├── __init__.py +├── test_news_entities.py # Entity validation and serialization +├── test_news_service.py # Mock repository and OpenRouter client +├── test_news_repository.py # PostgreSQL test database +├── test_openrouter_client.py # pytest-vcr for API responses +├── test_scheduled_collector.py # Mock APScheduler and services +└── integration/ + ├── test_sentiment_pipeline.py # End-to-end sentiment analysis + ├── test_embedding_pipeline.py # End-to-end embedding generation + └── test_scheduled_execution.py # Full job execution cycle +``` + +### Key Test Categories + +**Entity Tests (Fast Unit Tests)** +```python +def test_news_article_sentiment_validation(): + """Test sentiment score validation and reliability checks""" + + # Valid sentiment + sentiment = SentimentScore( + sentiment="positive", + confidence=0.8, + reasoning="Strong positive language" + ) + + article = NewsArticle( + headline="Test headline", + url="https://example.com", + source="Test Source", + published_date=datetime.datetime.now(), + sentiment_score=sentiment + ) + + assert article.has_reliable_sentiment() == True + + # Low confidence sentiment + low_confidence = SentimentScore( + sentiment="neutral", + confidence=0.3, + reasoning="Ambiguous language" + ) + + article.sentiment_score = low_confidence + assert article.has_reliable_sentiment() == False + +def test_news_article_vector_validation(): + """Test vector embedding validation""" + + # Valid 1536-dimension embedding + valid_embedding = [0.1] * 1536 + article = NewsArticle( + headline="Test", + url="https://example.com", + source="Test", + published_date=datetime.datetime.now(), + title_embedding=valid_embedding + ) + + assert len(article.title_embedding) == 1536 + + # Invalid dimension should raise ValidationError + with pytest.raises(ValidationError): + NewsArticle( + headline="Test", + url="https://example.com", + source="Test", + published_date=datetime.datetime.now(), + title_embedding=[0.1] * 512 # Wrong dimension + ) +``` + +**Service Integration Tests (Mock Boundaries)** +```python +@pytest.mark.asyncio +async def test_news_service_with_sentiment_analysis(mock_openrouter_client, mock_repository): + """Test NewsService integration with mocked LLM client""" + + # Mock successful sentiment analysis + mock_sentiment = SentimentScore( + sentiment="positive", + confidence=0.9, + reasoning="Optimistic financial outlook" + ) + mock_openrouter_client.analyze_sentiment.return_value = mock_sentiment + + # Mock embeddings + mock_openrouter_client.generate_embeddings.return_value = [ + [0.1] * 1536, # title embedding + [0.2] * 1536 # content embedding + ] + + service = NewsService( + repository=mock_repository, + google_client=mock_google_client, + scraper_client=mock_scraper_client, + openrouter_client=mock_openrouter_client + ) + + articles = await service.update_company_news("AAPL", include_sentiment=True) + + # Verify LLM integration + assert len(articles) > 0 + assert articles[0].sentiment_score == mock_sentiment + assert articles[0].title_embedding == [0.1] * 1536 + assert mock_openrouter_client.analyze_sentiment.called + assert mock_openrouter_client.generate_embeddings.called +``` + +**Repository Integration Tests (Real Database)** +```python +@pytest.mark.asyncio +async def test_repository_vector_similarity_search(test_db): + """Test vector similarity search with real pgvectorscale""" + + repository = NewsRepository(test_db) + + # Insert articles with embeddings + article1 = NewsArticle( + headline="Apple reports strong iPhone sales", + url="https://example.com/1", + source="TechNews", + published_date=datetime.datetime.now(), + entities=["AAPL"], + title_embedding=[0.1, 0.2] + [0.0] * 1534 # Similar to query + ) + + article2 = NewsArticle( + headline="Microsoft launches new Azure features", + url="https://example.com/2", + source="CloudNews", + published_date=datetime.datetime.now(), + entities=["MSFT"], + title_embedding=[0.9, 0.8] + [0.0] * 1534 # Different from query + ) + + await repository.upsert_batch([article1, article2]) + + # Query with similar embedding + query_embedding = [0.15, 0.25] + [0.0] * 1534 + similar_articles = await repository.find_similar_articles( + query_embedding, symbol="AAPL", limit=1 + ) + + assert len(similar_articles) == 1 + assert similar_articles[0].headline == "Apple reports strong iPhone sales" +``` + +**API Integration Tests (pytest-vcr)** +```python +@pytest.mark.vcr +@pytest.mark.asyncio +async def test_openrouter_sentiment_analysis(): + """Test real OpenRouter API calls with VCR cassettes""" + + config = TradingAgentsConfig.from_env() + client = OpenRouterClient(config) + + test_text = "Apple's quarterly earnings exceeded expectations with strong iPhone sales." + + sentiment = await client.analyze_sentiment(test_text) + + assert isinstance(sentiment, SentimentScore) + assert sentiment.sentiment in ["positive", "negative", "neutral"] + assert 0.0 <= sentiment.confidence <= 1.0 + assert len(sentiment.reasoning) > 0 + +@pytest.mark.vcr +@pytest.mark.asyncio +async def test_openrouter_embeddings_generation(): + """Test real OpenRouter embeddings API with VCR""" + + config = TradingAgentsConfig.from_env() + client = OpenRouterClient(config) + + texts = ["Apple stock rises", "Market volatility increases"] + + embeddings = await client.generate_embeddings(texts) + + assert len(embeddings) == 2 + assert all(len(emb) == 1536 for emb in embeddings) + assert all(isinstance(val, float) for emb in embeddings for val in emb) +``` + +### Coverage Requirements + +Maintain existing >85% coverage with new components: + +- **Entity Layer**: 95% coverage (comprehensive validation testing) +- **Service Layer**: 90% coverage (mock external dependencies) +- **Repository Layer**: 85% coverage (real database integration tests) +- **Client Layer**: 80% coverage (pytest-vcr for API calls) +- **Integration Tests**: End-to-end scenarios covering complete workflows + +### Performance Testing + +```python +@pytest.mark.performance +@pytest.mark.asyncio +async def test_vector_similarity_performance(): + """Ensure vector similarity queries perform under 100ms""" + + repository = NewsRepository(test_db) + + # Insert 1000 articles with embeddings + articles = [create_test_article_with_embedding() for _ in range(1000)] + await repository.upsert_batch(articles) + + query_embedding = [random.random() for _ in range(1536)] + + start_time = time.time() + results = await repository.find_similar_articles(query_embedding, limit=10) + duration = time.time() - start_time + + assert duration < 0.1 # Under 100ms + assert len(results) == 10 +``` + +## Integration Points + +### News Analyst AgentToolkit Integration + +The completed News domain integrates seamlessly with existing News Analyst agents: + +```python +class NewsAnalystToolkit: + """Enhanced toolkit with semantic search capabilities""" + + def __init__(self, news_service: NewsService): + self.news_service = news_service + + async def get_relevant_news(self, + ticker: str, + query: Optional[str] = None, + days_back: int = 30) -> List[Dict[str, Any]]: + """Get news with optional semantic search""" + + if query: + # Use semantic similarity search + articles = await self.news_service.find_similar_articles( + query_text=query, + symbol=ticker, + limit=20 + ) + else: + # Use time-based search (existing) + articles = await self.news_service.find_recent_news( + symbol=ticker, + days_back=days_back + ) + + return [ + { + "headline": article.headline, + "summary": article.summary, + "published_date": article.published_date.isoformat(), + "sentiment": article.sentiment_score.sentiment if article.sentiment_score else "unknown", + "confidence": article.sentiment_score.confidence if article.sentiment_score else 0.0, + "source": article.source, + "url": article.url + } + for article in articles + ] +``` + +### Configuration Integration + +Seamless integration with existing `TradingAgentsConfig`: + +```python +# Enhanced configuration for news domain completion +config = TradingAgentsConfig( + # Existing LLM configuration + llm_provider="openrouter", + openrouter_api_key=os.getenv("OPENROUTER_API_KEY"), + quick_think_llm="anthropic/claude-3.5-haiku", # For sentiment analysis + + # New news-specific settings + news_collection_enabled=True, + news_schedule_hour=6, # UTC + news_sentiment_enabled=True, + news_embeddings_enabled=True, + news_max_articles_per_ticker=20, + + # Database (existing) + database_url=os.getenv("DATABASE_URL"), +) + +# Job configuration +news_job_config = NewsJobConfig( + tickers=["AAPL", "GOOGL", "MSFT", "TSLA", "NVDA"], + schedule_hour=6, # 6 AM UTC daily collection + sentiment_model=config.quick_think_llm, + embedding_model="text-embedding-3-large", + max_articles_per_ticker=20 +) +``` + +This design completes the final 5% of the News domain while leveraging the existing 95% infrastructure, maintaining architectural consistency, and providing the robust scheduled execution, LLM-powered sentiment analysis, and vector embeddings needed for advanced News Analyst capabilities. \ No newline at end of file diff --git a/docs/specs/news/requirements.json b/docs/specs/news/requirements.json new file mode 100644 index 00000000..1e24fd98 --- /dev/null +++ b/docs/specs/news/requirements.json @@ -0,0 +1,6 @@ +{ + "raw_user_story": "a) As a Dagster Job I want to fetch all the google news articles for a ticker, fetch the article, perform sentimate analysis with LLM's and store in in the DB. b) As a News Analyst I want to fetch all relavent news data for a specific ticker and related tickers.", + "raw_criteria": "a) the news data is updated on a schedule, daily to start. B) I can update the news for a ticker c) I can get the news for a ticker d) News is stored in DB with embeddings c) News is fetched from DB", + "raw_rules": "a) best effort to fetch article, if it is paywalled or blocked, log a waring and continue", + "raw_scope": "Included: Only fetch data from google new xml feed, use newspaper4k to fetch article content, use LLM to run sentiment analysis. Excluded: Other news sources beyond Google News XML feed." +} \ No newline at end of file diff --git a/docs/specs/news/spec-lite.md b/docs/specs/news/spec-lite.md new file mode 100644 index 00000000..77bf0cd4 --- /dev/null +++ b/docs/specs/news/spec-lite.md @@ -0,0 +1,80 @@ +# News Domain Completion - Implementation Summary + +## Core Requirement +Complete final 5% of news domain: add scheduled execution, LLM sentiment analysis, and vector embeddings to existing 95% complete infrastructure. + +## User Story +**Dagster Job** automatically fetches Google News articles for tracked tickers, extracts content, performs LLM sentiment analysis, and stores with embeddings → **News Analysts** get comprehensive, up-to-date news data for trading decisions. + +## Essential Requirements + +### 1. Scheduled Execution +- Daily job at 6 AM UTC for all configured tickers +- APScheduler integration (no Dagster dependency) +- Graceful error handling with comprehensive logging + +### 2. LLM Sentiment Analysis +- OpenRouter integration using `quick_think_llm` (claude-3.5-haiku) +- Structured output: `{"sentiment": "positive|negative|neutral", "confidence": 0.0-1.0}` +- Best-effort processing - failures don't stop pipeline + +### 3. Vector Embeddings +- 1536-dimension embeddings for title and content +- pgvectorscale storage with similarity indexes +- Semantic search capability for News Analysts + +## Technical Implementation + +### Architecture Pattern +``` +ScheduledNewsJob → NewsService → NewsRepository → NewsArticle → PostgreSQL+pgvectorscale +``` + +### Database Changes +```sql +ALTER TABLE news_articles +ADD COLUMN sentiment_score JSONB, +ADD COLUMN title_embedding vector(1536), +ADD COLUMN content_embedding vector(1536); +``` + +### Key Integration Points +- **Existing NewsService**: Enhance `update_news_for_symbol` method +- **LLM Integration**: OpenRouter unified provider for sentiment +- **Vector Generation**: text-embedding-3-small model (1536 dims) +- **Job Scheduling**: APScheduler with cron trigger + +## Implementation Phases +1. **Scheduled Execution** (2-3h): APScheduler + config management +2. **LLM Sentiment** (3-4h): OpenRouter integration + structured prompts +3. **Vector Embeddings** (2-3h): Embedding generation + database schema +4. **Testing & Monitoring** (2h): Coverage + performance validation + +**Total: 9-12 hours** + +## Success Criteria +- ✅ Daily automated news collection without manual intervention +- ✅ News retrieval with sentiment scores < 2 seconds response time +- ✅ Vector embeddings enable semantic search for News Analysts +- ✅ >95% article processing success rate despite paywall/blocking +- ✅ Maintain >85% test coverage including new components + +## Dependencies +- **APIs**: OpenRouter (sentiment), OpenAI (embeddings) +- **Infrastructure**: PostgreSQL + TimescaleDB + pgvectorscale +- **New Package**: `apscheduler` for job scheduling +- **Existing**: 95% complete news domain components + +## Configuration +```bash +OPENROUTER_API_KEY="sk-or-..." +OPENAI_API_KEY="sk-..." +NEWS_SCHEDULE_HOUR=6 +NEWS_TICKERS="AAPL,GOOGL,MSFT,TSLA" +``` + +## Risk Mitigation +- **API Rate Limits**: Exponential backoff + batch processing +- **Paywall Blocking**: Metadata-only storage with warnings +- **Job Failures**: Monitoring + alerting for operational visibility +- **Performance**: Vector indexes + query optimization for <2s target \ No newline at end of file diff --git a/docs/specs/news/spec.json b/docs/specs/news/spec.json new file mode 100644 index 00000000..256f6403 --- /dev/null +++ b/docs/specs/news/spec.json @@ -0,0 +1,68 @@ +{ + "feature": "news", + "user_story": "As a Dagster Job, I want to automatically fetch Google News articles for tracked tickers, extract content, perform LLM sentiment analysis, and store with embeddings in the database, so that News Analysts can access comprehensive, up-to-date news data for trading decisions", + "acceptance_criteria": [ + "GIVEN a scheduled job runs daily WHEN it executes THEN it fetches news for all configured tickers without manual intervention", + "GIVEN a news article is found WHEN content extraction fails due to paywall THEN a warning is logged and processing continues with available metadata", + "GIVEN a ticker symbol WHEN a News Analyst requests news data THEN they receive articles with sentiment scores and embeddings within 2 seconds", + "GIVEN news articles are processed WHEN LLM sentiment analysis runs THEN each article gets a structured sentiment score (positive/negative/neutral with confidence)", + "GIVEN news articles are stored WHEN saved to database THEN they include vector embeddings for both title and content for semantic search" + ], + "business_rules": [ + "Best effort article fetching - log warnings for paywalled/blocked content but continue processing", + "Daily schedule execution with configurable ticker list", + "Deduplication by URL to prevent duplicate articles", + "Sentiment analysis using OpenRouter LLM integration", + "Vector embeddings generated for semantic similarity search", + "Graceful error handling for network failures and API limits" + ], + "scope": { + "included": [ + "Scheduled news collection job using existing NewsService", + "LLM-based sentiment analysis replacing current keyword approach", + "Vector embedding generation for articles", + "Configuration management for ticker lists and schedules", + "Integration with existing GoogleNewsClient and ArticleScraperClient", + "Database storage using existing NewsRepository patterns" + ], + "excluded": [ + "Other news sources beyond Google News XML feed", + "Real-time news streaming (daily batch processing only)", + "Custom sentiment models (use OpenRouter LLMs only)", + "News source reliability scoring", + "Multi-language news support" + ] + }, + "current_implementation_status": "95% complete - core components exist", + "missing_components": [ + "Scheduled execution framework (Dagster alternative needed)", + "LLM sentiment analysis integration", + "Vector embedding generation", + "Configuration management for tickers and schedules", + "Pipeline monitoring and status tracking" + ], + "existing_components": [ + "NewsService with update_news_for_symbol method", + "GoogleNewsClient for RSS feed parsing", + "ArticleScraperClient with newspaper4k integration", + "NewsRepository with async PostgreSQL and vector schema", + "NewsArticle domain model with validation", + "Comprehensive test coverage with pytest-vcr" + ], + "aligns_with": "Multi-agent trading framework vision - provides news context for agent decision making", + "dependencies": [ + "OpenRouter API for LLM sentiment analysis", + "PostgreSQL with pgvectorscale for embeddings", + "Existing news domain components (95% complete)", + "APScheduler or similar for job scheduling (Dagster not in current dependencies)" + ], + "technical_details": { + "architecture_pattern": "Router → Service → Repository → Entity → Database", + "database_integration": "Async PostgreSQL with TimescaleDB optimization", + "llm_integration": "OpenRouter unified provider with two-tier model strategy", + "vector_storage": "1536-dimension embeddings using pgvectorscale", + "error_handling": "Graceful degradation with comprehensive logging", + "testing_strategy": "Domain-specific with pytest-vcr for HTTP mocking" + }, + "implementation_approach": "Complete the missing 5% by adding scheduled execution, LLM sentiment analysis, and vector embedding generation to existing NewsService infrastructure" +} \ No newline at end of file diff --git a/docs/specs/news/spec.md b/docs/specs/news/spec.md new file mode 100644 index 00000000..ebf7b2c2 --- /dev/null +++ b/docs/specs/news/spec.md @@ -0,0 +1,334 @@ +# News Domain Completion Specification + +## Feature Overview + +Complete the final 5% of the news domain by adding scheduled execution, LLM sentiment analysis, and vector embeddings to the existing 95% complete infrastructure. This enables automated daily news collection with advanced sentiment analysis and semantic search capabilities for News Analysts in the multi-agent trading framework. + +## User Story + +**Primary User**: Dagster Job (automated system) +**Secondary Users**: News Analysts (LLM agents) + +> As a Dagster Job, I want to automatically fetch Google News articles for tracked tickers, extract content, perform LLM sentiment analysis, and store with embeddings in the database, so that News Analysts can access comprehensive, up-to-date news data for trading decisions. + +## Acceptance Criteria + +### AC1: Scheduled Execution +**GIVEN** a scheduled job runs daily +**WHEN** it executes +**THEN** it fetches news for all configured tickers without manual intervention + +**Validation**: +- Job executes at configured time (default: daily at 6 AM UTC) +- All tickers in configuration are processed +- Job completion status is logged with metrics + +### AC2: Content Extraction Resilience +**GIVEN** a news article is found +**WHEN** content extraction fails due to paywall +**THEN** a warning is logged and processing continues with available metadata + +**Validation**: +- Paywall detection doesn't halt processing +- Warning messages include article URL and error reason +- Metadata (title, source, publish_date) is still stored + +### AC3: Fast News Retrieval +**GIVEN** a ticker symbol +**WHEN** a News Analyst requests news data +**THEN** they receive articles with sentiment scores and embeddings within 2 seconds + +**Validation**: +- Database queries return results in < 2 seconds +- Results include sentiment scores and vector embeddings +- Pagination supports large result sets + +### AC4: LLM Sentiment Analysis +**GIVEN** news articles are processed +**WHEN** LLM sentiment analysis runs +**THEN** each article gets a structured sentiment score (positive/negative/neutral with confidence) + +**Validation**: +- Sentiment scores use structured format: `{"sentiment": "positive|negative|neutral", "confidence": 0.0-1.0}` +- LLM integration uses OpenRouter unified provider +- Failed sentiment analysis doesn't prevent article storage + +### AC5: Vector Embeddings Storage +**GIVEN** news articles are stored +**WHEN** saved to database +**THEN** they include vector embeddings for both title and content for semantic search + +**Validation**: +- 1536-dimension embeddings generated for title and content +- Embeddings stored in pgvectorscale-optimized columns +- Semantic similarity search returns relevant results + +## Business Rules + +### BR1: Best Effort Processing +- Log warnings for paywalled/blocked content but continue processing +- Network failures don't halt entire job execution +- API rate limits are respected with exponential backoff + +### BR2: Daily Schedule Execution +- Configurable ticker list supports adding/removing symbols +- Job execution time is configurable (default: daily at 6 AM UTC) +- Manual job execution available for testing and backfill + +### BR3: Data Quality Standards +- URL-based deduplication prevents duplicate articles +- Article publish dates must be within last 30 days +- Source URLs must be valid and accessible + +### BR4: LLM Integration Standards +- Use OpenRouter unified provider for sentiment analysis +- Quick-think LLM for sentiment processing (cost optimization) +- Structured prompts ensure consistent sentiment format + +### BR5: Vector Search Optimization +- Embeddings enable semantic similarity search for agents +- Vector indexes optimize query performance +- Embedding generation uses consistent model for coherence + +### BR6: Graceful Error Handling +- Individual article failures don't stop batch processing +- Comprehensive logging for monitoring and debugging +- Database transactions ensure data consistency + +## Technical Implementation + +### Architecture Alignment + +Follows established **Router → Service → Repository → Entity → Database** pattern: + +``` +ScheduledNewsJob → NewsService → NewsRepository → NewsArticle → PostgreSQL+pgvectorscale +``` + +### Database Schema Integration + +Leverages existing NewsRepository with vector extensions: + +```sql +-- Existing news_articles table enhanced with: +ALTER TABLE news_articles +ADD COLUMN IF NOT EXISTS sentiment_score JSONB, +ADD COLUMN IF NOT EXISTS title_embedding vector(1536), +ADD COLUMN IF NOT EXISTS content_embedding vector(1536); + +-- Vector similarity indexes +CREATE INDEX IF NOT EXISTS idx_title_embedding +ON news_articles USING ivfflat (title_embedding vector_cosine_ops); +``` + +### LLM Integration Pattern + +```python +# OpenRouter sentiment analysis +sentiment_result = await llm_client.analyze_sentiment( + text=article.content, + model="anthropic/claude-3.5-haiku", # quick_think_llm + structured_output=True +) + +# Expected response format +{ + "sentiment": "positive|negative|neutral", + "confidence": 0.85, + "reasoning": "Brief explanation" +} +``` + +### Vector Embedding Strategy + +```python +# Generate embeddings for semantic search +title_embedding = await embedding_client.create_embedding( + text=article.title, + model="text-embedding-3-small" # 1536 dimensions +) + +content_embedding = await embedding_client.create_embedding( + text=article.content[:8000], # Truncate for token limits + model="text-embedding-3-small" +) +``` + +### Scheduled Execution Framework + +Use APScheduler for job orchestration (Dagster not in current dependencies): + +```python +from apscheduler.schedulers.asyncio import AsyncIOScheduler + +scheduler = AsyncIOScheduler() +scheduler.add_job( + run_news_collection, + 'cron', + hour=6, # 6 AM UTC + minute=0, + timezone=timezone.utc, + id='daily_news_collection' +) +``` + +## Implementation Approach + +### Phase 1: Scheduled Execution (2-3 hours) +1. Configure APScheduler for daily news collection +2. Create job configuration management for ticker lists +3. Implement job monitoring and status tracking +4. Add manual execution capability for testing + +### Phase 2: LLM Sentiment Integration (3-4 hours) +1. Integrate OpenRouter LLM for sentiment analysis +2. Create structured sentiment analysis prompts +3. Update NewsService to include sentiment processing +4. Add sentiment data to NewsArticle domain model + +### Phase 3: Vector Embeddings (2-3 hours) +1. Add embedding generation to article processing +2. Update database schema for vector storage +3. Implement semantic search capabilities in NewsRepository +4. Create vector similarity query methods + +### Phase 4: Testing & Monitoring (2 hours) +1. Add comprehensive test coverage for new components +2. Implement job monitoring and alerting +3. Create configuration validation +4. Performance testing for 2-second query requirement + +### Total Estimated Effort: 9-12 hours + +## Dependencies + +### Required APIs +- **OpenRouter API**: LLM sentiment analysis (`OPENROUTER_API_KEY`) +- **OpenAI API**: Vector embeddings (`OPENAI_API_KEY` for embeddings) + +### Database Requirements +- **PostgreSQL**: Base storage with async support +- **TimescaleDB**: Time-series optimization for news data +- **pgvectorscale**: Vector storage and similarity search + +### Existing Infrastructure (95% Complete) +- `NewsService` with `update_news_for_symbol` method +- `GoogleNewsClient` for RSS feed parsing +- `ArticleScraperClient` with newspaper4k integration +- `NewsRepository` with async PostgreSQL operations +- `NewsArticle` domain model with validation +- Comprehensive test coverage with pytest-vcr + +### New Dependencies +- `apscheduler` for job scheduling +- Enhanced vector embedding capabilities +- LLM client integration for sentiment analysis + +## Configuration Management + +### Environment Variables +```bash +# Existing +OPENROUTER_API_KEY="sk-or-..." +DATABASE_URL="postgresql://..." + +# New requirements +OPENAI_API_KEY="sk-..." # For embeddings +NEWS_SCHEDULE_HOUR=6 # UTC hour for daily execution +NEWS_TICKERS="AAPL,GOOGL,MSFT,TSLA" # Comma-separated ticker list +``` + +### Configuration File Support +```yaml +# config/news_collection.yaml +schedule: + hour: 6 + minute: 0 + timezone: "UTC" + +tickers: + - "AAPL" + - "GOOGL" + - "MSFT" + - "TSLA" + +sentiment: + llm_model: "anthropic/claude-3.5-haiku" + confidence_threshold: 0.5 + +embeddings: + model: "text-embedding-3-small" + dimensions: 1536 + content_max_length: 8000 +``` + +## Success Metrics + +### Performance Targets +- **Query Response Time**: < 2 seconds for news retrieval with sentiment +- **Job Execution Time**: < 30 minutes for daily collection (4 tickers) +- **Success Rate**: > 95% article processing success rate +- **Test Coverage**: Maintain > 85% coverage including new components + +### Operational Metrics +- Daily job completion status and execution time +- Article processing success/failure rates per ticker +- LLM sentiment analysis success rates +- Vector embedding generation performance +- Database query performance monitoring + +## Risk Mitigation + +### Technical Risks +1. **LLM API Rate Limits**: Implement exponential backoff and batch processing +2. **Vector Storage Performance**: Monitor query times and optimize indexes +3. **Paywall Content Blocking**: Graceful degradation with metadata-only storage +4. **Database Migration Complexity**: Test schema changes thoroughly + +### Operational Risks +1. **Scheduled Job Failures**: Implement monitoring and alerting +2. **API Key Management**: Secure configuration management +3. **Data Quality Issues**: Validation at multiple pipeline stages +4. **Performance Degradation**: Regular performance monitoring and optimization + +## Testing Strategy + +### Unit Testing (pytest with pytest-vcr) +- Scheduled job execution logic +- LLM sentiment analysis integration +- Vector embedding generation +- Configuration management + +### Integration Testing +- End-to-end news collection pipeline +- Database vector operations +- LLM API integration +- Job scheduling functionality + +### Performance Testing +- Query response time validation (< 2 seconds) +- Batch processing performance +- Vector similarity search optimization +- Concurrent job execution handling + +## Monitoring and Observability + +### Logging Strategy +- Job execution start/completion with metrics +- Individual article processing success/failure +- LLM API call status and timing +- Database operation performance + +### Health Checks +- Daily job completion status +- Database connectivity and performance +- LLM API availability and response times +- Vector search functionality + +### Alerting Triggers +- Failed daily news collection jobs +- API rate limit violations +- Database query performance degradation +- Sentiment analysis failure rates > 10% + +This specification completes the news domain infrastructure to support advanced news analysis for the multi-agent trading framework, providing News Analysts with comprehensive, sentiment-analyzed, and semantically searchable news data for informed trading decisions. \ No newline at end of file diff --git a/docs/specs/news/status.md b/docs/specs/news/status.md new file mode 100644 index 00000000..162421dc --- /dev/null +++ b/docs/specs/news/status.md @@ -0,0 +1,336 @@ +# News Domain Completion - Progress Status + +## Overview + +**Feature**: News Domain Final 5% Completion +**Status**: Ready for Implementation +**Total Estimated Time**: 12-16 hours with AI assistance +**Target Timeline**: 3-4 days +**Current Progress**: 95% complete (infrastructure ready) + +--- + +## Progress Summary + +### Overall Completion: 0% (95% + 0% of final 5%) + +| Phase | Status | Progress | Duration | Completion | +|-------|--------|----------|----------|------------| +| Phase 1: Foundation | ⏳ Not Started | 0/3 tasks | 0h/4-7h | ⬜⬜⬜⬜⬜⬜⬜ | +| Phase 2: Data Access | ⏳ Not Started | 0/1 tasks | 0h/2-3h | ⬜⬜⬜ | +| Phase 3: LLM Integration | ⏳ Not Started | 0/3 tasks | 0h/5-8h | ⬜⬜⬜⬜⬜⬜⬜⬜ | +| Phase 4: Scheduling | ⏳ Not Started | 0/2 tasks | 0h/4-6h | ⬜⬜⬜⬜⬜⬜ | +| Phase 5: Validation | ⏳ Not Started | 0/2 tasks | 0h/3-5h | ⬜⬜⬜⬜⬜ | + +**Legend**: ✅ Complete | 🟡 In Progress | ⏳ Not Started | ❌ Blocked + +--- + +## Task Status Tracking + +### Phase 1: Foundation (0% Complete) + +#### ⏳ T001: Database Migration - NewsJobConfig Table +- **Status**: Not Started +- **Priority**: Critical +- **Estimated**: 1-2 hours +- **Dependencies**: None +- **Progress**: 0% +- **Acceptance Criteria**: 0/4 completed + - [ ] `news_job_configs` table created with UUID primary key + - [ ] JSONB fields for symbols and categories with validation + - [ ] Proper indexes for enabled/frequency queries + - [ ] Migration script tests with rollback capability +- **Blocking Issues**: None +- **Next Actions**: Create Alembic migration script + +#### ⏳ T002: Enhance NewsArticle Entity - Sentiment and Embeddings +- **Status**: Not Started +- **Priority**: Critical +- **Estimated**: 2-3 hours +- **Dependencies**: T001 +- **Progress**: 0% +- **Acceptance Criteria**: 0/5 completed + - [ ] Add sentiment_score, sentiment_confidence, sentiment_label fields + - [ ] Add title_embedding and content_embedding vector fields + - [ ] Enhanced validate() method with sentiment range checks + - [ ] Updated transformations for vector handling + - [ ] Embedding dimension validation (1536) +- **Blocking Issues**: None +- **Next Actions**: Extend NewsArticle dataclass + +#### ⏳ T003: Create NewsJobConfig Entity +- **Status**: Not Started +- **Priority**: Critical +- **Estimated**: 1-2 hours +- **Dependencies**: T001 +- **Progress**: 0% +- **Acceptance Criteria**: 0/5 completed + - [ ] NewsJobConfig dataclass with all required fields + - [ ] Business rule validation for job configuration + - [ ] Cron expression validation for frequency + - [ ] Symbol list validation + - [ ] JSON serialization for database storage +- **Blocking Issues**: None +- **Next Actions**: Create new entity file + +### Phase 2: Data Access (0% Complete) + +#### ⏳ T004: Enhance NewsRepository - Vector and Job Operations +- **Status**: Not Started +- **Priority**: Critical +- **Estimated**: 2-3 hours +- **Dependencies**: T002, T003 +- **Progress**: 0% +- **Acceptance Criteria**: 0/5 completed + - [ ] Vector similarity search with cosine distance + - [ ] Batch embedding update operations + - [ ] NewsJobConfig CRUD methods + - [ ] Optimized query performance for vector operations + - [ ] Proper async connection handling +- **Blocking Issues**: Waiting for T002, T003 +- **Next Actions**: Extend NewsRepository class + +### Phase 3: LLM Integration (0% Complete) + +#### ⏳ T005: OpenRouter Client - Sentiment Analysis +- **Status**: Not Started +- **Priority**: Critical +- **Estimated**: 2-3 hours +- **Dependencies**: T002 +- **Progress**: 0% +- **Acceptance Criteria**: 0/5 completed + - [ ] OpenRouter API integration for sentiment analysis + - [ ] Structured prompts for financial news sentiment + - [ ] Response parsing with Pydantic models + - [ ] Error handling with graceful fallbacks + - [ ] Retry logic with exponential backoff +- **Blocking Issues**: Waiting for T002 +- **Next Actions**: Create OpenRouter sentiment client + +#### ⏳ T006: OpenRouter Client - Vector Embeddings +- **Status**: Not Started +- **Priority**: Critical +- **Estimated**: 1-2 hours +- **Dependencies**: T002 +- **Progress**: 0% +- **Acceptance Criteria**: 0/5 completed + - [ ] OpenRouter embeddings API integration + - [ ] Text preprocessing for embedding generation + - [ ] Batch processing for multiple articles + - [ ] 1536-dimensional vector validation + - [ ] Proper error handling and retries +- **Blocking Issues**: Waiting for T002 +- **Next Actions**: Create OpenRouter embeddings client + +#### ⏳ T007: Enhance NewsService - LLM Integration +- **Status**: Not Started +- **Priority**: Critical +- **Estimated**: 2-3 hours +- **Dependencies**: T005, T006 +- **Progress**: 0% +- **Acceptance Criteria**: 0/5 completed + - [ ] Replace keyword sentiment with LLM analysis + - [ ] Add embedding generation to article processing + - [ ] End-to-end article processing pipeline + - [ ] Proper error handling and fallback strategies + - [ ] Integration with existing service methods +- **Blocking Issues**: Waiting for T005, T006 +- **Next Actions**: Integrate LLM clients into NewsService + +### Phase 4: Scheduling (0% Complete) + +#### ⏳ T008: APScheduler Integration - Job Scheduling +- **Status**: Not Started +- **Priority**: High +- **Estimated**: 3-4 hours +- **Dependencies**: T003, T004, T007 +- **Progress**: 0% +- **Acceptance Criteria**: 0/5 completed + - [ ] APScheduler setup with PostgreSQL job store + - [ ] Scheduled job execution with proper error handling + - [ ] Job configuration loading and validation + - [ ] Status monitoring and failure recovery + - [ ] CLI integration for job management +- **Blocking Issues**: Waiting for T003, T004, T007 +- **Next Actions**: Implement ScheduledNewsCollector + +#### ⏳ T009: CLI Integration - Job Management Commands +- **Status**: Not Started +- **Priority**: Medium +- **Estimated**: 1-2 hours +- **Dependencies**: T008 +- **Progress**: 0% +- **Acceptance Criteria**: 0/5 completed + - [ ] CLI commands for job creation/management + - [ ] Manual job execution commands + - [ ] Job status and monitoring commands + - [ ] Integration with existing CLI structure + - [ ] Proper error handling and user feedback +- **Blocking Issues**: Waiting for T008 +- **Next Actions**: Extend CLI with news job commands + +### Phase 5: Validation (0% Complete) + +#### ⏳ T010: Integration Tests - End-to-End Workflow +- **Status**: Not Started +- **Priority**: High +- **Estimated**: 2-3 hours +- **Dependencies**: T007, T008 +- **Progress**: 0% +- **Acceptance Criteria**: 0/5 completed + - [ ] End-to-end workflow tests from RSS to vector storage + - [ ] Agent integration tests via AgentToolkit + - [ ] Performance tests for daily collection volumes + - [ ] Error recovery and fallback tests + - [ ] Test coverage maintained above 85% +- **Blocking Issues**: Waiting for T007, T008 +- **Next Actions**: Create comprehensive integration test suite + +#### ⏳ T011: Documentation and Monitoring +- **Status**: Not Started +- **Priority**: Medium +- **Estimated**: 1-2 hours +- **Dependencies**: T010 +- **Progress**: 0% +- **Acceptance Criteria**: 0/5 completed + - [ ] Updated API documentation for new methods + - [ ] Job scheduling configuration examples + - [ ] Performance monitoring dashboard queries + - [ ] Troubleshooting guide for common issues + - [ ] Agent integration documentation +- **Blocking Issues**: Waiting for T010 +- **Next Actions**: Update documentation and monitoring + +--- + +## Success Criteria Validation + +### Technical Requirements Status +- [ ] **OpenRouter-only LLM Integration**: Not started +- [ ] **Vector Embeddings with pgvectorscale**: Not started +- [ ] **APScheduler Job Execution**: Not started +- [ ] **Test Coverage >85%**: Baseline established (needs monitoring) +- [ ] **Query Performance <100ms**: Not tested +- [ ] **Vector Search Performance <1s**: Not tested +- [ ] **Backward Compatibility**: Not validated + +### Functional Requirements Status +- [ ] **Sentiment Analysis Pipeline**: Not implemented +- [ ] **Embedding Generation Pipeline**: Not implemented +- [ ] **Scheduled News Collection**: Not implemented +- [ ] **CLI Job Management**: Not implemented +- [ ] **AgentToolkit Integration**: Not validated +- [ ] **Error Handling & Fallbacks**: Not implemented + +### Quality Requirements Status +- [ ] **TDD Implementation**: Process defined, not applied +- [ ] **Layered Architecture**: Pattern defined, not validated +- [ ] **Async Connection Pooling**: Not implemented +- [ ] **Production Monitoring**: Not implemented +- [ ] **Documentation Completeness**: Not updated + +--- + +## Current Blocking Issues + +### Critical Blockers +**None currently** - All dependencies are internal to this implementation + +### Potential Risk Areas +1. **OpenRouter API Access**: Requires valid API keys and model access +2. **Database Migration**: Need proper PostgreSQL permissions for schema changes +3. **Vector Extension**: pgvectorscale must be properly installed and configured +4. **Performance Testing**: Need realistic data volumes for benchmark validation + +--- + +## Weekly Progress Targets + +### Week 1 Target (Days 1-2) +- **Goal**: Complete Phase 1 & 2 (Foundation + Data Access) +- **Expected Completion**: T001, T002, T003, T004 +- **Target Progress**: 45% overall completion + +### Week 1 Target (Days 3-4) +- **Goal**: Complete Phase 3 & 4 (LLM Integration + Scheduling) +- **Expected Completion**: T005, T006, T007, T008, T009 +- **Target Progress**: 90% overall completion + +### Week 2 Target (Day 1) +- **Goal**: Complete Phase 5 (Validation) +- **Expected Completion**: T010, T011 +- **Target Progress**: 100% overall completion + +--- + +## Metrics Dashboard + +### Code Coverage +- **Current**: 95% (existing infrastructure) +- **Target**: >85% (including new functionality) +- **Status**: ⏳ Pending implementation + +### Performance Benchmarks +- **Query Performance**: Not measured (Target: <100ms) +- **Vector Search**: Not measured (Target: <1s) +- **Batch Processing**: Not measured (Target: TBD) +- **Status**: ⏳ Pending implementation + +### Test Execution +- **Unit Tests**: 0/11 tasks have tests +- **Integration Tests**: 0/11 tasks have integration tests +- **VCR Tests**: 0/3 API clients have VCR tests +- **Status**: ⏳ Pending implementation + +--- + +## Communication & Reporting + +### Daily Standup Format +``` +Yesterday: [Tasks completed with IDs] +Today: [Tasks planned with IDs] +Blockers: [Any issues requiring attention] +Help Needed: [Specific areas for collaboration] +``` + +### Weekly Status Report Format +``` +Completed: [Phase progress with task counts] +In Progress: [Current focus areas] +Upcoming: [Next phase priorities] +Risks: [Technical or timeline concerns] +Metrics: [Coverage, performance, test results] +``` + +### Milestone Checkpoints +- **Checkpoint 1** (End of Day 2): Foundation Complete (T001-T004) +- **Checkpoint 2** (End of Day 4): LLM Integration Complete (T005-T009) +- **Checkpoint 3** (End of Day 5): Full Implementation Complete (T001-T011) + +--- + +## Notes + +### Implementation Context +- Building on 95% complete news domain infrastructure +- Focus on OpenRouter-only LLM integration (no other providers) +- Maintaining backward compatibility with AgentToolkit +- Following established TDD and layered architecture patterns + +### Key Success Factors +1. **Incremental Progress**: Validate each layer before proceeding +2. **Comprehensive Testing**: Maintain test coverage throughout +3. **Performance Monitoring**: Validate benchmarks at each step +4. **Error Resilience**: Implement fallbacks for all LLM dependencies +5. **Documentation**: Keep implementation and usage docs current + +### Last Updated +**Date**: 2024-08-30 +**By**: System +**Next Review**: Daily during implementation + +--- + +*This status document will be updated as implementation progresses. Use this as a single source of truth for current progress and blocking issues.* \ No newline at end of file diff --git a/docs/specs/news/tasks.md b/docs/specs/news/tasks.md new file mode 100644 index 00000000..19bfd021 --- /dev/null +++ b/docs/specs/news/tasks.md @@ -0,0 +1,1039 @@ +# News Domain Completion - Task Implementation Guide + +## Overview + +Complete the final 5% of the news domain by implementing OpenRouter-only LLM sentiment analysis, vector embeddings, and APScheduler job execution. This builds on 95% complete infrastructure with PostgreSQL + TimescaleDB + pgvectorscale stack. + +**Total Estimated Time**: 12-16 hours with AI assistance +**Target Completion**: 3-4 days +**Test Coverage Requirement**: Maintain >85% +**Architecture Pattern**: Database → Entity → Repository → Service → Scheduling + +## Implementation Phases + +### Phase 1: Foundation (4-7 hours) +Database and entity layer enhancements for LLM integration + +### Phase 2: Data Access (2-3 hours) +Repository layer enhancements for vector and job operations + +### Phase 3: LLM Integration (5-8 hours) +OpenRouter clients and service integration + +### Phase 4: Scheduling (4-6 hours) +Job scheduling and CLI integration + +### Phase 5: Validation (3-5 hours) +Testing, documentation, and monitoring + +--- + +## Task Breakdown + +### Phase 1: Foundation + +#### T001: Database Migration - NewsJobConfig Table +**Priority**: Critical | **Duration**: 1-2 hours | **Dependencies**: None + +**Description**: Create database migration for news job configurations table with proper indexes + +**Acceptance Criteria**: +- [ ] `news_job_configs` table created with UUID primary key +- [ ] JSONB fields for symbols and categories with validation +- [ ] Proper indexes for enabled/frequency queries +- [ ] Migration script tests with rollback capability + +**Implementation Details**: +```python +# Migration structure +def upgrade(): + op.create_table( + 'news_job_configs', + sa.Column('id', postgresql.UUID(), primary_key=True), + sa.Column('name', sa.String(255), nullable=False), + sa.Column('symbols', postgresql.JSONB(), nullable=False), + sa.Column('categories', postgresql.JSONB(), nullable=False), + sa.Column('frequency_cron', sa.String(100), nullable=False), + sa.Column('enabled', sa.Boolean(), default=True), + sa.Column('last_run', sa.DateTime(timezone=True)), + sa.Column('created_at', sa.DateTime(timezone=True), default=func.now()), + sa.Column('updated_at', sa.DateTime(timezone=True), default=func.now()) + ) + + # Indexes + op.create_index('idx_news_jobs_enabled_frequency', 'news_job_configs', + ['enabled', 'frequency_cron']) + op.create_index('idx_news_jobs_last_run', 'news_job_configs', + ['last_run'], postgresql_where=sa.text('enabled = true')) +``` + +**Files to Modify**: +- `/Users/martinrichards/code/TradingAgents/tradingagents/data/migrations/add_news_job_configs.py` + +**Test Requirements**: +- Migration up/down tests +- Index performance validation +- Constraint validation tests + +--- + +#### T002: Enhance NewsArticle Entity - Sentiment and Embeddings +**Priority**: Critical | **Duration**: 2-3 hours | **Dependencies**: T001 + +**Description**: Add LLM sentiment fields and embedding validation to NewsArticle entity + +**Acceptance Criteria**: +- [ ] Add `sentiment_score`, `sentiment_confidence`, `sentiment_label` fields +- [ ] Add `title_embedding` and `content_embedding` vector fields +- [ ] Enhanced `validate()` method with sentiment range checks +- [ ] Updated transformations for vector handling +- [ ] Embedding dimension validation (1536) + +**Implementation Details**: +```python +@dataclass +class NewsArticle: + # Existing fields... + + # LLM sentiment fields + sentiment_score: Optional[float] = None # [-1.0, 1.0] + sentiment_confidence: Optional[float] = None # [0.0, 1.0] + sentiment_label: Optional[str] = None # "positive", "negative", "neutral" + + # Vector embedding fields + title_embedding: Optional[List[float]] = None # 1536 dimensions + content_embedding: Optional[List[float]] = None # 1536 dimensions + + def validate(self) -> Dict[str, List[str]]: + errors = super().validate() + + # Sentiment validation + if self.sentiment_score is not None: + if not -1.0 <= self.sentiment_score <= 1.0: + errors["sentiment_score"] = ["Must be between -1.0 and 1.0"] + + if self.sentiment_confidence is not None: + if not 0.0 <= self.sentiment_confidence <= 1.0: + errors["sentiment_confidence"] = ["Must be between 0.0 and 1.0"] + + # Vector dimension validation + for field, vector in [("title_embedding", self.title_embedding), + ("content_embedding", self.content_embedding)]: + if vector is not None and len(vector) != 1536: + errors[field] = ["Must be exactly 1536 dimensions"] + + return errors + + def to_record(self) -> Dict[str, Any]: + record = super().to_record() + # Convert vectors to pgvector format if present + if self.title_embedding: + record["title_embedding"] = self.title_embedding + if self.content_embedding: + record["content_embedding"] = self.content_embedding + return record +``` + +**Files to Modify**: +- `/Users/martinrichards/code/TradingAgents/tradingagents/domains/news/entities/news_article.py` + +**Test Requirements**: +- Sentiment validation tests (range checks) +- Vector dimension validation tests +- Transformation method tests +- Business rule violation tests + +--- + +#### T003: Create NewsJobConfig Entity +**Priority**: Critical | **Duration**: 1-2 hours | **Dependencies**: T001 + +**Description**: Implement NewsJobConfig entity for scheduled job management + +**Acceptance Criteria**: +- [ ] NewsJobConfig dataclass with all required fields +- [ ] Business rule validation for job configuration +- [ ] Cron expression validation for frequency +- [ ] Symbol list validation +- [ ] JSON serialization for database storage + +**Implementation Details**: +```python +@dataclass +class NewsJobConfig: + id: Optional[UUID] = None + name: str = "" + symbols: List[str] = field(default_factory=list) + categories: List[str] = field(default_factory=list) + frequency_cron: str = "" + enabled: bool = True + last_run: Optional[datetime] = None + created_at: Optional[datetime] = None + updated_at: Optional[datetime] = None + + def validate(self) -> Dict[str, List[str]]: + errors = {} + + # Name validation + if not self.name or len(self.name) > 255: + errors["name"] = ["Name required and must be <= 255 characters"] + + # Symbol validation + if not self.symbols: + errors["symbols"] = ["At least one symbol required"] + for symbol in self.symbols: + if not symbol.isupper() or not symbol.isalpha(): + errors["symbols"] = ["Symbols must be uppercase letters only"] + + # Cron validation + try: + from croniter import croniter + if not croniter.is_valid(self.frequency_cron): + errors["frequency_cron"] = ["Invalid cron expression"] + except ImportError: + # Fallback validation for simple intervals + if self.frequency_cron not in ["hourly", "daily", "weekly"]: + errors["frequency_cron"] = ["Invalid frequency"] + + return errors +``` + +**Files to Create**: +- `/Users/martinrichards/code/TradingAgents/tradingagents/domains/news/entities/news_job_config.py` + +**Test Requirements**: +- Job configuration validation tests +- Schedule parsing tests +- Symbol validation tests +- Serialization/deserialization tests + +--- + +### Phase 2: Data Access + +#### T004: Enhance NewsRepository - Vector and Job Operations +**Priority**: Critical | **Duration**: 2-3 hours | **Dependencies**: T002, T003 + +**Description**: Add vector similarity search and NewsJobConfig CRUD operations + +**Acceptance Criteria**: +- [ ] Vector similarity search with cosine distance +- [ ] Batch embedding update operations +- [ ] NewsJobConfig CRUD methods +- [ ] Optimized query performance for vector operations +- [ ] Proper async connection handling + +**Implementation Details**: +```python +class NewsRepository: + # Existing methods... + + async def find_similar_articles(self, + embedding: List[float], + limit: int = 10, + threshold: float = 0.8) -> List[NewsArticle]: + """Find articles similar to given embedding using cosine distance""" + query = """ + SELECT *, 1 - (title_embedding <=> %s::vector) as similarity + FROM news_articles + WHERE title_embedding IS NOT NULL + AND 1 - (title_embedding <=> %s::vector) > %s + ORDER BY title_embedding <=> %s::vector + LIMIT %s + """ + + async with self._get_connection() as conn: + rows = await conn.fetch(query, embedding, embedding, threshold, embedding, limit) + return [NewsArticle.from_record(dict(row)) for row in rows] + + async def batch_update_embeddings(self, + articles: List[NewsArticle]) -> None: + """Efficiently update embeddings for multiple articles""" + if not articles: + return + + query = """ + UPDATE news_articles + SET title_embedding = %s, content_embedding = %s, updated_at = now() + WHERE id = %s + """ + + async with self._get_connection() as conn: + await conn.executemany(query, [ + (article.title_embedding, article.content_embedding, article.id) + for article in articles + if article.id and (article.title_embedding or article.content_embedding) + ]) + + # NewsJobConfig CRUD operations + async def create_job_config(self, config: NewsJobConfig) -> NewsJobConfig: + """Create new job configuration""" + query = """ + INSERT INTO news_job_configs (id, name, symbols, categories, frequency_cron, enabled) + VALUES (%s, %s, %s, %s, %s, %s) + RETURNING * + """ + + config.id = config.id or uuid4() + async with self._get_connection() as conn: + row = await conn.fetchrow(query, + config.id, config.name, json.dumps(config.symbols), + json.dumps(config.categories), config.frequency_cron, config.enabled) + return NewsJobConfig.from_record(dict(row)) + + async def get_active_job_configs(self) -> List[NewsJobConfig]: + """Get all enabled job configurations""" + query = "SELECT * FROM news_job_configs WHERE enabled = true" + async with self._get_connection() as conn: + rows = await conn.fetch(query) + return [NewsJobConfig.from_record(dict(row)) for row in rows] +``` + +**Files to Modify**: +- `/Users/martinrichards/code/TradingAgents/tradingagents/domains/news/repositories/news_repository.py` + +**Test Requirements**: +- Vector similarity search tests with mock data +- Batch operation performance tests +- Job config CRUD tests +- Database connection pooling tests + +--- + +### Phase 3: LLM Integration + +#### T005: OpenRouter Client - Sentiment Analysis +**Priority**: Critical | **Duration**: 2-3 hours | **Dependencies**: T002 + +**Description**: Implement OpenRouter client for LLM sentiment analysis + +**Acceptance Criteria**: +- [ ] OpenRouter API integration for sentiment analysis +- [ ] Structured prompts for financial news sentiment +- [ ] Response parsing with Pydantic models +- [ ] Error handling with graceful fallbacks +- [ ] Retry logic with exponential backoff + +**Implementation Details**: +```python +class OpenRouterSentimentClient: + def __init__(self, config: TradingAgentsConfig): + self.api_key = config.openrouter_api_key + self.model = config.quick_think_llm + self.base_url = "https://openrouter.ai/api/v1" + + async def analyze_sentiment(self, title: str, content: str) -> SentimentResult: + """Analyze sentiment of news article""" + prompt = f""" + Analyze the sentiment of this financial news article: + + Title: {title} + Content: {content[:1000]}... + + Provide sentiment analysis as JSON: + {{ + "score": float between -1.0 (very negative) and 1.0 (very positive), + "confidence": float between 0.0 and 1.0, + "label": "positive" | "negative" | "neutral", + "reasoning": "brief explanation" + }} + """ + + try: + async with aiohttp.ClientSession() as session: + response = await self._make_request(session, prompt) + return self._parse_sentiment_response(response) + except Exception as e: + logger.warning(f"LLM sentiment analysis failed: {e}") + return self._fallback_sentiment(title, content) + + def _fallback_sentiment(self, title: str, content: str) -> SentimentResult: + """Keyword-based fallback sentiment analysis""" + # Simple keyword-based sentiment as fallback + positive_words = ["gain", "profit", "up", "growth", "buy"] + negative_words = ["loss", "down", "decline", "sell", "drop"] + + text = (title + " " + content).lower() + pos_count = sum(word in text for word in positive_words) + neg_count = sum(word in text for word in negative_words) + + if pos_count > neg_count: + return SentimentResult(score=0.3, confidence=0.5, label="positive") + elif neg_count > pos_count: + return SentimentResult(score=-0.3, confidence=0.5, label="negative") + else: + return SentimentResult(score=0.0, confidence=0.5, label="neutral") +``` + +**Files to Create**: +- `/Users/martinrichards/code/TradingAgents/tradingagents/domains/news/clients/openrouter_sentiment_client.py` + +**Test Requirements**: +- Sentiment analysis API tests with VCR +- Error handling tests +- Response parsing tests +- Fallback mechanism tests + +--- + +#### T006: OpenRouter Client - Vector Embeddings +**Priority**: Critical | **Duration**: 1-2 hours | **Dependencies**: T002 + +**Description**: Implement OpenRouter client for vector embeddings generation + +**Acceptance Criteria**: +- [ ] OpenRouter embeddings API integration +- [ ] Text preprocessing for embedding generation +- [ ] Batch processing for multiple articles +- [ ] 1536-dimensional vector validation +- [ ] Proper error handling and retries + +**Implementation Details**: +```python +class OpenRouterEmbeddingsClient: + def __init__(self, config: TradingAgentsConfig): + self.api_key = config.openrouter_api_key + self.model = "openai/text-embedding-ada-002" # Via OpenRouter + self.base_url = "https://openrouter.ai/api/v1" + + async def generate_embeddings(self, texts: List[str]) -> List[List[float]]: + """Generate embeddings for multiple texts""" + if not texts: + return [] + + try: + async with aiohttp.ClientSession() as session: + response = await self._make_embeddings_request(session, texts) + embeddings = self._parse_embeddings_response(response) + + # Validate dimensions + for i, embedding in enumerate(embeddings): + if len(embedding) != 1536: + raise ValueError(f"Invalid embedding dimension at index {i}: {len(embedding)}") + + return embeddings + except Exception as e: + logger.error(f"Embeddings generation failed: {e}") + # Return zero vectors as fallback + return [[0.0] * 1536 for _ in texts] + + async def generate_article_embeddings(self, article: NewsArticle) -> Tuple[List[float], List[float]]: + """Generate embeddings for article title and content""" + texts = [] + + # Prepare texts for embedding + if article.title: + texts.append(self._preprocess_text(article.title)) + if article.summary: + # Combine title and summary for comprehensive embedding + combined_text = f"{article.title} {article.summary}" + texts.append(self._preprocess_text(combined_text)) + + if not texts: + return [0.0] * 1536, [0.0] * 1536 + + embeddings = await self.generate_embeddings(texts) + title_embedding = embeddings[0] if len(embeddings) > 0 else [0.0] * 1536 + content_embedding = embeddings[1] if len(embeddings) > 1 else [0.0] * 1536 + + return title_embedding, content_embedding + + def _preprocess_text(self, text: str) -> str: + """Preprocess text for optimal embedding generation""" + # Remove extra whitespace and limit length + cleaned = " ".join(text.split()) + return cleaned[:8000] # OpenAI embedding limit +``` + +**Files to Create**: +- `/Users/martinrichards/code/TradingAgents/tradingagents/domains/news/clients/openrouter_embeddings_client.py` + +**Test Requirements**: +- Embeddings API tests with VCR +- Batch processing tests +- Vector dimension validation tests +- Text preprocessing tests + +--- + +#### T007: Enhance NewsService - LLM Integration +**Priority**: Critical | **Duration**: 2-3 hours | **Dependencies**: T005, T006 + +**Description**: Integrate OpenRouter LLM clients into NewsService workflow + +**Acceptance Criteria**: +- [ ] Replace keyword sentiment with LLM analysis +- [ ] Add embedding generation to article processing +- [ ] End-to-end article processing pipeline +- [ ] Proper error handling and fallback strategies +- [ ] Integration with existing service methods + +**Implementation Details**: +```python +class NewsService: + def __init__(self, + repository: NewsRepository, + config: TradingAgentsConfig): + self.repository = repository + self.config = config + self.sentiment_client = OpenRouterSentimentClient(config) + self.embeddings_client = OpenRouterEmbeddingsClient(config) + + async def process_articles_with_llm(self, articles: List[NewsArticle]) -> List[NewsArticle]: + """Process articles with LLM sentiment analysis and embeddings""" + processed_articles = [] + + for article in articles: + try: + # Generate sentiment analysis + sentiment_result = await self.sentiment_client.analyze_sentiment( + article.title, article.summary or "" + ) + + # Generate embeddings + title_embedding, content_embedding = await self.embeddings_client.generate_article_embeddings(article) + + # Update article with LLM results + article.sentiment_score = sentiment_result.score + article.sentiment_confidence = sentiment_result.confidence + article.sentiment_label = sentiment_result.label + article.title_embedding = title_embedding + article.content_embedding = content_embedding + + processed_articles.append(article) + + except Exception as e: + logger.warning(f"Failed to process article {article.id}: {e}") + # Add article without LLM processing + processed_articles.append(article) + + return processed_articles + + async def collect_and_process_news(self, symbols: List[str]) -> List[NewsArticle]: + """Complete pipeline: collect → process → store with LLM analysis""" + # Collect raw articles (existing functionality) + raw_articles = await self.collect_news_articles(symbols) + + # Process with LLM + processed_articles = await self.process_articles_with_llm(raw_articles) + + # Store processed articles + stored_articles = [] + for article in processed_articles: + stored_article = await self.repository.create_article(article) + stored_articles.append(stored_article) + + # Batch update embeddings for efficiency + articles_with_embeddings = [a for a in stored_articles + if a.title_embedding or a.content_embedding] + if articles_with_embeddings: + await self.repository.batch_update_embeddings(articles_with_embeddings) + + return stored_articles +``` + +**Files to Modify**: +- `/Users/martinrichards/code/TradingAgents/tradingagents/domains/news/services/news_service.py` + +**Test Requirements**: +- Integration tests with mocked LLM clients +- Article processing pipeline tests +- Error handling and fallback tests +- Performance tests for batch operations + +--- + +### Phase 4: Scheduling + +#### T008: APScheduler Integration - Job Scheduling +**Priority**: High | **Duration**: 3-4 hours | **Dependencies**: T003, T004, T007 + +**Description**: Implement scheduled news collection using APScheduler + +**Acceptance Criteria**: +- [ ] APScheduler setup with PostgreSQL job store +- [ ] Scheduled job execution with proper error handling +- [ ] Job configuration loading and validation +- [ ] Status monitoring and failure recovery +- [ ] CLI integration for job management + +**Implementation Details**: +```python +class ScheduledNewsCollector: + def __init__(self, + news_service: NewsService, + repository: NewsRepository, + config: TradingAgentsConfig): + self.news_service = news_service + self.repository = repository + self.config = config + self.scheduler = None + + async def initialize_scheduler(self): + """Initialize APScheduler with PostgreSQL job store""" + from apscheduler.schedulers.asyncio import AsyncIOScheduler + from apscheduler.jobstores.sqlalchemy import SQLAlchemyJobStore + + jobstore = SQLAlchemyJobStore(url=self.config.database_url, + tablename='apscheduler_jobs') + + self.scheduler = AsyncIOScheduler() + self.scheduler.add_jobstore(jobstore, 'default') + + async def load_job_configurations(self): + """Load and schedule all active job configurations""" + job_configs = await self.repository.get_active_job_configs() + + for config in job_configs: + try: + await self._schedule_job(config) + except Exception as e: + logger.error(f"Failed to schedule job {config.name}: {e}") + + async def _schedule_job(self, job_config: NewsJobConfig): + """Schedule a single job configuration""" + job_id = f"news_collection_{job_config.id}" + + # Remove existing job if present + if self.scheduler.get_job(job_id): + self.scheduler.remove_job(job_id) + + # Add new job + from apscheduler.triggers.cron import CronTrigger + trigger = CronTrigger.from_crontab(job_config.frequency_cron) + + self.scheduler.add_job( + self._execute_news_collection, + trigger=trigger, + id=job_id, + args=[job_config], + name=f"News collection: {job_config.name}", + replace_existing=True + ) + + async def _execute_news_collection(self, job_config: NewsJobConfig): + """Execute news collection for a job configuration""" + try: + logger.info(f"Starting news collection job: {job_config.name}") + + # Collect and process news + articles = await self.news_service.collect_and_process_news(job_config.symbols) + + # Update job last run timestamp + job_config.last_run = datetime.now(timezone.utc) + await self.repository.update_job_config(job_config) + + logger.info(f"Completed news collection job: {job_config.name}, " + f"collected {len(articles)} articles") + + except Exception as e: + logger.error(f"News collection job failed: {job_config.name}, error: {e}") + # Could implement notification/alerting here + + async def start_scheduler(self): + """Start the scheduler""" + if not self.scheduler: + await self.initialize_scheduler() + + await self.load_job_configurations() + self.scheduler.start() + logger.info("News collection scheduler started") + + async def stop_scheduler(self): + """Stop the scheduler""" + if self.scheduler: + self.scheduler.shutdown(wait=True) + logger.info("News collection scheduler stopped") +``` + +**Files to Create**: +- `/Users/martinrichards/code/TradingAgents/tradingagents/domains/news/services/scheduled_news_collector.py` + +**Test Requirements**: +- Job scheduling tests with test scheduler +- Job execution tests with mocked dependencies +- Error handling and retry tests +- Job configuration validation tests + +--- + +#### T009: CLI Integration - Job Management Commands +**Priority**: Medium | **Duration**: 1-2 hours | **Dependencies**: T008 + +**Description**: Add CLI commands for news job management and manual execution + +**Acceptance Criteria**: +- [ ] CLI commands for job creation/management +- [ ] Manual job execution commands +- [ ] Job status and monitoring commands +- [ ] Integration with existing CLI structure +- [ ] Proper error handling and user feedback + +**Implementation Details**: +```python +# Add to cli/commands/news_commands.py +@click.group() +def news(): + """News domain management commands""" + pass + +@news.group() +def job(): + """Job management commands""" + pass + +@job.command() +@click.option('--name', required=True, help='Job name') +@click.option('--symbols', required=True, help='Comma-separated stock symbols') +@click.option('--frequency', required=True, help='Cron expression or simple frequency') +@click.option('--categories', help='Comma-separated news categories') +async def create(name: str, symbols: str, frequency: str, categories: str): + """Create a new news collection job""" + try: + symbol_list = [s.strip().upper() for s in symbols.split(',')] + category_list = [c.strip() for c in (categories or "").split(',')] if categories else [] + + config = NewsJobConfig( + name=name, + symbols=symbol_list, + categories=category_list, + frequency_cron=frequency, + enabled=True + ) + + # Validate configuration + errors = config.validate() + if errors: + click.echo(f"❌ Invalid configuration: {errors}") + return + + # Create job + repository = NewsRepository(get_database_config()) + created_config = await repository.create_job_config(config) + + click.echo(f"✅ Created job: {created_config.name} (ID: {created_config.id})") + + except Exception as e: + click.echo(f"❌ Failed to create job: {e}") + +@job.command() +async def list(): + """List all job configurations""" + try: + repository = NewsRepository(get_database_config()) + configs = await repository.get_all_job_configs() + + if not configs: + click.echo("No jobs configured") + return + + click.echo("\n📋 News Collection Jobs:") + click.echo("=" * 60) + + for config in configs: + status = "🟢 Enabled" if config.enabled else "🔴 Disabled" + last_run = config.last_run.strftime("%Y-%m-%d %H:%M") if config.last_run else "Never" + + click.echo(f"{config.name}") + click.echo(f" Status: {status}") + click.echo(f" Symbols: {', '.join(config.symbols)}") + click.echo(f" Schedule: {config.frequency_cron}") + click.echo(f" Last Run: {last_run}") + click.echo() + + except Exception as e: + click.echo(f"❌ Failed to list jobs: {e}") + +@job.command() +@click.argument('job_id', type=str) +async def run(job_id: str): + """Manually execute a job""" + try: + repository = NewsRepository(get_database_config()) + config = await repository.get_job_config(UUID(job_id)) + + if not config: + click.echo(f"❌ Job not found: {job_id}") + return + + click.echo(f"🚀 Running job: {config.name}") + + # Execute job + service = NewsService(repository, get_trading_config()) + articles = await service.collect_and_process_news(config.symbols) + + click.echo(f"✅ Completed: collected {len(articles)} articles") + + except Exception as e: + click.echo(f"❌ Job execution failed: {e}") +``` + +**Files to Modify**: +- `/Users/martinrichards/code/TradingAgents/cli/commands/news_commands.py` + +**Test Requirements**: +- CLI command tests with mocked services +- User input validation tests +- Output formatting tests + +--- + +### Phase 5: Validation + +#### T010: Integration Tests - End-to-End Workflow +**Priority**: High | **Duration**: 2-3 hours | **Dependencies**: T007, T008 + +**Description**: Comprehensive integration tests for complete news domain workflow + +**Acceptance Criteria**: +- [ ] End-to-end workflow tests from RSS to vector storage +- [ ] Agent integration tests via AgentToolkit +- [ ] Performance tests for daily collection volumes +- [ ] Error recovery and fallback tests +- [ ] Test coverage maintained above 85% + +**Implementation Details**: +```python +# tests/domains/news/integration/test_news_workflow.py +class TestNewsWorkflowIntegration: + + @pytest.mark.asyncio + async def test_complete_news_processing_pipeline(self, test_db, mock_openrouter): + """Test complete pipeline from RSS to vector storage""" + # Setup + config = TradingAgentsConfig.from_test_config() + repository = NewsRepository(test_db) + service = NewsService(repository, config) + + # Mock OpenRouter responses + mock_openrouter.sentiment_response = { + "score": 0.7, + "confidence": 0.85, + "label": "positive" + } + mock_openrouter.embeddings_response = [[0.1] * 1536] + + # Execute pipeline + articles = await service.collect_and_process_news(["AAPL"]) + + # Verify results + assert len(articles) > 0 + assert all(a.sentiment_score is not None for a in articles) + assert all(a.title_embedding is not None for a in articles) + + # Verify database storage + stored_articles = await repository.get_articles_by_symbol("AAPL") + assert len(stored_articles) == len(articles) + + # Test vector similarity search + similar = await repository.find_similar_articles( + articles[0].title_embedding, limit=5 + ) + assert len(similar) > 0 + + @pytest.mark.asyncio + async def test_agent_toolkit_integration(self, test_db): + """Test integration with AgentToolkit for RAG queries""" + from tradingagents.agents.libs.toolkit import AgentToolkit + + # Setup with real data + toolkit = AgentToolkit(test_db) + + # Test news context retrieval + context = await toolkit.get_news_context("AAPL", days=7) + assert "articles" in context + assert "sentiment_summary" in context + + # Test vector similarity for context + similar_context = await toolkit.get_similar_news( + "Apple earnings beat expectations", limit=5 + ) + assert len(similar_context) <= 5 + + @pytest.mark.asyncio + async def test_scheduler_integration(self, test_db): + """Test APScheduler integration with job management""" + config = TradingAgentsConfig.from_test_config() + repository = NewsRepository(test_db) + service = NewsService(repository, config) + scheduler = ScheduledNewsCollector(service, repository, config) + + # Create test job configuration + job_config = NewsJobConfig( + name="test_job", + symbols=["AAPL"], + frequency_cron="0 */6 * * *", # Every 6 hours + enabled=True + ) + await repository.create_job_config(job_config) + + # Test scheduler initialization + await scheduler.initialize_scheduler() + await scheduler.load_job_configurations() + + # Verify job was scheduled + assert scheduler.scheduler.get_job(f"news_collection_{job_config.id}") is not None + + # Test manual job execution + await scheduler._execute_news_collection(job_config) + + # Verify execution updated last_run + updated_config = await repository.get_job_config(job_config.id) + assert updated_config.last_run is not None + + @pytest.mark.asyncio + async def test_error_recovery_and_fallbacks(self, test_db): + """Test error handling and fallback mechanisms""" + config = TradingAgentsConfig.from_test_config() + repository = NewsRepository(test_db) + service = NewsService(repository, config) + + # Test with failing LLM client + with patch.object(service.sentiment_client, 'analyze_sentiment', side_effect=Exception("API Error")): + articles = await service.collect_and_process_news(["AAPL"]) + + # Should still process articles with fallback + assert len(articles) > 0 + # Should have fallback sentiment values + assert any(a.sentiment_score is not None for a in articles) + + @pytest.mark.asyncio + async def test_performance_benchmarks(self, test_db): + """Test performance meets requirements""" + config = TradingAgentsConfig.from_test_config() + repository = NewsRepository(test_db) + + # Create test articles with embeddings + test_articles = await self._create_test_articles_with_embeddings(repository, count=1000) + + # Test query performance (< 100ms requirement) + start_time = time.time() + articles = await repository.get_recent_articles_by_symbol("AAPL", days=30) + query_time = (time.time() - start_time) * 1000 + + assert query_time < 100, f"Query took {query_time}ms, should be < 100ms" + + # Test vector similarity performance (< 1s requirement) + start_time = time.time() + similar = await repository.find_similar_articles( + test_articles[0].title_embedding, limit=10 + ) + vector_time = (time.time() - start_time) * 1000 + + assert vector_time < 1000, f"Vector search took {vector_time}ms, should be < 1s" +``` + +**Files to Create**: +- `/Users/martinrichards/code/TradingAgents/tests/domains/news/integration/test_news_workflow.py` + +**Test Requirements**: +- Full workflow integration tests +- AgentToolkit integration tests +- Performance benchmark tests +- Error scenario tests + +--- + +#### T011: Documentation and Monitoring +**Priority**: Medium | **Duration**: 1-2 hours | **Dependencies**: T010 + +**Description**: Update documentation and add monitoring for new functionality + +**Acceptance Criteria**: +- [ ] Updated API documentation for new methods +- [ ] Job scheduling configuration examples +- [ ] Performance monitoring dashboard queries +- [ ] Troubleshooting guide for common issues +- [ ] Agent integration documentation + +**Files to Modify**: +- `/Users/martinrichards/code/TradingAgents/docs/domains/news.md` +- `/Users/martinrichards/code/TradingAgents/docs/api-reference.md` + +**Test Requirements**: +- Documentation accuracy validation +- Configuration example testing + +--- + +## Parallel Development Opportunities + +### AI Agent Collaboration Points + +**Tasks T005 & T006** can be developed in parallel: +- Both are independent OpenRouter client implementations +- Different LLM capabilities (sentiment vs embeddings) +- Can be tested independently with VCR cassettes + +**Phase 1 Tasks (T001, T002, T003)** have minimal dependencies: +- T002 and T003 both depend on T001 but can be developed simultaneously +- Entity layer changes are independent of each other + +### Critical Path Analysis + +**Critical Path**: T001 → T002/T003 → T004 → T005/T006 → T007 → T008 + +**Parallel Opportunities**: +1. **Foundation Phase**: T002 + T003 (after T001) +2. **LLM Integration**: T005 + T006 (after T002) +3. **Testing**: Unit tests alongside implementation + +### Risk Mitigation Strategies + +**LLM API Dependencies**: +- Implement comprehensive fallback strategies +- Use VCR for deterministic testing +- Mock clients for unit tests + +**Database Performance**: +- Test with realistic data volumes +- Monitor query performance during development +- Use proper indexes for vector operations + +**Integration Complexity**: +- Build incrementally with testing at each step +- Maintain backward compatibility +- Use feature flags for gradual rollout + +--- + +## Success Metrics + +**Technical Metrics**: +- Test coverage >85% maintained +- Query performance <100ms +- Vector search performance <1s +- Zero breaking changes to AgentToolkit + +**Functional Metrics**: +- Successful OpenRouter-only LLM integration +- Scheduled jobs executing reliably +- Agent context enriched with sentiment and similarity + +**Quality Metrics**: +- All acceptance criteria met +- Comprehensive error handling +- Production-ready monitoring and documentation + +--- + +## Implementation Guidelines + +### TDD Approach +**Every task follows**: Write test → Write code → Refactor + +### Layered Architecture Pattern +**Strict adherence to**: Database → Entity → Repository → Service → Scheduling + +### Error Handling Strategy +**Graceful fallbacks** for all LLM API dependencies + +### Performance Requirements +**Async operations** with proper connection pooling throughout + +### Testing Strategy +**Unit tests + Integration tests + VCR** for external API calls + +--- + +This comprehensive task breakdown provides clear implementation guidance for completing the final 5% of the news domain while maintaining architectural consistency and leveraging AI-assisted development patterns. \ No newline at end of file diff --git a/docs/specs/socialmedia/context.json b/docs/specs/socialmedia/context.json new file mode 100644 index 00000000..23468340 --- /dev/null +++ b/docs/specs/socialmedia/context.json @@ -0,0 +1,70 @@ +{ + "product_vision": "Multi-agent LLM financial trading framework that mirrors real-world trading firm dynamics for research-based market analysis and trading decisions with PostgreSQL + TimescaleDB + pgvectorscale architecture", + "existing_features": [ + "news_domain_95_complete", + "social_media_domain_stub_only", + "postgresql_timescaledb_stack", + "agent_toolkit_rag_integration", + "openrouter_llm_provider", + "reddit_client_empty_stub", + "social_repository_file_based" + ], + "architecture": { + "layer_pattern": "Router → Service → Repository → Entity → Database", + "database": "PostgreSQL + TimescaleDB + pgvectorscale", + "llm_provider": "OpenRouter unified interface", + "agent_orchestration": "LangGraph workflows", + "data_pipeline": "APScheduler/Dagster (planned, not implemented)", + "domain_structure": "news (95% complete), marketdata (planned), socialmedia (stub only)", + "testing_strategy": "Domain-specific: mocks for services, real DB for repositories, pytest-vcr for HTTP" + }, + "socialmedia_implementation_status": { + "current_components": { + "SocialMediaService": "Stub implementation with empty methods", + "SocialRepository": "File-based JSON storage with deduplication", + "RedditClient": "Empty stub class - needs full implementation", + "Data Models": "Basic SocialPost, PostData, SocialContext models exist" + }, + "missing_components": { + "PostgreSQL_migration": "Current file storage needs database migration", + "Reddit_API_integration": "RedditClient is empty - needs PRAW implementation", + "LLM_sentiment_analysis": "No sentiment analysis for social posts", + "Vector_embeddings": "No embedding generation or similarity search", + "Agent_toolkit_methods": "get_reddit_news and get_reddit_stock_info missing", + "Scheduled_execution": "No daily data collection pipeline" + }, + "implementation_gaps": [ + "SocialRepository uses file storage instead of PostgreSQL", + "No SQLAlchemy entity for social posts with vector support", + "RedditClient has no API integration code", + "No LLM integration for sentiment analysis", + "Agent toolkit missing social media methods", + "No scheduled execution framework" + ] + }, + "reference_patterns": { + "news_domain_success": { + "NewsService": "95% complete business logic orchestration", + "NewsRepository": "Async PostgreSQL with vector embeddings", + "GoogleNewsClient": "RSS feed integration with error handling", + "Agent_integration": "RAG-powered context via AgentToolkit" + }, + "database_patterns": "Async PostgreSQL with TimescaleDB optimization and pgvectorscale", + "llm_integration": "OpenRouter unified provider with two-tier model strategy", + "testing_approach": "pytest-vcr for HTTP, real DB for repositories, mocks for services" + }, + "technical_dependencies": { + "external": [ + "PRAW (Python Reddit API Wrapper) for Reddit data access", + "OpenRouter API for LLM sentiment analysis", + "PostgreSQL with pgvectorscale for embeddings", + "APScheduler or Dagster for scheduled execution" + ], + "internal": [ + "Existing database infrastructure from news domain", + "OpenRouter configuration in TradingAgentsConfig", + "DatabaseManager for connection management", + "AgentToolkit patterns for RAG integration" + ] + } +} \ No newline at end of file diff --git a/docs/specs/socialmedia/design.json b/docs/specs/socialmedia/design.json new file mode 100644 index 00000000..e3c78cd9 --- /dev/null +++ b/docs/specs/socialmedia/design.json @@ -0,0 +1,567 @@ +{ + "requirements": { + "entities": { + "SocialPost": "Core domain entity for Reddit posts with sentiment and engagement data", + "SocialMediaPostEntity": "New SQLAlchemy entity for PostgreSQL storage with vector embeddings" + }, + "data_persistence": { + "migration_required": "File-based JSON storage to PostgreSQL + TimescaleDB + pgvectorscale", + "schema": "social_media_posts table with vector embeddings, sentiment fields, and TimescaleDB optimization", + "deduplication": "Reddit post_id unique constraint prevents duplicates" + }, + "api_needed": { + "external_apis": [ + "PRAW (Python Reddit API Wrapper) for Reddit data collection", + "OpenRouter API for LLM sentiment analysis and embeddings" + ], + "internal_apis": [ + "AgentToolkit methods: get_reddit_news, get_reddit_stock_info", + "SocialMediaService orchestration methods", + "SocialRepository PostgreSQL operations" + ] + }, + "components": { + "reddit_client": "Complete PRAW implementation (currently empty stub)", + "repository": "PostgreSQL migration from file storage", + "service": "Business logic with LLM integration", + "agent_toolkit": "RAG methods for AI agents", + "dagster_pipeline": "Scheduled daily collection" + }, + "domains": { + "primary": "socialmedia (complete greenfield implementation)", + "integration": "Follows news domain patterns for consistency" + }, + "business_rules": [ + "Daily collection from financial subreddits (wallstreetbets, investing, stocks, SecurityAnalysis)", + "OpenRouter LLM sentiment analysis with structured scoring", + "Vector embeddings for semantic similarity search", + "Post deduplication by Reddit post_id", + "90-day data retention policy", + "Rate limiting compliance with Reddit API", + "Best effort processing for API failures" + ] + }, + "technical_needs": { + "domain_model": { + "entities": { + "SocialPost": { + "purpose": "Domain entity managing business rules and data transformations", + "responsibilities": [ + "fromRequest() - Create from Reddit API response", + "toRecord() - Transform for PostgreSQL storage", + "toResponse() - Format for agent consumption", + "validate() - Business rule validation", + "calculateSentiment() - Derived sentiment scoring", + "extractTickers() - Ticker symbol detection" + ], + "fields": [ + "post_id: str (Reddit unique ID)", + "title: str", + "content: str", + "author: str", + "subreddit: str", + "created_utc: datetime", + "upvotes: int", + "downvotes: int", + "comments_count: int", + "url: str", + "sentiment_score: float", + "sentiment_label: str", + "tickers: List[str]", + "embedding: Optional[List[float]]" + ] + }, + "SocialMediaPostEntity": { + "purpose": "SQLAlchemy entity for PostgreSQL persistence", + "table": "social_media_posts", + "hypertable": "TimescaleDB partitioned by created_utc", + "indexes": [ + "post_id (unique)", + "subreddit, created_utc", + "tickers (GIN array)", + "embedding (pgvectorscale HNSW)" + ] + } + } + }, + "persistence": { + "database_type": "PostgreSQL + TimescaleDB + pgvectorscale", + "schema_design": { + "table": "social_media_posts", + "columns": [ + "id: UUID PRIMARY KEY", + "post_id: VARCHAR(50) UNIQUE NOT NULL", + "title: TEXT", + "content: TEXT", + "author: VARCHAR(100)", + "subreddit: VARCHAR(50)", + "created_utc: TIMESTAMPTZ (hypertable partition key)", + "upvotes: INTEGER", + "downvotes: INTEGER", + "comments_count: INTEGER", + "url: TEXT", + "sentiment_score: FLOAT", + "sentiment_label: VARCHAR(20)", + "tickers: TEXT[] (array)", + "embedding: VECTOR(1536) (pgvectorscale)", + "inserted_at: TIMESTAMPTZ DEFAULT NOW()", + "updated_at: TIMESTAMPTZ DEFAULT NOW()" + ], + "constraints": [ + "UNIQUE(post_id)", + "CHECK(sentiment_score BETWEEN -1 AND 1)" + ] + }, + "access_patterns": [ + "Ticker-based queries: SELECT * WHERE 'AAPL' = ANY(tickers)", + "Time-range filtering: SELECT * WHERE created_utc BETWEEN ? AND ?", + "Vector similarity: SELECT * ORDER BY embedding <=> ? LIMIT 10", + "Sentiment aggregations: SELECT AVG(sentiment_score) GROUP BY subreddit" + ], + "data_volume": "~400+ posts daily, 90-day retention = ~36K posts max" + }, + "router": { + "type": "AgentToolkit Integration (No HTTP Router)", + "methods": [ + "get_reddit_news(ticker: str, days: int) -> List[SocialPost]", + "get_reddit_stock_info(ticker: str) -> Dict", + "search_similar_posts(query: str, limit: int) -> List[SocialPost]", + "get_subreddit_sentiment(subreddit: str, ticker: str) -> SentimentSummary" + ], + "dependencies": [ + "SocialMediaService for business orchestration", + "Entity transformations: SocialPost.toResponse()" + ] + }, + "events": { + "domain_events": [ + "SocialPostCollected: Published when new posts are scraped", + "SentimentAnalyzed: Published after LLM sentiment analysis", + "EmbeddingGenerated: Published after vector embedding creation" + ], + "integration_events": [ + "MarketDataRequested: Subscribe to ticker validation events", + "TradingDecisionMade: Consume for social sentiment correlation" + ] + }, + "dependencies": { + "external_services": [ + "Reddit API (PRAW): Post collection and metadata", + "OpenRouter API: Sentiment analysis and embeddings", + "PostgreSQL: Data persistence and queries", + "TimescaleDB: Time-series optimization", + "pgvectorscale: Vector similarity search" + ], + "internal_services": [ + "None (greenfield implementation)" + ], + "required_by": [ + "AI agents: Social sentiment context for trading decisions", + "Multi-agent workflows: RAG-powered social media analysis", + "Risk management: Social sentiment risk factors" + ], + "component_order": [ + "1. SocialMediaPostEntity (database schema)", + "2. SocialPost (domain entity with transformations)", + "3. RedditClient (PRAW implementation)", + "4. SocialRepository (PostgreSQL operations)", + "5. SocialMediaService (business orchestration + LLM)", + "6. AgentToolkit methods (RAG integration)", + "7. Dagster pipeline (scheduled collection)" + ] + } + }, + "design": { + "architecture_overview": { + "pattern": "Event-driven microservice with layered internal architecture", + "data_flow": "Dagster Pipeline → RedditClient → SocialMediaService → SocialRepository → PostgreSQL + pgvectorscale", + "agent_flow": "AgentToolkit → SocialMediaService → SocialRepository → Vector Similarity Search + Sentiment Aggregation", + "key_principles": [ + "Leverage news domain patterns for consistency", + "OpenRouter unified LLM provider", + "Best-effort processing for API failures", + "Vector-enhanced semantic search", + "Rate limiting compliance with Reddit API", + "Complete greenfield implementation from empty stubs" + ] + }, + "domain_model": { + "SentimentScore": { + "purpose": "Structured sentiment analysis result from OpenRouter LLM", + "fields": { + "sentiment": "Literal['positive', 'negative', 'neutral']", + "confidence": "float (0.0-1.0)", + "reasoning": "str (brief explanation)" + }, + "validation": [ + "confidence >= 0.5 for reliable sentiment", + "reasoning must be non-empty" + ] + }, + "SocialPost": { + "purpose": "Core domain entity with business rules and transformations", + "base_fields": { + "post_id": "str (Reddit unique ID, e.g., 't3_abc123')", + "title": "str", + "content": "Optional[str] (selftext for text posts)", + "author": "str", + "subreddit": "str", + "created_utc": "datetime", + "upvotes": "int (score)", + "downvotes": "int (calculated from score + upvote_ratio)", + "comments_count": "int (num_comments)", + "url": "str (permalink or external URL)" + }, + "enhanced_fields": { + "sentiment_score": "Optional[SentimentScore]", + "tickers": "List[str] (extracted ticker symbols)", + "title_embedding": "Optional[List[float]] (1536 dimensions)", + "content_embedding": "Optional[List[float]] (1536 dimensions)" + }, + "methods": { + "from_praw_submission": "Create from PRAW Submission object", + "to_entity": "Transform to SocialMediaPostEntity for database storage", + "from_entity": "Create from database entity", + "validate": "Business rule validation", + "extract_tickers": "Extract stock symbols from title and content", + "has_reliable_sentiment": "Check if sentiment confidence >= 0.5", + "to_response": "Format for agent consumption" + }, + "validation_rules": [ + "post_id must match Reddit format (starts with 't3_')", + "title cannot be empty", + "created_utc cannot be in future", + "sentiment_score confidence must be 0.0-1.0", + "embeddings must be 1536 dimensions if present", + "subreddit must be in allowed financial subreddits" + ] + }, + "SocialJobConfig": { + "purpose": "Configuration for scheduled Reddit collection", + "fields": { + "subreddits": "List[str] (financial subreddits to monitor)", + "schedule_times": "List[str] (cron expressions for collection)", + "sentiment_model": "str (OpenRouter model for sentiment)", + "embedding_model": "str (OpenRouter model for embeddings)", + "max_posts_per_subreddit": "int (limit per collection run)", + "lookback_hours": "int (how far back to collect)", + "min_score": "int (minimum upvotes threshold)", + "rate_limit_delay": "float (seconds between API calls)" + }, + "defaults": { + "subreddits": "['wallstreetbets', 'investing', 'stocks', 'SecurityAnalysis']", + "schedule_times": "['0 6 * * *', '0 18 * * *']", + "sentiment_model": "anthropic/claude-3.5-haiku", + "embedding_model": "text-embedding-3-large", + "max_posts_per_subreddit": 50, + "lookback_hours": 12, + "min_score": 10, + "rate_limit_delay": 1.0 + } + } + }, + "data_persistence": { + "database_schema": { + "table_definition": "CREATE TABLE social_media_posts (\n id UUID PRIMARY KEY DEFAULT uuid7(),\n post_id VARCHAR(50) UNIQUE NOT NULL,\n title TEXT NOT NULL,\n content TEXT,\n author VARCHAR(100) NOT NULL,\n subreddit VARCHAR(50) NOT NULL,\n created_utc TIMESTAMPTZ NOT NULL,\n upvotes INTEGER NOT NULL DEFAULT 0,\n downvotes INTEGER NOT NULL DEFAULT 0,\n comments_count INTEGER NOT NULL DEFAULT 0,\n url TEXT NOT NULL,\n sentiment_score JSONB,\n sentiment_label VARCHAR(20),\n tickers TEXT[] DEFAULT '{}',\n title_embedding VECTOR(1536),\n content_embedding VECTOR(1536),\n inserted_at TIMESTAMPTZ DEFAULT NOW(),\n updated_at TIMESTAMPTZ DEFAULT NOW()\n);", + "hypertable": "SELECT create_hypertable('social_media_posts', 'created_utc', chunk_time_interval => INTERVAL '1 day');", + "indexes": [ + "CREATE UNIQUE INDEX idx_social_posts_post_id ON social_media_posts (post_id);", + "CREATE INDEX idx_social_posts_subreddit_time ON social_media_posts (subreddit, created_utc DESC);", + "CREATE INDEX idx_social_posts_tickers_gin ON social_media_posts USING GIN (tickers);", + "CREATE INDEX idx_social_posts_title_embedding ON social_media_posts USING vectors (title_embedding vector_cosine_ops);", + "CREATE INDEX idx_social_posts_content_embedding ON social_media_posts USING vectors (content_embedding vector_cosine_ops);", + "CREATE INDEX idx_social_posts_sentiment ON social_media_posts (((sentiment_score->>'sentiment'))) WHERE sentiment_score IS NOT NULL;" + ], + "constraints": [ + "ALTER TABLE social_media_posts ADD CONSTRAINT chk_sentiment_score CHECK (sentiment_score IS NULL OR ((sentiment_score->>'confidence')::float BETWEEN 0 AND 1));", + "ALTER TABLE social_media_posts ADD CONSTRAINT chk_created_utc CHECK (created_utc <= NOW());" + ] + }, + "repository_methods": { + "find_by_ticker": "async def find_by_ticker(self, ticker: str, days: int = 30, limit: int = 50) -> List[SocialPost]", + "find_by_subreddit": "async def find_by_subreddit(self, subreddit: str, hours: int = 24, limit: int = 100) -> List[SocialPost]", + "find_similar_posts": "async def find_similar_posts(self, query_embedding: List[float], ticker: Optional[str] = None, limit: int = 10) -> List[SocialPost]", + "get_sentiment_summary": "async def get_sentiment_summary(self, ticker: str, subreddit: Optional[str] = None, hours: int = 24) -> Dict[str, Any]", + "upsert_batch": "async def upsert_batch(self, posts: List[SocialPost]) -> List[SocialPost]", + "cleanup_old_posts": "async def cleanup_old_posts(self, days: int = 90) -> int" + }, + "query_optimizations": [ + "TimescaleDB hypertables for time-based partitioning", + "pgvectorscale HNSW indexes for fast vector similarity", + "GIN indexes for ticker array queries", + "Composite indexes for common access patterns", + "Materialized views for sentiment aggregations" + ] + }, + "api_specification": { + "reddit_client": { + "class": "RedditClient", + "purpose": "PRAW wrapper with rate limiting and error handling", + "configuration": { + "client_id": "Reddit app client ID", + "client_secret": "Reddit app client secret", + "user_agent": "TradingAgents/1.0 by /u/tradingagents", + "rate_limit": "1 request per second", + "timeout": "30 seconds per request" + }, + "methods": { + "fetch_subreddit_posts": "async def fetch_subreddit_posts(self, subreddit: str, limit: int = 50, time_filter: str = 'day') -> List[Dict[str, Any]]", + "search_posts": "async def search_posts(self, query: str, subreddit: Optional[str] = None, limit: int = 25) -> List[Dict[str, Any]]", + "get_post_details": "async def get_post_details(self, post_id: str) -> Optional[Dict[str, Any]]" + }, + "error_handling": [ + "Rate limit exceeded: Exponential backoff", + "Authentication errors: Log and continue with next subreddit", + "Network timeouts: Retry up to 3 times", + "Invalid subreddit: Skip and log warning" + ] + }, + "openrouter_client": { + "reuse": "Leverage existing OpenRouterClient from news domain", + "enhancements": [ + "Social media specific prompts for sentiment analysis", + "Batch processing for Reddit post embeddings", + "Optimized token usage for short social media text" + ], + "sentiment_prompt": "Analyze this Reddit post about stocks/finance. Consider the informal language, memes, and community context. Respond with JSON: {\"sentiment\": \"positive|negative|neutral\", \"confidence\": 0.0-1.0, \"reasoning\": \"brief explanation\"}" + } + }, + "components": { + "RedditClient": { + "layer": "External API Integration", + "responsibilities": [ + "Authenticate with Reddit API using PRAW", + "Fetch posts from financial subreddits", + "Handle rate limiting and API errors", + "Transform PRAW responses to standard format" + ], + "dependencies": [ + "PRAW library", + "Reddit API credentials", + "Async HTTP client (httpx)" + ], + "error_handling": "Best-effort with graceful degradation" + }, + "SocialRepository": { + "layer": "Data Access", + "responsibilities": [ + "PostgreSQL + TimescaleDB operations", + "Vector similarity searches using pgvectorscale", + "Batch upsert operations for performance", + "Sentiment aggregation queries" + ], + "dependencies": [ + "AsyncSession (SQLAlchemy)", + "SocialMediaPostEntity", + "Vector similarity functions" + ], + "performance_targets": [ + "Batch upsert: <5s for 1000 posts", + "Vector similarity: <1s for top 10 results", + "Ticker queries: <100ms for 30-day range" + ] + }, + "SocialMediaService": { + "layer": "Business Logic", + "responsibilities": [ + "Orchestrate Reddit data collection", + "Coordinate LLM sentiment analysis", + "Generate vector embeddings", + "Apply business rules and validation" + ], + "methods": { + "collect_subreddit_posts": "async def collect_subreddit_posts(self, config: SocialJobConfig) -> int", + "update_post_sentiment": "async def update_post_sentiment(self, posts: List[SocialPost]) -> List[SocialPost]", + "generate_embeddings": "async def generate_embeddings(self, posts: List[SocialPost]) -> List[SocialPost]", + "find_trending_tickers": "async def find_trending_tickers(self, hours: int = 24) -> List[Dict[str, Any]]" + }, + "integration_patterns": [ + "OpenRouter for sentiment and embeddings", + "Repository for data persistence", + "Event publishing for domain events" + ] + }, + "AgentToolkit": { + "layer": "Agent Integration", + "responsibilities": [ + "Provide RAG methods for AI agents", + "Format social data for agent consumption", + "Semantic search for relevant posts", + "Sentiment aggregation and analysis" + ], + "methods": { + "get_reddit_sentiment": "async def get_reddit_sentiment(self, ticker: str, days: int = 7) -> Dict[str, Any]", + "search_social_posts": "async def search_social_posts(self, query: str, ticker: Optional[str] = None) -> List[Dict[str, Any]]", + "get_trending_discussions": "async def get_trending_discussions(self, ticker: str) -> List[Dict[str, Any]]", + "get_subreddit_analysis": "async def get_subreddit_analysis(self, subreddit: str, ticker: str) -> Dict[str, Any]" + ], + "response_format": [ + "Structured JSON with post content, metadata, and sentiment", + "Data quality indicators", + "Source attribution and confidence scores" + ] + } + }, + "events": { + "domain_events": { + "SocialPostCollected": { + "trigger": "New Reddit post successfully stored", + "payload": { + "post_id": "str", + "subreddit": "str", + "tickers": "List[str]", + "created_utc": "datetime", + "collection_timestamp": "datetime" + } + }, + "SentimentAnalyzed": { + "trigger": "LLM sentiment analysis completed", + "payload": { + "post_id": "str", + "sentiment": "str", + "confidence": "float", + "processing_time": "float" + } + }, + "EmbeddingGenerated": { + "trigger": "Vector embedding created and stored", + "payload": { + "post_id": "str", + "embedding_type": "str (title|content)", + "dimensions": "int", + "model_used": "str" + } + } + }, + "integration_events": { + "MarketDataRequested": { + "purpose": "Validate ticker symbols against market data", + "consumption": "Subscribe to ensure social posts reference valid tickers" + }, + "TradingDecisionRequested": { + "purpose": "Provide social sentiment context for trading decisions", + "consumption": "Publish social sentiment summaries when trading decisions are being made" + } + } + }, + "dependencies": { + "external_dependencies": { + "Reddit API": { + "library": "PRAW (Python Reddit API Wrapper)", + "authentication": "OAuth2 with client credentials", + "rate_limits": "60 requests per minute per OAuth client", + "required_credentials": ["client_id", "client_secret", "user_agent"] + }, + "OpenRouter API": { + "reuse": "Existing OpenRouterClient from news domain", + "models": { + "sentiment": "anthropic/claude-3.5-haiku", + "embeddings": "text-embedding-3-large" + }, + "cost_optimization": "Batch requests and token-efficient prompts" + }, + "PostgreSQL Stack": { + "database": "PostgreSQL 16+", + "extensions": ["TimescaleDB", "pgvectorscale", "uuid-ossp"], + "connection": "AsyncSession with asyncpg driver" + } + }, + "internal_dependencies": { + "news_domain": "Reference implementation patterns for consistency", + "config_management": "TradingAgentsConfig for unified configuration", + "database_manager": "Shared DatabaseManager and session handling" + }, + "implementation_order": [ + "1. Database migration: Create social_media_posts table with TimescaleDB and vector support", + "2. SocialMediaPostEntity: SQLAlchemy entity with proper field mappings", + "3. SocialPost: Domain entity with validation and transformation methods", + "4. RedditClient: PRAW integration with rate limiting and error handling", + "5. SocialRepository: Database operations with vector similarity search", + "6. SocialMediaService: Business logic orchestration with LLM integration", + "7. AgentToolkit integration: RAG methods for AI agent consumption", + "8. Dagster pipeline: Scheduled collection and processing" + ] + }, + "implementation_guidance": { + "database_setup": { + "migration_script": [ + "Create social_media_posts table with all columns", + "Add TimescaleDB hypertable partitioning on created_utc", + "Create all indexes including vector similarity indexes", + "Add constraints for data validation", + "Set up retention policy for 90-day data cleanup" + ], + "seed_data": "Optional test data with sample Reddit posts for development" + }, + "reddit_integration": { + "praw_setup": [ + "Create Reddit app at https://www.reddit.com/prefs/apps/", + "Configure OAuth2 credentials in environment variables", + "Implement rate limiting to respect API limits", + "Handle subreddit access and content filtering" + ], + "data_collection_strategy": [ + "Focus on financial subreddits: wallstreetbets, investing, stocks, SecurityAnalysis", + "Collect hot/trending posts twice daily (6 AM, 6 PM UTC)", + "Filter by minimum score threshold (10+ upvotes)", + "Extract ticker symbols from post titles and content", + "Deduplicate by Reddit post_id" + ] + }, + "llm_integration": { + "sentiment_analysis": [ + "Use OpenRouter with anthropic/claude-3.5-haiku for cost efficiency", + "Social media-specific prompts accounting for informal language and memes", + "Structured JSON output with sentiment, confidence, and reasoning", + "Best-effort processing: store posts even if sentiment analysis fails" + ], + "embeddings": [ + "Use text-embedding-3-large for 1536-dimension vectors", + "Batch process for efficiency", + "Generate embeddings for both title and content when available", + "Store NULL for failed embedding generation" + ] + }, + "testing_strategy": { + "unit_tests": [ + "Entity validation and transformation methods", + "Reddit client with mocked PRAW responses", + "Repository operations with test database", + "Service orchestration with mocked dependencies" + ], + "integration_tests": [ + "End-to-end collection pipeline", + "Vector similarity search with real pgvectorscale", + "LLM integration with pytest-vcr cassettes", + "Dagster pipeline execution" + ], + "performance_tests": [ + "Vector similarity query performance (<1s for top 10)", + "Batch upsert performance (<5s for 1000 posts)", + "Memory usage during large collection runs" + ] + }, + "monitoring_and_observability": { + "metrics": [ + "Posts collected per subreddit per day", + "Sentiment analysis success rate", + "Embedding generation success rate", + "Vector similarity query performance", + "Reddit API rate limit utilization" + ], + "logging": [ + "Collection job start/completion with statistics", + "API errors and retry attempts", + "Data quality issues and validation failures", + "Performance metrics for optimization" + ], + "alerts": [ + "Collection job failures", + "Reddit API authentication issues", + "High error rates in LLM processing", + "Database connection problems" + ] + } + } + } +} \ No newline at end of file diff --git a/docs/specs/socialmedia/design.md b/docs/specs/socialmedia/design.md new file mode 100644 index 00000000..9490d9e2 --- /dev/null +++ b/docs/specs/socialmedia/design.md @@ -0,0 +1,834 @@ +# Social Media Domain - Technical Design Document + +## Executive Summary + +This document specifies the complete greenfield implementation of the Social Media domain within TradingAgents, transitioning from empty stubs to a production-ready system for collecting and analyzing social media sentiment from financial subreddits. This domain will provide AI agents with social sentiment context for trading decisions through a PostgreSQL + TimescaleDB + pgvectorscale architecture with RAG-powered capabilities. + +**Implementation Scope**: Complete domain implementation (0% → 100% completion) +**Architecture**: PostgreSQL + TimescaleDB + pgvectorscale with PRAW Reddit integration and OpenRouter LLM processing +**Target**: 400+ posts daily across 4 financial subreddits with 85%+ test coverage + +--- + +## 1. Architecture Overview + +### 1.1 System Architecture + +The Social Media domain follows the established layered architecture pattern while introducing new capabilities for social media data collection and semantic search: + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Dagster Pipeline │ +│ (Scheduled Collection) │ +└─────────────────────┬───────────────────────────────────────┘ + │ +┌─────────────────────▼───────────────────────────────────────┐ +│ RedditClient │ +│ (PRAW + Rate Limiting) │ +└─────────────────────┬───────────────────────────────────────┘ + │ +┌─────────────────────▼───────────────────────────────────────┐ +│ SocialMediaService │ +│ (Business Logic + LLM Integration) │ +└─────────────────────┬───────────────────────────────────────┘ + │ +┌─────────────────────▼───────────────────────────────────────┐ +│ SocialRepository │ +│ (PostgreSQL + TimescaleDB + pgvectorscale) │ +└─────────────────────┬───────────────────────────────────────┘ + │ +┌─────────────────────▼───────────────────────────────────────┐ +│ PostgreSQL + TimescaleDB + pgvectorscale │ +│ (Time-series + Vector Storage) │ +└─────────────────────────────────────────────────────────────┘ +``` + +### 1.2 Data Flow Architecture + +**Collection Flow:** +``` +Reddit API → RedditClient → SocialMediaService → OpenRouter LLM → +SocialRepository → PostgreSQL + Vector Storage +``` + +**Agent Query Flow:** +``` +AgentToolkit → SocialMediaService → SocialRepository → +Vector Similarity Search + Sentiment Aggregation → Structured Response +``` + +### 1.3 Key Architectural Principles + +- **Consistent Patterns**: Follow news domain architecture for maintainability +- **Vector-Enhanced Search**: Semantic similarity using pgvectorscale for contextual social media analysis +- **Best-Effort Processing**: Continue operation even when LLM services are unavailable +- **Rate Limiting Compliance**: Respect Reddit API limits with exponential backoff +- **Event-Driven Design**: Publish domain events for system integration + +--- + +## 2. Domain Model + +### 2.1 Core Entities + +#### SocialPost (Domain Entity) + +The primary domain entity managing business rules and data transformations: + +```python +@dataclass +class SocialPost: + """Core domain entity for Reddit posts with sentiment and engagement data.""" + + # Core Reddit Data + post_id: str # Reddit unique ID (e.g., 't3_abc123') + title: str # Post title + content: Optional[str] # Post content (selftext for text posts) + author: str # Reddit username + subreddit: str # Subreddit name + created_utc: datetime # Post creation time + url: str # Reddit permalink or external URL + + # Engagement Metrics + upvotes: int # Post score + downvotes: int # Calculated from score + upvote_ratio + comments_count: int # Number of comments + + # Enhanced Data + sentiment_score: Optional[SentimentScore] = None + tickers: List[str] = field(default_factory=list) + title_embedding: Optional[List[float]] = None + content_embedding: Optional[List[float]] = None + + def from_praw_submission(cls, submission: praw.Submission) -> 'SocialPost': + """Create SocialPost from PRAW Submission object.""" + + def to_entity(self) -> SocialMediaPostEntity: + """Transform to database entity for storage.""" + + def validate(self) -> List[str]: + """Validate business rules and return errors.""" + + def extract_tickers(self) -> List[str]: + """Extract stock ticker symbols from title and content.""" + + def has_reliable_sentiment(self) -> bool: + """Check if sentiment confidence >= 0.5.""" + + def to_response(self) -> Dict[str, Any]: + """Format for agent consumption.""" +``` + +**Validation Rules:** +- `post_id` must match Reddit format (starts with 't3_') +- `title` cannot be empty +- `created_utc` cannot be in the future +- `sentiment_score.confidence` must be 0.0-1.0 +- `embeddings` must be 1536 dimensions if present +- `subreddit` must be in allowed financial subreddits list + +#### SentimentScore (Value Object) + +Structured sentiment analysis result from OpenRouter LLM: + +```python +@dataclass +class SentimentScore: + """Structured sentiment analysis result with confidence and reasoning.""" + + sentiment: Literal['positive', 'negative', 'neutral'] + confidence: float # 0.0-1.0 + reasoning: str # Brief explanation + + def is_reliable(self) -> bool: + """Check if confidence >= 0.5 for reliable sentiment.""" + return self.confidence >= 0.5 + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for JSON storage.""" +``` + +#### SocialJobConfig (Configuration) + +Configuration for scheduled Reddit collection: + +```python +@dataclass +class SocialJobConfig: + """Configuration for scheduled Reddit data collection.""" + + # Collection Settings + subreddits: List[str] = field(default_factory=lambda: [ + 'wallstreetbets', 'investing', 'stocks', 'SecurityAnalysis' + ]) + max_posts_per_subreddit: int = 50 + lookback_hours: int = 12 + min_score: int = 10 + + # Processing Settings + sentiment_model: str = "anthropic/claude-3.5-haiku" + embedding_model: str = "text-embedding-3-large" + + # Rate Limiting + rate_limit_delay: float = 1.0 # seconds between API calls + + # Scheduling + schedule_times: List[str] = field(default_factory=lambda: [ + '0 6 * * *', # 6 AM UTC + '0 18 * * *' # 6 PM UTC + ]) +``` + +--- + +## 3. Database Design + +### 3.1 Schema Definition + +The `social_media_posts` table leverages PostgreSQL with TimescaleDB for time-series optimization and pgvectorscale for vector similarity search: + +```sql +-- Core table definition +CREATE TABLE social_media_posts ( + id UUID PRIMARY KEY DEFAULT uuid7(), + post_id VARCHAR(50) UNIQUE NOT NULL, + title TEXT NOT NULL, + content TEXT, + author VARCHAR(100) NOT NULL, + subreddit VARCHAR(50) NOT NULL, + created_utc TIMESTAMPTZ NOT NULL, + upvotes INTEGER NOT NULL DEFAULT 0, + downvotes INTEGER NOT NULL DEFAULT 0, + comments_count INTEGER NOT NULL DEFAULT 0, + url TEXT NOT NULL, + sentiment_score JSONB, + sentiment_label VARCHAR(20), + tickers TEXT[] DEFAULT '{}', + title_embedding VECTOR(1536), + content_embedding VECTOR(1536), + inserted_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW() +); + +-- TimescaleDB hypertable for time-series optimization +SELECT create_hypertable('social_media_posts', 'created_utc', + chunk_time_interval => INTERVAL '1 day'); + +-- Performance indexes +CREATE UNIQUE INDEX idx_social_posts_post_id ON social_media_posts (post_id); +CREATE INDEX idx_social_posts_subreddit_time ON social_media_posts (subreddit, created_utc DESC); +CREATE INDEX idx_social_posts_tickers_gin ON social_media_posts USING GIN (tickers); +CREATE INDEX idx_social_posts_title_embedding ON social_media_posts + USING vectors (title_embedding vector_cosine_ops); +CREATE INDEX idx_social_posts_content_embedding ON social_media_posts + USING vectors (content_embedding vector_cosine_ops); +CREATE INDEX idx_social_posts_sentiment ON social_media_posts + (((sentiment_score->>'sentiment'))) WHERE sentiment_score IS NOT NULL; + +-- Data validation constraints +ALTER TABLE social_media_posts ADD CONSTRAINT chk_sentiment_score + CHECK (sentiment_score IS NULL OR + ((sentiment_score->>'confidence')::float BETWEEN 0 AND 1)); +ALTER TABLE social_media_posts ADD CONSTRAINT chk_created_utc + CHECK (created_utc <= NOW()); +``` + +### 3.2 SQLAlchemy Entity + +```python +class SocialMediaPostEntity(Base): + """SQLAlchemy entity for PostgreSQL persistence with vector support.""" + + __tablename__ = "social_media_posts" + + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid7) + post_id = Column(String(50), unique=True, nullable=False, index=True) + title = Column(Text, nullable=False) + content = Column(Text) + author = Column(String(100), nullable=False) + subreddit = Column(String(50), nullable=False) + created_utc = Column(DateTime(timezone=True), nullable=False) + upvotes = Column(Integer, nullable=False, default=0) + downvotes = Column(Integer, nullable=False, default=0) + comments_count = Column(Integer, nullable=False, default=0) + url = Column(Text, nullable=False) + sentiment_score = Column(JSONB) + sentiment_label = Column(String(20)) + tickers = Column(ARRAY(String), default=[]) + title_embedding = Column(Vector(1536)) + content_embedding = Column(Vector(1536)) + inserted_at = Column(DateTime(timezone=True), default=func.now()) + updated_at = Column(DateTime(timezone=True), default=func.now(), onupdate=func.now()) + + def to_domain(self) -> SocialPost: + """Convert to domain entity.""" + + @classmethod + def from_domain(cls, post: SocialPost) -> 'SocialMediaPostEntity': + """Create from domain entity.""" +``` + +### 3.3 Access Patterns and Query Optimization + +**Common Access Patterns:** +- Ticker-based queries: `SELECT * WHERE 'AAPL' = ANY(tickers)` +- Time-range filtering: `SELECT * WHERE created_utc BETWEEN ? AND ?` +- Vector similarity: `SELECT * ORDER BY embedding <=> ? LIMIT 10` +- Sentiment aggregations: `SELECT AVG(sentiment_score) GROUP BY subreddit` + +**Performance Targets:** +- Vector similarity queries: < 1s for top 10 results +- Batch upserts: < 5s for 1000 posts +- Ticker-based queries: < 100ms for 30-day ranges + +--- + +## 4. API Integration + +### 4.1 Reddit Client (PRAW Integration) + +Complete implementation of Reddit data collection using PRAW (Python Reddit API Wrapper): + +```python +class RedditClient: + """PRAW wrapper with rate limiting and error handling.""" + + def __init__(self, config: RedditClientConfig): + """Initialize Reddit client with OAuth2 credentials.""" + self.reddit = praw.Reddit( + client_id=config.client_id, + client_secret=config.client_secret, + user_agent=config.user_agent + ) + self.rate_limiter = AsyncLimiter(1, 1) # 1 request per second + + async def fetch_subreddit_posts( + self, + subreddit: str, + limit: int = 50, + time_filter: str = 'day' + ) -> List[Dict[str, Any]]: + """Fetch hot posts from subreddit with rate limiting.""" + + async def search_posts( + self, + query: str, + subreddit: Optional[str] = None, + limit: int = 25 + ) -> List[Dict[str, Any]]: + """Search posts with ticker symbols or keywords.""" + + async def get_post_details(self, post_id: str) -> Optional[Dict[str, Any]]: + """Get detailed information for a specific post.""" +``` + +**Configuration Requirements:** +- Reddit App Credentials: `client_id`, `client_secret`, `user_agent` +- Rate Limiting: 1 request per second (60 requests/minute limit) +- Error Handling: Exponential backoff for rate limits, graceful degradation for authentication errors + +### 4.2 OpenRouter LLM Integration + +Leverage existing OpenRouter infrastructure with social media-specific enhancements: + +**Sentiment Analysis Prompt:** +``` +Analyze this Reddit post about stocks/finance. Consider the informal language, +memes, and community context typical of financial subreddits. + +Post: {title} - {content} + +Respond with valid JSON: +{ + "sentiment": "positive|negative|neutral", + "confidence": 0.0-1.0, + "reasoning": "brief explanation considering context" +} +``` + +**Embedding Configuration:** +- Model: `text-embedding-3-large` (1536 dimensions) +- Batch processing for efficiency +- Generate embeddings for both title and content when available +- Store NULL for failed embedding generation (best-effort processing) + +--- + +## 5. Component Architecture + +### 5.1 Repository Layer (Data Access) + +```python +class SocialRepository: + """Data access layer for social media posts with vector capabilities.""" + + def __init__(self, session: AsyncSession): + self.session = session + + async def find_by_ticker( + self, + ticker: str, + days: int = 30, + limit: int = 50 + ) -> List[SocialPost]: + """Find posts mentioning specific ticker within time range.""" + + async def find_similar_posts( + self, + query_embedding: List[float], + ticker: Optional[str] = None, + limit: int = 10 + ) -> List[SocialPost]: + """Find semantically similar posts using vector similarity.""" + + async def get_sentiment_summary( + self, + ticker: str, + subreddit: Optional[str] = None, + hours: int = 24 + ) -> Dict[str, Any]: + """Generate sentiment aggregation for ticker.""" + + async def upsert_batch(self, posts: List[SocialPost]) -> List[SocialPost]: + """Batch upsert posts with conflict resolution.""" + + async def cleanup_old_posts(self, days: int = 90) -> int: + """Remove posts older than retention period.""" +``` + +### 5.2 Service Layer (Business Logic) + +```python +class SocialMediaService: + """Business logic orchestration with LLM integration.""" + + def __init__( + self, + repository: SocialRepository, + reddit_client: RedditClient, + openrouter_client: OpenRouterClient + ): + self.repository = repository + self.reddit_client = reddit_client + self.openrouter_client = openrouter_client + + async def collect_subreddit_posts(self, config: SocialJobConfig) -> int: + """Orchestrate complete collection process for configured subreddits.""" + + async def update_post_sentiment( + self, + posts: List[SocialPost] + ) -> List[SocialPost]: + """Add sentiment analysis to posts using OpenRouter LLM.""" + + async def generate_embeddings( + self, + posts: List[SocialPost] + ) -> List[SocialPost]: + """Generate vector embeddings for semantic search.""" + + async def find_trending_tickers( + self, + hours: int = 24 + ) -> List[Dict[str, Any]]: + """Identify trending ticker mentions across subreddits.""" +``` + +### 5.3 Agent Integration Layer + +```python +class SocialMediaAgentToolkit: + """RAG methods for AI agent integration.""" + + def __init__(self, service: SocialMediaService): + self.service = service + + async def get_reddit_sentiment( + self, + ticker: str, + days: int = 7 + ) -> Dict[str, Any]: + """Get sentiment summary for ticker from Reddit discussions.""" + + async def search_social_posts( + self, + query: str, + ticker: Optional[str] = None + ) -> List[Dict[str, Any]]: + """Semantic search for relevant social media posts.""" + + async def get_trending_discussions( + self, + ticker: str + ) -> List[Dict[str, Any]]: + """Get trending discussions and sentiment for specific ticker.""" + + async def get_subreddit_analysis( + self, + subreddit: str, + ticker: str + ) -> Dict[str, Any]: + """Analyze sentiment and engagement for ticker in specific subreddit.""" +``` + +**Agent Response Format:** +```json +{ + "posts": [ + { + "post_id": "t3_abc123", + "title": "AAPL earnings beat expectations", + "subreddit": "stocks", + "created_utc": "2024-01-15T14:30:00Z", + "sentiment": { + "sentiment": "positive", + "confidence": 0.85, + "reasoning": "Strong positive language about earnings" + }, + "engagement": { + "upvotes": 245, + "comments_count": 67 + }, + "tickers": ["AAPL"], + "url": "https://reddit.com/r/stocks/comments/abc123" + } + ], + "summary": { + "total_posts": 15, + "sentiment_breakdown": { + "positive": 0.6, + "negative": 0.2, + "neutral": 0.2 + }, + "avg_confidence": 0.78, + "data_quality": "high" + } +} +``` + +--- + +## 6. Dagster Pipeline Architecture + +### 6.1 Scheduled Collection Pipeline + +```python +@asset( + partitions_def=DailyPartitionsDefinition(start_date="2024-01-01"), + config_schema=SocialJobConfig.schema() +) +def reddit_posts_collection(context: AssetExecutionContext) -> MaterializeResult: + """Collect Reddit posts from financial subreddits.""" + +@asset(deps=[reddit_posts_collection]) +def reddit_sentiment_analysis(context: AssetExecutionContext) -> MaterializeResult: + """Add sentiment analysis to collected posts.""" + +@asset(deps=[reddit_sentiment_analysis]) +def reddit_embeddings_generation(context: AssetExecutionContext) -> MaterializeResult: + """Generate vector embeddings for semantic search.""" + +# Schedule: Twice daily collection +reddit_collection_schedule = ScheduleDefinition( + name="reddit_collection_schedule", + job=define_asset_job("reddit_collection", selection=[ + reddit_posts_collection, + reddit_sentiment_analysis, + reddit_embeddings_generation + ]), + cron_schedule="0 6,18 * * *" # 6 AM and 6 PM UTC +) +``` + +### 6.2 Data Quality and Monitoring + +**Collection Metrics:** +- Posts collected per subreddit per run +- Sentiment analysis success rate +- Embedding generation success rate +- API error rates and retry attempts + +**Data Quality Checks:** +- Post deduplication verification +- Sentiment confidence distribution +- Embedding vector validation +- Reddit API rate limit utilization + +**Failure Handling:** +- Best-effort processing: Continue with remaining subreddits if one fails +- Exponential backoff for Reddit API rate limits +- Graceful degradation: Store posts without sentiment/embeddings if LLM fails +- Dead letter queue for failed posts with retry mechanism + +--- + +## 7. Testing Strategy + +### 7.1 Test Structure + +Following the project's pragmatic outside-in TDD approach: + +``` +tests/domains/socialmedia/ +├── __init__.py +├── test_social_post.py # Domain entity validation +├── test_social_repository.py # PostgreSQL + vector operations +├── test_reddit_client.py # PRAW integration with VCR +├── test_social_media_service.py # Business logic with mocked deps +├── test_social_agent_toolkit.py # Agent integration methods +└── fixtures/ + ├── reddit_responses.json # Sample PRAW responses + └── vcr_cassettes/ # HTTP cassettes for external APIs +``` + +### 7.2 Testing Approach + +**Unit Tests (Mock I/O boundaries):** +- `SocialPost` entity validation and transformations +- `SocialRepository` with test PostgreSQL database +- `RedditClient` with mocked PRAW responses +- `SocialMediaService` with mocked dependencies + +**Integration Tests (Real components):** +- End-to-end collection pipeline with test Reddit data +- Vector similarity search with actual pgvectorscale +- LLM integration with pytest-vcr cassettes +- Dagster pipeline execution + +**Performance Tests:** +- Vector similarity query performance (< 1s target) +- Batch upsert performance (< 5s for 1000 posts) +- Memory usage during large collection runs + +### 7.3 Test Fixtures and Mocking + +**Reddit API Mocking:** +```python +@pytest.fixture +def mock_reddit_response(): + """Sample Reddit API response for testing.""" + return { + "id": "abc123", + "title": "AAPL earnings discussion", + "selftext": "Strong quarter, bullish outlook", + "author": "test_user", + "subreddit_display_name": "stocks", + "created_utc": 1705315200, + "score": 150, + "upvote_ratio": 0.85, + "num_comments": 45, + "permalink": "/r/stocks/comments/abc123/aapl_earnings/" + } +``` + +**Vector Similarity Testing:** +```python +@pytest.mark.asyncio +async def test_vector_similarity_search(social_repository, sample_posts): + """Test semantic similarity search using pgvectorscale.""" + # Insert test posts with embeddings + await social_repository.upsert_batch(sample_posts) + + # Test similarity search + query_embedding = [0.1] * 1536 # Sample embedding + similar_posts = await social_repository.find_similar_posts( + query_embedding, limit=5 + ) + + assert len(similar_posts) <= 5 + assert all(post.title_embedding for post in similar_posts) +``` + +--- + +## 8. Implementation Roadmap + +### 8.1 Phase 1: Database Foundation (Week 1) + +**Priority 1: Database Schema** +1. Create PostgreSQL migration for `social_media_posts` table +2. Add TimescaleDB hypertable configuration +3. Set up pgvectorscale indexes for vector similarity +4. Implement data validation constraints + +**Priority 2: Core Entities** +1. `SocialMediaPostEntity` (SQLAlchemy entity) +2. `SocialPost` (domain entity with validation) +3. `SentimentScore` (value object) +4. Entity transformation methods (`to_domain`, `from_domain`) + +### 8.2 Phase 2: Data Collection (Week 2) + +**Priority 1: Reddit Integration** +1. `RedditClient` with PRAW implementation +2. Rate limiting and error handling +3. Subreddit post collection methods +4. Reddit API authentication setup + +**Priority 2: Repository Layer** +1. `SocialRepository` with PostgreSQL operations +2. Vector similarity search methods +3. Batch upsert operations +4. Sentiment aggregation queries + +### 8.3 Phase 3: Processing & Intelligence (Week 3) + +**Priority 1: Service Layer** +1. `SocialMediaService` business logic +2. OpenRouter LLM integration for sentiment +3. Vector embedding generation +4. Batch processing workflows + +**Priority 2: Agent Integration** +1. `SocialMediaAgentToolkit` RAG methods +2. Structured response formatting +3. Context-aware social media analysis +4. Integration with existing agent workflows + +### 8.4 Phase 4: Automation & Monitoring (Week 4) + +**Priority 1: Dagster Pipeline** +1. Scheduled Reddit collection assets +2. Processing pipeline orchestration +3. Data quality monitoring +4. Error handling and retry logic + +**Priority 2: Testing & Documentation** +1. Comprehensive test suite (>85% coverage) +2. Performance testing and optimization +3. API documentation updates +4. Integration with existing test infrastructure + +--- + +## 9. Monitoring and Observability + +### 9.1 Key Metrics + +**Collection Metrics:** +- Posts collected per subreddit per day +- Collection job success/failure rates +- Reddit API rate limit utilization +- Data deduplication effectiveness + +**Processing Metrics:** +- Sentiment analysis success rate and latency +- Embedding generation success rate and latency +- LLM token usage and costs +- Vector similarity query performance + +**Business Metrics:** +- Active tickers with social sentiment data +- Sentiment distribution across subreddits +- Trending ticker detection accuracy +- Agent query response times + +### 9.2 Alerting Strategy + +**Critical Alerts:** +- Collection job failures (> 2 consecutive failures) +- Reddit API authentication errors +- Database connection failures +- High LLM processing error rates (> 20%) + +**Warning Alerts:** +- Low collection volumes (< 50% of expected) +- High sentiment analysis latency (> 30s per batch) +- Vector similarity performance degradation +- Approaching Reddit API rate limits + +### 9.3 Logging and Debugging + +**Structured Logging Format:** +```json +{ + "timestamp": "2024-01-15T14:30:00Z", + "level": "INFO", + "component": "SocialMediaService", + "operation": "collect_subreddit_posts", + "subreddit": "stocks", + "posts_collected": 45, + "sentiment_analyzed": 43, + "embeddings_generated": 41, + "duration_ms": 12500, + "metadata": { + "reddit_api_calls": 3, + "llm_tokens_used": 15420 + } +} +``` + +--- + +## 10. Security and Compliance + +### 10.1 Data Privacy + +**Reddit Data Handling:** +- Store only publicly available Reddit posts +- Respect user privacy: hash usernames for analytics +- Implement data retention policies (90-day maximum) +- No collection of private or deleted content + +**API Key Management:** +- Environment variable storage for Reddit credentials +- OpenRouter API key rotation support +- No credential logging or persistence in plain text + +### 10.2 Rate Limiting Compliance + +**Reddit API Compliance:** +- Respect 60 requests per minute OAuth limit +- Implement exponential backoff for rate limit violations +- User-Agent string identification as required +- Monitor and log API usage statistics + +**OpenRouter Usage:** +- Monitor token usage and costs +- Implement request batching for efficiency +- Handle API rate limits gracefully +- Cost optimization through model selection + +--- + +## 11. Future Enhancements + +### 11.1 Extended Social Media Sources + +**Twitter/X Integration:** +- Similar architecture pattern for Twitter API v2 +- Real-time streaming for high-frequency updates +- Hashtag and mention tracking + +**News Comment Sections:** +- Integration with financial news comment sections +- Cross-platform sentiment correlation +- Enhanced context for news articles + +### 11.2 Advanced Analytics + +**Sentiment Trend Analysis:** +- Time-series sentiment tracking +- Volatility correlation with social sentiment +- Predictive sentiment modeling + +**Influence Network Analysis:** +- User influence scoring based on engagement +- Community detection within financial subreddits +- Viral content identification and tracking + +### 11.3 Real-time Processing + +**Streaming Architecture:** +- Real-time Reddit post collection +- Event-driven sentiment processing +- Live sentiment dashboards for agents + +**Market Hours Integration:** +- Increased collection frequency during market hours +- After-hours sentiment tracking +- Weekend vs. weekday sentiment patterns + +--- + +This technical design provides a comprehensive blueprint for implementing the complete Social Media domain from empty stubs to a production-ready system. The architecture leverages proven patterns from the news domain while introducing specialized capabilities for social media data collection, semantic search, and AI agent integration. \ No newline at end of file diff --git a/docs/specs/socialmedia/requirements.json b/docs/specs/socialmedia/requirements.json new file mode 100644 index 00000000..fc830ec5 --- /dev/null +++ b/docs/specs/socialmedia/requirements.json @@ -0,0 +1,6 @@ +{ + "raw_user_story": "a) As a dagster job I want to scrape specific sub reddits from market sentiment b) As an AI Agent I want to get relavent social media data about a specific ticker or market", + "raw_criteria": "a) All reddit posts are stored with sentiment analysis in db b) Agents can get RAG data from db", + "raw_rules": "updated daily", + "raw_scope": "Included: reddit. Excluded: Other social media platforms beyond Reddit." +} \ No newline at end of file diff --git a/docs/specs/socialmedia/spec-lite.md b/docs/specs/socialmedia/spec-lite.md new file mode 100644 index 00000000..eb0cfc5f --- /dev/null +++ b/docs/specs/socialmedia/spec-lite.md @@ -0,0 +1,105 @@ +# Social Media Domain - Specification Lite + +## Summary +Complete implementation of social media data collection from Reddit with LLM sentiment analysis and vector embeddings for AI agent RAG integration. + +## Core Requirements + +### Data Collection +- **Daily Reddit collection** from financial subreddits (wallstreetbets, investing, stocks, SecurityAnalysis) +- **OpenRouter LLM sentiment analysis** with confidence scoring +- **Vector embeddings** for semantic similarity search +- **PostgreSQL storage** with TimescaleDB + pgvectorscale optimization + +### Agent Integration +- **AgentToolkit methods**: `get_reddit_news()` and `get_reddit_stock_info()` +- **RAG-enhanced queries** with < 2 second response time +- **Vector similarity search** for contextual social media insights + +## Technical Implementation + +### Architecture Pattern +**Router → Service → Repository → Entity → Database** (matching news domain) + +### Database Schema +```sql +social_media_posts ( + post_id, ticker, subreddit, title, content, author, + created_at, upvotes, comment_count, + sentiment_score, sentiment_label, sentiment_confidence, + embedding vector(1536), -- pgvectorscale + data_quality_score, processing_status +) +``` + +### Key Components + +#### 1. RedditClient +- PRAW integration with rate limiting +- Financial subreddit targeting +- Ticker-specific post filtering + +#### 2. SentimentAnalyzer +- OpenRouter LLM integration +- Structured sentiment scoring (-1.0 to +1.0) +- Financial context awareness + +#### 3. SocialRepository +- PostgreSQL with deduplication by post_id +- Vector similarity search using pgvectorscale +- TimescaleDB time-series optimization + +#### 4. SocialMediaService +- Orchestrates collection pipeline: Reddit → Sentiment → Embeddings → Storage +- Provides ticker-specific social context +- Calculates aggregate sentiment metrics + +#### 5. AgentToolkit Integration +```python +async def get_reddit_news(ticker: str, days: int = 7) -> str: + # Returns formatted social media context with sentiment analysis + +async def get_reddit_stock_info(ticker: str, query: Optional[str] = None) -> str: + # Returns semantic search results with sentiment aggregation +``` + +## Implementation Scope + +### Complete Implementation ✅ +- PostgreSQL migration from file storage +- Reddit API client (currently empty stub) +- SQLAlchemy entities with vector fields +- LLM sentiment analysis pipeline +- Vector embedding generation and search +- Dagster pipeline for scheduled collection +- Comprehensive test coverage (pytest-vcr for APIs) + +### Current Status +**Basic stub implementation** - requires complete rebuild of all components + +### Dependencies +- Reddit API credentials (PRAW) +- OpenRouter API access +- PostgreSQL with TimescaleDB + pgvectorscale +- Existing TradingAgentsConfig +- News domain patterns for consistency + +## Data Flow +1. **Dagster pipeline** triggers daily collection +2. **RedditClient** fetches posts from financial subreddits +3. **SentimentAnalyzer** processes posts via OpenRouter LLM +4. **EmbeddingGenerator** creates vector embeddings +5. **SocialRepository** stores in PostgreSQL with deduplication +6. **AI Agents** query via AgentToolkit with RAG-enhanced context + +## Testing Strategy +- **pytest-vcr** for Reddit API mocking +- **Real PostgreSQL** for repository integration tests +- **Service mocks** for business logic testing +- **85%+ coverage** matching project standards + +## Success Criteria +- Daily automated Reddit collection with sentiment analysis +- Sub-2-second agent queries with vector search +- Seamless RAG integration matching news domain patterns +- Production-ready reliability with comprehensive error handling \ No newline at end of file diff --git a/docs/specs/socialmedia/spec.json b/docs/specs/socialmedia/spec.json new file mode 100644 index 00000000..204eb33d --- /dev/null +++ b/docs/specs/socialmedia/spec.json @@ -0,0 +1,90 @@ +{ + "feature": "socialmedia", + "user_story": "As a Dagster pipeline, I want to collect Reddit posts from financial subreddits with LLM sentiment analysis and vector embeddings, so that AI Agents can access comprehensive social media context for ticker-specific trading decisions through RAG-powered queries", + "acceptance_criteria": [ + "GIVEN a scheduled Dagster pipeline WHEN it executes daily THEN it collects Reddit posts from configured financial subreddits without manual intervention", + "GIVEN Reddit posts are collected WHEN processed THEN they are stored in PostgreSQL with TimescaleDB optimization and vector embeddings for semantic search", + "GIVEN social media posts WHEN processed THEN each post receives OpenRouter LLM sentiment analysis with structured scores (positive/negative/neutral with confidence)", + "GIVEN a ticker symbol WHEN AI agents request social context THEN they receive relevant Reddit posts with sentiment scores and vector similarity ranking within 2 seconds", + "GIVEN social media data WHEN agents query THEN AgentToolkit provides RAG-enhanced context including post content, sentiment trends, and engagement metrics" + ], + "business_rules": [ + "Daily automated collection from configured financial subreddits (wallstreetbets, investing, stocks, SecurityAnalysis)", + "OpenRouter LLM sentiment analysis for all posts with confidence scoring", + "Vector embeddings generation for semantic similarity search", + "Post deduplication by Reddit post ID to prevent duplicates", + "Rate limiting compliance with Reddit API terms of service", + "Data retention policy: 90 days for social media posts", + "Best effort processing: API failures or rate limits don't block other posts" + ], + "scope": { + "included": [ + "Complete socialmedia domain implementation from stub to production", + "PostgreSQL migration from current file-based storage", + "Reddit API integration using PRAW (Python Reddit API Wrapper)", + "OpenRouter LLM sentiment analysis integration", + "Vector embeddings generation and similarity search", + "AgentToolkit integration with get_reddit_news and get_reddit_stock_info methods", + "Dagster pipeline for scheduled daily collection", + "SQLAlchemy entities with TimescaleDB and pgvectorscale support", + "Comprehensive test coverage with pytest-vcr for API mocking" + ], + "excluded": [ + "Other social media platforms beyond Reddit (Twitter, LinkedIn, etc.)", + "Real-time social media streaming (batch processing only)", + "Custom sentiment models (use OpenRouter LLMs only)", + "Social media influence scoring or user reputation tracking", + "Multi-language post support (English only)", + "Historical Reddit data backfilling beyond 30 days" + ] + }, + "current_implementation_status": "Basic stub implementation - requires complete rebuild", + "missing_components": [ + "PostgreSQL database migration from file storage", + "Reddit API client implementation (RedditClient is empty stub)", + "SQLAlchemy entity models for social posts with vector fields", + "LLM sentiment analysis integration via OpenRouter", + "Vector embedding generation and similarity search", + "AgentToolkit RAG methods (get_reddit_news, get_reddit_stock_info)", + "Dagster pipeline for scheduled data collection", + "Comprehensive test suite with domain-specific patterns" + ], + "existing_stub_components": [ + "SocialMediaService with empty method stubs", + "SocialRepository with file-based JSON storage", + "Basic data models: SocialPost, PostData, SocialContext", + "Empty RedditClient class requiring full implementation", + "Agent references to social methods (not yet implemented)" + ], + "aligns_with": "Multi-agent trading framework vision - provides social sentiment context for comprehensive market analysis alongside news and market data", + "dependencies": [ + "PRAW (Python Reddit API Wrapper) for Reddit API access", + "OpenRouter API for LLM sentiment analysis", + "PostgreSQL with TimescaleDB and pgvectorscale extensions", + "Existing database infrastructure from news domain", + "OpenRouter configuration in TradingAgentsConfig", + "Dagster orchestration framework for scheduled execution" + ], + "technical_details": { + "architecture_pattern": "Router → Service → Repository → Entity → Database (matching news domain)", + "database_integration": "PostgreSQL + TimescaleDB + pgvectorscale (consistent with news domain)", + "llm_integration": "OpenRouter unified provider with two-tier model strategy", + "vector_storage": "1536-dimension embeddings using pgvectorscale (consistent with news)", + "api_integration": "PRAW (Python Reddit API Wrapper) with rate limiting and error handling", + "testing_strategy": "pytest-vcr for HTTP mocking, real PostgreSQL for repository tests, service mocks for business logic" + }, + "implementation_approach": "Complete domain implementation following successful news domain patterns: database migration → entity models → Reddit client → repository → service → AgentToolkit → Dagster pipeline", + "reference_implementations": { + "news_domain_patterns": "Follow NewsService, NewsRepository, NewsArticleEntity patterns for consistency", + "database_schema": "Mirror NewsArticleEntity vector embedding approach for social posts", + "agent_integration": "Follow existing AgentToolkit get_news() pattern for social media methods", + "testing_approach": "Apply news domain testing patterns: VCR for API, real DB for repositories" + }, + "success_criteria": { + "functionality": "Daily Reddit collection with sentiment analysis and vector search", + "performance": "< 2 second social context queries, < 100ms repository operations", + "quality": "85%+ test coverage, comprehensive error handling", + "integration": "Seamless AgentToolkit RAG integration for AI agents", + "consistency": "Architecture and patterns match successful news domain implementation" + } +} \ No newline at end of file diff --git a/docs/specs/socialmedia/spec.md b/docs/specs/socialmedia/spec.md new file mode 100644 index 00000000..c21f6090 --- /dev/null +++ b/docs/specs/socialmedia/spec.md @@ -0,0 +1,740 @@ +# Social Media Domain Specification + +## Feature Overview + +**Complete implementation of social media data collection and analysis** - Transform the current stub implementation into a production-ready social media domain that provides comprehensive Reddit sentiment analysis for trading agents. + +### User Story + +As a Dagster pipeline, I want to collect Reddit posts from financial subreddits with LLM sentiment analysis and vector embeddings, so that AI Agents can access comprehensive social media context for ticker-specific trading decisions through RAG-powered queries. + +## Acceptance Criteria + +### Daily Data Collection +- **GIVEN** a scheduled Dagster pipeline **WHEN** it executes daily **THEN** it collects Reddit posts from configured financial subreddits without manual intervention +- **GIVEN** Reddit posts are collected **WHEN** processed **THEN** they are stored in PostgreSQL with TimescaleDB optimization and vector embeddings for semantic search + +### LLM Sentiment Analysis +- **GIVEN** social media posts **WHEN** processed **THEN** each post receives OpenRouter LLM sentiment analysis with structured scores (positive/negative/neutral with confidence) + +### Agent Integration +- **GIVEN** a ticker symbol **WHEN** AI agents request social context **THEN** they receive relevant Reddit posts with sentiment scores and vector similarity ranking within 2 seconds +- **GIVEN** social media data **WHEN** agents query **THEN** AgentToolkit provides RAG-enhanced context including post content, sentiment trends, and engagement metrics + +## Business Rules and Constraints + +### Data Collection Rules +1. **Daily automated collection** from configured financial subreddits (wallstreetbets, investing, stocks, SecurityAnalysis) +2. **OpenRouter LLM sentiment analysis** for all posts with confidence scoring +3. **Vector embeddings generation** for semantic similarity search +4. **Post deduplication** by Reddit post ID to prevent duplicates +5. **Rate limiting compliance** with Reddit API terms of service + +### Data Management +1. **Data retention policy**: 90 days for social media posts +2. **Best effort processing**: API failures or rate limits don't block other posts + +## Scope Definition + +### Included Features ✅ +- Complete socialmedia domain implementation from stub to production +- PostgreSQL migration from current file-based storage +- Reddit API integration using PRAW or Reddit API client +- OpenRouter LLM sentiment analysis integration +- Vector embeddings generation and similarity search +- AgentToolkit integration with `get_reddit_news` and `get_reddit_stock_info` methods +- Dagster pipeline for scheduled daily collection +- SQLAlchemy entities with TimescaleDB and pgvectorscale support +- Comprehensive test coverage with pytest-vcr for API mocking + +### Excluded Features ❌ +- Other social media platforms beyond Reddit (Twitter, LinkedIn, etc.) +- Real-time social media streaming (batch processing only) +- Custom sentiment models (use OpenRouter LLMs only) +- Social media influence scoring or user reputation tracking +- Multi-language post support (English only) +- Historical Reddit data backfilling beyond 30 days + +## Technical Implementation Details + +### Architecture Pattern +**Router → Service → Repository → Entity → Database** (matching news domain) + +### Current Implementation Status +**Basic stub implementation - requires complete rebuild** + +### Missing Components +1. PostgreSQL database migration from file storage +2. Reddit API client implementation (RedditClient is empty stub) +3. SQLAlchemy entity models for social posts with vector fields +4. LLM sentiment analysis integration via OpenRouter +5. Vector embedding generation and similarity search +6. AgentToolkit RAG methods (`get_reddit_news`, `get_reddit_stock_info`) +7. Dagster pipeline for scheduled data collection +8. Comprehensive test suite with domain-specific patterns + +### Existing Stub Components +- SocialMediaService with empty method stubs +- SocialRepository with file-based JSON storage +- Basic data models: SocialPost, PostData, SocialContext +- Empty RedditClient class requiring full implementation +- Agent references to social methods (not yet implemented) + +## Database Integration + +### PostgreSQL Schema Design +```sql +-- Social media posts table with TimescaleDB optimization +CREATE TABLE social_media_posts ( + id SERIAL PRIMARY KEY, + post_id VARCHAR(50) UNIQUE NOT NULL, -- Reddit post ID + ticker VARCHAR(10), -- Associated ticker + subreddit VARCHAR(50) NOT NULL, -- Source subreddit + title TEXT NOT NULL, -- Post title + content TEXT, -- Post content + author VARCHAR(50), -- Reddit username + created_at TIMESTAMPTZ NOT NULL, -- Post creation time + collected_at TIMESTAMPTZ DEFAULT NOW(), -- Data collection time + upvotes INTEGER DEFAULT 0, -- Reddit upvotes + downvotes INTEGER DEFAULT 0, -- Reddit downvotes + comment_count INTEGER DEFAULT 0, -- Number of comments + url TEXT, -- Reddit URL + permalink TEXT, -- Reddit permalink + + -- Sentiment analysis fields + sentiment_score DECIMAL(3,2), -- -1.0 to +1.0 + sentiment_label VARCHAR(20), -- positive/negative/neutral + sentiment_confidence DECIMAL(3,2), -- 0.0 to 1.0 + + -- Vector embeddings + embedding vector(1536), -- pgvectorscale embedding + + -- Metadata + data_quality_score DECIMAL(3,2) DEFAULT 1.0, + processing_status VARCHAR(20) DEFAULT 'pending', + error_message TEXT +); + +-- TimescaleDB hypertable for time-series optimization +SELECT create_hypertable('social_media_posts', 'created_at'); + +-- Vector similarity index +CREATE INDEX idx_social_posts_embedding ON social_media_posts USING vectors (embedding vector_cosine_ops); + +-- Performance indexes +CREATE INDEX idx_social_posts_ticker ON social_media_posts (ticker, created_at DESC); +CREATE INDEX idx_social_posts_subreddit ON social_media_posts (subreddit, created_at DESC); +CREATE INDEX idx_social_posts_sentiment ON social_media_posts (sentiment_label, sentiment_score); +``` + +### Entity Model +```python +# tradingagents/domains/socialmedia/entities.py +from sqlalchemy import Column, Integer, String, Text, DECIMAL, TIMESTAMP, Index +from sqlalchemy.dialects.postgresql import VECTOR +from tradingagents.database import Base +from typing import Optional, Dict, Any +import json + +class SocialMediaPostEntity(Base): + __tablename__ = 'social_media_posts' + + id = Column(Integer, primary_key=True) + post_id = Column(String(50), unique=True, nullable=False) + ticker = Column(String(10), index=True) + subreddit = Column(String(50), nullable=False, index=True) + title = Column(Text, nullable=False) + content = Column(Text) + author = Column(String(50)) + created_at = Column(TIMESTAMP(timezone=True), nullable=False, index=True) + collected_at = Column(TIMESTAMP(timezone=True), server_default='NOW()') + upvotes = Column(Integer, default=0) + downvotes = Column(Integer, default=0) + comment_count = Column(Integer, default=0) + url = Column(Text) + permalink = Column(Text) + + # Sentiment analysis + sentiment_score = Column(DECIMAL(3,2)) + sentiment_label = Column(String(20)) + sentiment_confidence = Column(DECIMAL(3,2)) + + # Vector embeddings + embedding = Column(VECTOR(1536)) + + # Metadata + data_quality_score = Column(DECIMAL(3,2), default=1.0) + processing_status = Column(String(20), default='pending') + error_message = Column(Text) + + def to_domain(self) -> 'SocialPost': + """Convert entity to domain model""" + return SocialPost( + post_id=self.post_id, + ticker=self.ticker, + subreddit=self.subreddit, + title=self.title, + content=self.content, + author=self.author, + created_at=self.created_at, + upvotes=self.upvotes, + downvotes=self.downvotes, + comment_count=self.comment_count, + url=self.url, + sentiment_score=float(self.sentiment_score) if self.sentiment_score else None, + sentiment_label=self.sentiment_label, + sentiment_confidence=float(self.sentiment_confidence) if self.sentiment_confidence else None + ) + + @classmethod + def from_domain(cls, post: 'SocialPost', embedding: Optional[list] = None) -> 'SocialMediaPostEntity': + """Create entity from domain model""" + return cls( + post_id=post.post_id, + ticker=post.ticker, + subreddit=post.subreddit, + title=post.title, + content=post.content, + author=post.author, + created_at=post.created_at, + upvotes=post.upvotes, + downvotes=post.downvotes, + comment_count=post.comment_count, + url=post.url, + sentiment_score=post.sentiment_score, + sentiment_label=post.sentiment_label, + sentiment_confidence=post.sentiment_confidence, + embedding=embedding + ) +``` + +## Reddit API Integration + +### RedditClient Implementation +```python +# tradingagents/domains/socialmedia/clients.py +import praw +from typing import List, Optional, Dict, Any +from datetime import datetime, timedelta +import asyncio +import aiohttp +from tradingagents.config import TradingAgentsConfig + +class RedditClient: + def __init__(self, config: TradingAgentsConfig): + self.config = config + self.reddit = praw.Reddit( + client_id=config.reddit_client_id, + client_secret=config.reddit_client_secret, + user_agent=config.reddit_user_agent + ) + + async def fetch_financial_posts( + self, + subreddits: List[str], + ticker: Optional[str] = None, + limit: int = 100, + time_filter: str = "day" + ) -> List[Dict[str, Any]]: + """Fetch financial posts from specified subreddits""" + posts = [] + + for subreddit_name in subreddits: + try: + subreddit = self.reddit.subreddit(subreddit_name) + submissions = subreddit.hot(limit=limit) + + for submission in submissions: + # Filter by ticker if specified + if ticker and ticker.upper() not in submission.title.upper(): + continue + + post_data = { + 'post_id': submission.id, + 'subreddit': subreddit_name, + 'title': submission.title, + 'content': submission.selftext, + 'author': str(submission.author), + 'created_at': datetime.fromtimestamp(submission.created_utc), + 'upvotes': submission.ups, + 'downvotes': submission.downs, + 'comment_count': submission.num_comments, + 'url': submission.url, + 'permalink': submission.permalink + } + posts.append(post_data) + + except Exception as e: + # Log error but continue processing other subreddits + print(f"Error fetching from {subreddit_name}: {e}") + continue + + return posts +``` + +## LLM Sentiment Analysis + +### OpenRouter Integration +```python +# tradingagents/domains/socialmedia/services.py +from typing import Dict, Any, Optional, Tuple +import openai +from tradingagents.config import TradingAgentsConfig + +class SentimentAnalyzer: + def __init__(self, config: TradingAgentsConfig): + self.config = config + self.client = openai.OpenAI( + base_url="https://openrouter.ai/api/v1", + api_key=config.openrouter_api_key + ) + + async def analyze_sentiment(self, text: str) -> Tuple[float, str, float]: + """ + Analyze sentiment of social media post + Returns: (score, label, confidence) + """ + prompt = f""" + Analyze the financial sentiment of this social media post. + + Post: "{text}" + + Return sentiment as JSON with: + - score: float from -1.0 (very negative) to +1.0 (very positive) + - label: "positive", "negative", or "neutral" + - confidence: float from 0.0 to 1.0 indicating confidence + + Focus on financial and trading sentiment, not general sentiment. + """ + + try: + response = await self.client.chat.completions.create( + model=self.config.quick_think_llm, + messages=[{"role": "user", "content": prompt}], + max_tokens=100, + temperature=0.1 + ) + + result = json.loads(response.choices[0].message.content) + return result['score'], result['label'], result['confidence'] + + except Exception as e: + # Return neutral sentiment on error + return 0.0, "neutral", 0.0 +``` + +## Vector Embeddings and Search + +### Embedding Generation +```python +# tradingagents/domains/socialmedia/embeddings.py +import openai +from typing import List, Optional +from tradingagents.config import TradingAgentsConfig + +class EmbeddingGenerator: + def __init__(self, config: TradingAgentsConfig): + self.config = config + self.client = openai.OpenAI( + base_url="https://openrouter.ai/api/v1", + api_key=config.openrouter_api_key + ) + + async def generate_embedding(self, text: str) -> Optional[List[float]]: + """Generate vector embedding for text""" + try: + response = await self.client.embeddings.create( + model="text-embedding-3-small", + input=text, + encoding_format="float" + ) + return response.data[0].embedding + except Exception as e: + print(f"Embedding generation failed: {e}") + return None + + def prepare_text_for_embedding(self, post: Dict[str, Any]) -> str: + """Combine title and content for embedding""" + title = post.get('title', '') + content = post.get('content', '') + return f"{title} {content}".strip() +``` + +## Repository Implementation + +### SocialRepository with PostgreSQL +```python +# tradingagents/domains/socialmedia/repositories.py +from typing import List, Optional, Dict, Any +from sqlalchemy.orm import Session +from sqlalchemy import desc, and_, text +from tradingagents.domains.socialmedia.entities import SocialMediaPostEntity +from tradingagents.domains.socialmedia.models import SocialPost, SocialContext +from tradingagents.database import get_db_session +from datetime import datetime, timedelta + +class SocialRepository: + def __init__(self): + self.session = get_db_session() + + async def save_posts(self, posts: List[SocialPost]) -> List[str]: + """Save social media posts with deduplication""" + saved_ids = [] + + for post in posts: + # Check for existing post + existing = self.session.query(SocialMediaPostEntity).filter( + SocialMediaPostEntity.post_id == post.post_id + ).first() + + if existing: + continue # Skip duplicates + + entity = SocialMediaPostEntity.from_domain(post) + self.session.add(entity) + saved_ids.append(post.post_id) + + self.session.commit() + return saved_ids + + async def get_posts_for_ticker( + self, + ticker: str, + days: int = 7, + limit: int = 50 + ) -> List[SocialPost]: + """Get social media posts for specific ticker""" + cutoff_date = datetime.now() - timedelta(days=days) + + results = self.session.query(SocialMediaPostEntity).filter( + and_( + SocialMediaPostEntity.ticker == ticker, + SocialMediaPostEntity.created_at >= cutoff_date + ) + ).order_by(desc(SocialMediaPostEntity.created_at)).limit(limit).all() + + return [entity.to_domain() for entity in results] + + async def vector_similarity_search( + self, + query_embedding: List[float], + ticker: Optional[str] = None, + limit: int = 10 + ) -> List[SocialPost]: + """Find similar posts using vector search""" + query = self.session.query(SocialMediaPostEntity) + + if ticker: + query = query.filter(SocialMediaPostEntity.ticker == ticker) + + # Vector similarity search using pgvectorscale + query = query.order_by( + text(f"embedding <-> '{query_embedding}'") + ).limit(limit) + + results = query.all() + return [entity.to_domain() for entity in results] +``` + +## Service Layer + +### SocialMediaService +```python +# tradingagents/domains/socialmedia/services.py +from typing import List, Optional, Dict, Any +from tradingagents.domains.socialmedia.repositories import SocialRepository +from tradingagents.domains.socialmedia.clients import RedditClient +from tradingagents.domains.socialmedia.models import SocialPost, SocialContext +from tradingagents.config import TradingAgentsConfig + +class SocialMediaService: + def __init__(self, config: TradingAgentsConfig): + self.config = config + self.repository = SocialRepository() + self.reddit_client = RedditClient(config) + self.sentiment_analyzer = SentimentAnalyzer(config) + self.embedding_generator = EmbeddingGenerator(config) + + async def collect_social_data( + self, + ticker: Optional[str] = None, + subreddits: Optional[List[str]] = None + ) -> SocialContext: + """Main entry point for social media data collection""" + + if not subreddits: + subreddits = ['wallstreetbets', 'investing', 'stocks', 'SecurityAnalysis'] + + # Fetch posts from Reddit + raw_posts = await self.reddit_client.fetch_financial_posts( + subreddits=subreddits, + ticker=ticker, + limit=100 + ) + + # Process posts: sentiment analysis + embeddings + processed_posts = [] + for raw_post in raw_posts: + # Generate sentiment + text = f"{raw_post['title']} {raw_post['content']}" + score, label, confidence = await self.sentiment_analyzer.analyze_sentiment(text) + + # Generate embedding + embedding = await self.embedding_generator.generate_embedding(text) + + post = SocialPost( + **raw_post, + sentiment_score=score, + sentiment_label=label, + sentiment_confidence=confidence + ) + processed_posts.append(post) + + # Save to database + await self.repository.save_posts(processed_posts) + + # Return context + return SocialContext( + posts=processed_posts, + ticker=ticker, + total_posts=len(processed_posts), + sentiment_summary=self._calculate_sentiment_summary(processed_posts) + ) + + def _calculate_sentiment_summary(self, posts: List[SocialPost]) -> Dict[str, Any]: + """Calculate aggregate sentiment metrics""" + if not posts: + return {} + + scores = [p.sentiment_score for p in posts if p.sentiment_score is not None] + labels = [p.sentiment_label for p in posts if p.sentiment_label] + + return { + 'avg_sentiment': sum(scores) / len(scores) if scores else 0.0, + 'positive_count': labels.count('positive'), + 'negative_count': labels.count('negative'), + 'neutral_count': labels.count('neutral'), + 'total_posts': len(posts) + } +``` + +## AgentToolkit Integration + +### RAG-Enhanced Methods +```python +# tradingagents/agents/libs/agent_toolkit.py (additions) + +async def get_reddit_news(self, ticker: str, days: int = 7) -> str: + """Get Reddit posts related to a ticker with RAG context""" + try: + # Get recent posts for ticker + posts = await self.social_service.repository.get_posts_for_ticker( + ticker=ticker, + days=days, + limit=20 + ) + + if not posts: + return f"No Reddit posts found for {ticker} in the last {days} days." + + # Format for agent consumption + context = f"Reddit Social Media Context for {ticker} ({len(posts)} posts):\n\n" + + for post in posts[:10]: # Limit to top 10 + sentiment_emoji = {"positive": "📈", "negative": "📉", "neutral": "➡️"}.get(post.sentiment_label, "") + context += f"{sentiment_emoji} r/{post.subreddit} - {post.title}\n" + context += f" Sentiment: {post.sentiment_label} ({post.sentiment_score:.2f})\n" + context += f" Engagement: {post.upvotes} upvotes, {post.comment_count} comments\n" + if post.content: + context += f" Content: {post.content[:200]}...\n" + context += "\n" + + return context + + except Exception as e: + return f"Error fetching Reddit data for {ticker}: {str(e)}" + +async def get_reddit_stock_info(self, ticker: str, query: Optional[str] = None) -> str: + """Get Reddit stock information with semantic search""" + try: + if query: + # Generate embedding for semantic search + query_embedding = await self.social_service.embedding_generator.generate_embedding(query) + if query_embedding: + posts = await self.social_service.repository.vector_similarity_search( + query_embedding=query_embedding, + ticker=ticker, + limit=10 + ) + else: + posts = await self.social_service.repository.get_posts_for_ticker(ticker, days=7) + else: + posts = await self.social_service.repository.get_posts_for_ticker(ticker, days=7) + + if not posts: + return f"No relevant Reddit discussions found for {ticker}." + + # Aggregate sentiment and key insights + sentiment_summary = self.social_service._calculate_sentiment_summary(posts) + + context = f"Reddit Stock Analysis for {ticker}:\n\n" + context += f"Overall Sentiment: {sentiment_summary.get('avg_sentiment', 0):.2f}/1.0\n" + context += f"Posts: {sentiment_summary.get('positive_count', 0)} positive, " + context += f"{sentiment_summary.get('negative_count', 0)} negative, " + context += f"{sentiment_summary.get('neutral_count', 0)} neutral\n\n" + + context += "Key Discussions:\n" + for post in posts[:5]: + context += f"• {post.title} (r/{post.subreddit})\n" + context += f" Sentiment: {post.sentiment_label} ({post.sentiment_score:.2f})\n" + + return context + + except Exception as e: + return f"Error analyzing Reddit stock info for {ticker}: {str(e)}" +``` + +## Dagster Pipeline + +### Social Media Collection Asset +```python +# tradingagents/data/assets/social_media.py +from dagster import asset, AssetExecutionContext +from tradingagents.domains.socialmedia.services import SocialMediaService +from tradingagents.config import TradingAgentsConfig + +@asset( + group_name="social_media", + description="Collect Reddit posts from financial subreddits with sentiment analysis" +) +async def reddit_financial_posts(context: AssetExecutionContext) -> Dict[str, Any]: + """Daily collection of Reddit financial posts""" + + config = TradingAgentsConfig.from_env() + social_service = SocialMediaService(config) + + # Collect from financial subreddits + subreddits = ['wallstreetbets', 'investing', 'stocks', 'SecurityAnalysis'] + + total_collected = 0 + results = {} + + for subreddit in subreddits: + try: + social_context = await social_service.collect_social_data( + subreddits=[subreddit] + ) + + results[subreddit] = { + 'posts_collected': len(social_context.posts), + 'sentiment_summary': social_context.sentiment_summary + } + total_collected += len(social_context.posts) + + context.log.info(f"Collected {len(social_context.posts)} posts from r/{subreddit}") + + except Exception as e: + context.log.error(f"Failed to collect from r/{subreddit}: {e}") + results[subreddit] = {'error': str(e)} + + context.log.info(f"Total posts collected: {total_collected}") + return results +``` + +## Testing Strategy + +### Test Structure +``` +tests/domains/socialmedia/ +├── conftest.py # Fixtures and test setup +├── test_reddit_client.py # API integration tests with VCR +├── test_social_repository.py # PostgreSQL database tests +├── test_social_service.py # Business logic with mocks +├── test_sentiment_analyzer.py # LLM sentiment analysis tests +├── test_embedding_generator.py # Vector embedding tests +└── fixtures/ # VCR cassettes and test data + └── reddit_api_responses.yaml +``` + +### Key Test Patterns +```python +# tests/domains/socialmedia/test_social_service.py +import pytest +from unittest.mock import AsyncMock, MagicMock +from tradingagents.domains.socialmedia.services import SocialMediaService + +@pytest.mark.asyncio +async def test_collect_social_data_success(mock_social_service): + """Test successful social media data collection""" + # Mock Reddit API response + mock_posts = [ + { + 'post_id': 'abc123', + 'title': 'AAPL to the moon!', + 'subreddit': 'wallstreetbets', + # ... other fields + } + ] + + mock_social_service.reddit_client.fetch_financial_posts.return_value = mock_posts + mock_social_service.sentiment_analyzer.analyze_sentiment.return_value = (0.8, 'positive', 0.9) + + result = await mock_social_service.collect_social_data(ticker='AAPL') + + assert len(result.posts) == 1 + assert result.posts[0].sentiment_label == 'positive' + assert result.sentiment_summary['positive_count'] == 1 +``` + +## Dependencies + +### Technical Dependencies +- **Reddit API access** (PRAW or Reddit API client) +- **OpenRouter API** for LLM sentiment analysis +- **PostgreSQL** with TimescaleDB and pgvectorscale extensions +- **Existing database infrastructure** from news domain +- **OpenRouter configuration** in TradingAgentsConfig +- **Dagster orchestration framework** for scheduled execution + +### Reference Implementations +- **News domain patterns**: Follow NewsService, NewsRepository, NewsArticleEntity patterns for consistency +- **Database schema**: Mirror NewsArticleEntity vector embedding approach for social posts +- **Agent integration**: Follow existing AgentToolkit get_news() pattern for social media methods +- **Testing approach**: Apply news domain testing patterns: VCR for API, real DB for repositories + +## Success Criteria + +### Functionality +- Daily Reddit collection with sentiment analysis and vector search +- Seamless integration with existing multi-agent trading framework +- RAG-enhanced social context for AI agents + +### Performance +- < 2 second social context queries +- < 100ms repository operations +- Efficient vector similarity search + +### Quality +- 85%+ test coverage matching project standards +- Comprehensive error handling and resilience +- Data quality monitoring and validation + +### Integration +- Seamless AgentToolkit RAG integration for AI agents +- Architecture and patterns match successful news domain implementation +- Consistent with existing TradingAgents configuration and conventions + +## Implementation Approach + +**Complete domain implementation following successful news domain patterns:** + +1. **Database migration** from file storage to PostgreSQL +2. **Entity models** with TimescaleDB and vector support +3. **Reddit client** implementation with rate limiting +4. **Repository layer** with vector search capabilities +5. **Service layer** with sentiment analysis and embedding generation +6. **AgentToolkit integration** with RAG-enhanced methods +7. **Dagster pipeline** for automated daily collection +8. **Comprehensive testing** with VCR mocking and real database tests + +This comprehensive implementation transforms the social media domain from basic stubs into a production-ready system that seamlessly integrates with the existing TradingAgents framework. \ No newline at end of file diff --git a/docs/specs/socialmedia/status.md b/docs/specs/socialmedia/status.md new file mode 100644 index 00000000..6da7072c --- /dev/null +++ b/docs/specs/socialmedia/status.md @@ -0,0 +1,184 @@ +# Social Media Domain Implementation Status + +## Project Overview + +**Feature:** Complete socialmedia domain implementation from empty stubs to production +**Total Estimated Time:** 32 hours across 3 phases +**Approach:** Parallel development with multiple AI agents +**Target:** >85% test coverage, PostgreSQL migration, PRAW Reddit integration, OpenRouter LLM sentiment analysis + +--- + +## Progress Summary + +| Phase | Status | Completed | Total | Progress | Est. Time | +|-------|--------|-----------|-------|----------|-----------| +| **Phase 1: Foundation** | 🟡 Not Started | 0 | 4 | 0% | 12h | +| **Phase 2: API Integration** | 🟡 Not Started | 0 | 4 | 0% | 12h | +| **Phase 3: Integration** | 🟡 Not Started | 0 | 3 | 0% | 8h | +| **Overall Progress** | 🟡 Not Started | **0** | **11** | **0%** | **32h** | + +--- + +## Phase 1: Foundation (12 hours) + +### 🏗️ Database & Core Models + +| Task | Agent | Status | Progress | Time | Priority | +|------|-------|--------|----------|------|----------| +| **1.1** Database Schema Migration | Database Specialist | 🟡 Not Started | 0% | 3h | 🔴 Blocking | +| **1.2** SQLAlchemy Entity Implementation | Entity Specialist | 🟡 Not Started | 0% | 3h | 🔴 Blocking | +| **1.3** Domain Model Enhancement | Domain Specialist | 🟡 Not Started | 0% | 3h | 🔴 Blocking | +| **1.4** Repository Implementation | Repository Specialist | 🟡 Not Started | 0% | 3h | 🟠 Medium | + +#### Phase 1 Dependencies +- Task 1.1 → Task 1.2 (Entity requires database schema) +- Task 1.4 depends on Tasks 1.1 + 1.2 +- Task 1.3 can run parallel with others + +#### Phase 1 Acceptance Criteria +- [ ] PostgreSQL table `social_media_posts` with TimescaleDB + pgvectorscale +- [ ] SocialMediaPostEntity with proper field mappings and transformations +- [ ] SocialPost domain model with validation and business rules +- [ ] SocialRepository with vector similarity search and sentiment aggregation + +--- + +## Phase 2: API Integration & Processing (12 hours) + +### 🔌 Clients & Services + +| Task | Agent | Status | Progress | Time | Priority | +|------|-------|--------|----------|------|----------| +| **2.1** Reddit Client Implementation | API Integration Specialist | 🟡 Not Started | 0% | 4h | 🔴 Blocking | +| **2.2** OpenRouter Sentiment Analysis | LLM Integration Specialist | 🟡 Not Started | 0% | 3h | 🟠 Medium | +| **2.3** Vector Embedding Generation | ML Integration Specialist | 🟡 Not Started | 0% | 2h | 🟠 Medium | +| **2.4** Service Layer Implementation | Service Integration Specialist | 🟡 Not Started | 0% | 3h | 🟠 Medium | + +#### Phase 2 Dependencies +- All tasks can run in parallel initially +- Task 2.4 depends on completion of Tasks 2.1, 2.2, 2.3 + +#### Phase 2 Acceptance Criteria +- [ ] PRAW Reddit client with rate limiting and error handling +- [ ] OpenRouter sentiment analysis with social media-specific prompts +- [ ] Vector embeddings (1536-dim) for titles and content using text-embedding-3-large +- [ ] SocialMediaService orchestrating collection, sentiment, and embeddings + +--- + +## Phase 3: Integration & Validation (8 hours) + +### 🎯 AgentToolkit & Pipeline + +| Task | Agent | Status | Progress | Time | Priority | +|------|-------|--------|----------|------|----------| +| **3.1** AgentToolkit Integration | Agent Integration Specialist | 🟡 Not Started | 0% | 3h | 🔴 High | +| **3.2** Dagster Pipeline Implementation | Pipeline Specialist | 🟡 Not Started | 0% | 2h | 🟠 Medium | +| **3.3** Comprehensive Testing Suite | Testing Specialist | 🟡 Not Started | 0% | 3h | 🔴 High | + +#### Phase 3 Dependencies +- Task 3.1 depends on Task 2.4 (SocialMediaService) +- Task 3.2 depends on Task 2.4 +- Task 3.3 can start after any component is implemented + +#### Phase 3 Acceptance Criteria +- [ ] AgentToolkit RAG methods: `get_reddit_sentiment()`, `get_reddit_stock_info()`, etc. +- [ ] Daily Dagster pipeline with sentiment analysis and embedding generation +- [ ] >85% test coverage with VCR cassettes and mocked dependencies + +--- + +## Current Blocking Issues + +| Issue | Impact | Affected Tasks | Resolution | +|-------|---------|----------------|------------| +| No active blocking issues | - | - | Ready to start Phase 1 | + +--- + +## Implementation Readiness + +### Prerequisites Status +| Requirement | Status | Notes | +|-------------|---------|-------| +| PostgreSQL + Extensions | ✅ Available | TimescaleDB + pgvectorscale ready | +| Reddit API Credentials | ⚠️ Required | Need REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET | +| OpenRouter API Access | ✅ Available | Existing OpenRouterClient integration | +| Database Migration System | ✅ Available | Existing migration infrastructure | +| Testing Framework | ✅ Available | pytest, pytest-vcr, pytest-asyncio | + +### Risk Assessment +| Risk Level | Tasks | Mitigation | +|------------|-------|------------| +| 🔴 **High** | 2.1 (Reddit Client) | Use proven PRAW library, implement circuit breaker | +| 🟠 **Medium** | 1.1, 1.4, 2.2, 2.4 | Follow existing news domain patterns | +| 🟢 **Low** | 1.2, 1.3, 2.3, 3.1, 3.2, 3.3 | Standard implementation patterns | + +--- + +## Key Success Metrics + +### Technical Metrics +- [ ] **Database Performance:** <1s vector similarity queries for top 10 results +- [ ] **API Performance:** <2s social context generation for AI agents +- [ ] **Processing Performance:** <5s batch processing for 1000 posts +- [ ] **Test Coverage:** >85% across all socialmedia domain components +- [ ] **Data Quality:** >80% posts with reliable sentiment analysis + +### Integration Metrics +- [ ] **AgentToolkit Integration:** 4 RAG methods implemented and tested +- [ ] **Dagster Pipeline:** Daily automated collection with monitoring +- [ ] **Architecture Consistency:** Follows news domain patterns exactly +- [ ] **Error Resilience:** Graceful degradation on API failures + +### Business Metrics +- [ ] **Data Collection:** 400+ posts collected daily from financial subreddits +- [ ] **Sentiment Analysis:** Structured scoring with confidence levels +- [ ] **Semantic Search:** Vector-based similarity search operational +- [ ] **Agent Context:** Rich social media context for trading decisions + +--- + +## Next Steps + +### Immediate Actions (Next Sprint) +1. **🚀 Start Phase 1:** Begin database schema migration (Task 1.1) +2. **📋 Environment Setup:** Configure Reddit API credentials +3. **👥 Agent Assignment:** Assign specialized agents to parallel tasks +4. **📊 Progress Tracking:** Update status after each task completion + +### Phase Transition Criteria +**Phase 1 → Phase 2:** All foundation tasks complete, database operational +**Phase 2 → Phase 3:** Service layer operational, sentiment and embeddings working +**Phase 3 → Production:** All tests passing, AgentToolkit integration complete + +--- + +## Change Log + +| Date | Change | Impact | Updated By | +|------|--------|---------|------------| +| 2024-08-30 | Initial status tracking setup | Baseline established | System | + +--- + +## Notes and Observations + +**Implementation Strategy:** +- Leverage existing news domain as reference implementation +- Prioritize blocking tasks (database, core models) first +- Enable parallel development in Phase 2 for efficiency +- Comprehensive testing throughout to maintain >85% coverage + +**Key Dependencies:** +- Reddit API reliability and rate limiting compliance +- OpenRouter LLM performance for sentiment analysis +- PostgreSQL vector extension performance at scale +- Integration with existing TradingAgents configuration + +**Success Indicators:** +- Clean migration from file-based to PostgreSQL storage +- Reliable daily data collection without manual intervention +- AI agents receiving rich social context within performance targets +- Production-ready error handling and monitoring diff --git a/docs/specs/socialmedia/tasks.md b/docs/specs/socialmedia/tasks.md new file mode 100644 index 00000000..de9360ef --- /dev/null +++ b/docs/specs/socialmedia/tasks.md @@ -0,0 +1,2729 @@ +# Social Media Domain Implementation Tasks + +## Overview + +Complete greenfield implementation of the socialmedia domain from empty stubs to production-ready system with PRAW Reddit API integration, PostgreSQL migration, OpenRouter LLM sentiment analysis, and AgentToolkit RAG methods. + +**Total Estimated Time: 32 hours (3-phase parallel development approach)** + +## Phase Structure + +### Phase 1: Foundation (12 hours) - Database & Core Models +**Parallel Execution Ready**: Multiple agents can work on different components simultaneously + +### Phase 2: API Integration & Processing (12 hours) - Clients & Services +**Parallel Execution Ready**: API clients and LLM services can be developed in parallel + +### Phase 3: Integration & Validation (8 hours) - AgentToolkit & Dagster +**Parallel Execution Ready**: AgentToolkit and pipeline development with comprehensive testing + +--- + +## Phase 1: Foundation (12 hours) + +### Task 1.1: Database Schema Migration (3 hours) +**Priority: Blocking** | **Agent: Database Specialist** + +Create PostgreSQL migration for social_media_posts table with TimescaleDB and pgvectorscale support. + +**Implementation:** +```sql +-- Migration: 003_create_social_media_posts.sql +CREATE TABLE social_media_posts ( + id UUID PRIMARY KEY DEFAULT uuid7(), + post_id VARCHAR(50) UNIQUE NOT NULL, + title TEXT NOT NULL, + content TEXT, + author VARCHAR(100) NOT NULL, + subreddit VARCHAR(50) NOT NULL, + created_utc TIMESTAMPTZ NOT NULL, + upvotes INTEGER NOT NULL DEFAULT 0, + downvotes INTEGER NOT NULL DEFAULT 0, + comments_count INTEGER NOT NULL DEFAULT 0, + url TEXT NOT NULL, + sentiment_score JSONB, + sentiment_label VARCHAR(20), + tickers TEXT[] DEFAULT '{}', + title_embedding VECTOR(1536), + content_embedding VECTOR(1536), + inserted_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW() +); + +SELECT create_hypertable('social_media_posts', 'created_utc', chunk_time_interval => INTERVAL '1 day'); + +-- Performance indexes +CREATE UNIQUE INDEX idx_social_posts_post_id ON social_media_posts (post_id); +CREATE INDEX idx_social_posts_subreddit_time ON social_media_posts (subreddit, created_utc DESC); +CREATE INDEX idx_social_posts_tickers_gin ON social_media_posts USING GIN (tickers); +CREATE INDEX idx_social_posts_title_embedding ON social_media_posts USING vectors (title_embedding vector_cosine_ops); +CREATE INDEX idx_social_posts_content_embedding ON social_media_posts USING vectors (content_embedding vector_cosine_ops); +CREATE INDEX idx_social_posts_sentiment ON social_media_posts (((sentiment_score->>'sentiment'))) WHERE sentiment_score IS NOT NULL; + +-- Constraints +ALTER TABLE social_media_posts ADD CONSTRAINT chk_sentiment_score CHECK ( + sentiment_score IS NULL OR ((sentiment_score->>'confidence')::float BETWEEN 0 AND 1) +); +ALTER TABLE social_media_posts ADD CONSTRAINT chk_created_utc CHECK (created_utc <= NOW()); +``` + +**Acceptance Criteria:** +- [ ] Migration script creates social_media_posts table +- [ ] TimescaleDB hypertable configured for time-series optimization +- [ ] pgvectorscale indexes for title_embedding and content_embedding +- [ ] All constraints and indexes properly created +- [ ] Migration runs successfully in test and development environments + +**Dependencies:** PostgreSQL + TimescaleDB + pgvectorscale installed +**Risk:** Medium - Extension compatibility issues + +--- + +### Task 1.2: SQLAlchemy Entity Implementation (3 hours) +**Priority: Blocking** | **Agent: Entity Specialist** + +Create SocialMediaPostEntity with proper field mappings and domain transformations. + +**File:** `tradingagents/domains/socialmedia/entities.py` + +**Implementation:** +```python +from sqlalchemy import Column, String, Text, Integer, TIMESTAMP, Index +from sqlalchemy.dialects.postgresql import UUID, VECTOR, ARRAY, JSONB +from sqlalchemy.sql import func +from tradingagents.database.base import Base +from typing import Optional, List, Dict, Any +import uuid + +class SocialMediaPostEntity(Base): + __tablename__ = 'social_media_posts' + + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + post_id = Column(String(50), unique=True, nullable=False, index=True) + title = Column(Text, nullable=False) + content = Column(Text) + author = Column(String(100), nullable=False) + subreddit = Column(String(50), nullable=False, index=True) + created_utc = Column(TIMESTAMP(timezone=True), nullable=False, index=True) + upvotes = Column(Integer, nullable=False, default=0) + downvotes = Column(Integer, nullable=False, default=0) + comments_count = Column(Integer, nullable=False, default=0) + url = Column(Text, nullable=False) + + # Enhanced fields + sentiment_score = Column(JSONB) + sentiment_label = Column(String(20)) + tickers = Column(ARRAY(String(10)), default=lambda: []) + title_embedding = Column(VECTOR(1536)) + content_embedding = Column(VECTOR(1536)) + + # Metadata + inserted_at = Column(TIMESTAMP(timezone=True), server_default=func.now()) + updated_at = Column(TIMESTAMP(timezone=True), server_default=func.now(), onupdate=func.now()) + + def to_domain(self) -> 'SocialPost': + """Convert entity to domain model with proper field mapping""" + sentiment_data = self.sentiment_score or {} + return SocialPost( + post_id=self.post_id, + title=self.title, + content=self.content, + author=self.author, + subreddit=self.subreddit, + created_utc=self.created_utc, + upvotes=self.upvotes, + downvotes=self.downvotes, + comments_count=self.comments_count, + url=self.url, + sentiment_score=sentiment_data.get('score'), + sentiment_label=self.sentiment_label, + sentiment_confidence=sentiment_data.get('confidence'), + tickers=list(self.tickers) if self.tickers else [], + title_embedding=list(self.title_embedding) if self.title_embedding else None, + content_embedding=list(self.content_embedding) if self.content_embedding else None + ) + + @classmethod + def from_domain(cls, post: 'SocialPost') -> 'SocialMediaPostEntity': + """Create entity from domain model""" + sentiment_data = None + if post.sentiment_score is not None and post.sentiment_confidence is not None: + sentiment_data = { + 'score': post.sentiment_score, + 'confidence': post.sentiment_confidence, + 'reasoning': getattr(post, 'sentiment_reasoning', None) + } + + return cls( + post_id=post.post_id, + title=post.title, + content=post.content, + author=post.author, + subreddit=post.subreddit, + created_utc=post.created_utc, + upvotes=post.upvotes, + downvotes=post.downvotes, + comments_count=post.comments_count, + url=post.url, + sentiment_score=sentiment_data, + sentiment_label=post.sentiment_label, + tickers=post.tickers or [], + title_embedding=post.title_embedding, + content_embedding=post.content_embedding + ) +``` + +**Acceptance Criteria:** +- [ ] SocialMediaPostEntity properly maps all database fields +- [ ] to_domain() and from_domain() methods handle all field conversions +- [ ] Proper handling of vector fields and JSONB sentiment data +- [ ] Entity integrates with existing database session management +- [ ] All field types match database schema exactly + +**Dependencies:** Task 1.1 (database schema) +**Risk:** Low - Standard SQLAlchemy patterns + +--- + +### Task 1.3: Domain Model Enhancement (3 hours) +**Priority: Blocking** | **Agent: Domain Specialist** + +Enhance SocialPost domain entity with comprehensive validation, transformations, and business rules. + +**File:** `tradingagents/domains/socialmedia/models.py` + +**Implementation:** +```python +from pydantic import BaseModel, Field, validator, root_validator +from typing import Optional, List, Dict, Any +from datetime import datetime +import re + +class SentimentScore(BaseModel): + """Structured sentiment analysis result from OpenRouter LLM""" + sentiment: Literal['positive', 'negative', 'neutral'] + confidence: float = Field(..., ge=0.0, le=1.0) + reasoning: Optional[str] = None + + @validator('reasoning') + def reasoning_not_empty(cls, v): + if v is not None and len(v.strip()) == 0: + return None + return v + +class SocialPost(BaseModel): + """Core domain entity with business rules and transformations""" + # Base fields from Reddit API + post_id: str = Field(..., regex=r'^[a-zA-Z0-9_-]+$') + title: str = Field(..., min_length=1, max_length=300) + content: Optional[str] = None + author: str = Field(..., min_length=1, max_length=100) + subreddit: str = Field(..., min_length=1, max_length=50) + created_utc: datetime + upvotes: int = Field(..., ge=0) + downvotes: int = Field(..., ge=0) + comments_count: int = Field(..., ge=0) + url: str = Field(..., min_length=1) + + # Enhanced fields + sentiment_score: Optional[float] = Field(None, ge=-1.0, le=1.0) + sentiment_label: Optional[str] = Field(None, regex=r'^(positive|negative|neutral)$') + sentiment_confidence: Optional[float] = Field(None, ge=0.0, le=1.0) + sentiment_reasoning: Optional[str] = None + tickers: Optional[List[str]] = Field(default_factory=list) + title_embedding: Optional[List[float]] = None + content_embedding: Optional[List[float]] = None + + @validator('tickers') + def validate_tickers(cls, v): + """Validate ticker symbols format""" + if v is None: + return [] + # Ensure tickers are uppercase and valid format + return [ticker.upper() for ticker in v if re.match(r'^[A-Z]{1,5}$', ticker.upper())] + + @validator('title_embedding', 'content_embedding') + def validate_embedding_dimensions(cls, v): + """Ensure embeddings have correct dimensions""" + if v is not None and len(v) != 1536: + raise ValueError('Embedding must be 1536 dimensions') + return v + + @root_validator + def validate_sentiment_consistency(cls, values): + """Ensure sentiment fields are consistent""" + score = values.get('sentiment_score') + label = values.get('sentiment_label') + confidence = values.get('sentiment_confidence') + + # All sentiment fields should be present or all None + sentiment_fields = [score, label, confidence] + non_none_count = sum(1 for field in sentiment_fields if field is not None) + + if non_none_count > 0 and non_none_count < 3: + raise ValueError('All sentiment fields (score, label, confidence) must be provided together') + + return values + + @classmethod + def from_praw_submission(cls, submission: Any) -> 'SocialPost': + """Create SocialPost from PRAW Reddit submission""" + return cls( + post_id=submission.id, + title=submission.title[:300], # Truncate long titles + content=submission.selftext if submission.selftext else None, + author=str(submission.author) if submission.author else '[deleted]', + subreddit=submission.subreddit.display_name, + created_utc=datetime.fromtimestamp(submission.created_utc), + upvotes=submission.ups if hasattr(submission, 'ups') else submission.score, + downvotes=max(0, submission.score - submission.ups) if hasattr(submission, 'ups') else 0, + comments_count=submission.num_comments, + url=f"https://reddit.com{submission.permalink}" + ) + + def extract_tickers(self) -> List[str]: + """Extract ticker symbols from title and content""" + text = f"{self.title} {self.content or ''}" + # Look for $TICKER or TICKER patterns + ticker_pattern = r'\b(?:\$)?([A-Z]{1,5})\b' + potential_tickers = re.findall(ticker_pattern, text.upper()) + + # Filter out common words that look like tickers + excluded = {'THE', 'AND', 'OR', 'FOR', 'TO', 'OF', 'IN', 'ON', 'AT', 'BY', 'UP', 'IS', 'IT', 'BE', 'AS', 'ARE', 'WAS', 'HE', 'SHE', 'WE', 'YOU', 'THEY', 'ALL', 'ANY', 'CAN', 'HAD', 'HER', 'HIS', 'HOW', 'ITS', 'MAY', 'NEW', 'NOW', 'OLD', 'SEE', 'TWO', 'WHO', 'BOY', 'DID', 'HAS', 'LET', 'PUT', 'SAY', 'SIX', 'TEN', 'USE', 'WAS', 'WIN', 'YES'} + + tickers = [ticker for ticker in potential_tickers if ticker not in excluded] + return list(set(tickers)) # Remove duplicates + + def has_reliable_sentiment(self) -> bool: + """Check if sentiment analysis has sufficient confidence""" + return (self.sentiment_confidence is not None and + self.sentiment_confidence >= 0.5) + + def to_agent_context(self) -> Dict[str, Any]: + """Format post for agent consumption""" + sentiment_emoji = {"positive": "📈", "negative": "📉", "neutral": "➡️"}.get(self.sentiment_label, "❓") + + return { + 'post_id': self.post_id, + 'subreddit': self.subreddit, + 'title': self.title, + 'content': self.content[:200] + '...' if self.content and len(self.content) > 200 else self.content, + 'author': self.author, + 'created_utc': self.created_utc.isoformat(), + 'engagement': { + 'upvotes': self.upvotes, + 'comments_count': self.comments_count, + 'score': self.upvotes - self.downvotes + }, + 'sentiment': { + 'label': self.sentiment_label, + 'score': self.sentiment_score, + 'confidence': self.sentiment_confidence, + 'emoji': sentiment_emoji, + 'reliable': self.has_reliable_sentiment() + }, + 'tickers': self.tickers or [], + 'url': self.url + } +``` + +**Acceptance Criteria:** +- [ ] SocialPost model handles all Reddit API fields properly +- [ ] Comprehensive validation for all fields including sentiment and embeddings +- [ ] from_praw_submission() creates valid domain objects from Reddit data +- [ ] extract_tickers() accurately finds ticker symbols in text +- [ ] to_agent_context() formats data for AI agent consumption +- [ ] Business rule validation prevents invalid state combinations + +**Dependencies:** None (can run parallel with other tasks) +**Risk:** Low - Standard domain modeling + +--- + +### Task 1.4: Repository Implementation (3 hours) +**Priority: Medium** | **Agent: Repository Specialist** + +Implement SocialRepository with PostgreSQL operations, vector similarity search, and performance optimization. + +**File:** `tradingagents/domains/socialmedia/repositories.py` + +**Implementation:** +```python +from typing import List, Optional, Dict, Any, Tuple +from sqlalchemy import and_, or_, desc, text, func +from sqlalchemy.orm import Session +from sqlalchemy.exc import IntegrityError +from tradingagents.domains.socialmedia.entities import SocialMediaPostEntity +from tradingagents.domains.socialmedia.models import SocialPost +from tradingagents.database import DatabaseManager +from datetime import datetime, timedelta +import logging + +logger = logging.getLogger(__name__) + +class SocialRepository: + """PostgreSQL repository for social media posts with vector search capabilities""" + + def __init__(self, db_manager: DatabaseManager): + self.db_manager = db_manager + + async def upsert_batch(self, posts: List[SocialPost]) -> List[str]: + """Batch upsert social media posts with deduplication""" + async with self.db_manager.get_session() as session: + saved_ids = [] + + for post in posts: + try: + # Check for existing post + existing = await session.execute( + text("SELECT id FROM social_media_posts WHERE post_id = :post_id"), + {"post_id": post.post_id} + ) + + if existing.first(): + logger.debug(f"Skipping duplicate post: {post.post_id}") + continue + + entity = SocialMediaPostEntity.from_domain(post) + session.add(entity) + saved_ids.append(post.post_id) + + except IntegrityError as e: + logger.warning(f"Integrity error saving post {post.post_id}: {e}") + await session.rollback() + continue + + await session.commit() + logger.info(f"Saved {len(saved_ids)} new posts to database") + return saved_ids + + async def find_by_ticker(self, ticker: str, days: int = 30, limit: int = 50) -> List[SocialPost]: + """Find posts mentioning specific ticker symbol""" + async with self.db_manager.get_session() as session: + cutoff_date = datetime.now() - timedelta(days=days) + + result = await session.execute( + text(""" + SELECT * FROM social_media_posts + WHERE :ticker = ANY(tickers) + AND created_utc >= :cutoff_date + ORDER BY created_utc DESC + LIMIT :limit + """), + { + "ticker": ticker.upper(), + "cutoff_date": cutoff_date, + "limit": limit + } + ) + + entities = [SocialMediaPostEntity(**row) for row in result.mappings()] + return [entity.to_domain() for entity in entities] + + async def find_by_subreddit(self, subreddit: str, hours: int = 24, limit: int = 100) -> List[SocialPost]: + """Find recent posts from specific subreddit""" + async with self.db_manager.get_session() as session: + cutoff_date = datetime.now() - timedelta(hours=hours) + + result = await session.execute( + text(""" + SELECT * FROM social_media_posts + WHERE subreddit = :subreddit + AND created_utc >= :cutoff_date + ORDER BY created_utc DESC + LIMIT :limit + """), + { + "subreddit": subreddit, + "cutoff_date": cutoff_date, + "limit": limit + } + ) + + entities = [SocialMediaPostEntity(**row) for row in result.mappings()] + return [entity.to_domain() for entity in entities] + + async def find_similar_posts( + self, + query_embedding: List[float], + ticker: Optional[str] = None, + limit: int = 10, + similarity_threshold: float = 0.8 + ) -> List[Tuple[SocialPost, float]]: + """Find similar posts using vector similarity search""" + async with self.db_manager.get_session() as session: + embedding_str = str(query_embedding) + + base_query = """ + SELECT *, + LEAST( + 1 - (title_embedding <=> :embedding), + 1 - (content_embedding <=> :embedding) + ) as similarity_score + FROM social_media_posts + WHERE (title_embedding IS NOT NULL OR content_embedding IS NOT NULL) + """ + + params = {"embedding": embedding_str} + + if ticker: + base_query += " AND :ticker = ANY(tickers)" + params["ticker"] = ticker.upper() + + base_query += """ + AND LEAST( + 1 - (title_embedding <=> :embedding), + 1 - (content_embedding <=> :embedding) + ) >= :threshold + ORDER BY similarity_score DESC + LIMIT :limit + """ + + params.update({ + "threshold": similarity_threshold, + "limit": limit + }) + + result = await session.execute(text(base_query), params) + + posts_with_scores = [] + for row in result.mappings(): + entity = SocialMediaPostEntity(**{k: v for k, v in row.items() if k != 'similarity_score'}) + post = entity.to_domain() + similarity = row['similarity_score'] + posts_with_scores.append((post, similarity)) + + return posts_with_scores + + async def get_sentiment_summary( + self, + ticker: Optional[str] = None, + subreddit: Optional[str] = None, + hours: int = 24 + ) -> Dict[str, Any]: + """Get aggregated sentiment analysis for ticker or subreddit""" + async with self.db_manager.get_session() as session: + cutoff_date = datetime.now() - timedelta(hours=hours) + + base_query = """ + SELECT + sentiment_label, + COUNT(*) as count, + AVG((sentiment_score->>'score')::float) as avg_score, + AVG((sentiment_score->>'confidence')::float) as avg_confidence, + SUM(upvotes) as total_upvotes, + SUM(comments_count) as total_comments + FROM social_media_posts + WHERE created_utc >= :cutoff_date + AND sentiment_score IS NOT NULL + """ + + params = {"cutoff_date": cutoff_date} + + if ticker: + base_query += " AND :ticker = ANY(tickers)" + params["ticker"] = ticker.upper() + + if subreddit: + base_query += " AND subreddit = :subreddit" + params["subreddit"] = subreddit + + base_query += " GROUP BY sentiment_label" + + result = await session.execute(text(base_query), params) + + sentiment_counts = {} + total_posts = 0 + weighted_score = 0 + total_engagement = 0 + + for row in result.mappings(): + label = row['sentiment_label'] + count = row['count'] + avg_score = float(row['avg_score'] or 0) + engagement = (row['total_upvotes'] or 0) + (row['total_comments'] or 0) + + sentiment_counts[label] = { + 'count': count, + 'avg_score': avg_score, + 'avg_confidence': float(row['avg_confidence'] or 0), + 'engagement': engagement + } + + total_posts += count + weighted_score += avg_score * count + total_engagement += engagement + + return { + 'ticker': ticker, + 'subreddit': subreddit, + 'period_hours': hours, + 'total_posts': total_posts, + 'sentiment_breakdown': sentiment_counts, + 'overall_sentiment': weighted_score / total_posts if total_posts > 0 else 0.0, + 'total_engagement': total_engagement, + 'data_quality': { + 'posts_with_sentiment': total_posts, + 'period_start': cutoff_date.isoformat(), + 'generated_at': datetime.now().isoformat() + } + } + + async def cleanup_old_posts(self, days: int = 90) -> int: + """Remove posts older than specified days""" + async with self.db_manager.get_session() as session: + cutoff_date = datetime.now() - timedelta(days=days) + + result = await session.execute( + text("DELETE FROM social_media_posts WHERE created_utc < :cutoff_date"), + {"cutoff_date": cutoff_date} + ) + + deleted_count = result.rowcount + await session.commit() + + logger.info(f"Cleaned up {deleted_count} posts older than {days} days") + return deleted_count + + async def get_trending_tickers(self, hours: int = 24, min_mentions: int = 5) -> List[Dict[str, Any]]: + """Find trending ticker symbols by mention frequency and sentiment""" + async with self.db_manager.get_session() as session: + cutoff_date = datetime.now() - timedelta(hours=hours) + + result = await session.execute( + text(""" + SELECT + unnest(tickers) as ticker, + COUNT(*) as mention_count, + AVG((sentiment_score->>'score')::float) as avg_sentiment, + SUM(upvotes) as total_upvotes, + SUM(comments_count) as total_comments + FROM social_media_posts + WHERE created_utc >= :cutoff_date + AND sentiment_score IS NOT NULL + AND array_length(tickers, 1) > 0 + GROUP BY ticker + HAVING COUNT(*) >= :min_mentions + ORDER BY mention_count DESC, avg_sentiment DESC + LIMIT 20 + """), + { + "cutoff_date": cutoff_date, + "min_mentions": min_mentions + } + ) + + trending = [] + for row in result.mappings(): + trending.append({ + 'ticker': row['ticker'], + 'mention_count': row['mention_count'], + 'avg_sentiment': float(row['avg_sentiment'] or 0), + 'total_upvotes': row['total_upvotes'] or 0, + 'total_comments': row['total_comments'] or 0, + 'engagement_score': (row['total_upvotes'] or 0) + (row['total_comments'] or 0) + }) + + return trending +``` + +**Acceptance Criteria:** +- [ ] Batch upsert operations with proper deduplication +- [ ] Vector similarity search using pgvectorscale indexes +- [ ] Efficient ticker-based queries with TimescaleDB optimization +- [ ] Comprehensive sentiment aggregation with engagement metrics +- [ ] Data cleanup operations with configurable retention +- [ ] Trending ticker analysis with minimum mention thresholds +- [ ] Proper error handling and logging throughout + +**Dependencies:** Task 1.1 (database schema), Task 1.2 (entity model) +**Risk:** Medium - Complex vector search queries + +--- + +## Phase 2: API Integration & Processing (12 hours) + +### Task 2.1: Reddit Client Implementation (4 hours) +**Priority: Blocking** | **Agent: API Integration Specialist** + +Implement RedditClient using PRAW with comprehensive rate limiting, error handling, and financial subreddit focus. + +**File:** `tradingagents/domains/socialmedia/clients.py` + +**Implementation:** +```python +import praw +import asyncio +from typing import List, Optional, Dict, Any, AsyncIterator +from datetime import datetime, timedelta +from tradingagents.config import TradingAgentsConfig +import logging +import time +from contextlib import asynccontextmanager + +logger = logging.getLogger(__name__) + +class RedditClient: + """PRAW-based Reddit client with rate limiting and error handling""" + + def __init__(self, config: TradingAgentsConfig): + self.config = config + self.reddit = None + self.last_request_time = 0 + self.min_request_interval = 1.0 # 1 second between requests + self.financial_subreddits = [ + 'wallstreetbets', 'investing', 'stocks', 'SecurityAnalysis', + 'ValueInvesting', 'financialindependence', 'StockMarket', + 'options', 'dividends', 'pennystocks' + ] + + async def __aenter__(self): + """Async context manager entry""" + self._initialize_reddit() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Async context manager exit""" + pass + + def _initialize_reddit(self): + """Initialize PRAW Reddit instance""" + try: + self.reddit = praw.Reddit( + client_id=self.config.reddit_client_id, + client_secret=self.config.reddit_client_secret, + user_agent=self.config.reddit_user_agent, + check_for_async=False + ) + + # Test authentication + self.reddit.user.me() + logger.info("Reddit client initialized successfully") + + except Exception as e: + logger.error(f"Failed to initialize Reddit client: {e}") + raise + + async def _rate_limit_delay(self): + """Implement rate limiting between requests""" + current_time = time.time() + time_since_last = current_time - self.last_request_time + + if time_since_last < self.min_request_interval: + delay = self.min_request_interval - time_since_last + await asyncio.sleep(delay) + + self.last_request_time = time.time() + + async def fetch_subreddit_posts( + self, + subreddit_name: str, + time_filter: str = 'day', + limit: int = 50, + sort_type: str = 'hot' + ) -> List[Dict[str, Any]]: + """Fetch posts from a specific subreddit""" + if not self.reddit: + self._initialize_reddit() + + await self._rate_limit_delay() + + try: + subreddit = self.reddit.subreddit(subreddit_name) + + # Get submissions based on sort type + if sort_type == 'hot': + submissions = subreddit.hot(limit=limit) + elif sort_type == 'top': + submissions = subreddit.top(time_filter=time_filter, limit=limit) + elif sort_type == 'new': + submissions = subreddit.new(limit=limit) + else: + submissions = subreddit.hot(limit=limit) + + posts = [] + for submission in submissions: + # Skip removed or deleted posts + if submission.selftext == '[removed]' or submission.selftext == '[deleted]': + continue + + post_data = self._extract_post_data(submission, subreddit_name) + posts.append(post_data) + + logger.info(f"Fetched {len(posts)} posts from r/{subreddit_name}") + return posts + + except Exception as e: + logger.error(f"Error fetching posts from r/{subreddit_name}: {e}") + return [] + + async def fetch_financial_posts_batch( + self, + subreddits: Optional[List[str]] = None, + time_filter: str = 'day', + posts_per_subreddit: int = 50 + ) -> Dict[str, List[Dict[str, Any]]]: + """Fetch posts from multiple financial subreddits""" + if not subreddits: + subreddits = self.financial_subreddits + + results = {} + + for subreddit_name in subreddits: + try: + posts = await self.fetch_subreddit_posts( + subreddit_name=subreddit_name, + time_filter=time_filter, + limit=posts_per_subreddit + ) + results[subreddit_name] = posts + + except Exception as e: + logger.error(f"Failed to fetch from r/{subreddit_name}: {e}") + results[subreddit_name] = [] + + total_posts = sum(len(posts) for posts in results.values()) + logger.info(f"Fetched {total_posts} total posts from {len(subreddits)} subreddits") + + return results + + async def search_posts( + self, + query: str, + subreddit_names: Optional[List[str]] = None, + time_filter: str = 'week', + limit: int = 25 + ) -> List[Dict[str, Any]]: + """Search for posts containing specific terms""" + if not self.reddit: + self._initialize_reddit() + + if not subreddit_names: + subreddit_names = self.financial_subreddits + + all_posts = [] + + for subreddit_name in subreddit_names: + await self._rate_limit_delay() + + try: + subreddit = self.reddit.subreddit(subreddit_name) + search_results = subreddit.search( + query=query, + time_filter=time_filter, + limit=limit, + sort='relevance' + ) + + for submission in search_results: + if submission.selftext not in ['[removed]', '[deleted]']: + post_data = self._extract_post_data(submission, subreddit_name) + all_posts.append(post_data) + + except Exception as e: + logger.error(f"Search error in r/{subreddit_name}: {e}") + continue + + logger.info(f"Found {len(all_posts)} posts matching query: {query}") + return all_posts + + def _extract_post_data(self, submission: Any, subreddit_name: str) -> Dict[str, Any]: + """Extract structured data from PRAW submission""" + try: + return { + 'post_id': submission.id, + 'title': submission.title[:300], # Limit title length + 'content': submission.selftext if submission.selftext else None, + 'author': str(submission.author) if submission.author else '[deleted]', + 'subreddit': subreddit_name, + 'created_utc': datetime.fromtimestamp(submission.created_utc), + 'upvotes': getattr(submission, 'ups', submission.score), + 'downvotes': max(0, submission.score - getattr(submission, 'ups', submission.score)), + 'comments_count': submission.num_comments, + 'url': f"https://reddit.com{submission.permalink}", + 'reddit_score': submission.score, + 'upvote_ratio': getattr(submission, 'upvote_ratio', 0.5), + 'is_self': submission.is_self, + 'domain': submission.domain, + 'flair_text': getattr(submission, 'link_flair_text', None) + } + except Exception as e: + logger.error(f"Error extracting post data: {e}") + return None + + async def get_post_details(self, post_id: str) -> Optional[Dict[str, Any]]: + """Get detailed information for a specific post""" + if not self.reddit: + self._initialize_reddit() + + await self._rate_limit_delay() + + try: + submission = self.reddit.submission(id=post_id) + return self._extract_post_data(submission, submission.subreddit.display_name) + except Exception as e: + logger.error(f"Error fetching post details for {post_id}: {e}") + return None + + async def health_check(self) -> bool: + """Check if Reddit API is accessible""" + try: + if not self.reddit: + self._initialize_reddit() + + # Simple API call to verify connectivity + self.reddit.subreddit('wallstreetbets').hot(limit=1) + return True + except Exception as e: + logger.error(f"Reddit health check failed: {e}") + return False +``` + +**Testing Implementation:** +```python +# tests/domains/socialmedia/test_reddit_client.py +import pytest +import pytest_vcr +from unittest.mock import MagicMock, patch +from tradingagents.domains.socialmedia.clients import RedditClient +from tradingagents.config import TradingAgentsConfig + +@pytest_vcr.use_cassette('reddit_fetch_posts.yaml') +@pytest.mark.asyncio +async def test_fetch_subreddit_posts(reddit_client, trading_config): + """Test fetching posts from a specific subreddit""" + async with reddit_client: + posts = await reddit_client.fetch_subreddit_posts('wallstreetbets', limit=10) + + assert len(posts) > 0 + for post in posts: + assert 'post_id' in post + assert 'title' in post + assert 'subreddit' in post + assert post['subreddit'] == 'wallstreetbets' +``` + +**Acceptance Criteria:** +- [ ] PRAW Reddit client properly authenticated and initialized +- [ ] Rate limiting implemented (1 request per second minimum) +- [ ] Comprehensive error handling for network issues and API limits +- [ ] Financial subreddit focus with configurable subreddit lists +- [ ] Structured data extraction from Reddit submissions +- [ ] Search functionality across multiple subreddits +- [ ] Health check capabilities for monitoring +- [ ] Test coverage with pytest-vcr cassettes + +**Dependencies:** Reddit API credentials in TradingAgentsConfig +**Risk:** High - External API dependency, rate limiting complexity + +--- + +### Task 2.2: OpenRouter LLM Sentiment Analysis (3 hours) +**Priority: Medium** | **Agent: LLM Integration Specialist** + +Implement sentiment analysis using OpenRouter with social media-specific prompts and structured output parsing. + +**File:** `tradingagents/domains/socialmedia/sentiment.py` + +**Implementation:** +```python +from typing import Optional, Dict, Any, List +import json +import asyncio +from tradingagents.llm.openrouter_client import OpenRouterClient +from tradingagents.config import TradingAgentsConfig +from tradingagents.domains.socialmedia.models import SentimentScore +import logging + +logger = logging.getLogger(__name__) + +class SocialSentimentAnalyzer: + """OpenRouter-based sentiment analysis for social media posts""" + + def __init__(self, config: TradingAgentsConfig): + self.config = config + self.client = OpenRouterClient(config) + self.batch_size = 5 # Process posts in batches + + async def analyze_post_sentiment(self, post_text: str, ticker: Optional[str] = None) -> Optional[SentimentScore]: + """Analyze sentiment of a single social media post""" + prompt = self._create_sentiment_prompt(post_text, ticker) + + try: + response = await self.client.generate_response( + model=self.config.quick_think_llm, + messages=[{"role": "user", "content": prompt}], + max_tokens=150, + temperature=0.1, + response_format={"type": "json_object"} + ) + + result = json.loads(response) + + return SentimentScore( + sentiment=result.get('sentiment', 'neutral'), + confidence=float(result.get('confidence', 0.0)), + reasoning=result.get('reasoning') + ) + + except Exception as e: + logger.error(f"Sentiment analysis failed: {e}") + return None + + async def analyze_batch( + self, + posts: List[Dict[str, Any]], + include_ticker: bool = True + ) -> List[Optional[SentimentScore]]: + """Analyze sentiment for multiple posts with rate limiting""" + results = [] + + for i in range(0, len(posts), self.batch_size): + batch = posts[i:i + self.batch_size] + batch_tasks = [] + + for post in batch: + text = self._combine_post_text(post) + ticker = None + + if include_ticker and 'tickers' in post and post['tickers']: + ticker = post['tickers'][0] # Use first ticker if available + + task = self.analyze_post_sentiment(text, ticker) + batch_tasks.append(task) + + # Process batch with concurrency limit + batch_results = await asyncio.gather(*batch_tasks, return_exceptions=True) + + for result in batch_results: + if isinstance(result, Exception): + logger.error(f"Batch sentiment analysis error: {result}") + results.append(None) + else: + results.append(result) + + # Rate limiting between batches + if i + self.batch_size < len(posts): + await asyncio.sleep(1.0) + + successful_count = sum(1 for r in results if r is not None) + logger.info(f"Sentiment analysis completed: {successful_count}/{len(posts)} successful") + + return results + + def _create_sentiment_prompt(self, text: str, ticker: Optional[str] = None) -> str: + """Create social media-specific sentiment analysis prompt""" + ticker_context = f" for ticker ${ticker}" if ticker else "" + + return f""" +Analyze the financial sentiment of this Reddit post{ticker_context}. Consider: +- Trading/investment sentiment (not general mood) +- Informal language, slang, and memes common in financial social media +- Context clues like "diamond hands", "to the moon", "bearish", etc. +- Overall market outlook expressed in the post + +Post text: "{text}" + +Respond with JSON only: +{{ + "sentiment": "positive|negative|neutral", + "confidence": 0.0-1.0, + "reasoning": "brief explanation of key factors" +}} + +Guidelines: +- "positive": Bullish, optimistic about price/performance +- "negative": Bearish, pessimistic about price/performance +- "neutral": Mixed signals or no clear directional sentiment +- Confidence: How certain are you? (0.5+ for reliable sentiment) +- Reasoning: Key words/phrases that influenced the classification + """.strip() + + def _combine_post_text(self, post: Dict[str, Any]) -> str: + """Combine title and content for sentiment analysis""" + title = post.get('title', '') + content = post.get('content', '') + + if content: + # Limit total text length for efficient processing + combined = f"{title} {content}"[:1000] + else: + combined = title + + return combined.strip() + + async def analyze_market_sentiment( + self, + posts: List[Dict[str, Any]], + ticker: str + ) -> Dict[str, Any]: + """Analyze overall market sentiment for a ticker from multiple posts""" + sentiments = await self.analyze_batch(posts, include_ticker=True) + + # Filter out failed analyses + valid_sentiments = [s for s in sentiments if s is not None and s.confidence >= 0.5] + + if not valid_sentiments: + return { + 'ticker': ticker, + 'overall_sentiment': 'neutral', + 'confidence': 0.0, + 'post_count': len(posts), + 'analysis_success_rate': 0.0, + 'sentiment_distribution': {'positive': 0, 'negative': 0, 'neutral': 0} + } + + # Calculate sentiment distribution + sentiment_counts = {'positive': 0, 'negative': 0, 'neutral': 0} + confidence_sum = 0 + + for sentiment in valid_sentiments: + sentiment_counts[sentiment.sentiment] += 1 + confidence_sum += sentiment.confidence + + # Determine overall sentiment + total_valid = len(valid_sentiments) + positive_ratio = sentiment_counts['positive'] / total_valid + negative_ratio = sentiment_counts['negative'] / total_valid + + if positive_ratio > 0.6: + overall_sentiment = 'positive' + elif negative_ratio > 0.6: + overall_sentiment = 'negative' + else: + overall_sentiment = 'neutral' + + return { + 'ticker': ticker, + 'overall_sentiment': overall_sentiment, + 'confidence': confidence_sum / total_valid, + 'post_count': len(posts), + 'analyzed_posts': total_valid, + 'analysis_success_rate': total_valid / len(posts), + 'sentiment_distribution': sentiment_counts, + 'positive_ratio': positive_ratio, + 'negative_ratio': negative_ratio, + 'neutral_ratio': sentiment_counts['neutral'] / total_valid + } +``` + +**Acceptance Criteria:** +- [ ] OpenRouter integration for sentiment analysis with structured JSON output +- [ ] Social media-specific prompts handling informal language and financial slang +- [ ] Batch processing with rate limiting and error handling +- [ ] Confidence scoring for sentiment reliability +- [ ] Market sentiment aggregation across multiple posts +- [ ] Comprehensive error handling and logging +- [ ] Test coverage with mocked LLM responses + +**Dependencies:** OpenRouter client implementation +**Risk:** Medium - LLM API reliability and cost management + +--- + +### Task 2.3: Vector Embedding Generation (2 hours) +**Priority: Medium** | **Agent: ML Integration Specialist** + +Implement vector embedding generation for semantic similarity search using OpenRouter embedding models. + +**File:** `tradingagents/domains/socialmedia/embeddings.py` + +**Implementation:** +```python +from typing import List, Optional, Dict, Any +import asyncio +import numpy as np +from tradingagents.llm.openrouter_client import OpenRouterClient +from tradingagents.config import TradingAgentsConfig +import logging + +logger = logging.getLogger(__name__) + +class SocialEmbeddingGenerator: + """Generate vector embeddings for social media posts using OpenRouter""" + + def __init__(self, config: TradingAgentsConfig): + self.config = config + self.client = OpenRouterClient(config) + self.embedding_model = "text-embedding-3-large" # 1536 dimensions + self.max_text_length = 8000 # Token limit for embedding model + self.batch_size = 10 + + async def generate_post_embeddings( + self, + post: Dict[str, Any] + ) -> Dict[str, Optional[List[float]]]: + """Generate embeddings for post title and content separately""" + embeddings = { + 'title_embedding': None, + 'content_embedding': None + } + + # Generate title embedding + title = post.get('title', '').strip() + if title: + embeddings['title_embedding'] = await self._generate_embedding(title) + + # Generate content embedding if content exists + content = post.get('content', '').strip() + if content: + # Combine title and content for content embedding + combined_text = f"{title} {content}"[:self.max_text_length] + embeddings['content_embedding'] = await self._generate_embedding(combined_text) + + return embeddings + + async def generate_batch_embeddings( + self, + posts: List[Dict[str, Any]] + ) -> List[Dict[str, Optional[List[float]]]]: + """Generate embeddings for multiple posts with batching""" + results = [] + + for i in range(0, len(posts), self.batch_size): + batch = posts[i:i + self.batch_size] + + # Create tasks for concurrent processing + tasks = [self.generate_post_embeddings(post) for post in batch] + batch_results = await asyncio.gather(*tasks, return_exceptions=True) + + for result in batch_results: + if isinstance(result, Exception): + logger.error(f"Embedding generation error: {result}") + results.append({'title_embedding': None, 'content_embedding': None}) + else: + results.append(result) + + # Rate limiting between batches + if i + self.batch_size < len(posts): + await asyncio.sleep(0.5) + + successful_count = sum( + 1 for r in results + if r.get('title_embedding') is not None or r.get('content_embedding') is not None + ) + logger.info(f"Embedding generation completed: {successful_count}/{len(posts)} successful") + + return results + + async def generate_query_embedding(self, query: str) -> Optional[List[float]]: + """Generate embedding for search query""" + return await self._generate_embedding(query[:self.max_text_length]) + + async def _generate_embedding(self, text: str) -> Optional[List[float]]: + """Generate single embedding using OpenRouter""" + if not text.strip(): + return None + + try: + response = await self.client.create_embeddings( + model=self.embedding_model, + input=[text], + encoding_format="float" + ) + + if response and response.data: + embedding = response.data[0].embedding + + # Validate embedding dimensions + if len(embedding) != 1536: + logger.error(f"Unexpected embedding dimension: {len(embedding)}") + return None + + return embedding + + except Exception as e: + logger.error(f"Embedding generation failed for text: {e}") + return None + + return None + + def calculate_similarity( + self, + embedding1: List[float], + embedding2: List[float] + ) -> float: + """Calculate cosine similarity between two embeddings""" + try: + # Convert to numpy arrays for efficient computation + vec1 = np.array(embedding1) + vec2 = np.array(embedding2) + + # Cosine similarity: dot product / (magnitude1 * magnitude2) + dot_product = np.dot(vec1, vec2) + magnitude1 = np.linalg.norm(vec1) + magnitude2 = np.linalg.norm(vec2) + + if magnitude1 == 0 or magnitude2 == 0: + return 0.0 + + similarity = dot_product / (magnitude1 * magnitude2) + return float(similarity) + + except Exception as e: + logger.error(f"Similarity calculation error: {e}") + return 0.0 + + def find_most_similar( + self, + query_embedding: List[float], + post_embeddings: List[Dict[str, Any]], + top_k: int = 10 + ) -> List[Dict[str, Any]]: + """Find most similar posts to query embedding""" + similarities = [] + + for i, post_data in enumerate(post_embeddings): + max_similarity = 0.0 + best_embedding_type = None + + # Check title embedding similarity + title_emb = post_data.get('title_embedding') + if title_emb: + title_sim = self.calculate_similarity(query_embedding, title_emb) + if title_sim > max_similarity: + max_similarity = title_sim + best_embedding_type = 'title' + + # Check content embedding similarity + content_emb = post_data.get('content_embedding') + if content_emb: + content_sim = self.calculate_similarity(query_embedding, content_emb) + if content_sim > max_similarity: + max_similarity = content_sim + best_embedding_type = 'content' + + if max_similarity > 0: + similarities.append({ + 'post_index': i, + 'similarity_score': max_similarity, + 'embedding_type': best_embedding_type, + 'post_data': post_data + }) + + # Sort by similarity score and return top k + similarities.sort(key=lambda x: x['similarity_score'], reverse=True) + return similarities[:top_k] + + async def create_semantic_clusters( + self, + posts: List[Dict[str, Any]], + similarity_threshold: float = 0.8 + ) -> List[List[Dict[str, Any]]]: + """Group similar posts into semantic clusters""" + if not posts: + return [] + + # Generate embeddings for all posts + embeddings_data = await self.generate_batch_embeddings(posts) + + # Combine posts with their embeddings + posts_with_embeddings = [] + for post, embeddings in zip(posts, embeddings_data): + if embeddings.get('title_embedding') or embeddings.get('content_embedding'): + posts_with_embeddings.append({**post, **embeddings}) + + clusters = [] + processed = set() + + for i, post in enumerate(posts_with_embeddings): + if i in processed: + continue + + current_cluster = [post] + processed.add(i) + + # Find similar posts for current cluster + for j, other_post in enumerate(posts_with_embeddings): + if j in processed or i == j: + continue + + # Calculate similarity between posts + max_sim = 0.0 + + # Compare all embedding combinations + for emb1_type in ['title_embedding', 'content_embedding']: + for emb2_type in ['title_embedding', 'content_embedding']: + emb1 = post.get(emb1_type) + emb2 = other_post.get(emb2_type) + + if emb1 and emb2: + sim = self.calculate_similarity(emb1, emb2) + max_sim = max(max_sim, sim) + + if max_sim >= similarity_threshold: + current_cluster.append(other_post) + processed.add(j) + + if len(current_cluster) > 1: # Only include clusters with multiple posts + clusters.append(current_cluster) + + logger.info(f"Created {len(clusters)} semantic clusters from {len(posts)} posts") + return clusters +``` + +**Acceptance Criteria:** +- [ ] Vector embedding generation for post titles and content separately +- [ ] Batch processing with rate limiting for efficiency +- [ ] Cosine similarity calculation for semantic search +- [ ] Query embedding generation for search functionality +- [ ] Semantic clustering capabilities for related post discovery +- [ ] Proper error handling and dimension validation +- [ ] Test coverage with mocked embedding responses + +**Dependencies:** OpenRouter client with embedding support +**Risk:** Low - Standard embedding generation patterns + +--- + +### Task 2.4: Service Layer Implementation (3 hours) +**Priority: Medium** | **Agent: Service Integration Specialist** + +Implement SocialMediaService that orchestrates Reddit collection, sentiment analysis, and embedding generation. + +**File:** `tradingagents/domains/socialmedia/services.py` + +**Implementation:** +```python +from typing import List, Optional, Dict, Any, Tuple +import asyncio +import logging +from datetime import datetime, timedelta + +from tradingagents.domains.socialmedia.clients import RedditClient +from tradingagents.domains.socialmedia.repositories import SocialRepository +from tradingagents.domains.socialmedia.sentiment import SocialSentimentAnalyzer +from tradingagents.domains.socialmedia.embeddings import SocialEmbeddingGenerator +from tradingagents.domains.socialmedia.models import SocialPost, SocialContext +from tradingagents.config import TradingAgentsConfig +from tradingagents.database import DatabaseManager + +logger = logging.getLogger(__name__) + +class SocialMediaService: + """Orchestrates social media data collection, analysis, and storage""" + + def __init__(self, config: TradingAgentsConfig, db_manager: DatabaseManager): + self.config = config + self.db_manager = db_manager + self.repository = SocialRepository(db_manager) + self.sentiment_analyzer = SocialSentimentAnalyzer(config) + self.embedding_generator = SocialEmbeddingGenerator(config) + + # Configuration + self.financial_subreddits = [ + 'wallstreetbets', 'investing', 'stocks', 'SecurityAnalysis', + 'ValueInvesting', 'financialindependence', 'StockMarket' + ] + self.min_score_threshold = 10 # Minimum upvotes + self.max_posts_per_subreddit = 50 + + async def collect_and_process_posts( + self, + subreddits: Optional[List[str]] = None, + time_filter: str = 'day', + process_sentiment: bool = True, + generate_embeddings: bool = True + ) -> Dict[str, Any]: + """Main entry point for collecting and processing social media posts""" + if not subreddits: + subreddits = self.financial_subreddits + + collection_start = datetime.now() + logger.info(f"Starting social media collection from {len(subreddits)} subreddits") + + async with RedditClient(self.config) as reddit_client: + # Collect raw posts from Reddit + raw_posts_by_subreddit = await reddit_client.fetch_financial_posts_batch( + subreddits=subreddits, + time_filter=time_filter, + posts_per_subreddit=self.max_posts_per_subreddit + ) + + # Flatten and filter posts + all_raw_posts = [] + for subreddit, posts in raw_posts_by_subreddit.items(): + filtered_posts = [ + post for post in posts + if post and post.get('reddit_score', 0) >= self.min_score_threshold + ] + all_raw_posts.extend(filtered_posts) + + logger.info(f"Collected {len(all_raw_posts)} posts meeting quality thresholds") + + # Convert to domain objects and extract tickers + domain_posts = [] + for raw_post in all_raw_posts: + try: + post = SocialPost(**raw_post) + post.tickers = post.extract_tickers() # Extract tickers from content + domain_posts.append(post) + except Exception as e: + logger.error(f"Error creating domain object: {e}") + continue + + # Process sentiment analysis if requested + if process_sentiment and domain_posts: + await self._process_sentiment_analysis(domain_posts) + + # Generate embeddings if requested + if generate_embeddings and domain_posts: + await self._process_embeddings(domain_posts) + + # Save to database + saved_post_ids = await self.repository.upsert_batch(domain_posts) + + collection_end = datetime.now() + processing_time = (collection_end - collection_start).total_seconds() + + # Calculate success metrics + results = { + 'collection_timestamp': collection_start.isoformat(), + 'processing_time_seconds': processing_time, + 'subreddits_processed': subreddits, + 'total_posts_collected': len(all_raw_posts), + 'posts_processed': len(domain_posts), + 'posts_saved': len(saved_post_ids), + 'sentiment_analysis_enabled': process_sentiment, + 'embeddings_enabled': generate_embeddings, + 'subreddit_breakdown': {} + } + + # Add per-subreddit breakdown + for subreddit, posts in raw_posts_by_subreddit.items(): + results['subreddit_breakdown'][subreddit] = { + 'posts_collected': len(posts), + 'posts_filtered': len([p for p in posts if p.get('reddit_score', 0) >= self.min_score_threshold]) + } + + logger.info(f"Collection completed: {len(saved_post_ids)} posts saved in {processing_time:.2f}s") + return results + + async def get_social_context( + self, + ticker: str, + days: int = 7, + include_similar: bool = True, + similarity_query: Optional[str] = None + ) -> SocialContext: + """Get comprehensive social media context for a ticker""" + logger.info(f"Generating social context for {ticker} ({days} days)") + + # Get direct ticker mentions + ticker_posts = await self.repository.find_by_ticker(ticker, days=days, limit=50) + + similar_posts = [] + if include_similar and ticker_posts: + # Use semantic search to find related discussions + if similarity_query: + query_embedding = await self.embedding_generator.generate_query_embedding(similarity_query) + if query_embedding: + similar_results = await self.repository.find_similar_posts( + query_embedding=query_embedding, + ticker=ticker, + limit=10 + ) + similar_posts = [post for post, score in similar_results] + + # Get sentiment summary + sentiment_summary = await self.repository.get_sentiment_summary( + ticker=ticker, + hours=days * 24 + ) + + # Find trending discussions + trending_tickers = await self.repository.get_trending_tickers( + hours=days * 24, + min_mentions=3 + ) + ticker_trend = next( + (trend for trend in trending_tickers if trend['ticker'] == ticker.upper()), + None + ) + + return SocialContext( + ticker=ticker, + period_days=days, + direct_mentions=ticker_posts, + similar_posts=similar_posts, + sentiment_summary=sentiment_summary, + trending_info=ticker_trend, + total_posts=len(ticker_posts) + len(similar_posts), + data_quality_score=self._calculate_data_quality(ticker_posts + similar_posts) + ) + + async def search_posts_semantic( + self, + query: str, + ticker: Optional[str] = None, + limit: int = 10, + min_similarity: float = 0.7 + ) -> List[Tuple[SocialPost, float]]: + """Semantic search for social media posts""" + query_embedding = await self.embedding_generator.generate_query_embedding(query) + + if not query_embedding: + logger.error(f"Failed to generate query embedding for: {query}") + return [] + + return await self.repository.find_similar_posts( + query_embedding=query_embedding, + ticker=ticker, + limit=limit, + similarity_threshold=min_similarity + ) + + async def get_subreddit_analysis( + self, + subreddit: str, + hours: int = 24 + ) -> Dict[str, Any]: + """Get analysis of a specific subreddit's activity""" + posts = await self.repository.find_by_subreddit(subreddit, hours=hours) + + if not posts: + return { + 'subreddit': subreddit, + 'period_hours': hours, + 'total_posts': 0, + 'message': f'No posts found for r/{subreddit} in the last {hours} hours' + } + + # Analyze ticker mentions + ticker_counts = {} + for post in posts: + for ticker in post.tickers or []: + ticker_counts[ticker] = ticker_counts.get(ticker, 0) + 1 + + top_tickers = sorted(ticker_counts.items(), key=lambda x: x[1], reverse=True)[:10] + + # Analyze sentiment distribution + sentiment_counts = {'positive': 0, 'negative': 0, 'neutral': 0} + reliable_sentiment_count = 0 + + for post in posts: + if post.sentiment_label: + sentiment_counts[post.sentiment_label] += 1 + if post.has_reliable_sentiment(): + reliable_sentiment_count += 1 + + # Calculate engagement metrics + total_upvotes = sum(post.upvotes for post in posts) + total_comments = sum(post.comments_count for post in posts) + avg_score = total_upvotes / len(posts) if posts else 0 + + return { + 'subreddit': subreddit, + 'period_hours': hours, + 'total_posts': len(posts), + 'engagement_metrics': { + 'total_upvotes': total_upvotes, + 'total_comments': total_comments, + 'avg_score': avg_score, + 'top_post_score': max(post.upvotes for post in posts) if posts else 0 + }, + 'sentiment_analysis': { + 'distribution': sentiment_counts, + 'reliable_sentiment_posts': reliable_sentiment_count, + 'sentiment_reliability': reliable_sentiment_count / len(posts) if posts else 0 + }, + 'ticker_mentions': { + 'top_tickers': top_tickers, + 'unique_tickers': len(ticker_counts), + 'total_mentions': sum(ticker_counts.values()) + }, + 'data_quality': self._calculate_data_quality(posts) + } + + async def _process_sentiment_analysis(self, posts: List[SocialPost]) -> None: + """Process sentiment analysis for posts""" + logger.info(f"Processing sentiment analysis for {len(posts)} posts") + + # Convert to dict format for sentiment analyzer + posts_data = [] + for post in posts: + post_dict = post.dict() + posts_data.append(post_dict) + + # Analyze sentiment in batches + sentiments = await self.sentiment_analyzer.analyze_batch(posts_data) + + # Update posts with sentiment results + for post, sentiment in zip(posts, sentiments): + if sentiment: + post.sentiment_score = sentiment.score if hasattr(sentiment, 'score') else None + post.sentiment_label = sentiment.sentiment + post.sentiment_confidence = sentiment.confidence + post.sentiment_reasoning = sentiment.reasoning + + successful_count = sum(1 for s in sentiments if s is not None) + logger.info(f"Sentiment analysis completed: {successful_count}/{len(posts)} successful") + + async def _process_embeddings(self, posts: List[SocialPost]) -> None: + """Process embedding generation for posts""" + logger.info(f"Generating embeddings for {len(posts)} posts") + + # Convert to dict format for embedding generator + posts_data = [] + for post in posts: + post_dict = post.dict() + posts_data.append(post_dict) + + # Generate embeddings in batches + embeddings = await self.embedding_generator.generate_batch_embeddings(posts_data) + + # Update posts with embedding results + for post, embedding_data in zip(posts, embeddings): + post.title_embedding = embedding_data.get('title_embedding') + post.content_embedding = embedding_data.get('content_embedding') + + successful_count = sum( + 1 for e in embeddings + if e.get('title_embedding') is not None or e.get('content_embedding') is not None + ) + logger.info(f"Embedding generation completed: {successful_count}/{len(posts)} successful") + + def _calculate_data_quality(self, posts: List[SocialPost]) -> Dict[str, float]: + """Calculate data quality metrics for posts""" + if not posts: + return {'overall_score': 0.0} + + sentiment_coverage = sum(1 for p in posts if p.sentiment_label is not None) / len(posts) + reliable_sentiment = sum(1 for p in posts if p.has_reliable_sentiment()) / len(posts) + embedding_coverage = sum( + 1 for p in posts + if p.title_embedding is not None or p.content_embedding is not None + ) / len(posts) + ticker_extraction = sum(1 for p in posts if p.tickers) / len(posts) + + overall_score = (sentiment_coverage + reliable_sentiment + embedding_coverage + ticker_extraction) / 4 + + return { + 'overall_score': overall_score, + 'sentiment_coverage': sentiment_coverage, + 'reliable_sentiment_ratio': reliable_sentiment, + 'embedding_coverage': embedding_coverage, + 'ticker_extraction_ratio': ticker_extraction + } +``` + +**Acceptance Criteria:** +- [ ] Orchestrates complete collection, analysis, and storage pipeline +- [ ] Integrates Reddit client, sentiment analyzer, and embedding generator +- [ ] Handles batch processing with proper error handling and logging +- [ ] Provides ticker-specific social context with sentiment and similarity +- [ ] Semantic search capabilities with configurable similarity thresholds +- [ ] Subreddit analysis with engagement and sentiment metrics +- [ ] Data quality scoring and monitoring +- [ ] Comprehensive test coverage with mocked dependencies + +**Dependencies:** All Phase 2 tasks (clients, sentiment, embeddings) +**Risk:** Medium - Complex orchestration of multiple async services + +--- + +## Phase 3: Integration & Validation (8 hours) + +### Task 3.1: AgentToolkit Integration (3 hours) +**Priority: High** | **Agent: Agent Integration Specialist** + +Add RAG-enhanced social media methods to AgentToolkit for AI agent consumption. + +**File:** `tradingagents/agents/libs/agent_toolkit.py` (additions) + +**Implementation:** +```python +# Additional methods for AgentToolkit class + +async def get_reddit_sentiment( + self, + ticker: str, + days: int = 7, + include_context: bool = True +) -> str: + """Get Reddit sentiment analysis for a specific ticker with RAG context""" + try: + if not hasattr(self, 'social_service'): + self.social_service = SocialMediaService(self.config, self.db_manager) + + # Get comprehensive social context + social_context = await self.social_service.get_social_context( + ticker=ticker, + days=days, + include_similar=include_context + ) + + if not social_context.total_posts: + return f"No Reddit sentiment data found for ${ticker} in the last {days} days." + + # Format for agent consumption + sentiment_summary = social_context.sentiment_summary + trending_info = social_context.trending_info + + context = f"Reddit Sentiment Analysis for ${ticker} ({days}-day period):\n\n" + + # Overall sentiment metrics + if sentiment_summary: + overall_score = sentiment_summary.get('overall_sentiment', 0.0) + sentiment_emoji = "📈" if overall_score > 0.1 else "📉" if overall_score < -0.1 else "➡️" + + context += f"{sentiment_emoji} Overall Sentiment: {overall_score:.2f}/1.0\n" + context += f"📊 Analysis Coverage: {social_context.total_posts} posts analyzed\n" + + # Sentiment breakdown + breakdown = sentiment_summary.get('sentiment_breakdown', {}) + if breakdown: + context += f" • Positive: {breakdown.get('positive', {}).get('count', 0)} posts\n" + context += f" • Negative: {breakdown.get('negative', {}).get('count', 0)} posts\n" + context += f" • Neutral: {breakdown.get('neutral', {}).get('count', 0)} posts\n" + + # Trending information + if trending_info: + context += f"\n🔥 Trending Status:\n" + context += f" • Mentions: {trending_info['mention_count']} posts\n" + context += f" • Engagement: {trending_info['engagement_score']} (upvotes + comments)\n" + context += f" • Avg Sentiment: {trending_info['avg_sentiment']:.2f}\n" + + # Top discussions (sample posts) + if social_context.direct_mentions: + context += f"\n💬 Recent Discussions:\n" + for i, post in enumerate(social_context.direct_mentions[:5]): + sentiment_emoji = {"positive": "📈", "negative": "📉", "neutral": "➡️"}.get( + post.sentiment_label, "❓" + ) + context += f"{i+1}. {sentiment_emoji} r/{post.subreddit}: {post.title[:100]}...\n" + context += f" Score: {post.upvotes} upvotes, {post.comments_count} comments\n" + if post.has_reliable_sentiment(): + context += f" Sentiment: {post.sentiment_label} ({post.sentiment_confidence:.2f})\n" + + # Data quality indicators + quality = social_context.data_quality_score + context += f"\n📋 Data Quality: {quality.get('overall_score', 0):.1%} coverage\n" + + return context + + except Exception as e: + logger.error(f"Error getting Reddit sentiment for {ticker}: {e}") + return f"Error retrieving Reddit sentiment for ${ticker}: {str(e)}" + +async def get_reddit_stock_info( + self, + ticker: str, + query: Optional[str] = None, + days: int = 7 +) -> str: + """Get Reddit stock information with optional semantic search""" + try: + if not hasattr(self, 'social_service'): + self.social_service = SocialMediaService(self.config, self.db_manager) + + context = f"Reddit Stock Information for ${ticker}:\n\n" + + if query: + # Semantic search for specific information + search_results = await self.social_service.search_posts_semantic( + query=query, + ticker=ticker, + limit=10, + min_similarity=0.7 + ) + + if search_results: + context += f"🔍 Semantic Search Results for '{query}':\n" + for i, (post, similarity) in enumerate(search_results[:5]): + context += f"{i+1}. (Similarity: {similarity:.2f}) r/{post.subreddit}\n" + context += f" Title: {post.title}\n" + if post.content: + context += f" Content: {post.content[:150]}...\n" + context += f" Engagement: {post.upvotes} upvotes, {post.comments_count} comments\n\n" + else: + context += f"🔍 No relevant discussions found for '{query}' about ${ticker}\n\n" + + # Get general stock context + social_context = await self.social_service.get_social_context( + ticker=ticker, + days=days, + include_similar=False + ) + + if social_context.direct_mentions: + context += f"📈 Recent Stock Discussions ({len(social_context.direct_mentions)} posts):\n" + + # Group by subreddit for better organization + by_subreddit = {} + for post in social_context.direct_mentions: + if post.subreddit not in by_subreddit: + by_subreddit[post.subreddit] = [] + by_subreddit[post.subreddit].append(post) + + for subreddit, posts in by_subreddit.items(): + context += f"\nr/{subreddit} ({len(posts)} posts):\n" + for post in posts[:3]: # Top 3 per subreddit + sentiment_info = "" + if post.has_reliable_sentiment(): + sentiment_emoji = {"positive": "📈", "negative": "📉", "neutral": "➡️"} + emoji = sentiment_emoji.get(post.sentiment_label, "❓") + sentiment_info = f" {emoji} {post.sentiment_label}" + + context += f" • {post.title[:80]}...{sentiment_info}\n" + context += f" {post.upvotes} upvotes, {post.comments_count} comments\n" + + # Add trending context if available + if social_context.trending_info: + trend = social_context.trending_info + context += f"\n📊 Trending Analysis:\n" + context += f" • Market attention: {trend['mention_count']} recent mentions\n" + context += f" • Community sentiment: {trend['avg_sentiment']:.2f}/1.0\n" + context += f" • Total engagement: {trend['engagement_score']}\n" + + return context + + except Exception as e: + logger.error(f"Error getting Reddit stock info for {ticker}: {e}") + return f"Error retrieving Reddit stock information for ${ticker}: {str(e)}" + +async def search_social_posts( + self, + query: str, + ticker: Optional[str] = None, + limit: int = 10, + days: int = 30 +) -> str: + """Search social media posts using semantic similarity""" + try: + if not hasattr(self, 'social_service'): + self.social_service = SocialMediaService(self.config, self.db_manager) + + # Perform semantic search + search_results = await self.social_service.search_posts_semantic( + query=query, + ticker=ticker, + limit=limit, + min_similarity=0.6 + ) + + if not search_results: + ticker_context = f" about ${ticker}" if ticker else "" + return f"No relevant social media posts found for '{query}'{ticker_context}." + + ticker_context = f" (${ticker})" if ticker else "" + context = f"Social Media Search Results for '{query}'{ticker_context}:\n\n" + context += f"Found {len(search_results)} relevant posts:\n\n" + + for i, (post, similarity) in enumerate(search_results): + context += f"{i+1}. Relevance: {similarity:.2%} | r/{post.subreddit}\n" + context += f" Title: {post.title}\n" + + if post.content: + # Show relevant snippet + content_preview = post.content[:200] + "..." if len(post.content) > 200 else post.content + context += f" Content: {content_preview}\n" + + # Add sentiment if available + if post.has_reliable_sentiment(): + sentiment_emoji = {"positive": "📈", "negative": "📉", "neutral": "➡️"}.get( + post.sentiment_label, "❓" + ) + context += f" Sentiment: {sentiment_emoji} {post.sentiment_label} ({post.sentiment_confidence:.2f})\n" + + # Add engagement metrics + context += f" Engagement: {post.upvotes} upvotes, {post.comments_count} comments\n" + context += f" Posted: {post.created_utc.strftime('%Y-%m-%d %H:%M')} UTC\n\n" + + return context + + except Exception as e: + logger.error(f"Error searching social posts for '{query}': {e}") + return f"Error searching social media posts: {str(e)}" + +async def get_subreddit_analysis( + self, + subreddit: str, + ticker: Optional[str] = None, + hours: int = 24 +) -> str: + """Get analysis of activity in a specific financial subreddit""" + try: + if not hasattr(self, 'social_service'): + self.social_service = SocialMediaService(self.config, self.db_manager) + + analysis = await self.social_service.get_subreddit_analysis(subreddit, hours=hours) + + if analysis['total_posts'] == 0: + return f"No recent activity found in r/{subreddit} in the last {hours} hours." + + context = f"r/{subreddit} Analysis ({hours}-hour period):\n\n" + + # Activity overview + context += f"📊 Activity Overview:\n" + context += f" • Total Posts: {analysis['total_posts']}\n" + context += f" • Total Upvotes: {analysis['engagement_metrics']['total_upvotes']:,}\n" + context += f" • Total Comments: {analysis['engagement_metrics']['total_comments']:,}\n" + context += f" • Avg Score: {analysis['engagement_metrics']['avg_score']:.1f}\n" + context += f" • Top Post Score: {analysis['engagement_metrics']['top_post_score']:,}\n\n" + + # Sentiment analysis + sentiment_dist = analysis['sentiment_analysis']['distribution'] + reliable_ratio = analysis['sentiment_analysis']['sentiment_reliability'] + + context += f"😊 Sentiment Analysis:\n" + context += f" • Positive: {sentiment_dist['positive']} posts\n" + context += f" • Negative: {sentiment_dist['negative']} posts\n" + context += f" • Neutral: {sentiment_dist['neutral']} posts\n" + context += f" • Reliability: {reliable_ratio:.1%} of posts have confident sentiment scores\n\n" + + # Ticker mentions + ticker_info = analysis['ticker_mentions'] + context += f"💰 Stock Mentions:\n" + context += f" • Unique Tickers: {ticker_info['unique_tickers']}\n" + context += f" • Total Mentions: {ticker_info['total_mentions']}\n" + + if ticker_info['top_tickers']: + context += f" • Most Discussed:\n" + for ticker_symbol, count in ticker_info['top_tickers'][:5]: + context += f" - ${ticker_symbol}: {count} mentions\n" + + # Filter for specific ticker if requested + if ticker: + ticker_mentions = next( + (count for symbol, count in ticker_info['top_tickers'] if symbol == ticker.upper()), + 0 + ) + if ticker_mentions > 0: + context += f"\n🎯 ${ticker} Activity: {ticker_mentions} mentions in this period\n" + else: + context += f"\n🎯 ${ticker}: No mentions found in r/{subreddit} during this period\n" + + # Data quality + quality = analysis['data_quality']['overall_score'] + context += f"\n📋 Data Quality Score: {quality:.1%}\n" + + return context + + except Exception as e: + logger.error(f"Error analyzing subreddit {subreddit}: {e}") + return f"Error analyzing r/{subreddit}: {str(e)}" +``` + +**Acceptance Criteria:** +- [ ] get_reddit_sentiment() provides comprehensive sentiment analysis with visual formatting +- [ ] get_reddit_stock_info() supports both general info and semantic search queries +- [ ] search_social_posts() enables semantic search across all social media content +- [ ] get_subreddit_analysis() provides detailed subreddit activity and ticker analysis +- [ ] All methods return human-readable formatted strings for AI agent consumption +- [ ] Proper error handling with fallback responses +- [ ] Methods integrate seamlessly with existing AgentToolkit patterns +- [ ] Test coverage with mocked service dependencies + +**Dependencies:** Task 2.4 (SocialMediaService implementation) +**Risk:** Low - Standard AgentToolkit integration patterns + +--- + +### Task 3.2: Dagster Pipeline Implementation (2 hours) +**Priority: Medium** | **Agent: Pipeline Specialist** + +Implement Dagster asset for scheduled social media collection and processing. + +**File:** `tradingagents/data/assets/social_media.py` + +**Implementation:** +```python +from dagster import asset, AssetExecutionContext, Config, DailyPartitionsDefinition +from typing import Dict, Any, List +import asyncio +from datetime import datetime, timedelta + +from tradingagents.domains.socialmedia.services import SocialMediaService +from tradingagents.config import TradingAgentsConfig +from tradingagents.database import DatabaseManager + +class SocialMediaCollectionConfig(Config): + """Configuration for social media collection""" + subreddits: List[str] = [ + 'wallstreetbets', 'investing', 'stocks', 'SecurityAnalysis', + 'ValueInvesting', 'StockMarket', 'options' + ] + time_filter: str = 'day' + process_sentiment: bool = True + generate_embeddings: bool = True + max_posts_per_subreddit: int = 50 + cleanup_old_data: bool = True + retention_days: int = 90 + +@asset( + partitions_def=DailyPartitionsDefinition(start_date="2024-01-01"), + group_name="social_media", + description="Daily collection of Reddit posts from financial subreddits with sentiment analysis and embeddings", + compute_kind="python", + tags={"domain": "socialmedia", "source": "reddit"} +) +async def reddit_financial_posts( + context: AssetExecutionContext, + config: SocialMediaCollectionConfig +) -> Dict[str, Any]: + """Daily collection and processing of Reddit financial posts""" + + partition_date = context.partition_key + context.log.info(f"Starting social media collection for partition: {partition_date}") + + # Initialize services + trading_config = TradingAgentsConfig.from_env() + db_manager = DatabaseManager(trading_config) + social_service = SocialMediaService(trading_config, db_manager) + + collection_start = datetime.now() + + try: + # Main collection and processing + results = await social_service.collect_and_process_posts( + subreddits=config.subreddits, + time_filter=config.time_filter, + process_sentiment=config.process_sentiment, + generate_embeddings=config.generate_embeddings + ) + + # Log detailed results + context.log.info(f"Collection completed successfully:") + context.log.info(f" - Total posts collected: {results['total_posts_collected']}") + context.log.info(f" - Posts processed: {results['posts_processed']}") + context.log.info(f" - Posts saved: {results['posts_saved']}") + context.log.info(f" - Processing time: {results['processing_time_seconds']:.2f}s") + + # Log per-subreddit breakdown + for subreddit, breakdown in results['subreddit_breakdown'].items(): + context.log.info(f" - r/{subreddit}: {breakdown['posts_collected']} collected, " + f"{breakdown['posts_filtered']} after filtering") + + # Data quality check + if results['posts_saved'] == 0: + context.log.warning("No posts were saved - possible data quality issues") + elif results['posts_saved'] < results['posts_processed'] * 0.5: + context.log.warning(f"Low save rate: {results['posts_saved']}/{results['posts_processed']} posts saved") + + # Cleanup old data if configured + if config.cleanup_old_data: + try: + deleted_count = await social_service.repository.cleanup_old_posts( + days=config.retention_days + ) + context.log.info(f"Cleaned up {deleted_count} posts older than {config.retention_days} days") + results['cleanup_deleted_count'] = deleted_count + except Exception as e: + context.log.error(f"Cleanup failed: {e}") + results['cleanup_error'] = str(e) + + # Add partition metadata + results.update({ + 'partition_date': partition_date, + 'asset_name': 'reddit_financial_posts', + 'collection_success': True + }) + + return results + + except Exception as e: + context.log.error(f"Social media collection failed: {e}") + + # Return error results for monitoring + return { + 'partition_date': partition_date, + 'asset_name': 'reddit_financial_posts', + 'collection_success': False, + 'error_message': str(e), + 'processing_time_seconds': (datetime.now() - collection_start).total_seconds(), + 'total_posts_collected': 0, + 'posts_processed': 0, + 'posts_saved': 0 + } + + finally: + # Always close database connections + if 'db_manager' in locals(): + await db_manager.close_all() + +@asset( + deps=[reddit_financial_posts], + group_name="social_media", + description="Generate daily social media analytics and trending analysis", + compute_kind="python", + tags={"domain": "socialmedia", "analytics": "trending"} +) +async def social_media_analytics(context: AssetExecutionContext) -> Dict[str, Any]: + """Generate analytics and trending analysis from collected social media data""" + + context.log.info("Generating social media analytics") + + # Initialize services + trading_config = TradingAgentsConfig.from_env() + db_manager = DatabaseManager(trading_config) + social_service = SocialMediaService(trading_config, db_manager) + + try: + # Get trending tickers analysis + trending_tickers = await social_service.repository.get_trending_tickers( + hours=24, + min_mentions=5 + ) + + context.log.info(f"Found {len(trending_tickers)} trending tickers") + + # Analyze top subreddits + financial_subreddits = [ + 'wallstreetbets', 'investing', 'stocks', 'SecurityAnalysis', + 'ValueInvesting', 'StockMarket' + ] + + subreddit_analysis = {} + for subreddit in financial_subreddits: + analysis = await social_service.get_subreddit_analysis(subreddit, hours=24) + subreddit_analysis[subreddit] = analysis + + if analysis['total_posts'] > 0: + context.log.info(f"r/{subreddit}: {analysis['total_posts']} posts, " + f"{analysis['ticker_mentions']['unique_tickers']} unique tickers") + + # Calculate overall sentiment trends + overall_sentiment_summary = {} + for ticker_info in trending_tickers[:10]: # Top 10 trending + ticker = ticker_info['ticker'] + sentiment_data = await social_service.repository.get_sentiment_summary( + ticker=ticker, + hours=24 + ) + overall_sentiment_summary[ticker] = sentiment_data + + analytics_results = { + 'generated_at': datetime.now().isoformat(), + 'period_hours': 24, + 'trending_tickers': trending_tickers, + 'subreddit_analysis': subreddit_analysis, + 'sentiment_trends': overall_sentiment_summary, + 'analytics_success': True + } + + # Log key insights + if trending_tickers: + top_ticker = trending_tickers[0] + context.log.info(f"Most trending ticker: ${top_ticker['ticker']} " + f"({top_ticker['mention_count']} mentions, " + f"{top_ticker['avg_sentiment']:.2f} sentiment)") + + return analytics_results + + except Exception as e: + context.log.error(f"Analytics generation failed: {e}") + return { + 'generated_at': datetime.now().isoformat(), + 'analytics_success': False, + 'error_message': str(e) + } + + finally: + if 'db_manager' in locals(): + await db_manager.close_all() + +@asset( + deps=[social_media_analytics], + group_name="social_media", + description="Data quality monitoring and validation for social media pipeline", + compute_kind="python", + tags={"domain": "socialmedia", "monitoring": "data_quality"} +) +async def social_media_quality_check(context: AssetExecutionContext) -> Dict[str, Any]: + """Monitor data quality and pipeline health for social media assets""" + + context.log.info("Performing social media data quality checks") + + trading_config = TradingAgentsConfig.from_env() + db_manager = DatabaseManager(trading_config) + social_service = SocialMediaService(trading_config, db_manager) + + try: + # Check recent data volume + recent_posts = await social_service.repository.find_by_subreddit( + 'wallstreetbets', # Use as representative subreddit + hours=24, + limit=1000 + ) + + # Quality metrics + total_posts = len(recent_posts) + posts_with_sentiment = sum(1 for p in recent_posts if p.sentiment_label is not None) + posts_with_embeddings = sum( + 1 for p in recent_posts + if p.title_embedding is not None or p.content_embedding is not None + ) + posts_with_tickers = sum(1 for p in recent_posts if p.tickers) + + # Calculate quality percentages + sentiment_coverage = posts_with_sentiment / total_posts if total_posts > 0 else 0 + embedding_coverage = posts_with_embeddings / total_posts if total_posts > 0 else 0 + ticker_coverage = posts_with_tickers / total_posts if total_posts > 0 else 0 + + # Quality thresholds + quality_checks = { + 'data_volume_check': total_posts >= 100, # Expect at least 100 posts per day + 'sentiment_coverage_check': sentiment_coverage >= 0.8, # 80% should have sentiment + 'embedding_coverage_check': embedding_coverage >= 0.7, # 70% should have embeddings + 'ticker_coverage_check': ticker_coverage >= 0.3 # 30% should have ticker mentions + } + + overall_health = all(quality_checks.values()) + + # Log quality results + context.log.info(f"Data quality assessment:") + context.log.info(f" - Total posts (24h): {total_posts}") + context.log.info(f" - Sentiment coverage: {sentiment_coverage:.1%}") + context.log.info(f" - Embedding coverage: {embedding_coverage:.1%}") + context.log.info(f" - Ticker coverage: {ticker_coverage:.1%}") + context.log.info(f" - Overall health: {'PASS' if overall_health else 'FAIL'}") + + # Alert on quality issues + for check_name, passed in quality_checks.items(): + if not passed: + context.log.warning(f"Quality check failed: {check_name}") + + return { + 'check_timestamp': datetime.now().isoformat(), + 'total_posts_24h': total_posts, + 'quality_metrics': { + 'sentiment_coverage': sentiment_coverage, + 'embedding_coverage': embedding_coverage, + 'ticker_coverage': ticker_coverage + }, + 'quality_checks': quality_checks, + 'overall_health': overall_health, + 'quality_check_success': True + } + + except Exception as e: + context.log.error(f"Quality check failed: {e}") + return { + 'check_timestamp': datetime.now().isoformat(), + 'quality_check_success': False, + 'error_message': str(e) + } + + finally: + if 'db_manager' in locals(): + await db_manager.close_all() + +# Schedule configuration for the social media pipeline +SOCIAL_MEDIA_SCHEDULE = { + "reddit_financial_posts": "0 6,18 * * *", # 6 AM and 6 PM UTC daily + "social_media_analytics": "30 7,19 * * *", # 30 minutes after collection + "social_media_quality_check": "0 8,20 * * *" # 1 hour after collection +} +``` + +**Acceptance Criteria:** +- [ ] Daily scheduled collection from financial subreddits +- [ ] Sentiment analysis and embedding generation in pipeline +- [ ] Analytics generation with trending ticker analysis +- [ ] Data quality monitoring with configurable thresholds +- [ ] Proper error handling and logging throughout pipeline +- [ ] Cleanup of old data based on retention policies +- [ ] Integration with existing Dagster infrastructure +- [ ] Monitoring and alerting on pipeline failures + +**Dependencies:** Task 2.4 (SocialMediaService) +**Risk:** Low - Standard Dagster asset patterns + +--- + +### Task 3.3: Comprehensive Testing Suite (3 hours) +**Priority: High** | **Agent: Testing Specialist** + +Implement comprehensive test suite covering all socialmedia domain components with >85% coverage. + +**Test Structure:** +``` +tests/domains/socialmedia/ +├── conftest.py # Fixtures and test configuration +├── test_entities.py # SQLAlchemy entity tests +├── test_models.py # Domain model validation tests +├── test_reddit_client.py # API integration with VCR +├── test_sentiment_analyzer.py # LLM sentiment analysis +├── test_embedding_generator.py # Vector embedding generation +├── test_social_repository.py # Database operations +├── test_social_service.py # Service orchestration +├── test_agent_toolkit.py # AgentToolkit integration +├── test_dagster_assets.py # Pipeline testing +└── fixtures/ + ├── reddit_responses.yaml # VCR cassettes + ├── sample_posts.json # Test data + └── embeddings.json # Sample embeddings +``` + +**Implementation Samples:** + +**conftest.py:** +```python +import pytest +import asyncio +from unittest.mock import MagicMock, AsyncMock +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker +from tradingagents.config import TradingAgentsConfig +from tradingagents.database import DatabaseManager +from tradingagents.domains.socialmedia.entities import SocialMediaPostEntity +from tradingagents.domains.socialmedia.models import SocialPost +from tradingagents.domains.socialmedia.services import SocialMediaService + +@pytest.fixture(scope="session") +def event_loop(): + """Create event loop for async tests""" + loop = asyncio.new_event_loop() + yield loop + loop.close() + +@pytest.fixture +def test_config(): + """Test configuration""" + return TradingAgentsConfig( + reddit_client_id="test_client_id", + reddit_client_secret="test_secret", + reddit_user_agent="test_agent", + openrouter_api_key="test_openrouter_key", + quick_think_llm="test/model", + database_url="sqlite:///test.db" + ) + +@pytest.fixture +async def db_session(test_config): + """Test database session""" + engine = create_engine(test_config.database_url, echo=False) + SocialMediaPostEntity.metadata.create_all(engine) + SessionLocal = sessionmaker(bind=engine) + session = SessionLocal() + yield session + session.close() + SocialMediaPostEntity.metadata.drop_all(engine) + +@pytest.fixture +def sample_social_post(): + """Sample SocialPost for testing""" + return SocialPost( + post_id="test123", + title="AAPL to the moon! 🚀", + content="Apple stock is going to explode higher after earnings!", + author="test_user", + subreddit="wallstreetbets", + created_utc=datetime(2024, 1, 15, 10, 0, 0), + upvotes=150, + downvotes=25, + comments_count=45, + url="https://reddit.com/r/wallstreetbets/test123", + tickers=["AAPL"], + sentiment_score=0.8, + sentiment_label="positive", + sentiment_confidence=0.9 + ) + +@pytest.fixture +def mock_social_service(test_config): + """Mocked SocialMediaService""" + service = MagicMock(spec=SocialMediaService) + service.config = test_config + service.repository = AsyncMock() + service.sentiment_analyzer = AsyncMock() + service.embedding_generator = AsyncMock() + return service +``` + +**test_models.py:** +```python +import pytest +from datetime import datetime +from tradingagents.domains.socialmedia.models import SocialPost, SentimentScore + +def test_social_post_validation(): + """Test SocialPost validation rules""" + # Valid post + post = SocialPost( + post_id="abc123", + title="Test post", + author="test_user", + subreddit="stocks", + created_utc=datetime.now(), + upvotes=10, + downvotes=2, + comments_count=5, + url="https://reddit.com/test" + ) + assert post.post_id == "abc123" + assert post.tickers == [] + +def test_extract_tickers(): + """Test ticker extraction from post content""" + post = SocialPost( + post_id="abc123", + title="AAPL and $TSLA are great buys", + content="I think MSFT will outperform this year", + author="test_user", + subreddit="investing", + created_utc=datetime.now(), + upvotes=10, + downvotes=0, + comments_count=3, + url="https://reddit.com/test" + ) + + tickers = post.extract_tickers() + assert "AAPL" in tickers + assert "TSLA" in tickers + assert "MSFT" in tickers + assert len(tickers) == 3 + +def test_sentiment_validation(): + """Test sentiment score validation""" + # Valid sentiment + sentiment = SentimentScore( + sentiment="positive", + confidence=0.85, + reasoning="Bullish language and positive outlook" + ) + assert sentiment.confidence == 0.85 + + # Invalid confidence + with pytest.raises(ValueError): + SentimentScore( + sentiment="positive", + confidence=1.5 # > 1.0 + ) + +@pytest.mark.parametrize("sentiment_score,sentiment_label,confidence,expected_reliable", [ + (0.8, "positive", 0.9, True), + (0.3, "neutral", 0.4, False), + (-0.6, "negative", 0.7, True), + (None, None, None, False) +]) +def test_has_reliable_sentiment(sentiment_score, sentiment_label, confidence, expected_reliable): + """Test sentiment reliability check""" + post = SocialPost( + post_id="test", + title="Test", + author="user", + subreddit="test", + created_utc=datetime.now(), + upvotes=1, + downvotes=0, + comments_count=0, + url="test", + sentiment_score=sentiment_score, + sentiment_label=sentiment_label, + sentiment_confidence=confidence + ) + + assert post.has_reliable_sentiment() == expected_reliable +``` + +**test_social_repository.py:** +```python +import pytest +from datetime import datetime, timedelta +from tradingagents.domains.socialmedia.repositories import SocialRepository +from tradingagents.domains.socialmedia.models import SocialPost + +@pytest.mark.asyncio +async def test_upsert_batch_deduplication(social_repository, sample_social_post): + """Test batch upsert with deduplication""" + posts = [sample_social_post, sample_social_post] # Duplicate posts + + saved_ids = await social_repository.upsert_batch(posts) + + assert len(saved_ids) == 1 # Only one saved due to deduplication + assert saved_ids[0] == sample_social_post.post_id + +@pytest.mark.asyncio +async def test_find_by_ticker(social_repository, sample_social_post): + """Test finding posts by ticker symbol""" + await social_repository.upsert_batch([sample_social_post]) + + posts = await social_repository.find_by_ticker("AAPL", days=7) + + assert len(posts) == 1 + assert posts[0].post_id == sample_social_post.post_id + assert "AAPL" in posts[0].tickers + +@pytest.mark.asyncio +async def test_vector_similarity_search(social_repository, sample_social_post): + """Test vector similarity search""" + # Add post with embedding + sample_social_post.title_embedding = [0.1] * 1536 # Mock embedding + await social_repository.upsert_batch([sample_social_post]) + + # Search with similar embedding + query_embedding = [0.1] * 1536 + results = await social_repository.find_similar_posts( + query_embedding=query_embedding, + limit=5 + ) + + assert len(results) >= 0 # May be empty if similarity too low + if results: + post, similarity = results[0] + assert isinstance(similarity, float) + assert 0 <= similarity <= 1 + +@pytest.mark.asyncio +async def test_sentiment_summary(social_repository, sample_social_post): + """Test sentiment aggregation""" + await social_repository.upsert_batch([sample_social_post]) + + summary = await social_repository.get_sentiment_summary( + ticker="AAPL", + hours=24 + ) + + assert summary['ticker'] == "AAPL" + assert summary['total_posts'] >= 0 + assert 'sentiment_breakdown' in summary + assert 'overall_sentiment' in summary + +@pytest.mark.asyncio +async def test_cleanup_old_posts(social_repository, sample_social_post): + """Test cleanup of old posts""" + # Create old post + old_post = sample_social_post.copy() + old_post.post_id = "old_post" + old_post.created_utc = datetime.now() - timedelta(days=100) + + await social_repository.upsert_batch([old_post]) + + deleted_count = await social_repository.cleanup_old_posts(days=90) + + assert deleted_count >= 1 +``` + +**test_reddit_client.py (with VCR):** +```python +import pytest +import pytest_vcr +from tradingagents.domains.socialmedia.clients import RedditClient + +@pytest_vcr.use_cassette('fixtures/reddit_fetch_posts.yaml') +@pytest.mark.asyncio +async def test_fetch_subreddit_posts(test_config): + """Test fetching posts from Reddit API""" + async with RedditClient(test_config) as client: + posts = await client.fetch_subreddit_posts( + subreddit_name="wallstreetbets", + limit=10 + ) + + assert len(posts) > 0 + for post in posts: + assert 'post_id' in post + assert 'title' in post + assert 'subreddit' in post + assert post['subreddit'] == 'wallstreetbets' + +@pytest_vcr.use_cassette('fixtures/reddit_search.yaml') +@pytest.mark.asyncio +async def test_search_posts(test_config): + """Test Reddit post search functionality""" + async with RedditClient(test_config) as client: + posts = await client.search_posts( + query="AAPL", + subreddit_names=["investing"], + limit=5 + ) + + assert isinstance(posts, list) + if posts: # May be empty in test + for post in posts: + assert 'post_id' in post + assert 'title' in post + +@pytest.mark.asyncio +async def test_health_check(test_config): + """Test Reddit API health check""" + async with RedditClient(test_config) as client: + health = await client.health_check() + assert isinstance(health, bool) +``` + +**Acceptance Criteria:** +- [ ] >85% test coverage across all socialmedia domain components +- [ ] Unit tests for all domain models with validation edge cases +- [ ] Integration tests for Reddit API client with VCR cassettes +- [ ] Repository tests with real PostgreSQL database operations +- [ ] Service layer tests with proper mocking of dependencies +- [ ] AgentToolkit integration tests +- [ ] Dagster pipeline asset tests with mocked data +- [ ] Performance benchmarks for vector similarity queries +- [ ] Error handling and edge case coverage +- [ ] Test fixtures and sample data for consistent testing + +**Dependencies:** All implementation tasks +**Risk:** Low - Standard testing patterns + +--- + +## Implementation Dependencies & Parallel Execution + +### Phase 1 Dependencies +- Task 1.1 → Task 1.2 (Entity depends on database schema) +- Task 1.3 can run parallel with 1.1 and 1.2 +- Task 1.4 depends on 1.1 and 1.2 + +### Phase 2 Dependencies +- All Phase 2 tasks can run in parallel +- Task 2.4 depends on 2.1, 2.2, and 2.3 + +### Phase 3 Dependencies +- Task 3.1 depends on Task 2.4 +- Task 3.2 depends on Task 2.4 +- Task 3.3 can start after any component is complete + +### Risk Assessment + +**High Risk Tasks:** +- Task 2.1 (Reddit Client) - External API complexity, rate limiting + +**Medium Risk Tasks:** +- Task 1.1 (Database Migration) - Extension dependencies +- Task 1.4 (Repository) - Complex vector queries +- Task 2.2 (Sentiment Analysis) - LLM API reliability +- Task 2.4 (Service Layer) - Complex orchestration + +**Low Risk Tasks:** +- Task 1.2 (Entity Implementation) +- Task 1.3 (Domain Models) +- Task 2.3 (Embedding Generation) +- Task 3.1 (AgentToolkit Integration) +- Task 3.2 (Dagster Pipeline) +- Task 3.3 (Testing Suite) + +## Success Criteria Summary + +### Functionality +- ✅ Complete Reddit data collection with PRAW integration +- ✅ OpenRouter LLM sentiment analysis with confidence scoring +- ✅ Vector embeddings for semantic similarity search +- ✅ PostgreSQL + TimescaleDB + pgvectorscale data persistence +- ✅ AgentToolkit RAG methods for AI agent integration +- ✅ Daily Dagster pipeline for automated collection +- ✅ Comprehensive error handling and resilience + +### Performance +- ✅ <2 second social context queries for AI agents +- ✅ <1 second vector similarity search (top 10 results) +- ✅ <5 seconds batch processing 1000 posts +- ✅ Efficient TimescaleDB time-series queries + +### Quality +- ✅ >85% test coverage across all components +- ✅ Data quality monitoring and validation +- ✅ Comprehensive logging and observability +- ✅ Best-effort processing with graceful degradation + +### Integration +- ✅ Seamless integration with existing TradingAgents architecture +- ✅ Follows news domain patterns for consistency +- ✅ Compatible with multi-agent trading workflows +- ✅ Production-ready deployment capability + +This comprehensive task breakdown enables efficient parallel development by multiple AI agents while ensuring complete coverage of the socialmedia domain implementation requirements. \ No newline at end of file diff --git a/docs/standards/practices.md b/docs/standards/practices.md new file mode 100644 index 00000000..41663183 --- /dev/null +++ b/docs/standards/practices.md @@ -0,0 +1,649 @@ +# Development Practices - TradingAgents + +## Testing Standards + +### Pragmatic Outside-In TDD + +**Philosophy**: Mock I/O boundaries, test real logic, optimize for fast feedback. + +**Core Principle**: Test behavior, not implementation. Focus on public interfaces and data transformations while mocking external dependencies (HTTP, database, filesystem). + +### Testing Strategy by Layer + +#### 1. Services (Business Logic) - Mock Boundaries +```python +# tests/domains/news/test_news_service.py +import pytest +from unittest.mock import Mock, AsyncMock +from tradingagents.domains.news.news_service import NewsService +from tradingagents.domains.news.news_repository import NewsArticle + +@pytest.fixture +def mock_repository(): + return AsyncMock(spec=NewsRepository) + +@pytest.fixture +def mock_google_client(): + return AsyncMock(spec=GoogleNewsClient) + +async def test_get_articles_returns_empty_on_repository_error(mock_repository): + # Mock repository failure + mock_repository.list.side_effect = Exception("Database connection failed") + + service = NewsService(repository=mock_repository, clients={}) + + # Service should handle error gracefully + articles = await service.get_articles("AAPL", date(2024, 1, 15)) + + assert articles == [] + mock_repository.list.assert_called_once_with("AAPL", date(2024, 1, 15)) + +async def test_update_articles_transforms_external_data_correctly(): + # Real business logic: test data transformation and coordination + external_articles = [create_external_article("Breaking News", "CNN")] + + mock_repository = AsyncMock() + mock_google_client = AsyncMock() + mock_google_client.search.return_value = external_articles + + service = NewsService( + repository=mock_repository, + clients={"google": mock_google_client} + ) + + # Test business logic: coordination and transformation + result_count = await service.update_articles("AAPL", date(2024, 1, 15)) + + # Verify transformation happened correctly + stored_articles = mock_repository.upsert_batch.call_args[0][0] + assert len(stored_articles) == 1 + assert isinstance(stored_articles[0], NewsArticle) + assert stored_articles[0].headline == "Breaking News" +``` + +#### 2. Repositories (Data Access) - Real Persistence +```python +# tests/domains/news/test_news_repository.py +import pytest +from tradingagents.lib.database import create_test_database_manager +from tradingagents.domains.news.news_repository import NewsRepository, NewsArticle + +@pytest.fixture +async def db_manager(): + """Use real PostgreSQL for repository tests""" + manager = create_test_database_manager() + await manager.create_tables() + yield manager + await manager.drop_tables() + await manager.close() + +async def test_upsert_batch_handles_duplicates_correctly(db_manager): + """Test actual database behavior with real SQL operations""" + repository = NewsRepository(db_manager) + + # Insert initial articles + articles = [ + NewsArticle("Apple Earnings Beat", "https://cnn.com/1", "CNN", date(2024, 1, 15)), + NewsArticle("Apple Stock Rises", "https://cnn.com/2", "CNN", date(2024, 1, 15)) + ] + + result1 = await repository.upsert_batch(articles, "AAPL") + assert len(result1) == 2 + + # Update one article (same URL) + updated_articles = [ + NewsArticle("Apple Earnings Beat Expectations", "https://cnn.com/1", "CNN", date(2024, 1, 15)) + ] + + result2 = await repository.upsert_batch(updated_articles, "AAPL") + + # Should update existing, not create duplicate + all_articles = await repository.list("AAPL", date(2024, 1, 15)) + assert len(all_articles) == 2 + assert any("Beat Expectations" in a.headline for a in all_articles) + +async def test_list_by_date_range_performance(db_manager): + """Test query performance with indexed queries""" + repository = NewsRepository(db_manager) + + # Insert test data + articles = [ + NewsArticle(f"News {i}", f"https://example.com/{i}", "Test", date(2024, 1, i+1)) + for i in range(100) + ] + await repository.upsert_batch(articles, "AAPL") + + # Test indexed query performance + start_time = time.time() + results = await repository.list_by_date_range( + "AAPL", date(2024, 1, 1), date(2024, 1, 10), limit=50 + ) + elapsed = time.time() - start_time + + assert len(results) == 10 + assert elapsed < 0.1 # < 100ms for simple query +``` + +#### 3. Clients (External APIs) - pytest-vcr +```python +# tests/domains/news/test_google_news_client.py +import pytest +import pytest_vcr +from tradingagents.domains.news.google_news_client import GoogleNewsClient + +class TestGoogleNewsClient: + @pytest_vcr.use_cassette("google_news_apple_search.yaml") + async def test_search_returns_structured_articles(self): + """Real HTTP calls recorded with VCR cassettes""" + client = GoogleNewsClient() + + articles = await client.search("AAPL", max_results=5) + + # Test real API response structure + assert len(articles) > 0 + assert all(article.title for article in articles) + assert all(article.link.startswith("http") for article in articles) + assert all(article.source for article in articles) + + @pytest_vcr.use_cassette("google_news_no_results.yaml") + async def test_search_handles_no_results_gracefully(self): + """Test error cases with real API responses""" + client = GoogleNewsClient() + + articles = await client.search("NONEXISTENT_SYMBOL_XYZ", max_results=5) + + assert articles == [] +``` + +### Quality Standards + +#### Coverage Requirements +- **85% minimum coverage** across all domains +- **100% coverage** for critical financial calculations +- **Branch coverage** for error handling paths + +**Coverage Enforcement**: +```bash +# mise tasks for coverage +[tasks.test-coverage] +description = "Run tests with coverage report" +run = "uv run pytest --cov=tradingagents --cov-report=html --cov-fail-under=85" + +[tasks.coverage-report] +description = "Open coverage report in browser" +run = "open htmlcov/index.html" +``` + +#### Performance Standards +- **< 100ms per unit test** (fast feedback) +- **< 5s for integration test suite** (rapid development) +- **< 30s for full test suite** (CI/CD efficiency) + +**Performance Monitoring**: +```python +# conftest.py - Test timing +@pytest.fixture(autouse=True) +def test_timer(request): + start_time = time.time() + yield + duration = time.time() - start_time + if duration > 0.1: # 100ms threshold + pytest.warn(f"Slow test: {request.node.nodeid} took {duration:.2f}s") +``` + +#### Test Structure Standards + +**Mirror Source Structure**: +``` +tests/ +├── conftest.py # Shared fixtures +├── domains/ +│ ├── news/ +│ │ ├── test_news_service.py # Business logic tests (mocked boundaries) +│ │ ├── test_news_repository.py # Data persistence tests (real DB) +│ │ └── test_google_news_client.py # External API tests (VCR cassettes) +│ ├── marketdata/ +│ └── socialmedia/ +├── agents/ +│ └── test_trading_graph.py # Agent workflow tests +└── integration/ + └── test_end_to_end.py # Full system tests +``` + +**Naming Conventions**: +- `test_{method_name}_{expected_behavior}_{context}` +- Example: `test_upsert_batch_handles_duplicates_correctly` + +## Development Workflow with Mise + +### Daily Development Commands + +**Core Development Flow**: +```bash +# 1. Start development environment +mise run docker # Start PostgreSQL + TimescaleDB + +# 2. Install/update dependencies +mise run install # uv sync --dev + +# 3. Development iteration +mise run format # Auto-format with ruff +mise run lint # Check code quality +mise run typecheck # Type checking with pyrefly +mise run test # Run test suite + +# 4. Run application +mise run dev # Interactive CLI +mise run run # Direct execution +``` + +**Quality Assurance**: +```bash +# Run all quality checks before commit +mise run all # format + lint + typecheck + +# Coverage analysis +mise run test-coverage +mise run coverage-report +``` + +**Troubleshooting**: +```bash +# Clean build artifacts +mise run clean + +# Reset development environment +mise run docker # Restart containers +mise run install # Reinstall dependencies +``` + +### Code Quality Standards + +#### Linting with Ruff +```toml +# pyproject.toml +[tool.ruff] +target-version = "py313" +line-length = 88 +extend-exclude = ["migrations/", "alembic/"] + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # Pyflakes + "I", # isort + "B", # flake8-bugbear + "C4", # flake8-comprehensions + "UP", # pyupgrade + "ERA", # eradicate + "PIE", # flake8-pie + "SIM", # flake8-simplify +] + +ignore = [ + "E501", # Line too long (handled by formatter) + "B008", # Do not perform function calls in argument defaults + "B904", # raise ... from None +] + +[tool.ruff.lint.per-file-ignores] +"tests/**/*.py" = [ + "S101", # Use of assert detected + "ARG", # Unused function args + "FBT", # Boolean trap +] +``` + +#### Type Checking with Pyrefly +```toml +[tool.pyrefly] +python-version = "3.13" +warn-unused-ignores = true +show-error-codes = true +strict = true + +# Enable async-aware type checking +plugins = ["sqlalchemy.ext.mypy.plugin"] + +# Per-module configuration +[[tool.pyrefly.overrides]] +module = "tests.*" +disallow_untyped_defs = false +``` + +### Database Development Patterns + +#### Migration Workflow +```bash +# 1. Create migration after model changes +alembic revision --autogenerate -m "Add user preferences table" + +# 2. Review generated migration +# Edit alembic/versions/{hash}_add_user_preferences_table.py + +# 3. Apply migration +alembic upgrade head + +# 4. Test with sample data +mise run test-migrations +``` + +#### Development Database Management +```bash +# Reset development database +mise run docker # Stop/start containers +alembic upgrade head # Apply all migrations +python scripts/seed_dev_data.py # Load sample data +``` + +#### Testing Database Strategy +```python +# Test database isolation +@pytest.fixture(scope="function") +async def clean_db(): + """Fresh database for each test""" + db_manager = create_test_database_manager() + await db_manager.create_tables() + yield db_manager + await db_manager.drop_tables() + await db_manager.close() + +# Shared test data +@pytest.fixture +def sample_news_articles(): + """Reusable test data across test modules""" + return [ + NewsArticle("Apple Earnings", "https://cnn.com/1", "CNN", date(2024, 1, 15)), + NewsArticle("Tesla Updates", "https://reuters.com/2", "Reuters", date(2024, 1, 16)) + ] +``` + +## Error Handling and Retry Strategies + +### Resilient External API Integration + +#### Exponential Backoff with Circuit Breaker +```python +import asyncio +import logging +from functools import wraps +from typing import TypeVar, Callable, Any + +T = TypeVar('T') + +class APIClient: + def __init__(self): + self.circuit_breaker = CircuitBreaker( + failure_threshold=5, + reset_timeout=60, + expected_exception=aiohttp.ClientError + ) + + @retry_with_backoff(max_retries=3, base_delay=1.0) + async def fetch_data(self, url: str) -> dict: + """Resilient HTTP requests with retry logic""" + async with self.circuit_breaker: + async with aiohttp.ClientSession() as session: + async with session.get(url, timeout=30) as response: + if response.status >= 500: + raise aiohttp.ClientError(f"Server error: {response.status}") + return await response.json() + +def retry_with_backoff(max_retries: int = 3, base_delay: float = 1.0): + """Decorator for exponential backoff retry logic""" + def decorator(func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]: + @wraps(func) + async def wrapper(*args, **kwargs) -> T: + last_exception = None + + for attempt in range(max_retries + 1): + try: + return await func(*args, **kwargs) + except Exception as e: + last_exception = e + if attempt == max_retries: + break + + delay = base_delay * (2 ** attempt) # Exponential backoff + jitter = random.uniform(0.1, 0.9) # Add jitter + await asyncio.sleep(delay * jitter) + + logging.warning(f"Retry {attempt + 1}/{max_retries} for {func.__name__}: {e}") + + raise last_exception + return wrapper + return decorator +``` + +### Database Error Handling + +#### Graceful Degradation +```python +class NewsService: + async def get_articles(self, symbol: str, date: date) -> list[NewsArticle]: + """Service-level error handling with fallbacks""" + try: + # Try primary repository + articles = await self.repository.list(symbol, date) + logger.info(f"Retrieved {len(articles)} articles from database") + return articles + + except DatabaseConnectionError: + logger.warning("Database unavailable, trying cache fallback") + # Fallback to file cache + return await self.cache_repository.list(symbol, date) + + except Exception as e: + logger.error(f"Failed to retrieve articles for {symbol}: {e}") + # Graceful degradation - return empty list rather than crash + return [] + + async def update_articles_with_partial_failure_handling(self, symbol: str, date: date) -> dict: + """Handle partial failures in batch operations""" + results = {"successful": 0, "failed": 0, "errors": []} + + try: + # Attempt batch fetch from multiple sources + sources = ["google_news", "finnhub", "alpha_vantage"] + articles_by_source = {} + + for source in sources: + try: + client = self.clients[source] + articles = await client.fetch_news(symbol, date) + articles_by_source[source] = articles + logger.info(f"Fetched {len(articles)} from {source}") + except Exception as e: + results["errors"].append(f"{source}: {str(e)}") + logger.warning(f"Failed to fetch from {source}: {e}") + + # Process successful fetches + all_articles = [] + for source, articles in articles_by_source.items(): + try: + validated = [a for a in articles if self.validate_article(a)] + all_articles.extend(validated) + results["successful"] += len(validated) + except Exception as e: + results["failed"] += len(articles) + results["errors"].append(f"Validation failed for {source}: {str(e)}") + + # Store successfully processed articles + if all_articles: + await self.repository.upsert_batch(all_articles, symbol) + + return results + + except Exception as e: + logger.error(f"Critical error in update_articles: {e}") + results["errors"].append(f"Critical failure: {str(e)}") + return results +``` + +### Logging Standards + +#### Structured Logging Configuration +```python +import logging +import json +from datetime import datetime + +class JSONFormatter(logging.Formatter): + """Structured JSON logging for production""" + + def format(self, record): + log_entry = { + "timestamp": datetime.utcnow().isoformat(), + "level": record.levelname, + "logger": record.name, + "message": record.getMessage(), + } + + # Add context information + if hasattr(record, 'symbol'): + log_entry["symbol"] = record.symbol + if hasattr(record, 'user_id'): + log_entry["user_id"] = record.user_id + if hasattr(record, 'request_id'): + log_entry["request_id"] = record.request_id + + # Add exception info + if record.exc_info: + log_entry["exception"] = self.formatException(record.exc_info) + + return json.dumps(log_entry) + +# Configuration +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.StreamHandler(), + logging.FileHandler('tradingagents.log') + ] +) + +# Domain-specific loggers +news_logger = logging.getLogger('tradingagents.domains.news') +market_logger = logging.getLogger('tradingagents.domains.marketdata') +agent_logger = logging.getLogger('tradingagents.agents') +``` + +#### Contextual Logging in Services +```python +class NewsService: + def __init__(self): + self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}") + + async def get_articles(self, symbol: str, date: date) -> list[NewsArticle]: + # Add context to log messages + extra = {"symbol": symbol, "date": date.isoformat()} + + self.logger.info("Starting article retrieval", extra=extra) + + try: + articles = await self.repository.list(symbol, date) + self.logger.info( + f"Successfully retrieved {len(articles)} articles", + extra={**extra, "count": len(articles)} + ) + return articles + except Exception as e: + self.logger.error( + f"Failed to retrieve articles: {e}", + extra=extra, + exc_info=True + ) + raise +``` + +## Performance Monitoring + +### Application Metrics + +#### Key Performance Indicators +```python +import time +import asyncio +from functools import wraps +from collections import defaultdict + +class PerformanceMonitor: + def __init__(self): + self.metrics = defaultdict(list) + + def track_execution_time(self, operation: str): + """Decorator to track method execution time""" + def decorator(func): + @wraps(func) + async def wrapper(*args, **kwargs): + start_time = time.time() + try: + result = await func(*args, **kwargs) + return result + finally: + duration = time.time() - start_time + self.metrics[f"{operation}_duration"].append(duration) + + # Log slow operations + if duration > 1.0: + logging.warning(f"Slow operation {operation}: {duration:.2f}s") + return wrapper + return decorator + + def get_performance_summary(self) -> dict: + """Get performance statistics""" + summary = {} + for operation, durations in self.metrics.items(): + if durations: + summary[operation] = { + "count": len(durations), + "avg": sum(durations) / len(durations), + "min": min(durations), + "max": max(durations), + "p95": sorted(durations)[int(len(durations) * 0.95)] + } + return summary + +# Usage in services +monitor = PerformanceMonitor() + +class NewsService: + @monitor.track_execution_time("news_fetch") + async def get_articles(self, symbol: str, date: date) -> list[NewsArticle]: + return await self.repository.list(symbol, date) + + @monitor.track_execution_time("news_update") + async def update_articles(self, symbol: str, date: date) -> int: + return await self._fetch_and_store_articles(symbol, date) +``` + +### Database Query Optimization + +#### Query Performance Monitoring +```python +# Custom SQLAlchemy event listener for query timing +from sqlalchemy import event +from sqlalchemy.engine import Engine +import logging + +query_logger = logging.getLogger('tradingagents.database.queries') + +@event.listens_for(Engine, "before_cursor_execute") +def before_cursor_execute(conn, cursor, statement, parameters, context, executemany): + context._query_start_time = time.time() + +@event.listens_for(Engine, "after_cursor_execute") +def after_cursor_execute(conn, cursor, statement, parameters, context, executemany): + total = time.time() - context._query_start_time + + # Log slow queries + if total > 0.1: # 100ms threshold + query_logger.warning( + f"Slow query ({total:.2f}s): {statement[:100]}...", + extra={"duration": total, "query": statement[:200]} + ) +``` + +This comprehensive development practices document establishes the foundation for maintaining high code quality, rapid development cycles, and robust error handling in the TradingAgents system. \ No newline at end of file diff --git a/docs/standards/security.md b/docs/standards/security.md new file mode 100644 index 00000000..29cce007 --- /dev/null +++ b/docs/standards/security.md @@ -0,0 +1,837 @@ +# Security Standards - TradingAgents + +## API Key Management + +### OpenRouter and LLM Provider Security + +**Environment Variable Management**: +```bash +# Required API keys +export OPENROUTER_API_KEY="sk-or-v1-xxxxxxxxxxxx" + +# Optional provider keys (for fallback) +export OPENAI_API_KEY="sk-xxxxxxxxxxxx" +export ANTHROPIC_API_KEY="sk-ant-xxxxxxxxxxxx" + +# Financial data APIs +export FINNHUB_API_KEY="xxxxxxxxxxxx" +export ALPHA_VANTAGE_API_KEY="xxxxxxxxxxxx" +``` + +**Configuration Security**: +```python +import os +from pathlib import Path + +class SecureConfig: + """Secure configuration management with validation""" + + @classmethod + def get_required_env(cls, key: str, description: str = "") -> str: + """Get required environment variable with validation""" + value = os.getenv(key) + if not value: + raise EnvironmentError( + f"Required environment variable {key} not set. {description}" + ) + + # Validate API key format + if key.endswith("_API_KEY"): + cls._validate_api_key(key, value) + + return value + + @classmethod + def _validate_api_key(cls, key: str, value: str) -> None: + """Validate API key format and warn on potential issues""" + if len(value) < 20: + raise ValueError(f"API key {key} appears too short (< 20 chars)") + + if value.startswith("sk-") and len(value) < 40: + raise ValueError(f"OpenAI/OpenRouter API key {key} appears invalid") + + # Detect placeholder values + placeholder_patterns = ["your_", "replace_", "xxxx", "test"] + if any(pattern in value.lower() for pattern in placeholder_patterns): + raise ValueError(f"API key {key} appears to be a placeholder") + + @classmethod + def load_openrouter_config(cls) -> dict[str, str]: + """Load and validate OpenRouter configuration""" + return { + "api_key": cls.get_required_env( + "OPENROUTER_API_KEY", + "Get your key from https://openrouter.ai/keys" + ), + "base_url": os.getenv("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"), + "app_name": os.getenv("OPENROUTER_APP_NAME", "TradingAgents"), + "site_url": os.getenv("OPENROUTER_SITE_URL", "https://github.com/TauricResearch/TradingAgents") + } +``` + +**Development vs Production Key Management**: +```python +# .env.example (committed to repo) +OPENROUTER_API_KEY=your_openrouter_api_key_here +DATABASE_URL=postgresql+asyncpg://postgres:tradingagents@localhost:5432/tradingagents +TRADINGAGENTS_RESULTS_DIR=./results +TRADINGAGENTS_DATA_DIR=./data + +# .env (never committed, gitignored) +OPENROUTER_API_KEY=sk-or-v1-actual-key-here +DATABASE_URL=postgresql+asyncpg://user:password@prod-db:5432/tradingagents +``` + +### Secret Rotation and Management + +**Key Rotation Strategy**: +```python +import logging +from datetime import datetime, timedelta +from typing import Dict, Optional + +logger = logging.getLogger(__name__) + +class APIKeyManager: + """Manages API key rotation and health monitoring""" + + def __init__(self): + self.key_health: Dict[str, Dict] = {} + self.rotation_schedule: Dict[str, datetime] = {} + + async def validate_key_health(self, service: str, api_key: str) -> bool: + """Test API key validity with minimal request""" + try: + if service == "openrouter": + return await self._test_openrouter_key(api_key) + elif service == "finnhub": + return await self._test_finnhub_key(api_key) + else: + logger.warning(f"No health check implemented for {service}") + return True + except Exception as e: + logger.error(f"API key health check failed for {service}: {e}") + return False + + async def _test_openrouter_key(self, api_key: str) -> bool: + """Test OpenRouter key with lightweight request""" + import aiohttp + + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json" + } + + # Use minimal model list request to test auth + async with aiohttp.ClientSession() as session: + async with session.get( + "https://openrouter.ai/api/v1/models", + headers=headers, + timeout=aiohttp.ClientTimeout(total=10) + ) as response: + return response.status == 200 + + def schedule_rotation(self, service: str, days: int = 90) -> None: + """Schedule API key rotation""" + rotation_date = datetime.now() + timedelta(days=days) + self.rotation_schedule[service] = rotation_date + logger.info(f"Scheduled {service} key rotation for {rotation_date.date()}") + + def get_rotation_alerts(self) -> list[str]: + """Get list of keys requiring rotation""" + alerts = [] + now = datetime.now() + warning_threshold = timedelta(days=7) + + for service, rotation_date in self.rotation_schedule.items(): + if now >= rotation_date: + alerts.append(f"URGENT: {service} API key rotation overdue") + elif now >= rotation_date - warning_threshold: + alerts.append(f"WARNING: {service} API key rotation due in {(rotation_date - now).days} days") + + return alerts +``` + +## Database Security Patterns + +### Connection Security + +**Secure Connection Configuration**: +```python +from sqlalchemy.ext.asyncio import create_async_engine +from sqlalchemy.pool import NullPool +import ssl + +class SecureDatabaseManager: + """Database manager with security-first configuration""" + + def __init__(self, database_url: str, require_ssl: bool = True): + # Parse and validate database URL + if not database_url.startswith(("postgresql+asyncpg://", "postgresql://")): + raise ValueError("Only PostgreSQL databases are supported") + + # Ensure asyncpg driver for better async performance + if database_url.startswith("postgresql://"): + database_url = database_url.replace("postgresql://", "postgresql+asyncpg://") + + # SSL/TLS configuration for production + connect_args = {} + if require_ssl: + ssl_context = ssl.create_default_context() + ssl_context.check_hostname = False # Often needed for cloud databases + ssl_context.verify_mode = ssl.CERT_REQUIRED + connect_args["ssl"] = ssl_context + + self.engine = create_async_engine( + database_url, + # Security settings + connect_args=connect_args, + pool_pre_ping=True, # Verify connections + pool_recycle=3600, # Recycle connections (1 hour) + + # Connection limits to prevent resource exhaustion + pool_size=10, # Base connection pool + max_overflow=20, # Additional connections under load + + # Prevent connection leaks in development + poolclass=NullPool if self._is_test_env() else None, + + # Disable SQL echo in production (information disclosure) + echo=False if os.getenv("ENVIRONMENT") == "production" else False + ) + + def _is_test_env(self) -> bool: + """Detect test environment""" + return any([ + "test" in os.getenv("DATABASE_URL", "").lower(), + os.getenv("TESTING") == "true", + "pytest" in sys.modules + ]) + + async def create_tables_secure(self): + """Create tables with security considerations""" + async with self.engine.begin() as conn: + # Set secure session parameters + await conn.execute(text("SET session_replication_role = 'origin'")) + await conn.execute(text("SET log_statement = 'none'")) # Disable query logging for DDL + + # Create tables + await conn.run_sync(Base.metadata.create_all) + + # Set up row-level security policies if needed + await self._setup_row_level_security(conn) + + async def _setup_row_level_security(self, conn): + """Configure row-level security for multi-tenant data""" + # Enable RLS on sensitive tables + await conn.execute(text("ALTER TABLE news_articles ENABLE ROW LEVEL SECURITY")) + + # Create policy for data isolation (if implementing multi-user features) + # await conn.execute(text(""" + # CREATE POLICY user_data_policy ON news_articles + # FOR ALL TO app_user + # USING (user_id = current_setting('app.user_id')::UUID) + # """)) +``` + +### Data Privacy and Anonymization + +**Financial Data Protection**: +```python +import hashlib +import secrets +from typing import Any, Dict + +class DataPrivacyManager: + """Handles sensitive financial data with privacy controls""" + + def __init__(self): + self.salt = self._get_or_create_salt() + + def _get_or_create_salt(self) -> bytes: + """Get encryption salt from secure storage""" + salt_path = Path(os.getenv("TRADINGAGENTS_DATA_DIR", "./data")) / ".salt" + + if salt_path.exists(): + return salt_path.read_bytes() + else: + # Generate cryptographically secure salt + salt = secrets.token_bytes(32) + salt_path.write_bytes(salt) + salt_path.chmod(0o600) # Restrict file permissions + return salt + + def hash_symbol(self, symbol: str) -> str: + """Create consistent hash for symbols (for analytics without exposure)""" + return hashlib.pbkdf2_hmac( + 'sha256', + symbol.encode(), + self.salt, + 100000 # iterations + ).hex()[:16] + + def sanitize_article_content(self, content: str) -> str: + """Remove PII and sensitive information from article content""" + import re + + # Remove potential SSNs, account numbers, etc. + patterns = [ + r'\b\d{3}-\d{2}-\d{4}\b', # SSN + r'\b\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}\b', # Credit card + r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', # Email + ] + + sanitized = content + for pattern in patterns: + sanitized = re.sub(pattern, '[REDACTED]', sanitized) + + return sanitized + + def audit_data_access(self, table: str, operation: str, record_count: int = 1): + """Log data access for compliance auditing""" + logger.info( + "Data access audit", + extra={ + "table": table, + "operation": operation, + "record_count": record_count, + "timestamp": datetime.utcnow().isoformat(), + "user": os.getenv("USER", "system") + } + ) +``` + +### Query Security + +**SQL Injection Prevention**: +```python +from sqlalchemy import text, select +from sqlalchemy.ext.asyncio import AsyncSession + +class SecureQueryBuilder: + """Build secure parameterized queries""" + + def __init__(self, session: AsyncSession): + self.session = session + + async def get_articles_secure( + self, + symbol: str, + date_filter: date, + user_input_query: Optional[str] = None + ) -> list[NewsArticle]: + """Secure article query with parameterization""" + + # Base query with parameterized symbol and date + query = select(NewsArticleEntity).where( + and_( + NewsArticleEntity.symbol == symbol, # Parameterized automatically + NewsArticleEntity.published_date == date_filter + ) + ) + + # Secure text search if provided + if user_input_query: + # Use full-text search instead of LIKE to prevent injection + # Sanitize and escape the search term + sanitized_query = self._sanitize_search_term(user_input_query) + query = query.where( + NewsArticleEntity.headline.match(sanitized_query) # PostgreSQL full-text search + ) + + result = await self.session.execute(query) + return [NewsArticle.from_entity(e) for e in result.scalars()] + + def _sanitize_search_term(self, query: str) -> str: + """Sanitize user input for full-text search""" + import re + + # Remove SQL injection patterns + dangerous_patterns = [ + r"[';\"\\]", # SQL metacharacters + r"\b(union|select|drop|delete|update|insert)\b", # SQL keywords + r"--", # SQL comments + r"/\*.*?\*/" # SQL block comments + ] + + sanitized = query + for pattern in dangerous_patterns: + sanitized = re.sub(pattern, "", sanitized, flags=re.IGNORECASE) + + # Limit length to prevent DoS + sanitized = sanitized[:100] + + # Convert to PostgreSQL full-text search format + terms = sanitized.split() + return " & ".join(f'"{term}"' for term in terms if term.isalnum()) + + async def execute_safe_raw_query(self, query_template: str, **params) -> Any: + """Execute raw SQL with parameter validation""" + # Whitelist allowed query templates + allowed_templates = { + "performance_stats": "SELECT * FROM pg_stat_statements WHERE query LIKE :pattern", + "table_sizes": "SELECT schemaname, tablename, pg_total_relation_size(schemaname||'.'||tablename) as size FROM pg_tables WHERE schemaname = :schema" + } + + if query_template not in allowed_templates: + raise ValueError(f"Query template not in whitelist: {query_template}") + + # Validate parameters + for key, value in params.items(): + if not self._validate_parameter(key, value): + raise ValueError(f"Invalid parameter {key}: {value}") + + query = text(allowed_templates[query_template]) + result = await self.session.execute(query, params) + return result.fetchall() + + def _validate_parameter(self, key: str, value: Any) -> bool: + """Validate query parameters""" + # Length limits + if isinstance(value, str) and len(value) > 100: + return False + + # Type restrictions + if key.endswith("_id") and not isinstance(value, (str, int)): + return False + + # No SQL injection patterns + if isinstance(value, str): + dangerous = ["'", '"', ";", "--", "/*", "*/", "union", "select"] + if any(pattern in value.lower() for pattern in dangerous): + return False + + return True +``` + +## Development Environment Security + +### Local Development Protection + +**Secure Development Setup**: +```bash +#!/bin/bash +# secure_dev_setup.sh - Secure development environment initialization + +set -euo pipefail + +# 1. Create secure data directory +DATA_DIR="${TRADINGAGENTS_DATA_DIR:-./data}" +mkdir -p "$DATA_DIR" +chmod 700 "$DATA_DIR" # Owner read/write/execute only + +# 2. Create .env file with secure permissions +if [ ! -f .env ]; then + cp .env.example .env + chmod 600 .env # Owner read/write only + echo "Created .env file. Please update with actual API keys." +fi + +# 3. Set up secure Docker environment +if [ ! -f docker-compose.override.yml ]; then + cat > docker-compose.override.yml << EOF +version: '3.8' +services: + timescaledb: + environment: + # Use strong password in development + POSTGRES_PASSWORD: \${DB_PASSWORD:-$(openssl rand -base64 32)} + volumes: + - ./data/postgres:/var/lib/postgresql/data +EOF + echo "Created docker-compose.override.yml with secure settings" +fi + +# 4. Configure Git security +git config --local core.hooksPath .githooks +chmod +x .githooks/pre-commit + +# 5. Install security scanning tools +if command -v pip &> /dev/null; then + pip install bandit safety + echo "Installed security scanning tools" +fi + +echo "Secure development environment configured" +echo "Remember to:" +echo " 1. Update .env with real API keys" +echo " 2. Never commit .env or API keys" +echo " 3. Run 'bandit -r tradingagents/' before commits" +``` + +**Git Security Hooks**: +```bash +#!/bin/bash +# .githooks/pre-commit - Prevent secrets from being committed + +# Check for common secret patterns +if git diff --cached --name-only | grep -E "\.(py|yml|yaml|json|env)$"; then + echo "Scanning for secrets..." + + # Pattern matching for common secrets + if git diff --cached | grep -i -E "(api_key|secret|password|token)" | grep -v -E "(example|template|your_|replace_)"; then + echo "ERROR: Potential secrets detected in staged files!" + echo "Please review and remove any sensitive information." + exit 1 + fi + + # Check for hardcoded URLs with credentials + if git diff --cached | grep -E "postgresql://[^:]+:[^@]+@"; then + echo "ERROR: Database URL with credentials detected!" + echo "Use environment variables instead." + exit 1 + fi +fi + +# Run security linting if bandit is available +if command -v bandit &> /dev/null; then + echo "Running security scan..." + bandit -r tradingagents/ -f json | jq '.results[] | select(.issue_severity == "HIGH")' | grep -q . && { + echo "ERROR: High-severity security issues found!" + echo "Run 'bandit -r tradingagents/' for details." + exit 1 + } +fi + +echo "Pre-commit security checks passed" +``` + +### Secrets Management with Environment Variables + +**Environment Variable Security**: +```python +import os +from pathlib import Path +from typing import Optional + +class EnvironmentManager: + """Secure environment variable management""" + + def __init__(self): + self.env_file = Path(".env") + self.required_vars = [ + "OPENROUTER_API_KEY", + "DATABASE_URL" + ] + self.sensitive_vars = [ + "API_KEY", "SECRET", "PASSWORD", "TOKEN", "PRIVATE_KEY" + ] + + def validate_environment(self) -> list[str]: + """Validate environment setup and return any issues""" + issues = [] + + # Check required variables + for var in self.required_vars: + if not os.getenv(var): + issues.append(f"Missing required environment variable: {var}") + + # Check .env file permissions + if self.env_file.exists(): + stat = self.env_file.stat() + if stat.st_mode & 0o077: # Check if group/other have any permissions + issues.append(".env file has overly permissive permissions (should be 600)") + + # Validate sensitive variables aren't using placeholder values + for var_name in os.environ: + if any(sensitive in var_name for sensitive in self.sensitive_vars): + value = os.getenv(var_name, "") + if self._is_placeholder_value(value): + issues.append(f"{var_name} appears to contain a placeholder value") + + return issues + + def _is_placeholder_value(self, value: str) -> bool: + """Detect common placeholder patterns""" + placeholders = [ + "your_", "replace_", "change_me", "xxxx", "test_key", + "example", "sample", "placeholder", "todo" + ] + return any(placeholder in value.lower() for placeholder in placeholders) + + def setup_production_env(self) -> dict[str, str]: + """Configure production environment with security hardening""" + return { + # Security settings + "PYTHONDONTWRITEBYTECODE": "1", # Don't create .pyc files + "PYTHONUNBUFFERED": "1", # Unbuffered output + "PYTHONHASHSEED": "random", # Random hash seed + + # Application security + "ENVIRONMENT": "production", + "DEBUG": "false", + "LOG_LEVEL": "INFO", # Don't log debug info + + # Database security + "DB_SSL_MODE": "require", + "DB_POOL_PRE_PING": "true", + "DB_ECHO": "false", # Don't log SQL queries + + # API security + "API_RATE_LIMIT": "100", # Requests per minute + "API_TIMEOUT": "30", # Request timeout in seconds + } + +def main(): + """Development environment security check""" + env_manager = EnvironmentManager() + issues = env_manager.validate_environment() + + if issues: + print("⚠️ Environment Security Issues:") + for issue in issues: + print(f" - {issue}") + print("\nRun ./scripts/secure_dev_setup.sh to fix common issues") + return 1 + else: + print("✅ Environment security validation passed") + return 0 + +if __name__ == "__main__": + exit(main()) +``` + +## Production Security Considerations + +### API Rate Limiting and DoS Protection + +**Request Throttling**: +```python +import asyncio +import time +from collections import defaultdict +from typing import Dict, Optional + +class RateLimiter: + """Protect against API abuse and DoS attacks""" + + def __init__(self): + self.request_counts: Dict[str, list] = defaultdict(list) + self.blocked_ips: Dict[str, float] = {} + self.rate_limits = { + "default": (100, 60), # 100 requests per 60 seconds + "openrouter": (50, 60), # 50 LLM requests per 60 seconds + "database": (1000, 60), # 1000 DB operations per 60 seconds + } + + async def check_rate_limit( + self, + identifier: str, + category: str = "default" + ) -> tuple[bool, Optional[str]]: + """Check if request should be allowed""" + + # Check if identifier is temporarily blocked + if identifier in self.blocked_ips: + block_until = self.blocked_ips[identifier] + if time.time() < block_until: + return False, f"Temporarily blocked until {time.ctime(block_until)}" + else: + del self.blocked_ips[identifier] + + # Get rate limit for category + max_requests, window_seconds = self.rate_limits.get( + category, self.rate_limits["default"] + ) + + # Clean old requests outside window + now = time.time() + cutoff = now - window_seconds + self.request_counts[identifier] = [ + req_time for req_time in self.request_counts[identifier] + if req_time > cutoff + ] + + # Check if within limits + current_count = len(self.request_counts[identifier]) + if current_count >= max_requests: + # Block for increasing duration based on violations + violation_count = getattr(self, f"_{identifier}_violations", 0) + 1 + setattr(self, f"_{identifier}_violations", violation_count) + + block_duration = min(300, 30 * violation_count) # Max 5 minutes + self.blocked_ips[identifier] = now + block_duration + + return False, f"Rate limit exceeded. Blocked for {block_duration} seconds" + + # Record this request + self.request_counts[identifier].append(now) + return True, None + + async def check_api_health(self) -> dict: + """Monitor for suspicious patterns""" + now = time.time() + + # Count recent requests across all identifiers + recent_requests = 0 + for requests in self.request_counts.values(): + recent_requests += len([r for r in requests if r > now - 60]) + + # Calculate metrics + total_blocked = len(self.blocked_ips) + active_identifiers = len([ + requests for requests in self.request_counts.values() + if any(r > now - 300 for r in requests) # Active in last 5 minutes + ]) + + status = "healthy" + if recent_requests > 500: # Threshold for concern + status = "high_load" + if total_blocked > 10: + status = "under_attack" + + return { + "status": status, + "recent_requests_per_minute": recent_requests, + "blocked_identifiers": total_blocked, + "active_identifiers": active_identifiers, + "timestamp": now + } +``` + +### Audit Logging and Compliance + +**Security Event Logging**: +```python +import json +import logging +from datetime import datetime +from enum import Enum +from typing import Any, Dict, Optional + +class SecurityEventType(Enum): + AUTH_SUCCESS = "auth_success" + AUTH_FAILURE = "auth_failure" + DATA_ACCESS = "data_access" + DATA_EXPORT = "data_export" + CONFIG_CHANGE = "config_change" + API_ABUSE = "api_abuse" + SYSTEM_ERROR = "system_error" + +class SecurityAuditor: + """Centralized security event logging for compliance""" + + def __init__(self): + # Separate logger for security events + self.security_logger = logging.getLogger("tradingagents.security") + + # Configure structured logging handler + handler = logging.FileHandler("logs/security.log") + formatter = SecurityLogFormatter() + handler.setFormatter(formatter) + self.security_logger.addHandler(handler) + self.security_logger.setLevel(logging.INFO) + + def log_event( + self, + event_type: SecurityEventType, + message: str, + user_id: Optional[str] = None, + ip_address: Optional[str] = None, + resource: Optional[str] = None, + additional_data: Optional[Dict[str, Any]] = None + ) -> None: + """Log security event with structured data""" + + event_data = { + "timestamp": datetime.utcnow().isoformat(), + "event_type": event_type.value, + "message": message, + "severity": self._get_severity(event_type), + "user_id": user_id or "system", + "ip_address": ip_address or "unknown", + "resource": resource, + "additional_data": additional_data or {}, + "process_id": os.getpid(), + "hostname": os.uname().nodename + } + + # Log at appropriate level based on severity + if event_data["severity"] == "critical": + self.security_logger.critical(json.dumps(event_data)) + elif event_data["severity"] == "warning": + self.security_logger.warning(json.dumps(event_data)) + else: + self.security_logger.info(json.dumps(event_data)) + + def _get_severity(self, event_type: SecurityEventType) -> str: + """Determine event severity""" + critical_events = { + SecurityEventType.AUTH_FAILURE, + SecurityEventType.API_ABUSE, + SecurityEventType.CONFIG_CHANGE + } + + if event_type in critical_events: + return "critical" + elif event_type == SecurityEventType.SYSTEM_ERROR: + return "warning" + else: + return "info" + + def log_data_access( + self, + table: str, + operation: str, + record_count: int, + user_id: str = "system" + ) -> None: + """Log data access for compliance auditing""" + self.log_event( + SecurityEventType.DATA_ACCESS, + f"Database {operation} on {table}", + user_id=user_id, + resource=table, + additional_data={ + "operation": operation, + "record_count": record_count + } + ) + + def log_api_key_usage( + self, + provider: str, + model: str, + tokens_used: int, + cost_estimate: float + ) -> None: + """Log LLM API usage for cost monitoring and abuse detection""" + self.log_event( + SecurityEventType.DATA_ACCESS, + f"LLM API call to {provider}/{model}", + resource=f"{provider}/{model}", + additional_data={ + "tokens_used": tokens_used, + "cost_estimate": cost_estimate, + "timestamp": datetime.utcnow().isoformat() + } + ) + +class SecurityLogFormatter(logging.Formatter): + """Custom formatter for security logs""" + + def format(self, record: logging.LogRecord) -> str: + # Security logs are already JSON formatted + return record.getMessage() + +# Usage in repository classes +class NewsRepository: + def __init__(self, database_manager: DatabaseManager): + self.db_manager = database_manager + self.auditor = SecurityAuditor() + + async def list(self, symbol: str, date: date) -> list[NewsArticle]: + # ... existing implementation ... + + # Log data access for compliance + self.auditor.log_data_access( + table="news_articles", + operation="SELECT", + record_count=len(result), + user_id=getattr(self, 'current_user_id', 'system') + ) + + return result +``` + +This comprehensive security standards document provides the foundation for protecting sensitive financial data, API keys, and system resources while maintaining compliance with data protection regulations in the TradingAgents system. \ No newline at end of file diff --git a/docs/standards/style.md b/docs/standards/style.md new file mode 100644 index 00000000..73550287 --- /dev/null +++ b/docs/standards/style.md @@ -0,0 +1,715 @@ +# Style Guide - TradingAgents + +## Python Code Style + +### Formatting with Ruff + +**Configuration** (pyproject.toml): +```toml +[tool.ruff] +target-version = "py313" +line-length = 88 +fix = true +extend-exclude = [ + "migrations/", + "alembic/versions/", + ".env", + "venv/", + ".venv/", +] + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # Pyflakes + "I", # isort + "B", # flake8-bugbear + "C4", # flake8-comprehensions + "UP", # pyupgrade + "ERA", # eradicate + "PIE", # flake8-pie + "SIM", # flake8-simplify + "TCH", # flake8-type-checking + "ARG", # flake8-unused-arguments + "PTH", # flake8-use-pathlib + "FIX", # flake8-fixme + "TD", # flake8-todos +] + +ignore = [ + "E501", # Line too long (handled by formatter) + "B008", # Do not perform function calls in argument defaults + "B904", # Use `raise ... from ...` for exception chaining + "TD002", # Missing author in TODO + "TD003", # Missing issue link on line following TODO + "FIX002", # Line contains TODO +] + +[tool.ruff.lint.per-file-ignores] +"tests/**/*.py" = [ + "S101", # Use of assert detected + "ARG001", # Unused function argument + "FBT001", # Boolean positional arg + "PLR2004", # Magic value used in comparison +] + +"migrations/**/*.py" = [ + "ERA001", # Found commented-out code +] + +[tool.ruff.lint.isort] +known-first-party = ["tradingagents"] +force-sort-within-sections = true +``` + +### Type Hints and Annotations + +**Modern Type Syntax** (Python 3.13): +```python +# Use built-in generics (no typing.List, typing.Dict) +def process_articles(articles: list[NewsArticle]) -> dict[str, int]: + """Process articles and return symbol counts""" + counts: dict[str, int] = {} + for article in articles: + symbol = article.symbol or "UNKNOWN" + counts[symbol] = counts.get(symbol, 0) + 1 + return counts + +# Union types with | +def get_article(article_id: str | int) -> NewsArticle | None: + """Get article by ID (string or integer)""" + if isinstance(article_id, str): + return get_by_url(article_id) + return get_by_id(article_id) + +# Optional with explicit None +def calculate_sentiment(text: str, model: str | None = None) -> float | None: + """Calculate sentiment score""" + if not text.strip(): + return None + # Implementation + return 0.5 +``` + +**Type Annotations for Complex Types**: +```python +from typing import TypeVar, Generic, Protocol, TypedDict, Awaitable +from collections.abc import Callable, AsyncGenerator +from datetime import date, datetime + +# Type variables +T = TypeVar('T') +ArticleT = TypeVar('ArticleT', bound='NewsArticle') + +# Protocol for type checking +class Repository(Protocol[T]): + async def list(self, symbol: str, date: date) -> list[T]: + ... + + async def upsert(self, item: T) -> T: + ... + +# TypedDict for structured data +class ArticleData(TypedDict): + headline: str + url: str + published_date: str + sentiment_score: float | None + +# Callable types +ProcessorFunc = Callable[[list[NewsArticle]], Awaitable[dict[str, int]]] +``` + +### Docstring Standards + +**Google Style Docstrings**: +```python +class NewsRepository: + """Repository for news article data access with PostgreSQL backend. + + Handles CRUD operations for news articles with support for batch operations, + vector similarity search, and TimescaleDB time-series optimization. + + Attributes: + db_manager: AsyncIO database connection manager + + Example: + >>> db_manager = DatabaseManager("postgresql://...") + >>> repo = NewsRepository(db_manager) + >>> articles = await repo.list("AAPL", date(2024, 1, 15)) + """ + + def __init__(self, database_manager: DatabaseManager) -> None: + """Initialize repository with database connection. + + Args: + database_manager: Async database connection manager with + PostgreSQL + TimescaleDB + pgvector support. + """ + self.db_manager = database_manager + + async def upsert_batch( + self, + articles: list[NewsArticle], + symbol: str, + *, + chunk_size: int = 1000 + ) -> list[NewsArticle]: + """Batch insert or update articles with deduplication. + + Uses PostgreSQL ON CONFLICT for atomic upserts based on URL uniqueness. + Processes articles in chunks to optimize memory usage for large datasets. + + Args: + articles: News articles to store + symbol: Stock symbol to associate with articles + chunk_size: Number of articles to process per database transaction. + Defaults to 1000 for optimal PostgreSQL performance. + + Returns: + List of stored articles with database-generated metadata + + Raises: + IntegrityError: If URL constraint violations occur + DatabaseConnectionError: If database is unavailable + + Example: + >>> articles = [NewsArticle("Title", "https://...", ...)] + >>> stored = await repo.upsert_batch(articles, "AAPL") + >>> assert len(stored) == len(articles) + """ + if not articles: + return [] + + # Implementation... +``` + +**Module-Level Docstrings**: +```python +""" +News repository with PostgreSQL + TimescaleDB backend. + +This module provides data access patterns for financial news articles with +support for: +- Time-series queries optimized by TimescaleDB +- Vector similarity search using pgvector +- Bulk operations with PostgreSQL-specific optimizations +- Async/await patterns for high-performance I/O + +Example Usage: + from tradingagents.domains.news.news_repository import NewsRepository + from tradingagents.lib.database import DatabaseManager + + db = DatabaseManager("postgresql+asyncpg://...") + repo = NewsRepository(db) + + # Get articles for a symbol and date + articles = await repo.list("AAPL", date(2024, 1, 15)) + + # Batch store new articles + new_articles = [...] + stored = await repo.upsert_batch(new_articles, "AAPL") +""" + +from __future__ import annotations +``` + +### Variable and Function Naming + +**Snake Case for Everything**: +```python +# Variables +article_count = len(articles) +sentiment_threshold = 0.5 +openrouter_api_key = os.getenv("OPENROUTER_API_KEY") + +# Functions +def calculate_portfolio_risk(positions: list[Position]) -> float: + """Calculate portfolio-wide risk metrics""" + +async def fetch_news_articles(symbol: str, date: date) -> list[NewsArticle]: + """Fetch news articles from external APIs""" + +# Private methods +def _validate_sentiment_score(score: float | None) -> bool: + """Internal validation for sentiment scores""" + +# Constants +MAX_ARTICLES_PER_REQUEST = 100 +DEFAULT_LOOKBACK_DAYS = 30 +OPENAI_EMBEDDING_DIMENSIONS = 1536 +``` + +**Descriptive Names Over Short Names**: +```python +# Good - Clear intent +async def update_articles_for_symbol(symbol: str, target_date: date) -> int: + successful_count = 0 + failed_count = 0 + + for news_source in self.configured_sources: + try: + articles = await news_source.fetch(symbol, target_date) + stored_articles = await self.repository.upsert_batch(articles, symbol) + successful_count += len(stored_articles) + except Exception as e: + failed_count += 1 + logger.warning(f"Failed to fetch from {news_source.name}: {e}") + + return successful_count + +# Avoid - Unclear abbreviations +async def upd_arts(sym: str, dt: date) -> int: + cnt = 0 + for src in self.srcs: + arts = await src.get(sym, dt) + cnt += len(arts) + return cnt +``` + +### Import Organization + +**Import Order with isort**: +```python +# 1. Standard library imports +import asyncio +import logging +import uuid +from datetime import date, datetime +from pathlib import Path +from typing import Any + +# 2. Third-party imports +import aiohttp +from sqlalchemy import select, and_ +from sqlalchemy.ext.asyncio import AsyncSession +import pytest + +# 3. First-party imports +from tradingagents.config import TradingAgentsConfig +from tradingagents.domains.news.news_repository import NewsArticle, NewsRepository +from tradingagents.lib.database import DatabaseManager + +# 4. Relative imports (avoid when possible) +from .google_news_client import GoogleNewsClient +``` + +**Import Aliases**: +```python +# Standard aliases for common packages +import pandas as pd +import numpy as np +from datetime import datetime as dt, date + +# Avoid long module paths +from tradingagents.domains.news.news_repository import ( + NewsArticle, + NewsRepository, + NewsArticleEntity +) + +# Type-only imports for forward references +from typing import TYPE_CHECKING +if TYPE_CHECKING: + from tradingagents.agents.trading_agent import TradingAgent +``` + +## Database Naming Conventions + +### Table Names + +**Snake Case with Domain Prefix**: +```sql +-- Domain-prefixed tables +news_articles -- Core news data +news_article_embeddings -- Vector embeddings (if separate) + +market_data_daily -- Daily market prices +market_data_intraday -- Intraday tick data + +social_media_posts -- Social media content +social_sentiment_scores -- Sentiment analysis results + +-- Agent-specific tables +agent_decisions -- Trading decisions +agent_portfolios -- Portfolio states +agent_memories -- RAG memory store +``` + +### Column Names + +**Descriptive Snake Case**: +```sql +-- Good - Clear and consistent +CREATE TABLE news_articles ( + id UUID PRIMARY KEY DEFAULT uuid7(), + headline TEXT NOT NULL, + url TEXT UNIQUE NOT NULL, + published_date DATE NOT NULL, + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW(), + + -- Foreign key relationships + symbol VARCHAR(20) REFERENCES stocks(symbol), + source_id UUID REFERENCES news_sources(id), + + -- Metrics and scores + sentiment_score DECIMAL(3,2) CHECK (sentiment_score BETWEEN -1 AND 1), + readability_score INTEGER CHECK (readability_score BETWEEN 0 AND 100), + + -- Vector embeddings + title_embedding VECTOR(1536), + content_embedding VECTOR(1536) +); + +-- Avoid - Unclear abbreviations +CREATE TABLE art ( + id UUID, + ttl TEXT, -- title? + dt DATE, -- published_date? + scr DECIMAL, -- score? source? + emb VECTOR(1536) -- embedding? +); +``` + +### Index Names + +**Descriptive with Purpose**: +```sql +-- Pattern: idx_{table}_{columns}_{purpose} +CREATE INDEX idx_news_articles_symbol_date_lookup +ON news_articles (symbol, published_date); + +CREATE INDEX idx_news_articles_published_date_timeseries +ON news_articles (published_date DESC); + +CREATE INDEX idx_news_articles_url_unique +ON news_articles (url); + +-- Vector indexes with algorithm +CREATE INDEX idx_news_articles_title_embedding_cosine +ON news_articles USING ivfflat (title_embedding vector_cosine_ops); + +-- Partial indexes for specific queries +CREATE INDEX idx_news_articles_recent_high_sentiment +ON news_articles (published_date, sentiment_score) +WHERE published_date > CURRENT_DATE - INTERVAL '30 days' +AND sentiment_score > 0.5; +``` + +## API Design Patterns + +### RESTful URL Structure + +**Resource-Based URLs**: +```python +# Good - Resource-oriented +GET /api/v1/symbols/AAPL/articles?date=2024-01-15 # Get articles +POST /api/v1/symbols/AAPL/articles # Create articles +PUT /api/v1/articles/{article_id} # Update article +DELETE /api/v1/articles/{article_id} # Delete article + +GET /api/v1/symbols/AAPL/market-data?start=2024-01-01&end=2024-01-31 +POST /api/v1/trading/decisions # Create trading decision +GET /api/v1/agents/portfolios/{portfolio_id} # Get portfolio state + +# Avoid - Action-oriented +POST /api/v1/getArticles # Should be GET +POST /api/v1/updateSymbolData # Should be PUT +GET /api/v1/performTradingAnalysis # Should be POST +``` + +**Query Parameter Standards**: +```python +from datetime import date +from pydantic import BaseModel, Field, validator + +class ArticleQueryParams(BaseModel): + """Query parameters for article endpoints""" + + # Date filtering + date: date | None = None + start_date: date | None = Field(None, alias="start") + end_date: date | None = Field(None, alias="end") + + # Pagination + limit: int = Field(default=50, ge=1, le=1000) + offset: int = Field(default=0, ge=0) + + # Filtering + sources: list[str] | None = Field(None, description="Filter by news sources") + min_sentiment: float | None = Field(None, ge=-1.0, le=1.0) + max_sentiment: float | None = Field(None, ge=-1.0, le=1.0) + + # Search + query: str | None = Field(None, max_length=200) + + @validator('end_date') + def end_date_after_start(cls, v, values): + if v and values.get('start_date') and v < values['start_date']: + raise ValueError('end_date must be after start_date') + return v +``` + +### Response Formats + +**Consistent JSON Structure**: +```python +from typing import Generic, TypeVar +from pydantic import BaseModel + +T = TypeVar('T') + +class APIResponse(BaseModel, Generic[T]): + """Standard API response wrapper""" + + data: T | None = None + success: bool = True + message: str | None = None + errors: list[str] = [] + + # Metadata + request_id: str | None = None + timestamp: str = Field(default_factory=lambda: datetime.utcnow().isoformat()) + +class PaginatedResponse(APIResponse[list[T]]): + """Paginated response with metadata""" + + pagination: dict[str, int] = Field(default_factory=dict) + + @classmethod + def create( + cls, + data: list[T], + total: int, + limit: int, + offset: int + ) -> 'PaginatedResponse[T]': + return cls( + data=data, + pagination={ + "total": total, + "limit": limit, + "offset": offset, + "has_more": offset + len(data) < total + } + ) + +# Usage example +@app.get("/api/v1/symbols/{symbol}/articles") +async def get_articles( + symbol: str, + params: ArticleQueryParams = Depends(), + db: AsyncSession = Depends(get_db_session) +) -> PaginatedResponse[ArticleData]: + """Get news articles for a symbol""" + + # Query implementation + articles, total = await article_service.get_paginated( + symbol=symbol, + limit=params.limit, + offset=params.offset, + date_filter=params.date + ) + + return PaginatedResponse.create( + data=[ArticleData.from_entity(a) for a in articles], + total=total, + limit=params.limit, + offset=params.offset + ) +``` + +## Documentation Standards + +### Code Comments + +**When to Comment**: +```python +class NewsRepository: + async def upsert_batch(self, articles: list[NewsArticle], symbol: str) -> list[NewsArticle]: + # Don't comment obvious code + if not articles: + return [] + + # DO comment complex business logic + # Use PostgreSQL ON CONFLICT for atomic upsert operations. + # This prevents race conditions when multiple processes + # are updating the same articles simultaneously. + stmt = insert(NewsArticleEntity).values(entity_data_list) + upsert_stmt = stmt.on_conflict_do_update( + index_elements=["url"], # Deduplication key + set_={ + # Update all fields except ID and created_at + **{col: stmt.excluded[col] for col in updateable_columns}, + "updated_at": func.now(), + }, + ) + + # DO comment performance optimizations + # Batch size of 1000 optimizes PostgreSQL memory usage + # while avoiding transaction timeout for large datasets + for chunk in chunks(entity_data_list, 1000): + result = await session.execute(upsert_stmt) +``` + +**TODO Comments**: +```python +# TODO(martin): Implement caching layer for frequently accessed articles +# TODO(martin): Add vector similarity search for related articles +# FIXME(martin): Handle edge case where published_date is in future +# HACK(martin): Temporary workaround for API rate limiting - remove after v2.0 +``` + +### README Structure + +**Repository README.md Template**: +```markdown +# TradingAgents - Multi-Agent Financial Analysis + +Brief description of what the project does and why it exists. + +## Quick Start + +```bash +# 1. Setup environment +export OPENROUTER_API_KEY="your_key" +mise run docker # Start PostgreSQL + +# 2. Install and run +mise run install +mise run dev # Interactive CLI +``` + +## Architecture + +High-level overview with diagrams if helpful. + +## Development + +### Prerequisites +- Python 3.13+ +- PostgreSQL 16+ with TimescaleDB +- OpenRouter API access + +### Setup +```bash +mise run install # Install dependencies +mise run test # Run test suite +mise run format # Format code +``` + +### Testing +Details about test strategy and running tests. + +## Configuration + +Environment variables and configuration options. + +## Contributing + +Link to contributing guidelines. +``` + +### Commit Message Conventions + +**Conventional Commits Format**: +``` +type(scope): description + +[optional body] + +[optional footer(s)] +``` + +**Types**: +- `feat`: New feature +- `fix`: Bug fix +- `docs`: Documentation changes +- `style`: Code style changes (formatting, missing semicolons, etc.) +- `refactor`: Code refactoring +- `test`: Adding missing tests or correcting existing tests +- `chore`: Changes to build process or auxiliary tools + +**Examples**: +``` +feat(news): add vector similarity search for related articles + +Implements pgvector-based similarity search using OpenAI embeddings. +Articles can now find related content based on semantic similarity +rather than just keyword matching. + +- Add title_embedding and content_embedding columns +- Implement cosine similarity search in NewsRepository +- Add vector index for performance optimization + +Closes #123 + +--- + +fix(database): handle connection timeouts in async sessions + +Connection pooling was causing timeouts under high load. +Added proper timeout handling and connection recycling. + +- Set pool_recycle=3600 for connection health +- Add retry logic for transient connection errors +- Improve error logging for debugging + +--- + +test(news): add integration tests for batch upsert operations + +Covers edge cases for duplicate URL handling and large batch processing. + +--- + +docs(api): update OpenAPI spec for news endpoints + +- Add pagination parameters +- Document error response formats +- Include example requests and responses +``` + +### Code Organization + +**File and Directory Structure**: +``` +tradingagents/ +├── __init__.py +├── config.py # Application configuration +├── main.py # Entry point +├── +├── domains/ # Domain-driven design +│ ├── __init__.py +│ ├── news/ # News domain +│ │ ├── __init__.py +│ │ ├── news_service.py # Business logic +│ │ ├── news_repository.py # Data access +│ │ ├── google_news_client.py # External API +│ │ └── models.py # Domain models +│ ├── marketdata/ # Market data domain +│ └── socialmedia/ # Social media domain +│ +├── agents/ # LLM agents +│ ├── __init__.py +│ ├── trading_agent.py +│ ├── analyst_agent.py +│ └── libs/ # Agent utilities +│ ├── __init__.py +│ └── agent_toolkit.py +│ +├── lib/ # Shared utilities +│ ├── __init__.py +│ ├── database.py # Database connection +│ ├── logging.py # Logging configuration +│ └── utils.py # Common utilities +│ +└── types/ # Shared type definitions + ├── __init__.py + ├── common.py + └── financial.py +``` + +This style guide ensures consistent, maintainable code across the TradingAgents project while leveraging modern Python features and database optimization techniques. \ No newline at end of file diff --git a/docs/standards/tech.md b/docs/standards/tech.md new file mode 100644 index 00000000..39a70dad --- /dev/null +++ b/docs/standards/tech.md @@ -0,0 +1,543 @@ +# Technical Standards - TradingAgents + +## Database Architecture + +### Core Stack: PostgreSQL + TimescaleDB + pgvectorscale + +**Primary Database**: PostgreSQL 16+ with TimescaleDB and pgvector extensions +- **TimescaleDB**: Optimized for time-series financial data (prices, volumes, news timestamps) +- **pgvector/pgvectorscale**: Vector embeddings for RAG-powered agents +- **Connection**: asyncpg driver for high-performance async operations + +**Database URL Pattern**: +```python +# Development +DATABASE_URL = "postgresql+asyncpg://postgres:tradingagents@localhost:5432/tradingagents" + +# Production +DATABASE_URL = "postgresql+asyncpg://username:password@host:port/database" +``` + +**Required Extensions**: +```sql +CREATE EXTENSION IF NOT EXISTS timescaledb CASCADE; +CREATE EXTENSION IF NOT EXISTS vector CASCADE; +CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; +``` + +### Schema Design Standards + +**Time-Series Tables (TimescaleDB)**: +```sql +-- Market data with time-based partitioning +CREATE TABLE market_data ( + id UUID PRIMARY KEY DEFAULT uuid7(), + symbol VARCHAR(20) NOT NULL, + timestamp TIMESTAMPTZ NOT NULL, + price DECIMAL(18,8), + volume BIGINT, + -- Metadata + created_at TIMESTAMPTZ DEFAULT NOW() +); + +-- Convert to hypertable for time-series optimization +SELECT create_hypertable('market_data', 'timestamp'); + +-- Indexes for common query patterns +CREATE INDEX ON market_data (symbol, timestamp DESC); +``` + +**Vector-Enabled Tables**: +```sql +-- News articles with embeddings +CREATE TABLE news_articles ( + id UUID PRIMARY KEY DEFAULT uuid7(), + headline TEXT NOT NULL, + url TEXT UNIQUE NOT NULL, -- Deduplication key + published_date DATE NOT NULL, + title_embedding VECTOR(1536), -- OpenAI embedding size + content_embedding VECTOR(1536), + -- TimescaleDB partitioning on published_date + created_at TIMESTAMPTZ DEFAULT NOW() +); + +-- Vector similarity index +CREATE INDEX ON news_articles USING ivfflat (title_embedding vector_cosine_ops); +``` + +**Composite Indexes for Query Optimization**: +```sql +-- Common query patterns +CREATE INDEX idx_symbol_date ON news_articles (symbol, published_date); +CREATE INDEX idx_published_date ON news_articles (published_date); +CREATE INDEX idx_url_unique ON news_articles (url); +``` + +### Connection Management + +**Async Session Factory**: +```python +from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine + +class DatabaseManager: + def __init__(self, database_url: str, echo: bool = False): + # Ensure asyncpg driver + if not database_url.startswith("postgresql+asyncpg://"): + database_url = database_url.replace("postgresql://", "postgresql+asyncpg://") + + self.engine = create_async_engine( + database_url, + echo=echo, + pool_recycle=3600, # 1-hour connection recycling + pool_pre_ping=True, # Connection health checks + ) + + self.AsyncSessionLocal = async_sessionmaker( + bind=self.engine, + class_=AsyncSession, + autocommit=False, + autoflush=False, + ) +``` + +**Session Context Management**: +```python +@asynccontextmanager +async def get_session(self) -> AsyncGenerator[AsyncSession, None]: + """Type-checker friendly session management""" + session = self.AsyncSessionLocal() + try: + yield session + await session.commit() + except Exception: + await session.rollback() + raise + finally: + await session.close() +``` + +## LLM Integration Standards + +### OpenRouter as Unified Provider + +**Configuration**: +```python +# Environment variables +OPENROUTER_API_KEY = "your_openrouter_key" +LLM_PROVIDER = "openrouter" +DEEP_THINK_LLM = "openai/gpt-4o" # Complex analysis +QUICK_THINK_LLM = "openai/gpt-4o-mini" # Fast responses +BACKEND_URL = "https://openrouter.ai/api/v1" +``` + +**Model Selection Strategy**: +- **Deep Think**: Complex reasoning, debates, risk analysis (`openai/gpt-4o`, `anthropic/claude-3.5-sonnet`) +- **Quick Think**: Data formatting, simple queries (`openai/gpt-4o-mini`, `anthropic/claude-3-haiku`) + +**Cost Optimization**: +```python +# Development/testing configuration +config = TradingAgentsConfig( + llm_provider="openrouter", + deep_think_llm="openai/gpt-4o-mini", # Lower cost + quick_think_llm="openai/gpt-4o-mini", # Consistent model + max_debate_rounds=1, # Reduce API calls + online_tools=False, # Use cached data +) +``` + +### Agent Integration Patterns + +**Anti-Corruption Layer**: +```python +class AgentToolkit: + """Mediates between LLM agents and domain services""" + + def __init__(self, config: TradingAgentsConfig): + self.config = config + self.services = self._initialize_services() + + async def get_news_context(self, symbol: str, date: date) -> dict: + """Convert domain models to structured LLM context""" + articles = await self.news_service.get_articles(symbol, date) + + return { + "articles": [article.to_dict() for article in articles], + "count": len(articles), + "data_quality": self._assess_data_quality(articles), + "source_distribution": self._analyze_sources(articles) + } +``` + +## Layered Architecture Enforcement + +### Standard Layer Pattern + +**Data Flow**: `Request → Router → Service → Repository → Entity → Database` + +**Component Responsibilities**: + +1. **Entity (Domain Model)**: +```python +@dataclass +class NewsArticle: + """Domain entity with business rules and transformations""" + + headline: str + url: str + published_date: date + sentiment_score: float | None = None + + def to_entity(self, symbol: str | None = None) -> NewsArticleEntity: + """Transform to database model""" + return NewsArticleEntity( + headline=self.headline, + url=self.url, + published_date=self.published_date, + symbol=symbol + ) + + @staticmethod + def from_entity(entity: NewsArticleEntity) -> 'NewsArticle': + """Transform from database model""" + return NewsArticle( + headline=entity.headline, + url=entity.url, + published_date=entity.published_date, + sentiment_score=entity.sentiment_score + ) + + def validate(self) -> list[str]: + """Business rule validation""" + errors = [] + if not self.headline.strip(): + errors.append("Headline cannot be empty") + if not self.url.startswith(("http://", "https://")): + errors.append("Invalid URL format") + return errors +``` + +2. **Repository (Data Access)**: +```python +class NewsRepository: + """Handles data persistence with async operations""" + + def __init__(self, database_manager: DatabaseManager): + self.db_manager = database_manager + + async def list(self, symbol: str, date: date) -> list[NewsArticle]: + """Query with proper error handling and logging""" + async with self.db_manager.get_session() as session: + result = await session.execute( + select(NewsArticleEntity) + .filter(and_( + NewsArticleEntity.symbol == symbol, + NewsArticleEntity.published_date == date + )) + .order_by(NewsArticleEntity.published_date.desc()) + ) + entities = result.scalars().all() + return [NewsArticle.from_entity(e) for e in entities] + + async def upsert_batch(self, articles: list[NewsArticle], symbol: str) -> list[NewsArticle]: + """Bulk operations for performance""" + if not articles: + return [] + + async with self.db_manager.get_session() as session: + # Use PostgreSQL ON CONFLICT for atomic upserts + stmt = insert(NewsArticleEntity).values([ + article.to_entity(symbol).__dict__ for article in articles + ]) + upsert_stmt = stmt.on_conflict_do_update( + index_elements=["url"], + set_={k: stmt.excluded[k] for k in stmt.excluded.keys()} + ).returning(NewsArticleEntity) + + result = await session.execute(upsert_stmt) + entities = result.scalars().all() + return [NewsArticle.from_entity(e) for e in entities] +``` + +3. **Service (Business Logic)**: +```python +class NewsService: + """Orchestrates business operations""" + + def __init__(self, repository: NewsRepository, clients: dict): + self.repository = repository + self.clients = clients + + async def get_articles(self, symbol: str, date: date) -> list[NewsArticle]: + """Business logic with error handling""" + try: + articles = await self.repository.list(symbol, date) + logger.info(f"Retrieved {len(articles)} articles for {symbol}") + return articles + except Exception as e: + logger.error(f"Failed to get articles for {symbol}: {e}") + return [] # Graceful degradation + + async def update_articles(self, symbol: str, date: date) -> int: + """Coordinated data refresh""" + new_articles = await self._fetch_from_sources(symbol, date) + if new_articles: + stored = await self.repository.upsert_batch(new_articles, symbol) + return len(stored) + return 0 +``` + +### Domain Isolation + +**Three Core Domains**: + +1. **News Domain** (`tradingagents/domains/news/`) +2. **Market Data Domain** (`tradingagents/domains/marketdata/`) +3. **Social Media Domain** (`tradingagents/domains/socialmedia/`) + +**Domain Boundary Rules**: +- Domains communicate through service interfaces only +- No direct database access between domains +- Shared types in `tradingagents/types/` +- Domain events for loose coupling + +## Vector Integration and RAG Patterns + +### Vector Embedding Storage + +**OpenAI Embeddings (1536 dimensions)**: +```python +# Entity definition +class NewsArticleEntity(Base): + title_embedding: Mapped[list[float] | None] = mapped_column( + Vector(1536), nullable=True + ) + content_embedding: Mapped[list[float] | None] = mapped_column( + Vector(1536), nullable=True + ) + +# Similarity search +async def find_similar_articles(self, query_embedding: list[float], limit: int = 10) -> list[NewsArticle]: + async with self.db_manager.get_session() as session: + result = await session.execute( + select(NewsArticleEntity) + .order_by(NewsArticleEntity.title_embedding.cosine_distance(query_embedding)) + .limit(limit) + ) + return [NewsArticle.from_entity(e) for e in result.scalars()] +``` + +### RAG Context Assembly + +**Agent Context Pattern**: +```python +async def build_agent_context(self, symbol: str, date: date) -> dict: + """Assemble multi-source context for agents""" + + # Recent news with embeddings + news_articles = await self.news_service.get_articles(symbol, date) + + # Market data + market_data = await self.market_service.get_recent_data(symbol, days=30) + + # Social sentiment + social_data = await self.social_service.get_sentiment(symbol, date) + + return { + "news": { + "articles": [a.to_dict() for a in news_articles], + "sentiment_avg": sum(a.sentiment_score or 0 for a in news_articles) / len(news_articles), + "sources": list({a.source for a in news_articles}) + }, + "market": { + "current_price": market_data.current_price, + "volatility": market_data.volatility_30d, + "volume_trend": market_data.volume_trend + }, + "social": { + "reddit_sentiment": social_data.reddit_score, + "twitter_mentions": social_data.twitter_mentions + }, + "context_quality": self._assess_context_quality(news_articles, market_data, social_data) + } +``` + +## Migration and Deployment Standards + +### Database Migrations + +**Alembic Configuration**: +```python +# alembic/env.py +import asyncio +from sqlalchemy.ext.asyncio import create_async_engine +from tradingagents.lib.database import Base + +def run_async_migrations(): + config = context.config + database_url = config.get_main_option("sqlalchemy.url") + + # Ensure asyncpg driver + if database_url.startswith("postgresql://"): + database_url = database_url.replace("postgresql://", "postgresql+asyncpg://") + + engine = create_async_engine(database_url) + + async def do_run_migrations(): + async with engine.begin() as connection: + await connection.run_sync(do_run_migrations_sync) + + asyncio.run(do_run_migrations()) +``` + +**TimescaleDB-Specific Migrations**: +```python +"""Add TimescaleDB hypertable + +Revision ID: 001 +""" + +def upgrade(): + # Create table first + op.create_table( + 'market_data', + sa.Column('id', postgresql.UUID(), nullable=False), + sa.Column('symbol', sa.String(20), nullable=False), + sa.Column('timestamp', sa.TIMESTAMP(timezone=True), nullable=False), + sa.Column('price', sa.Numeric(18, 8)), + sa.PrimaryKeyConstraint('id') + ) + + # Convert to hypertable + op.execute("SELECT create_hypertable('market_data', 'timestamp');") + + # Add indexes + op.create_index('idx_market_symbol_time', 'market_data', ['symbol', 'timestamp']) +``` + +### Docker Configuration + +**Development Environment**: +```yaml +# docker-compose.yml +services: + timescaledb: + build: ./db + container_name: tradingagents_timescaledb + environment: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: tradingagents + POSTGRES_DB: tradingagents + ports: + - "5432:5432" + volumes: + - ./seed.sql:/docker-entrypoint-initdb.d/seed.sql + - timescale_data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres -d tradingagents"] + interval: 30s + timeout: 10s + retries: 3 +``` + +### Environment Configuration + +**Required Environment Variables**: +```bash +# Database +DATABASE_URL=postgresql+asyncpg://postgres:tradingagents@localhost:5432/tradingagents + +# OpenRouter LLM +OPENROUTER_API_KEY=your_openrouter_key +LLM_PROVIDER=openrouter +DEEP_THINK_LLM=openai/gpt-4o +QUICK_THINK_LLM=openai/gpt-4o-mini +BACKEND_URL=https://openrouter.ai/api/v1 + +# Application +TRADINGAGENTS_RESULTS_DIR=./results +TRADINGAGENTS_DATA_DIR=./data +DEFAULT_LOOKBACK_DAYS=30 +ONLINE_TOOLS=true + +# Performance +MAX_DEBATE_ROUNDS=1 +MAX_RISK_DISCUSS_ROUNDS=1 +``` + +## Quality Gates + +### Database Performance + +**Query Performance Standards**: +- Simple queries: < 100ms +- Complex aggregations: < 500ms +- Vector similarity searches: < 1s +- Batch operations: < 5s for 1000 records + +**Monitoring Queries**: +```sql +-- Query performance monitoring +SELECT query, mean_exec_time, calls, total_exec_time +FROM pg_stat_statements +WHERE mean_exec_time > 100 +ORDER BY mean_exec_time DESC; + +-- TimescaleDB chunk information +SELECT * FROM chunk_relation_size('market_data'); +``` + +### Connection Health + +**Health Check Implementation**: +```python +async def health_check() -> dict: + """Comprehensive system health check""" + checks = {} + + # Database connectivity + try: + async with db_manager.get_session() as session: + await session.execute(text("SELECT 1")) + checks["database"] = {"status": "healthy", "latency_ms": None} + except Exception as e: + checks["database"] = {"status": "unhealthy", "error": str(e)} + + # OpenRouter API + try: + # Test API connection + checks["llm_api"] = {"status": "healthy"} + except Exception as e: + checks["llm_api"] = {"status": "unhealthy", "error": str(e)} + + return checks +``` + +### Data Quality Enforcement + +**Validation Pipeline**: +```python +class DataQualityValidator: + """Ensures data meets quality standards before storage""" + + def validate_news_article(self, article: NewsArticle) -> list[str]: + errors = [] + + # Business rules + if not article.headline.strip(): + errors.append("Empty headline") + + if len(article.headline) > 500: + errors.append("Headline too long") + + if article.sentiment_score and not (-1 <= article.sentiment_score <= 1): + errors.append("Invalid sentiment score range") + + # Data freshness + if article.published_date > date.today(): + errors.append("Future publication date") + + return errors +``` + +This technical standards document provides the foundation for maintaining consistency across the TradingAgents codebase while ensuring optimal performance for financial data processing and AI agent operations. \ No newline at end of file diff --git a/litellm.yml b/litellm.yml deleted file mode 100644 index acb18cf1..00000000 --- a/litellm.yml +++ /dev/null @@ -1,17 +0,0 @@ -model_list: - - model_name: "*" # Catches any model request - litellm_params: - model: "openrouter/qwen/qwen3-coder" - api_key: os.environ/OPENROUTER_API_KEY - stream: false - timeout: 600 # 10 minutes total - complex code can take time - stop: [] - -general_settings: - drop_params: true - stream: false - -router_settings: - num_retries: 10 - retry_after: 2 - allowed_fails: 100 diff --git a/package-lock.json b/package-lock.json deleted file mode 100644 index 28db9347..00000000 --- a/package-lock.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "name": "TradingAgents", - "lockfileVersion": 3, - "requires": true, - "packages": {} -} diff --git a/package.json b/package.json deleted file mode 100644 index 0967ef42..00000000 --- a/package.json +++ /dev/null @@ -1 +0,0 @@ -{} diff --git a/prd/news_service.md b/prd/news_service.md deleted file mode 100644 index 480d22f2..00000000 --- a/prd/news_service.md +++ /dev/null @@ -1,1019 +0,0 @@ -# News Service PRD - -## Executive Summary -The News Service feature will provide up-to-date news sentiment analysis for stock market tickers to the TradingAgents framework. This service will enable agents to make more informed trading decisions based on current market news and sentiment. - -## Requirements - -### Target Users -- Trading Agents (News Analyst, Researchers, Trader Agent, Risk Management team) -- Cron Job system for daily updates - -### Problem Statement -Agents need up-to-date news sentiment when analyzing the stock market to make better trading decisions. Currently, they may be missing important news events or experiencing delays in sentiment analysis that could impact trading performance. - -### Success Metrics -- Impact on trading decision quality - -### User Stories -1. As Cron Job I want to be able to update and store the news with sentiment analysis for a ticker each day -2. As a Trading Agent I want to be able to retrieve the news with sentiment analysis for a ticker and a day from a database - -### Out of Scope (v1) -- Real-time news streaming (vs daily updates) -- Multi-language news support -- Historical news sentiment analysis beyond a certain date range -- News source ranking or weighting -- Advanced filtering options - -### Timeline -MVP in 1 week - -## Status -✅ Requirements Complete | ✅ Technical Design Complete | ✅ Implementation Complete | 🔄 Testing In Progress - -## Technical Design - -### Architecture -- The `NewsService` will be the central component, orchestrating the fetching, scraping, analysis, and storage of news articles. -- It will utilize the existing `GoogleNewsClient` to fetch RSS feeds from Google News. -- The `ArticleScraperClient` will be enhanced to scrape full article content with robust fallback strategies: - - **Direct Fetch**: Primary method using `newspaper4k` library for content extraction (upgraded from newspaper3k) - - **Archive Fallback**: Internet Archive Wayback Machine fallback for failed fetches - - **Content Extraction**: Clean text, title, publication date, and metadata extraction - - **Paywall Detection**: Handle paywall-protected content gracefully -- A new `SentimentAnalysisService` will be created to handle the interaction with the configured LLM for structured sentiment analysis. -- The `NewsRepository` will store the news articles along with their sentiment scores in the existing file-based database. - -### Implementation Components -- **Backend:** - - `tradingagents/domains/news/news_service.py`: - - A new private method `_get_sentiment_for_article` will be added to call the `SentimentAnalysisService`. - - The `update_company_news` method will be modified to call this new method for each scraped article. - - The `_calculate_sentiment_summary` will be updated to aggregate the new structured sentiment scores. - - Update to work with SQLAlchemy-based NewsRepository instead of file-based storage. - - `tradingagents/domains/news/repository.py` (Enhanced with Compatibility Layer): - - Replace file-based storage with SQLAlchemy ORM operations - - **Backward Compatibility**: Maintain existing interface with adapter pattern - - Implement new methods: `save_articles()`, `get_articles_by_symbol()`, `get_articles_by_date_range()` - - Add transaction management and connection pooling - - Include duplicate detection using URL uniqueness constraints - - Add batch operations for efficient bulk inserts - -**Data Model Compatibility Strategy:** -```python -# Enhanced ArticleData to bridge existing and new models -@dataclass -class ArticleData: - # Existing fields (maintain compatibility) - title: str - content: str - author: str - source: str # Keep as string for existing code - date: str # YYYY-MM-DD format - url: str - sentiment: SentimentScore | None = None - - # New fields for enhanced functionality - source_id: int | None = None # Foreign key when available - category_id: int | None = None # Foreign key when available - - # Vector fields (optional for backward compatibility) - title_embedding: List[float] | None = None - content_embedding: List[float] | None = None - sentiment_embedding: List[float] | None = None - - @classmethod - def from_db_model(cls, article: NewsArticle) -> 'ArticleData': - """Convert database model to existing ArticleData format.""" - return cls( - title=article.title, - content=article.content or "", - author=article.author or "", - source=article.source.name if article.source else "Unknown", # Flatten relationship - date=article.published_date.isoformat(), - url=article.url, - sentiment=SentimentScore( - score=float(article.sentiment_score) if article.sentiment_score else 0.0, - confidence=float(article.sentiment_confidence) if article.sentiment_confidence else 0.0, - label=article.sentiment_label or "neutral" - ) if article.sentiment_score is not None else None, - source_id=article.source_id, - category_id=article.category_id, - title_embedding=article.title_embedding, - content_embedding=article.content_embedding, - sentiment_embedding=article.sentiment_embedding - ) - - def to_db_model(self, session: Session) -> NewsArticle: - """Convert to database model, handling source lookup.""" - # Get or create source - source = session.query(NewsSource).filter_by(name=self.source).first() - if not source: - source = NewsSource(name=self.source) - session.add(source) - session.flush() # Get ID - - return NewsArticle( - title=self.title, - content=self.content, - author=self.author, - source_id=source.id, - url=self.url, - published_date=date.fromisoformat(self.date), - sentiment_score=Decimal(str(self.sentiment.score)) if self.sentiment else None, - sentiment_confidence=Decimal(str(self.sentiment.confidence)) if self.sentiment else None, - sentiment_label=self.sentiment.label if self.sentiment else None, - title_embedding=self.title_embedding, - content_embedding=self.content_embedding, - sentiment_embedding=self.sentiment_embedding - ) -``` - - `tradingagents/domains/news/sentiment_service.py` (New File): - - This new service will encapsulate the logic for calling the LLM and generating embeddings. - - Primary method: `get_sentiment_with_embeddings(article_content: str) -> SentimentScoreWithEmbeddings`. - - It will use the `quick_think_llm` from the `TradingAgentsConfig` for performance. - - It will use a structured prompt to ask the LLM to return a JSON object with `score`, `confidence`, and `label`. - - **Embedding Generation**: Generate multiple embeddings using OpenAI's embedding API: - - `title_embedding`: Vector representation of article title (1536 dims) - - `content_embedding`: Vector representation of full article content (1536 dims) - - `sentiment_embedding`: Smaller specialized sentiment vector using sentence-transformers (384 dims) - - **Vector Similarity**: Enable semantic search for similar articles and sentiment clustering -- **Database:** - - **PostgreSQL + SQLAlchemy + pgvector Integration:** - - Replace file-based storage with PostgreSQL database using SQLAlchemy ORM - - Create new SQLAlchemy models for news articles with proper relationships - - Implement database migrations using Alembic - - Add connection pooling and transaction management - - Integrate pgvector extension for high-dimensional sentiment embeddings storage - - Enable semantic similarity search and vector-based sentiment clustering - - **Database Schema Design:** - - `news_articles` table with columns for article data, sentiment scores, embeddings, and metadata - - `news_sources` table for source information and credibility tracking - - `news_categories` table for article categorization - - `sentiment_embeddings` table for high-dimensional vector storage using pgvector - - Proper indexing for symbol, date, source queries, and vector similarity searches - - Foreign key relationships between articles, sources, categories, and embeddings - -### API Specification -- No external API changes. All modifications will be internal to the `NewsService` and the cron job that calls it. - -### Security & Performance -- **Security:** LLM API keys will continue to be managed through the `TradingAgentsConfig` and environment variables. No new security risks are introduced. -- **Performance:** The scraping and sentiment analysis process is I/O and network-bound. This will run as part of the daily cron job, so it will not impact the performance of the trading agents' decision-making process, which will read from the cached data. - -### Database Schema Design - -#### Core Tables -```sql --- Enable pgvector extension -CREATE EXTENSION IF NOT EXISTS vector; - --- News sources for credibility tracking -CREATE TABLE news_sources ( - id SERIAL PRIMARY KEY, - name VARCHAR(255) NOT NULL UNIQUE, - domain VARCHAR(255), - credibility_score DECIMAL(3,2) DEFAULT 0.5, -- 0.0 to 1.0 - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP -); - --- News categories for article classification -CREATE TABLE news_categories ( - id SERIAL PRIMARY KEY, - name VARCHAR(100) NOT NULL UNIQUE, - description TEXT, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP -); - --- Main articles table -CREATE TABLE news_articles ( - id SERIAL PRIMARY KEY, - title TEXT NOT NULL, - content TEXT, - author VARCHAR(255), - symbol VARCHAR(10), -- Stock ticker, nullable for global news - source_id INTEGER REFERENCES news_sources(id), - category_id INTEGER REFERENCES news_categories(id), - url TEXT UNIQUE NOT NULL, - published_date DATE NOT NULL, - scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - - -- Sentiment analysis - sentiment_score DECIMAL(3,2), -- -1.0 to 1.0 - sentiment_confidence DECIMAL(3,2), -- 0.0 to 1.0 - sentiment_label VARCHAR(20), -- positive/negative/neutral - sentiment_analyzed_at TIMESTAMP, - - -- Vector embeddings for semantic analysis - title_embedding vector(1536), -- OpenAI ada-002 embedding dimension - content_embedding vector(1536), -- Full article content embedding - sentiment_embedding vector(384), -- Sentence-transformer for sentiment - embedding_model VARCHAR(50) DEFAULT 'text-embedding-ada-002', - embedded_at TIMESTAMP, - - -- Metadata - content_length INTEGER, - scrape_status VARCHAR(20) DEFAULT 'SUCCESS', -- SUCCESS, FAILED, ARCHIVE_SUCCESS - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP -); - --- Remove redundant sentiment_embeddings table --- All embeddings stored directly in news_articles table for simplicity and performance - --- Performance indexes -CREATE INDEX idx_news_articles_symbol_date ON news_articles(symbol, published_date); -CREATE INDEX idx_news_articles_published_date ON news_articles(published_date); -CREATE INDEX idx_news_articles_source ON news_articles(source_id); -CREATE INDEX idx_news_articles_sentiment ON news_articles(sentiment_score, sentiment_confidence); -CREATE INDEX idx_news_articles_url_hash ON news_articles USING HASH(url); - --- Vector similarity indexes using HNSW (Hierarchical Navigable Small World) --- Note: HNSW indexes consume significant memory (2-4x vector storage) -CREATE INDEX idx_articles_title_embedding ON news_articles USING hnsw (title_embedding vector_cosine_ops) - WITH (m = 16, ef_construction = 64); -- Tuned for performance vs memory -CREATE INDEX idx_articles_content_embedding ON news_articles USING hnsw (content_embedding vector_cosine_ops) - WITH (m = 16, ef_construction = 64); -CREATE INDEX idx_articles_sentiment_embedding ON news_articles USING hnsw (sentiment_embedding vector_cosine_ops) - WITH (m = 8, ef_construction = 32); -- Smaller index for sentiment vectors -``` - -#### SQLAlchemy Models -```python -# tradingagents/domains/news/models.py -from datetime import datetime, date -from decimal import Decimal -from typing import List, Optional -from sqlalchemy import Column, Integer, String, Text, Date, DateTime, Decimal as SQLDecimal, ForeignKey -from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship -from pgvector.sqlalchemy import Vector - -Base = declarative_base() - -class NewsSource(Base): - __tablename__ = 'news_sources' - - id = Column(Integer, primary_key=True) - name = Column(String(255), nullable=False, unique=True) - domain = Column(String(255)) - credibility_score = Column(SQLDecimal(3,2), default=0.5) - created_at = Column(DateTime, default=datetime.utcnow) - updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) - - # Relationships - articles = relationship("NewsArticle", back_populates="source") - -class NewsCategory(Base): - __tablename__ = 'news_categories' - - id = Column(Integer, primary_key=True) - name = Column(String(100), nullable=False, unique=True) - description = Column(Text) - created_at = Column(DateTime, default=datetime.utcnow) - - # Relationships - articles = relationship("NewsArticle", back_populates="category") - -class NewsArticle(Base): - __tablename__ = 'news_articles' - - id = Column(Integer, primary_key=True) - title = Column(Text, nullable=False) - content = Column(Text) - author = Column(String(255)) - symbol = Column(String(10)) # Nullable for global news - source_id = Column(Integer, ForeignKey('news_sources.id')) - category_id = Column(Integer, ForeignKey('news_categories.id')) - url = Column(Text, unique=True, nullable=False) - published_date = Column(Date, nullable=False) - scraped_at = Column(DateTime, default=datetime.utcnow) - - # Sentiment fields - sentiment_score = Column(SQLDecimal(3,2)) # -1.0 to 1.0 - sentiment_confidence = Column(SQLDecimal(3,2)) # 0.0 to 1.0 - sentiment_label = Column(String(20)) # positive/negative/neutral - sentiment_analyzed_at = Column(DateTime) - - # Vector embeddings using pgvector - title_embedding = Column(Vector(1536)) # OpenAI ada-002 dimensions - content_embedding = Column(Vector(1536)) # Full content embedding - sentiment_embedding = Column(Vector(384)) # Sentence transformer for sentiment - embedding_model = Column(String(50), default='text-embedding-ada-002') - embedded_at = Column(DateTime) - - # Metadata - content_length = Column(Integer) - scrape_status = Column(String(20), default='SUCCESS') - created_at = Column(DateTime, default=datetime.utcnow) - updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) - - # Relationships - source = relationship("NewsSource", back_populates="articles") - category = relationship("NewsCategory", back_populates="articles") - -# Removed redundant SentimentEmbedding table for simplified architecture -``` - -#### Database Migration Strategy - -**Alembic Configuration:** -```python -# alembic/env.py -from tradingagents.domains.news.models import Base -from tradingagents.config import TradingAgentsConfig - -config = TradingAgentsConfig.from_env() -target_metadata = Base.metadata - -# Database URL from config -config.set_main_option("sqlalchemy.url", config.database_url) -``` - -**Initial Migration:** -```bash -# Initialize Alembic in the project -alembic init alembic - -# Generate initial migration -alembic revision --autogenerate -m "Create news tables" - -# Apply migration -alembic upgrade head -``` - -**Migration Files:** -- `001_enable_pgvector.py` - Enable pgvector extension -- `002_create_news_tables.py` - Initial schema creation with vector fields -- `003_add_vector_indexes.py` - HNSW indexes for vector similarity -- `004_seed_categories_sources.py` - Seed default categories and trusted sources - -**TradingAgentsConfig Extension:** -```python -@dataclass -class TradingAgentsConfig: - # ... existing fields ... - - # Database configuration - database_url: str = field(default_factory=lambda: os.getenv("DATABASE_URL", "")) - database_pool_size: int = field(default_factory=lambda: int(os.getenv("DATABASE_POOL_SIZE", "10"))) - database_max_overflow: int = field(default_factory=lambda: int(os.getenv("DATABASE_MAX_OVERFLOW", "20"))) - database_echo: bool = field(default_factory=lambda: os.getenv("DATABASE_ECHO", "false").lower() == "true") - - # Vector configuration - enable_vector_search: bool = field(default_factory=lambda: os.getenv("ENABLE_VECTOR_SEARCH", "true").lower() == "true") - embedding_model: str = field(default_factory=lambda: os.getenv("EMBEDDING_MODEL", "text-embedding-ada-002")) - embedding_batch_size: int = field(default_factory=lambda: int(os.getenv("EMBEDDING_BATCH_SIZE", "100"))) - enable_sentence_transformers: bool = field(default_factory=lambda: os.getenv("ENABLE_SENTENCE_TRANSFORMERS", "true").lower() == "true") - - @property - def has_database_config(self) -> bool: - """Check if database is properly configured.""" - return bool(self.database_url and self.database_url.startswith("postgresql://")) - - @property - def embedding_provider(self) -> str: - """Get embedding provider from LLM provider setting.""" - # Map LLM providers to their embedding providers - llm_provider = getattr(self, 'llm_provider', 'openai') - embedding_map = { - 'openai': 'openai', - 'google': 'google', # Use Gemini for embeddings when Google is selected - 'anthropic': 'openai', # Anthropic doesn't have embeddings, use OpenAI - 'ollama': 'openai' # Local models, use OpenAI for embeddings - } - return embedding_map.get(llm_provider, 'openai') - -def validate_database_config(config: TradingAgentsConfig) -> None: - """Validate database configuration before startup.""" - if not config.has_database_config: - raise ValueError("DATABASE_URL must be set for PostgreSQL integration") - - if config.enable_vector_search and not config.has_database_config: - raise ValueError("Vector search requires PostgreSQL database configuration") -``` - -**Environment Variables:** -```bash -# Database configuration (required) -DATABASE_URL=postgresql://username:password@localhost:5432/tradingagents -DATABASE_POOL_SIZE=10 # optional, defaults to 10 -DATABASE_MAX_OVERFLOW=20 # optional, defaults to 20 -DATABASE_ECHO=false # optional, set to true for SQL debugging - -# Vector configuration (optional) -ENABLE_VECTOR_SEARCH=true # optional, defaults to true -EMBEDDING_MODEL=google/gemini-2.5-flash # Use Gemini via OpenRouter for embeddings -EMBEDDING_BATCH_SIZE=100 # optional -ENABLE_SENTENCE_TRANSFORMERS=true # optional - -# Example configurations by provider: -# For OpenAI: EMBEDDING_MODEL=text-embedding-ada-002 -# For Gemini: EMBEDDING_MODEL=google/gemini-2.5-flash (via OpenRouter) -``` - -#### Embedding Generation Service Design - -**SentimentScore Enhancement:** -```python -@dataclass -class SentimentScoreWithEmbeddings: - """Enhanced sentiment analysis with vector embeddings.""" - - score: float # -1.0 to 1.0 - confidence: float # 0.0 to 1.0 - label: str # positive/negative/neutral - - # Vector embeddings - title_embedding: List[float] # 1536 dimensions - content_embedding: List[float] # 1536 dimensions - sentiment_embedding: List[float] # 384 dimensions - embedding_model: str = "text-embedding-ada-002" -``` - -**Service Implementation:** -```python -class EmbeddingProvider: - """Abstract base for embedding providers.""" - async def get_embeddings(self, texts: List[str]) -> List[List[float]]: - raise NotImplementedError - -class OpenAIEmbeddingProvider(EmbeddingProvider): - def __init__(self, api_key: str, model: str = "text-embedding-ada-002"): - self.client = AsyncOpenAI(api_key=api_key) - self.model = model - - async def get_embeddings(self, texts: List[str]) -> List[List[float]]: - response = await self.client.embeddings.create( - input=texts, - model=self.model - ) - return [item.embedding for item in response.data] - -class GeminiEmbeddingProvider(EmbeddingProvider): - def __init__(self, api_key: str, base_url: str = "https://openrouter.ai/api/v1"): - self.client = AsyncOpenAI(api_key=api_key, base_url=base_url) - self.model = "google/gemini-2.5-flash" - - async def get_embeddings(self, texts: List[str]) -> List[List[float]]: - # Gemini via OpenRouter - batch embeddings - response = await self.client.embeddings.create( - input=texts, - model=self.model - ) - return [item.embedding for item in response.data] - -class SentimentAnalysisService: - def __init__(self, config: TradingAgentsConfig): - self.llm_client = self._get_llm_client(config) - self.embedding_provider = self._get_embedding_provider(config) - self.sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2') if config.enable_sentence_transformers else None - - def _get_embedding_provider(self, config: TradingAgentsConfig) -> EmbeddingProvider: - """Get appropriate embedding provider based on configuration.""" - provider = config.embedding_provider - - if provider == 'openai': - return OpenAIEmbeddingProvider( - api_key=os.getenv('OPENAI_API_KEY'), - model=config.embedding_model - ) - elif provider == 'google': - return GeminiEmbeddingProvider( - api_key=os.getenv('OPENAI_API_KEY'), # OpenRouter key - base_url="https://openrouter.ai/api/v1" - ) - else: - # Default to OpenAI - return OpenAIEmbeddingProvider( - api_key=os.getenv('OPENAI_API_KEY'), - model=config.embedding_model - ) - - async def get_sentiment_with_embeddings( - self, - title: str, - content: str - ) -> SentimentScoreWithEmbeddings: - """Generate sentiment analysis with vector embeddings - optimized for performance.""" - - # 1. Parallel processing: sentiment score + embeddings - tasks = [ - self._get_sentiment_score(content), # LLM sentiment analysis - self.embedding_provider.get_embeddings([title, content]) # Batch embedding API call - ] - - sentiment, embeddings = await asyncio.gather(*tasks) - title_embedding, content_embedding = embeddings - - # 2. Generate local sentiment embedding if enabled - sentiment_embedding = None - if self.sentence_transformer: - sentiment_embedding = self.sentence_transformer.encode(content).tolist() - - return SentimentScoreWithEmbeddings( - score=sentiment.score, - confidence=sentiment.confidence, - label=sentiment.label, - title_embedding=title_embedding, - content_embedding=content_embedding, - sentiment_embedding=sentiment_embedding, - embedding_model=self.embedding_provider.model - ) - - async def _get_sentiment_score(self, content: str) -> SentimentScore: - """Generate sentiment score using LLM with financial news prompt.""" - - prompt = """ - Analyze the sentiment of this financial news article for trading purposes. - - Article Content: {content} - - Provide your analysis in the following JSON format: - {{ - "score": , - "confidence": , - "label": <"positive", "negative", or "neutral">, - "reasoning": , - "key_themes": , - "financial_entities": - }} - - Focus on the financial and market implications of the news. - Consider impact on stock prices, market sentiment, and trading decisions. - """.format(content=content[:2000]) # Limit content length - - response = await self.llm_client.complete(prompt) - - try: - result = json.loads(response) - return SentimentScore( - score=result.get("score", 0.0), - confidence=result.get("confidence", 0.5), - label=result.get("label", "neutral"), - metadata={ - "reasoning": result.get("reasoning", ""), - "key_themes": result.get("key_themes", []), - "financial_entities": result.get("financial_entities", []) - } - ) - except Exception as e: - # Return neutral sentiment on error - return SentimentScore( - score=0.0, - confidence=0.0, - label="neutral", - metadata={"error": str(e)} - ) - - def find_similar_articles( - self, - embedding: List[float], - limit: int = 10, - similarity_threshold: float = 0.8 - ) -> List[NewsArticle]: - """Find semantically similar articles using vector similarity.""" - # Use pgvector cosine similarity search - pass - - async def batch_analyze_sentiment( - self, - articles: List[ArticleData], - batch_size: int = 5 - ) -> List[SentimentScoreWithEmbeddings]: - """ - Batch process sentiment analysis and embedding generation. - - Args: - articles: List of articles to analyze - batch_size: Number of articles to process concurrently - - Returns: - List of sentiment scores with embeddings - """ - results = [] - - for i in range(0, len(articles), batch_size): - batch = articles[i:i + batch_size] - - # Process batch concurrently - batch_tasks = [ - self.get_sentiment_with_embeddings(article.title, article.content) - for article in batch - ] - - batch_results = await asyncio.gather(*batch_tasks, return_exceptions=True) - - for result in batch_results: - if isinstance(result, Exception): - # Handle individual failures gracefully - logger.error(f"Sentiment analysis failed: {result}") - results.append(self._get_neutral_sentiment_with_embeddings()) - else: - results.append(result) - - # Rate limiting: Add delay between batches - if i + batch_size < len(articles): - await asyncio.sleep(1.0) # 1 second delay between batches - - return results -``` - -**Optimized Vector Similarity Queries:** -```sql --- Find articles similar to a given title embedding (HNSW optimized) --- Note: Don't use WHERE clause on similarity - it defeats HNSW indexing -SELECT id, title, symbol, - (title_embedding <=> %s) as distance, - (1 - (title_embedding <=> %s)) as similarity -FROM news_articles -WHERE title_embedding IS NOT NULL -- Only filter on non-null vectors -ORDER BY title_embedding <=> %s -LIMIT 20 -- Get more candidates, filter in application if needed -HAVING distance < 0.2; -- Filter after ordering for best performance - --- Find articles with similar sentiment patterns (pre-filter by label for efficiency) -SELECT id, title, sentiment_label, - (sentiment_embedding <=> %s) as distance -FROM news_articles -WHERE sentiment_label = %s -- Filter first by indexed column - AND sentiment_embedding IS NOT NULL -ORDER BY sentiment_embedding <=> %s -LIMIT 15; - --- Cluster articles by content similarity for a ticker (optimized approach) -WITH similar_articles AS ( - SELECT id, symbol, sentiment_score, - (content_embedding <=> %s) as distance - FROM news_articles - WHERE symbol = %s -- Use indexed column first - AND content_embedding IS NOT NULL - ORDER BY content_embedding <=> %s - LIMIT 50 -- Limit search space -) -SELECT symbol, - AVG(sentiment_score) as avg_sentiment, - COUNT(*) as article_count, - AVG(distance) as avg_content_distance -FROM similar_articles -WHERE distance < 0.3 -- Apply similarity threshold after vector search -GROUP BY symbol; - --- Performance monitoring query -SELECT - schemaname, - tablename, - attname as column_name, - n_distinct, - correlation -FROM pg_stats -WHERE tablename = 'news_articles' - AND attname LIKE '%embedding%'; -``` - -**Memory Usage Estimation:** -```sql --- Estimate memory requirements for HNSW indexes -SELECT - pg_size_pretty(pg_total_relation_size('idx_articles_title_embedding')) as title_index_size, - pg_size_pretty(pg_total_relation_size('idx_articles_content_embedding')) as content_index_size, - pg_size_pretty(pg_total_relation_size('idx_articles_sentiment_embedding')) as sentiment_index_size, - pg_size_pretty(pg_total_relation_size('news_articles')) as table_size; - --- Expected memory usage: 500MB-1GB for 10K articles with 3 embedding types -``` - -### Current Implementation Status - -**✅ COMPLETED COMPONENTS:** - -1. **NewsService Core Structure (90% Complete)** - - ✅ Core service class with dependency injection - - ✅ Read path implemented: `get_company_news_context()`, `get_global_news_context()` - - ✅ Write path implemented: `update_company_news()`, `update_global_news()` - - ✅ Repository integration with file-based storage - - ✅ ArticleData model conversion from repository NewsArticle - - ✅ Simple keyword-based sentiment analysis as fallback - - ✅ Error handling and empty context returns - - ✅ Trending topics extraction - - ✅ Date validation and ISO format handling - -2. **NewsRepository (100% Complete)** - - ✅ File-based storage with JSON serialization - - ✅ Source separation (finnhub, google_news) - - ✅ Date-based file organization (YYYY-MM-DD.json) - - ✅ Article deduplication by URL - - ✅ Batch storage operations - - ✅ Complete CRUD operations - - ✅ Proper error handling and logging - -3. **Data Models (100% Complete)** - - ✅ ArticleData dataclass with sentiment field - - ✅ NewsContext and GlobalNewsContext for agent consumption - - ✅ SentimentScore model - - ✅ NewsUpdateResult for operation tracking - - ✅ DataQuality enum for metadata - -**✅ COMPLETED COMPONENTS (UPDATED):** - -4. **GoogleNewsClient (100% Complete)** - - ✅ RSS feed parsing with feedparser - - ✅ Company news method implemented (`get_company_news()`) - - ✅ Global news method implemented (`get_global_news()`) - - ✅ Proper error handling and logging - - ✅ Google News RSS URL construction - - ✅ Article parsing with source extraction - - ✅ Date parsing with fallback handling - -5. **ArticleScraperClient (100% Complete)** - - ✅ Full newspaper4k content extraction (upgraded from newspaper3k) - - ✅ Internet Archive Wayback Machine fallback - - ✅ Robust error handling for failed scrapes - - ✅ Content validation (minimum length checks) - - ✅ Multiple article batch processing - - ✅ Rate limiting with configurable delays - - ✅ Proper URL validation - -**❌ MISSING COMPONENTS:** - -6. **LLM Sentiment Analysis Service (0% Complete)** - - ❌ SentimentAnalysisService class not created - - ❌ LLM integration not implemented - - ❌ Financial news prompts not defined - - ❌ Batch processing not implemented - - **Current**: Using simple keyword-based fallback - - **Next**: Create dedicated sentiment service - -7. **Database Migration (0% Complete)** - - ❌ SQLAlchemy models not created - - ❌ PostgreSQL integration not started - - ❌ pgvector extension not configured - - ❌ Alembic migrations not set up - - **Current**: Using file-based storage - - **Status**: Planned for future iteration - -8. **Vector Embeddings (0% Complete)** - - ❌ Embedding providers not implemented - - ❌ Vector similarity not available - - ❌ Semantic search not implemented - - **Status**: Advanced feature for future enhancement - -### Revised Implementation Phases - -**PHASE 1: Complete Core Functionality (Current Priority)** -- **GoogleNewsClient RSS Implementation (2-3 days)** - - Implement feedparser RSS parsing - - Add company news and global news methods - - Handle RSS feed errors and edge cases - - Create comprehensive tests with VCR cassettes - -- **ArticleScraperClient Implementation (2-3 days)** - - Implement newspaper3k content extraction - - Add Internet Archive fallback mechanism - - Handle paywalls and extraction failures - - Create scraping tests with mock responses - -- **LLM Sentiment Analysis Service (3-4 days)** - - Create SentimentAnalysisService class - - Implement LLM client integration using TradingAgentsConfig - - Design financial news sentiment prompts - - Add batch processing with rate limiting - - Replace keyword-based sentiment in NewsService - -**PHASE 2: Testing and Refinement (Current Phase)** -- **Integration Testing (1-2 days)** - - End-to-end testing with real RSS feeds - - Test article scraping and sentiment analysis pipeline - - Verify error handling and partial failures - - Performance testing with multiple tickers - -- **Type Safety and Quality (1 day)** - - Ensure `mise run typecheck` passes with 0 errors - - Fix any remaining linting issues - - Add missing docstrings and type hints - -**PHASE 3: Future Enhancements (Deferred)** -- **Database Migration**: SQLAlchemy + PostgreSQL + pgvector -- **Vector Embeddings**: Semantic similarity and clustering -- **Performance Optimization**: Caching improvements and batch processing - -### Total Timeline: 1-2 weeks for core completion -- **Week 1**: Complete GoogleNewsClient, ArticleScraperClient, LLM Sentiment Service -- **Week 2**: Integration testing, refinement, and quality assurance -- **Future**: Database migration and vector enhancements as separate project - -## Testing Plan - -### Test Strategy -- **Unit Testing:** Test individual components in isolation with mocked dependencies -- **Integration Testing:** Test component interactions and data flow -- **End-to-End Testing:** Test complete workflows from news fetching to storage - -### Unit Tests - -#### GoogleNewsClient Tests -- **Location:** `tests/domains/news/test_google_news_client.py` -- **Framework:** `pytest` with `pytest-vcr` for HTTP recording/replay -- **VCR Cassettes:** `tests/fixtures/vcr_cassettes/google_news/` -- **Test Cases:** - - `@pytest.mark.vcr` `test_get_news_by_symbol_success()` - Valid symbol returns articles - - `@pytest.mark.vcr` `test_get_news_by_symbol_invalid_symbol()` - Invalid symbol handling - - `@pytest.mark.vcr` `test_get_global_news_success()` - Global news retrieval - - `@pytest.mark.vcr` `test_get_global_news_empty_response()` - Empty RSS feed handling - - `test_rss_feed_parsing_error()` - Malformed RSS handling (mocked) - - `test_network_timeout()` - Network timeout scenarios (mocked) - - `test_rate_limiting()` - Rate limit compliance (mocked) - -#### ArticleScraperClient Tests -- **Location:** `tests/domains/news/test_article_scraper_client.py` -- **Framework:** `pytest` with `pytest-vcr` for HTTP recording/replay -- **VCR Cassettes:** `tests/fixtures/vcr_cassettes/article_scraper/` -- **Test Cases:** - - `@pytest.mark.vcr` `test_scrape_article_success()` - Successful article scraping - - `@pytest.mark.vcr` `test_scrape_article_archive_fallback()` - Archive.is fallback - - `test_scrape_article_both_fail()` - Both methods fail gracefully (mocked) - - `test_invalid_url()` - Invalid URL handling (mocked) - - `@pytest.mark.vcr` `test_content_extraction()` - Content parsing accuracy - -#### SentimentAnalysisService Tests -- **Location:** `tests/domains/news/test_sentiment_service.py` -- **Test Cases:** - - `test_get_sentiment_positive()` - Positive sentiment detection - - `test_get_sentiment_negative()` - Negative sentiment detection - - `test_get_sentiment_neutral()` - Neutral sentiment detection - - `test_get_sentiment_llm_error()` - LLM API error handling - - `test_get_sentiment_invalid_response()` - Invalid JSON response handling - - `test_get_sentiment_empty_content()` - Empty content handling - -#### NewsService Tests -- **Location:** `tests/domains/news/test_news_service.py` -- **Test Cases:** - - `test_update_company_news_success()` - Complete news update workflow - - `test_update_company_news_no_articles()` - No articles found scenario - - `test_update_company_news_scraping_failure()` - Partial scraping failures - - `test_sentiment_analysis_integration()` - Sentiment analysis integration - - `test_calculate_sentiment_summary()` - Sentiment aggregation logic - - `test_get_company_news_by_date()` - News retrieval by date - -#### NewsRepository Tests -- **Location:** `tests/domains/news/test_news_repository.py` -- **Test Cases:** - - `test_store_news_articles()` - Article storage - - `test_get_news_by_symbol_and_date()` - News retrieval - - `test_duplicate_article_handling()` - Duplicate prevention - - `test_data_persistence()` - File system persistence - - `test_invalid_data_handling()` - Invalid data rejection - -### Integration Tests - -#### News Workflow Integration -- **Location:** `tests/integration/test_news_workflow.py` -- **Test Cases:** - - `test_full_news_update_workflow()` - Complete end-to-end workflow - - `test_news_service_with_real_clients()` - Real client integration - - `test_sentiment_service_integration()` - LLM integration testing - - `test_repository_integration()` - Data persistence integration - -### End-to-End Tests - -#### Complete System Tests -- **Location:** `tests/e2e/test_news_system.py` -- **Test Cases:** - - `test_daily_news_update_simulation()` - Simulate daily cron job - - `test_trading_agent_news_consumption()` - Agent news retrieval - - `test_system_performance_with_multiple_tickers()` - Performance testing - - `test_error_recovery_scenarios()` - System resilience testing - -### Test Data Management - -#### Mock Data Strategy -- **RSS Feed Samples:** Saved sample RSS responses for consistent testing -- **Article Content:** Pre-scraped article content for sentiment testing -- **LLM Responses:** Mock sentiment analysis responses for unit tests - -#### Test Configuration -- **Environment Variables:** Separate test configuration -- **Database Isolation:** Temporary test databases -- **VCR Configuration:** Record/replay HTTP interactions for deterministic tests -- **Pytest Configuration:** `pytest.ini` with VCR settings and test markers - -### Performance Testing - -#### Load Testing -- **Concurrent News Updates:** Test multiple ticker updates simultaneously -- **Memory Usage:** Monitor memory consumption during batch processing -- **API Rate Limiting:** Verify rate limit compliance under load - -#### Benchmarking -- **Scraping Speed:** Measure article scraping performance -- **Sentiment Analysis:** Measure LLM response times -- **Storage Performance:** Database write/read performance - -### Test Automation - -#### CI/CD Integration -- **Pre-commit Hooks:** Run fast unit tests before commits -- **Pull Request Checks:** Full test suite on PR creation -- **Nightly Tests:** End-to-end tests with real data - -#### Test Coverage Requirements -- **Minimum Coverage:** 80% line coverage for all components -- **Critical Path Coverage:** 100% coverage for core business logic -- **Error Handling Coverage:** All exception paths tested - -### Manual Testing Scenarios - -#### Smoke Tests -- **Daily Operations:** Manual verification of daily news updates -- **Data Quality:** Spot-check sentiment analysis accuracy -- **System Health:** Monitor error rates and performance metrics - -#### Acceptance Testing -- **Trading Agent Integration:** Verify agents can consume news data effectively -- **Data Accuracy:** Validate news relevance and sentiment accuracy -- **Performance Benchmarks:** Confirm system meets performance requirements - -## Current Implementation Status Summary - -### Overall Progress: 95% Complete 🎉 - -**✅ COMPLETED (95%)** -- Requirements analysis and technical design -- NewsService core structure with read/write paths -- NewsRepository with file-based storage and deduplication -- Data models (ArticleData, NewsContext, SentimentScore) -- GoogleNewsClient with full RSS feed parsing -- ArticleScraperClient with newspaper4k + Internet Archive fallback (upgraded) -- Basic sentiment analysis (keyword-based fallback) -- Error handling and validation -- Service integration and dependency injection -- **NEW**: Unit test suite with mocking framework -- **NEW**: Type safety improvements with newspaper4k migration -- **NEW**: Repository integration for cached data retrieval - -**❌ MISSING (5%)** -- LLM sentiment analysis service (only remaining core component) -- Integration tests with real data -- End-to-end testing validation - -**⏸️ DEFERRED (Future Iterations)** -- Database migration to PostgreSQL + SQLAlchemy -- Vector embeddings and semantic search -- Real-time news streaming capabilities - -### What's Working Now -The current NewsService implementation provides: -- **Read Path**: Agents can successfully call `get_company_news_context()` and `get_global_news_context()` -- **Repository Integration**: Service reads cached news data from file-based NewsRepository -- **Data Transformation**: Converts NewsRepository.NewsArticle → ArticleData for agents -- **Basic Sentiment**: Simple keyword-based sentiment analysis as fallback -- **Error Handling**: Graceful error handling with empty contexts and metadata -- **Type Safety**: Proper type hints and dataclass definitions - -### What's Missing -The service currently cannot: -- **LLM Sentiment Analysis**: No LLM integration for financial news sentiment (using keyword fallback) -- **Structured Storage**: Still using file-based storage instead of planned PostgreSQL + SQLAlchemy -- **Vector Embeddings**: No semantic similarity or vector-based features - -### Critical Gap (Only 1 Remaining!) -1. **LLM Sentiment Service** - No structured sentiment analysis with LLM prompts - - Current: Simple keyword-based sentiment scoring - - Needed: LLM integration using TradingAgentsConfig - - Impact: Agents get basic sentiment but not sophisticated financial analysis - -### Recent Updates (January 2025) -Latest development progress: -- ✅ **Migration to newspaper4k** - Upgraded from newspaper3k for better compatibility -- ✅ **Unit Test Framework** - Comprehensive test suite with mocking -- ✅ **Type Safety** - Added type stubs and improved type checking configuration -- ✅ **Repository Integration** - NewsService now properly reads cached data from repository -- ✅ **Linting Compliance** - All code passes ruff linting standards - -### Next Immediate Steps (Revised) -1. **✅ COMPLETE: GoogleNewsClient RSS parsing** - Already implemented with feedparser -2. **✅ COMPLETE: ArticleScraperClient** - Already implemented with newspaper4k + Internet Archive -3. **⏳ PRIORITY: Create LLM Sentiment Service** - Replace keyword-based analysis (2-3 days) -4. **⏳ PRIORITY: Integration testing** - End-to-end workflow validation (1-2 days) - -### Timeline to MVP (Updated January 2025) -- **3-5 days** for LLM sentiment service + testing -- **Current system has test framework** and passes type checking -- **Database migration** deferred to future iteration -- **Vector features** planned as advanced enhancement - -### Implementation Priority -**HIGH PRIORITY (Required for sophisticated sentiment)**: -- LLM Sentiment Analysis Service with financial news prompts - -**MEDIUM PRIORITY (System improvements)**: -- Better error handling and retry logic -- Performance optimization for batch processing -- Comprehensive integration test suite - -**LOW PRIORITY (Future enhancements)**: -- PostgreSQL + SQLAlchemy migration -- Vector embeddings and semantic search -- Real-time news streaming diff --git a/setup.py b/setup.py deleted file mode 100644 index c04be5a1..00000000 --- a/setup.py +++ /dev/null @@ -1,43 +0,0 @@ -""" -Setup script for the TradingAgents package. -""" - -from setuptools import find_packages, setup - -setup( - name="tradingagents", - version="0.1.0", - description="Multi-Agents LLM Financial Trading Framework", - author="TradingAgents Team", - author_email="yijia.xiao@cs.ucla.edu", - url="https://github.com/TauricResearch", - packages=find_packages(), - install_requires=[ - "langchain>=0.1.0", - "langchain-openai>=0.0.2", - "langchain-experimental>=0.0.40", - "langgraph>=0.0.20", - "numpy>=1.24.0", - "pandas>=2.0.0", - "praw>=7.7.0", - "stockstats>=0.5.4", - "yfinance>=0.2.31", - "typer>=0.9.0", - "rich>=13.0.0", - "questionary>=2.0.1", - ], - python_requires=">=3.10", - entry_points={ - "console_scripts": [ - "tradingagents=cli.main:app", - ], - }, - classifiers=[ - "Development Status :: 3 - Alpha", - "Intended Audience :: Financial and Trading Industry", - "License :: OSI Approved :: Apache Software License", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.10", - "Topic :: Office/Business :: Financial :: Investment", - ], -) diff --git a/test_typecheck.sh b/test_typecheck.sh deleted file mode 100644 index ac1092a5..00000000 --- a/test_typecheck.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -echo "Running type check..." -cd /Users/martinrichards/code/TradingAgents -mise run typecheck diff --git a/tests/domains/news/test_article_scraper_client.py b/tests/domains/news/test_article_scraper_client.py index 67ecf661..e6e82f36 100644 --- a/tests/domains/news/test_article_scraper_client.py +++ b/tests/domains/news/test_article_scraper_client.py @@ -2,6 +2,8 @@ Tests for ArticleScraperClient using pytest-vcr for HTTP interactions. """ +from unittest.mock import patch + import pytest from tradingagents.domains.news.article_scraper_client import ( @@ -9,34 +11,21 @@ from tradingagents.domains.news.article_scraper_client import ( ScrapeResult, ) - -# VCR configuration optimized for minimal cassette size -def response_content_filter(response): - """Filter response content to reduce cassette size.""" - if "text/html" in response.get("headers", {}).get("content-type", [""])[0]: - # For HTML responses, keep only the first 1KB for basic structure - if "string" in response["body"]: - content = response["body"]["string"] - if len(content) > 1024: - response["body"]["string"] = ( - content[:1024] + "... [TRUNCATED for test size]" - ) - return response - - +# VCR configuration vcr = pytest.mark.vcr( cassette_library_dir="tests/fixtures/vcr_cassettes/news", record_mode="once", # Record once, then replay match_on=["uri", "method"], - filter_headers=["authorization", "cookie", "user-agent", "set-cookie"], - before_record_response=response_content_filter, + filter_headers=["authorization", "cookie", "user-agent"], ) @pytest.fixture def scraper(): """ArticleScraperClient instance for testing.""" - return ArticleScraperClient(user_agent="Test-Agent/1.0", delay=0.1) + # Mock NLTK downloads to avoid external HTTP requests during tests + with patch("nltk.download"): + return ArticleScraperClient(user_agent="Test-Agent/1.0", delay=0.1) class TestArticleScraperClient: diff --git a/tests/domains/news/test_google_news_client.py b/tests/domains/news/test_google_news_client.py index 6a1ad8b2..9480ca2e 100644 --- a/tests/domains/news/test_google_news_client.py +++ b/tests/domains/news/test_google_news_client.py @@ -14,33 +14,12 @@ from tradingagents.domains.news.google_news_client import ( GoogleNewsClient, ) - -# VCR configuration optimized for minimal cassette size -def rss_content_filter(response): - """Filter RSS content to reduce cassette size while preserving test data.""" - content_type = response.get("headers", {}).get("content-type", [""])[0] - if "xml" in content_type and "string" in response["body"]: - content = response["body"]["string"] - # For RSS feeds, keep only first 5 items to reduce size - if len(content) > 5000: # Only truncate large RSS feeds - # Find closing tag of 5th item - item_count = content.count("") - if item_count > 5: - # Keep RSS structure but limit to 5 items - parts = content.split("") - if len(parts) > 6: # 5 items + everything after - response["body"]["string"] = ( - "".join(parts[:6]) + "" - ) - return response - - +# VCR configuration vcr = pytest.mark.vcr( cassette_library_dir="tests/fixtures/vcr_cassettes/news", record_mode="once", # Record once, then replay match_on=["uri", "method"], - filter_headers=["authorization", "cookie", "user-agent", "set-cookie"], - before_record_response=rss_content_filter, + filter_headers=["authorization", "cookie"], )