52 KiB
MarketData Domain: PostgreSQL Migration Technical Design
Project Overview
Project Type: Migration Project (85% complete CSV → PostgreSQL + TimescaleDB + pgvectorscale)
Business Impact: 10x performance improvement with sub-100ms query times and RAG capabilities
API Compatibility: 100% preservation of existing MarketDataService, FundamentalDataService, InsiderDataService APIs
Data Volume: 10 years OHLC, 5 years fundamentals, 3 years insider data for 500+ tickers
Architecture Overview
Current State (85% Complete)
CSV File Storage (./data/market_data/)
├── OHLC data in CSV files
├── Fundamental data in CSV files
├── Insider transaction data in CSV files
└── Manual file-based operations
Target Architecture
PostgreSQL + TimescaleDB + pgvectorscale
├── TimescaleDB hypertables for time-series optimization
├── pgvectorscale for RAG vector embeddings
├── Async PostgreSQL operations for concurrent agent access
├── Dagster automation for daily data collection
└── 100% API-compatible service layer
Component Relationships
External APIs (YFinance + FinnHub) → Dagster Pipeline → PostgreSQL Storage → Repository Layer → Service Layer → Agents
↓
pgvectorscale (RAG)
Domain Model
MarketDataEntity
Purpose: OHLC price data with TimescaleDB optimization and vector embeddings
from sqlalchemy import Column, String, DateTime, Numeric, Integer
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy_utils import TSVectorType
from pgvectorscale import Vector
class MarketDataEntity(Base):
__tablename__ = 'market_data'
__table_args__ = {
'timescaledb': {
'time_column_name': 'timestamp',
'chunk_time_interval': '1 day'
}
}
id = Column(Integer, primary_key=True)
symbol = Column(String(10), nullable=False, index=True)
timestamp = Column(DateTime, nullable=False, index=True)
open_price = Column(Numeric(10, 2), nullable=False)
high_price = Column(Numeric(10, 2), nullable=False)
low_price = Column(Numeric(10, 2), nullable=False)
close_price = Column(Numeric(10, 2), nullable=False)
volume = Column(Integer, nullable=False)
adjusted_close = Column(Numeric(10, 2), nullable=False)
# Vector embeddings for RAG
technical_pattern_embedding = Column(Vector(384)) # Technical analysis patterns
price_movement_embedding = Column(Vector(384)) # Price movement patterns
# Business rules
@classmethod
def from_csv_record(cls, csv_data: dict) -> 'MarketDataEntity':
"""Transform CSV data to entity"""
return cls(
symbol=csv_data['symbol'],
timestamp=pd.to_datetime(csv_data['timestamp']),
open_price=csv_data['open'],
high_price=csv_data['high'],
low_price=csv_data['low'],
close_price=csv_data['close'],
volume=csv_data['volume'],
adjusted_close=csv_data['adj_close']
)
def to_service_response(self) -> dict:
"""Transform entity to service API format"""
return {
'symbol': self.symbol,
'timestamp': self.timestamp.isoformat(),
'open': float(self.open_price),
'high': float(self.high_price),
'low': float(self.low_price),
'close': float(self.close_price),
'volume': self.volume,
'adj_close': float(self.adjusted_close)
}
def validate(self) -> bool:
"""Validate business rules"""
return (
self.high_price >= self.low_price and
self.high_price >= self.open_price and
self.high_price >= self.close_price and
self.low_price <= self.open_price and
self.low_price <= self.close_price and
self.volume >= 0
)
FundamentalDataEntity
Purpose: Financial statement data with PostgreSQL storage
class FundamentalDataEntity(Base):
__tablename__ = 'fundamental_data'
id = Column(Integer, primary_key=True)
symbol = Column(String(10), nullable=False, index=True)
report_date = Column(DateTime, nullable=False, index=True)
period_type = Column(String(10), nullable=False) # Q, Y
# Balance Sheet
total_assets = Column(Numeric(15, 2))
total_liabilities = Column(Numeric(15, 2))
shareholders_equity = Column(Numeric(15, 2))
# Income Statement
total_revenue = Column(Numeric(15, 2))
net_income = Column(Numeric(15, 2))
earnings_per_share = Column(Numeric(8, 4))
# Cash Flow
operating_cash_flow = Column(Numeric(15, 2))
capital_expenditures = Column(Numeric(15, 2))
free_cash_flow = Column(Numeric(15, 2))
# Ratios (calculated)
pe_ratio = Column(Numeric(8, 2))
pb_ratio = Column(Numeric(8, 2))
roe = Column(Numeric(8, 4))
roa = Column(Numeric(8, 4))
debt_to_equity = Column(Numeric(8, 4))
# Vector embeddings for RAG
financial_health_embedding = Column(Vector(384))
@classmethod
def from_finnhub_response(cls, finnhub_data: dict) -> 'FundamentalDataEntity':
"""Transform FinnHub API response to entity"""
return cls(
symbol=finnhub_data['symbol'],
report_date=pd.to_datetime(finnhub_data['reportedDate']),
period_type=finnhub_data['period'],
total_assets=finnhub_data.get('totalAssets'),
total_revenue=finnhub_data.get('totalRevenue'),
# ... map all fields
)
def calculate_ratios(self, current_price: float):
"""Calculate financial ratios"""
if self.earnings_per_share and self.earnings_per_share > 0:
self.pe_ratio = current_price / self.earnings_per_share
if self.shareholders_equity and self.shareholders_equity > 0:
self.pb_ratio = current_price / (self.shareholders_equity / 1_000_000) # Book value per share
if self.shareholders_equity and self.net_income:
self.roe = self.net_income / self.shareholders_equity
if self.total_assets and self.net_income:
self.roa = self.net_income / self.total_assets
InsiderDataEntity
Purpose: SEC insider transaction records with sentiment analysis
class InsiderDataEntity(Base):
__tablename__ = 'insider_data'
id = Column(Integer, primary_key=True)
symbol = Column(String(10), nullable=False, index=True)
transaction_date = Column(DateTime, nullable=False, index=True)
# Insider information
insider_name = Column(String(200), nullable=False)
insider_position = Column(String(100))
# Transaction details
transaction_type = Column(String(20), nullable=False) # Buy, Sell
shares_traded = Column(Integer, nullable=False)
transaction_price = Column(Numeric(10, 2))
shares_owned_after = Column(Integer)
# Derived fields
transaction_value = Column(Numeric(15, 2)) # shares * price
sentiment_score = Column(Numeric(3, 2)) # -1 to 1
# Vector embeddings for RAG
transaction_pattern_embedding = Column(Vector(384))
@classmethod
def from_finnhub_response(cls, finnhub_data: dict) -> 'InsiderDataEntity':
"""Transform FinnHub insider data to entity"""
entity = cls(
symbol=finnhub_data['symbol'],
transaction_date=pd.to_datetime(finnhub_data['transactionDate']),
insider_name=finnhub_data['personName'],
insider_position=finnhub_data.get('position'),
transaction_type='Buy' if finnhub_data['change'] > 0 else 'Sell',
shares_traded=abs(finnhub_data['change']),
shares_owned_after=finnhub_data['currentShares']
)
entity.calculate_sentiment()
return entity
def calculate_sentiment(self):
"""Calculate sentiment score based on transaction type and insider position"""
base_score = 0.7 if self.transaction_type == 'Buy' else -0.7
# Adjust based on position
if self.insider_position and 'ceo' in self.insider_position.lower():
base_score *= 1.2
elif self.insider_position and 'cfo' in self.insider_position.lower():
base_score *= 1.1
self.sentiment_score = max(-1.0, min(1.0, base_score))
TechnicalIndicatorEntity
Purpose: Calculated TA-Lib indicator values with vector embeddings
class TechnicalIndicatorEntity(Base):
__tablename__ = 'technical_indicators'
id = Column(Integer, primary_key=True)
symbol = Column(String(10), nullable=False, index=True)
timestamp = Column(DateTime, nullable=False, index=True)
# Moving Averages
sma_20 = Column(Numeric(10, 2))
sma_50 = Column(Numeric(10, 2))
ema_12 = Column(Numeric(10, 2))
ema_26 = Column(Numeric(10, 2))
# Momentum Indicators
rsi_14 = Column(Numeric(5, 2))
macd = Column(Numeric(10, 4))
macd_signal = Column(Numeric(10, 4))
macd_histogram = Column(Numeric(10, 4))
# Volatility Indicators
bollinger_upper = Column(Numeric(10, 2))
bollinger_lower = Column(Numeric(10, 2))
atr_14 = Column(Numeric(10, 4))
# Volume Indicators
obv = Column(Numeric(15, 0))
volume_sma_20 = Column(Numeric(15, 0))
# Pattern Recognition (0-100 scores)
pattern_doji = Column(Integer)
pattern_hammer = Column(Integer)
pattern_engulfing = Column(Integer)
# Vector embeddings for RAG pattern matching
indicator_pattern_embedding = Column(Vector(384))
@classmethod
def calculate_from_ohlc(cls, symbol: str, ohlc_data: pd.DataFrame) -> List['TechnicalIndicatorEntity']:
"""Calculate all technical indicators from OHLC data"""
import talib
indicators = []
# Calculate all indicators
sma_20 = talib.SMA(ohlc_data['close'], timeperiod=20)
rsi_14 = talib.RSI(ohlc_data['close'], timeperiod=14)
macd, macd_signal, macd_hist = talib.MACD(ohlc_data['close'])
# ... calculate all indicators
for i, timestamp in enumerate(ohlc_data.index):
if pd.notna(sma_20.iloc[i]): # Only create records with valid data
indicators.append(cls(
symbol=symbol,
timestamp=timestamp,
sma_20=sma_20.iloc[i],
rsi_14=rsi_14.iloc[i],
macd=macd.iloc[i],
macd_signal=macd_signal.iloc[i],
macd_histogram=macd_hist.iloc[i]
# ... set all calculated values
))
return indicators
Database Design
PostgreSQL + TimescaleDB + pgvectorscale Schema
-- Enable extensions
CREATE EXTENSION IF NOT EXISTS timescaledb CASCADE;
CREATE EXTENSION IF NOT EXISTS vectorscale CASCADE;
-- Market Data (TimescaleDB hypertable)
CREATE TABLE market_data (
id SERIAL PRIMARY KEY,
symbol VARCHAR(10) NOT NULL,
timestamp TIMESTAMPTZ NOT NULL,
open_price DECIMAL(10,2) NOT NULL,
high_price DECIMAL(10,2) NOT NULL,
low_price DECIMAL(10,2) NOT NULL,
close_price DECIMAL(10,2) NOT NULL,
volume BIGINT NOT NULL,
adjusted_close DECIMAL(10,2) NOT NULL,
technical_pattern_embedding vector(384),
price_movement_embedding vector(384),
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- Convert to TimescaleDB hypertable
SELECT create_hypertable('market_data', 'timestamp', chunk_time_interval => INTERVAL '1 day');
-- Indexes for performance
CREATE INDEX idx_market_data_symbol_time ON market_data (symbol, timestamp DESC);
CREATE INDEX idx_market_data_symbol ON market_data (symbol);
-- Vector indexes for RAG
CREATE INDEX idx_market_data_technical_embedding
ON market_data USING diskann (technical_pattern_embedding);
CREATE INDEX idx_market_data_price_embedding
ON market_data USING diskann (price_movement_embedding);
-- Fundamental Data
CREATE TABLE fundamental_data (
id SERIAL PRIMARY KEY,
symbol VARCHAR(10) NOT NULL,
report_date TIMESTAMPTZ NOT NULL,
period_type VARCHAR(10) NOT NULL,
-- Balance Sheet
total_assets DECIMAL(15,2),
total_liabilities DECIMAL(15,2),
shareholders_equity DECIMAL(15,2),
-- Income Statement
total_revenue DECIMAL(15,2),
net_income DECIMAL(15,2),
earnings_per_share DECIMAL(8,4),
-- Cash Flow
operating_cash_flow DECIMAL(15,2),
capital_expenditures DECIMAL(15,2),
free_cash_flow DECIMAL(15,2),
-- Ratios
pe_ratio DECIMAL(8,2),
pb_ratio DECIMAL(8,2),
roe DECIMAL(8,4),
roa DECIMAL(8,4),
debt_to_equity DECIMAL(8,4),
-- RAG embedding
financial_health_embedding vector(384),
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE(symbol, report_date, period_type)
);
CREATE INDEX idx_fundamental_symbol_date ON fundamental_data (symbol, report_date DESC);
CREATE INDEX idx_fundamental_embedding ON fundamental_data USING diskann (financial_health_embedding);
-- Insider Data
CREATE TABLE insider_data (
id SERIAL PRIMARY KEY,
symbol VARCHAR(10) NOT NULL,
transaction_date TIMESTAMPTZ NOT NULL,
insider_name VARCHAR(200) NOT NULL,
insider_position VARCHAR(100),
transaction_type VARCHAR(20) NOT NULL,
shares_traded INTEGER NOT NULL,
transaction_price DECIMAL(10,2),
shares_owned_after INTEGER,
transaction_value DECIMAL(15,2),
sentiment_score DECIMAL(3,2),
transaction_pattern_embedding vector(384),
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX idx_insider_symbol_date ON insider_data (symbol, transaction_date DESC);
CREATE INDEX idx_insider_embedding ON insider_data USING diskann (transaction_pattern_embedding);
-- Technical Indicators (TimescaleDB hypertable)
CREATE TABLE technical_indicators (
id SERIAL PRIMARY KEY,
symbol VARCHAR(10) NOT NULL,
timestamp TIMESTAMPTZ NOT NULL,
-- Moving Averages
sma_20 DECIMAL(10,2),
sma_50 DECIMAL(10,2),
ema_12 DECIMAL(10,2),
ema_26 DECIMAL(10,2),
-- Momentum
rsi_14 DECIMAL(5,2),
macd DECIMAL(10,4),
macd_signal DECIMAL(10,4),
macd_histogram DECIMAL(10,4),
-- Volatility
bollinger_upper DECIMAL(10,2),
bollinger_lower DECIMAL(10,2),
atr_14 DECIMAL(10,4),
-- Volume
obv DECIMAL(15,0),
volume_sma_20 DECIMAL(15,0),
-- Patterns
pattern_doji INTEGER,
pattern_hammer INTEGER,
pattern_engulfing INTEGER,
-- RAG embedding
indicator_pattern_embedding vector(384),
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
SELECT create_hypertable('technical_indicators', 'timestamp', chunk_time_interval => INTERVAL '1 day');
CREATE INDEX idx_technical_symbol_time ON technical_indicators (symbol, timestamp DESC);
CREATE INDEX idx_technical_embedding ON technical_indicators USING diskann (indicator_pattern_embedding);
Migration Strategy Scripts
# migrations/001_create_market_data_tables.py
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql
def upgrade():
# Create market_data table
op.create_table('market_data',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('symbol', sa.String(10), nullable=False),
sa.Column('timestamp', sa.DateTime(timezone=True), nullable=False),
sa.Column('open_price', sa.Numeric(10, 2), nullable=False),
sa.Column('high_price', sa.Numeric(10, 2), nullable=False),
sa.Column('low_price', sa.Numeric(10, 2), nullable=False),
sa.Column('close_price', sa.Numeric(10, 2), nullable=False),
sa.Column('volume', sa.BigInteger(), nullable=False),
sa.Column('adjusted_close', sa.Numeric(10, 2), nullable=False),
sa.Column('technical_pattern_embedding', postgresql.ARRAY(sa.Float()), nullable=True),
sa.Column('price_movement_embedding', postgresql.ARRAY(sa.Float()), nullable=True),
sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=True),
sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=True),
sa.PrimaryKeyConstraint('id')
)
# Convert to hypertable
op.execute("SELECT create_hypertable('market_data', 'timestamp', chunk_time_interval => INTERVAL '1 day');")
# Create indexes
op.create_index('idx_market_data_symbol_time', 'market_data', ['symbol', 'timestamp'])
op.create_index('idx_market_data_symbol', 'market_data', ['symbol'])
def downgrade():
op.drop_table('market_data')
API Preservation
100% Compatible Service Layer
MarketDataService: Preserve all existing methods with PostgreSQL backend
from typing import List, Dict, Any, Optional
import pandas as pd
from datetime import datetime, timedelta
class MarketDataService:
"""API-compatible service with PostgreSQL backend"""
def __init__(self, repository: MarketDataRepository):
self.repository = repository
async def get_ohlc_data(self, symbol: str, start_date: str, end_date: str) -> pd.DataFrame:
"""Get OHLC data - 100% API compatible"""
entities = await self.repository.get_ohlc_data(symbol, start_date, end_date)
# Transform to same DataFrame format as CSV version
return pd.DataFrame([
{
'timestamp': entity.timestamp,
'open': float(entity.open_price),
'high': float(entity.high_price),
'low': float(entity.low_price),
'close': float(entity.close_price),
'volume': entity.volume,
'adj_close': float(entity.adjusted_close)
}
for entity in entities
]).set_index('timestamp')
async def get_technical_indicators(self, symbol: str, start_date: str, end_date: str) -> Dict[str, List[float]]:
"""Get all technical indicators - 100% API compatible"""
indicators = await self.repository.get_technical_indicators(symbol, start_date, end_date)
return {
'sma_20': [float(ind.sma_20) if ind.sma_20 else None for ind in indicators],
'rsi_14': [float(ind.rsi_14) if ind.rsi_14 else None for ind in indicators],
'macd': [float(ind.macd) if ind.macd else None for ind in indicators],
'macd_signal': [float(ind.macd_signal) if ind.macd_signal else None for ind in indicators],
# ... all indicators
}
async def get_trading_style_preset(self, style: str, symbol: str, lookback_days: int = 30) -> Dict[str, Any]:
"""Get trading style analysis - 100% API compatible"""
end_date = datetime.now()
start_date = end_date - timedelta(days=lookback_days)
ohlc_data = await self.get_ohlc_data(symbol, start_date.isoformat(), end_date.isoformat())
indicators = await self.get_technical_indicators(symbol, start_date.isoformat(), end_date.isoformat())
if style == 'momentum':
return await self._analyze_momentum(ohlc_data, indicators)
elif style == 'mean_reversion':
return await self._analyze_mean_reversion(ohlc_data, indicators)
elif style == 'breakout':
return await self._analyze_breakout(ohlc_data, indicators)
# ... all trading styles
async def _analyze_momentum(self, ohlc_data: pd.DataFrame, indicators: Dict) -> Dict[str, Any]:
"""Momentum analysis with RAG enhancement"""
latest_rsi = indicators['rsi_14'][-1] if indicators['rsi_14'] else 50
latest_macd = indicators['macd'][-1] if indicators['macd'] else 0
# RAG: Find similar momentum patterns
similar_patterns = await self.repository.find_similar_momentum_patterns(
latest_rsi, latest_macd, limit=10
)
return {
'signal': 'BUY' if latest_rsi > 70 and latest_macd > 0 else 'HOLD',
'confidence': 0.85,
'indicators': {
'rsi': latest_rsi,
'macd': latest_macd
},
'similar_patterns': [p.to_dict() for p in similar_patterns],
'rag_enhanced': True
}
FundamentalDataService: Complete API preservation
class FundamentalDataService:
"""API-compatible fundamental analysis with PostgreSQL backend"""
def __init__(self, repository: FundamentalDataRepository):
self.repository = repository
async def get_financial_ratios(self, symbol: str, period_type: str = 'Q') -> Dict[str, float]:
"""Get latest financial ratios - 100% API compatible"""
latest_data = await self.repository.get_latest_fundamental_data(symbol, period_type)
if not latest_data:
return {}
return {
'pe_ratio': float(latest_data.pe_ratio) if latest_data.pe_ratio else None,
'pb_ratio': float(latest_data.pb_ratio) if latest_data.pb_ratio else None,
'roe': float(latest_data.roe) if latest_data.roe else None,
'roa': float(latest_data.roa) if latest_data.roa else None,
'debt_to_equity': float(latest_data.debt_to_equity) if latest_data.debt_to_equity else None
}
async def analyze_financial_health(self, symbol: str) -> Dict[str, Any]:
"""Financial health analysis with RAG - 100% API compatible"""
latest_data = await self.repository.get_latest_fundamental_data(symbol)
historical_data = await self.repository.get_fundamental_history(symbol, quarters=8)
# RAG: Find companies with similar financial profiles
similar_companies = await self.repository.find_similar_financial_profiles(
latest_data.financial_health_embedding, limit=10
)
return {
'health_score': self._calculate_health_score(latest_data, historical_data),
'trend_analysis': self._analyze_trends(historical_data),
'peer_comparison': [comp.to_dict() for comp in similar_companies],
'rag_enhanced': True
}
InsiderDataService: Complete API preservation
class InsiderDataService:
"""API-compatible insider analysis with PostgreSQL backend"""
def __init__(self, repository: InsiderDataRepository):
self.repository = repository
async def get_recent_insider_activity(self, symbol: str, days: int = 90) -> List[Dict[str, Any]]:
"""Get recent insider transactions - 100% API compatible"""
end_date = datetime.now()
start_date = end_date - timedelta(days=days)
transactions = await self.repository.get_insider_transactions(symbol, start_date, end_date)
return [
{
'insider_name': trans.insider_name,
'position': trans.insider_position,
'transaction_date': trans.transaction_date.isoformat(),
'transaction_type': trans.transaction_type,
'shares_traded': trans.shares_traded,
'transaction_price': float(trans.transaction_price) if trans.transaction_price else None,
'transaction_value': float(trans.transaction_value) if trans.transaction_value else None,
'sentiment_score': float(trans.sentiment_score) if trans.sentiment_score else None
}
for trans in transactions
]
async def analyze_insider_sentiment(self, symbol: str, days: int = 180) -> Dict[str, Any]:
"""Insider sentiment analysis with RAG - 100% API compatible"""
transactions = await self.get_recent_insider_activity(symbol, days)
# RAG: Find similar insider activity patterns
similar_patterns = await self.repository.find_similar_insider_patterns(
symbol, days, limit=10
)
buy_volume = sum(t['shares_traded'] for t in transactions if t['transaction_type'] == 'Buy')
sell_volume = sum(t['shares_traded'] for t in transactions if t['transaction_type'] == 'Sell')
net_sentiment = buy_volume - sell_volume
return {
'net_sentiment': net_sentiment,
'buy_transactions': len([t for t in transactions if t['transaction_type'] == 'Buy']),
'sell_transactions': len([t for t in transactions if t['transaction_type'] == 'Sell']),
'average_sentiment_score': sum(t['sentiment_score'] for t in transactions if t['sentiment_score']) / len(transactions) if transactions else 0,
'similar_patterns': [p.to_dict() for p in similar_patterns],
'rag_enhanced': True
}
Component Architecture
Repository Migration Pattern
AsyncRepository with PostgreSQL Operations
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
from sqlalchemy import select, and_, desc
from typing import List, Optional
from datetime import datetime
class MarketDataRepository:
"""Async PostgreSQL repository with RAG capabilities"""
def __init__(self, session_factory: async_sessionmaker[AsyncSession]):
self.session_factory = session_factory
async def get_ohlc_data(self, symbol: str, start_date: str, end_date: str) -> List[MarketDataEntity]:
"""Get OHLC data with sub-100ms performance"""
async with self.session_factory() as session:
stmt = select(MarketDataEntity).where(
and_(
MarketDataEntity.symbol == symbol,
MarketDataEntity.timestamp >= datetime.fromisoformat(start_date),
MarketDataEntity.timestamp <= datetime.fromisoformat(end_date)
)
).order_by(MarketDataEntity.timestamp)
result = await session.execute(stmt)
return result.scalars().all()
async def save_ohlc_batch(self, entities: List[MarketDataEntity]) -> None:
"""Batch insert with conflict resolution"""
async with self.session_factory() as session:
session.add_all(entities)
await session.commit()
async def find_similar_momentum_patterns(self, rsi: float, macd: float, limit: int = 10) -> List[TechnicalIndicatorEntity]:
"""RAG: Find similar technical patterns using vector similarity"""
target_embedding = self._encode_momentum_pattern(rsi, macd)
async with self.session_factory() as session:
# Using pgvectorscale cosine similarity
stmt = select(TechnicalIndicatorEntity).order_by(
TechnicalIndicatorEntity.indicator_pattern_embedding.cosine_distance(target_embedding)
).limit(limit)
result = await session.execute(stmt)
return result.scalars().all()
def _encode_momentum_pattern(self, rsi: float, macd: float) -> List[float]:
"""Encode momentum indicators to vector for similarity search"""
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
pattern_text = f"RSI: {rsi:.2f}, MACD: {macd:.4f}, momentum pattern"
return model.encode(pattern_text).tolist()
Migration Data Processing
4-Phase Migration Strategy
class MarketDataMigrator:
"""Migrate from CSV files to PostgreSQL with data validation"""
def __init__(self, csv_data_path: str, repository: MarketDataRepository):
self.csv_data_path = csv_data_path
self.repository = repository
async def migrate_all_data(self) -> Dict[str, int]:
"""Execute 4-phase migration strategy"""
results = {}
# Phase 1: Market Data (OHLC)
results['market_data'] = await self._migrate_market_data()
# Phase 2: Fundamental Data
results['fundamental_data'] = await self._migrate_fundamental_data()
# Phase 3: Insider Data
results['insider_data'] = await self._migrate_insider_data()
# Phase 4: Calculate Technical Indicators
results['technical_indicators'] = await self._calculate_technical_indicators()
return results
async def _migrate_market_data(self) -> int:
"""Migrate OHLC data from CSV files"""
csv_files = glob.glob(f"{self.csv_data_path}/market_data/*.csv")
total_records = 0
for csv_file in csv_files:
symbol = self._extract_symbol_from_filename(csv_file)
df = pd.read_csv(csv_file)
# Transform to entities
entities = []
for _, row in df.iterrows():
entity = MarketDataEntity.from_csv_record({
'symbol': symbol,
'timestamp': row['Date'],
'open': row['Open'],
'high': row['High'],
'low': row['Low'],
'close': row['Close'],
'volume': row['Volume'],
'adj_close': row['Adj Close']
})
if entity.validate():
entities.append(entity)
# Batch insert
await self.repository.save_ohlc_batch(entities)
total_records += len(entities)
print(f"Migrated {len(entities)} records for {symbol}")
return total_records
async def _calculate_technical_indicators(self) -> int:
"""Calculate and store technical indicators for all symbols"""
symbols = await self.repository.get_all_symbols()
total_indicators = 0
for symbol in symbols:
# Get OHLC data
ohlc_data = await self.repository.get_ohlc_data(
symbol, "2020-01-01", datetime.now().isoformat()
)
# Convert to DataFrame
df = pd.DataFrame([{
'timestamp': entity.timestamp,
'close': float(entity.close_price),
'high': float(entity.high_price),
'low': float(entity.low_price),
'volume': entity.volume
} for entity in ohlc_data])
df.set_index('timestamp', inplace=True)
# Calculate indicators
indicators = TechnicalIndicatorEntity.calculate_from_ohlc(symbol, df)
# Generate embeddings
for indicator in indicators:
indicator.indicator_pattern_embedding = self._generate_indicator_embedding(indicator)
# Save indicators
await self.repository.save_technical_indicators(indicators)
total_indicators += len(indicators)
print(f"Calculated {len(indicators)} indicators for {symbol}")
return total_indicators
RAG Integration
Vector Embeddings for Historical Pattern Matching
Embedding Generation Strategy
from sentence_transformers import SentenceTransformer
import numpy as np
class MarketDataEmbeddingService:
"""Generate vector embeddings for RAG pattern matching"""
def __init__(self):
self.model = SentenceTransformer('all-MiniLM-L6-v2')
def generate_technical_pattern_embedding(self, indicator_data: TechnicalIndicatorEntity) -> List[float]:
"""Generate embedding for technical analysis patterns"""
pattern_description = f"""
Technical Analysis Pattern:
RSI: {indicator_data.rsi_14:.2f}
MACD: {indicator_data.macd:.4f}, Signal: {indicator_data.macd_signal:.4f}
SMA 20: {indicator_data.sma_20:.2f}, SMA 50: {indicator_data.sma_50:.2f}
Bollinger Bands: Upper {indicator_data.bollinger_upper:.2f}, Lower {indicator_data.bollinger_lower:.2f}
Volume: Above average {indicator_data.volume_sma_20 > 0}
Pattern: {'Bullish' if indicator_data.rsi_14 > 50 and indicator_data.macd > indicator_data.macd_signal else 'Bearish'}
"""
return self.model.encode(pattern_description.strip()).tolist()
def generate_price_movement_embedding(self, market_data: MarketDataEntity) -> List[float]:
"""Generate embedding for price movement patterns"""
price_change = (market_data.close_price - market_data.open_price) / market_data.open_price * 100
volatility = (market_data.high_price - market_data.low_price) / market_data.open_price * 100
movement_description = f"""
Price Movement Pattern:
Price Change: {price_change:.2f}%
Intraday Volatility: {volatility:.2f}%
Volume Profile: {'High' if market_data.volume > 1000000 else 'Normal'}
Price Level: {'Above' if market_data.close_price > market_data.open_price else 'Below'} opening
Candle Type: {'Green' if market_data.close_price >= market_data.open_price else 'Red'}
"""
return self.model.encode(movement_description.strip()).tolist()
def generate_financial_health_embedding(self, fundamental_data: FundamentalDataEntity) -> List[float]:
"""Generate embedding for financial health patterns"""
health_description = f"""
Financial Health Profile:
PE Ratio: {fundamental_data.pe_ratio:.2f if fundamental_data.pe_ratio else 'N/A'}
ROE: {fundamental_data.roe * 100:.2f}% if fundamental_data.roe else 'N/A'}
Debt to Equity: {fundamental_data.debt_to_equity:.2f if fundamental_data.debt_to_equity else 'N/A'}
Revenue Growth: {'Positive' if fundamental_data.total_revenue and fundamental_data.total_revenue > 0 else 'Unknown'}
Profitability: {'Profitable' if fundamental_data.net_income and fundamental_data.net_income > 0 else 'Loss'}
Financial Strength: {'Strong' if fundamental_data.pe_ratio and fundamental_data.pe_ratio < 20 else 'Average'}
"""
return self.model.encode(health_description.strip()).tolist()
RAG Query Service
class MarketDataRAGService:
"""RAG-powered market analysis with vector similarity search"""
def __init__(self, repository: MarketDataRepository):
self.repository = repository
self.embedding_service = MarketDataEmbeddingService()
async def find_similar_market_conditions(
self,
symbol: str,
current_date: datetime,
similarity_threshold: float = 0.8,
limit: int = 10
) -> Dict[str, Any]:
"""Find historically similar market conditions for decision support"""
# Get current market state
current_ohlc = await self.repository.get_latest_ohlc(symbol, current_date)
current_indicators = await self.repository.get_latest_indicators(symbol, current_date)
current_fundamentals = await self.repository.get_latest_fundamentals(symbol)
# Generate query embeddings
current_pattern_embedding = self.embedding_service.generate_technical_pattern_embedding(current_indicators)
# Find similar patterns
similar_patterns = await self.repository.find_similar_patterns(
current_pattern_embedding,
similarity_threshold,
limit
)
# Analyze outcomes of similar patterns
pattern_outcomes = []
for pattern in similar_patterns:
# Get price movement 5-10 days after similar pattern
future_price = await self.repository.get_price_after_date(
pattern.symbol,
pattern.timestamp,
days=7
)
if future_price:
price_change = (future_price.close_price - pattern.close_price) / pattern.close_price
pattern_outcomes.append({
'symbol': pattern.symbol,
'date': pattern.timestamp,
'similarity_score': pattern.similarity_score,
'future_return': float(price_change),
'pattern_type': self._classify_pattern(pattern)
})
return {
'current_symbol': symbol,
'analysis_date': current_date,
'similar_patterns_found': len(similar_patterns),
'historical_outcomes': pattern_outcomes,
'average_return': np.mean([p['future_return'] for p in pattern_outcomes]) if pattern_outcomes else 0,
'success_rate': len([p for p in pattern_outcomes if p['future_return'] > 0]) / len(pattern_outcomes) if pattern_outcomes else 0,
'confidence_score': self._calculate_confidence(pattern_outcomes)
}
async def get_peer_analysis(self, symbol: str) -> Dict[str, Any]:
"""Find peer companies with similar financial profiles"""
fundamentals = await self.repository.get_latest_fundamentals(symbol)
if not fundamentals:
return {'error': 'No fundamental data available'}
# Find companies with similar financial health embeddings
similar_companies = await self.repository.find_similar_financial_profiles(
fundamentals.financial_health_embedding,
exclude_symbol=symbol,
limit=10
)
# Get recent performance comparison
peer_performance = []
for peer in similar_companies:
recent_performance = await self.repository.get_recent_performance(peer.symbol, days=30)
peer_performance.append({
'symbol': peer.symbol,
'similarity_score': peer.similarity_score,
'recent_return': recent_performance.get('return', 0),
'volatility': recent_performance.get('volatility', 0),
'pe_ratio': float(peer.pe_ratio) if peer.pe_ratio else None
})
return {
'target_symbol': symbol,
'peer_companies': peer_performance,
'peer_average_return': np.mean([p['recent_return'] for p in peer_performance]),
'relative_performance': 'Above average' if recent_performance.get('return', 0) > np.mean([p['recent_return'] for p in peer_performance]) else 'Below average'
}
Migration Strategy
4-Phase Migration Approach
Phase 1: Database Infrastructure Setup
# 1. Set up PostgreSQL with extensions
docker-compose up -d postgres
# 2. Run database migrations
alembic upgrade head
# 3. Verify TimescaleDB and pgvectorscale extensions
psql $DATABASE_URL -c "SELECT * FROM pg_extension WHERE extname IN ('timescaledb', 'vectorscale');"
Phase 2: Data Migration with Validation
# Migration script: migrate_market_data.py
import asyncio
from tradingagents.domains.marketdata.migration.migrator import MarketDataMigrator
from tradingagents.domains.marketdata.repository import MarketDataRepository
async def main():
# Initialize components
repository = MarketDataRepository(session_factory)
migrator = MarketDataMigrator("./data/market_data", repository)
# Execute migration
print("Starting MarketData migration...")
results = await migrator.migrate_all_data()
print(f"Migration completed:")
print(f" Market Data: {results['market_data']} records")
print(f" Fundamental Data: {results['fundamental_data']} records")
print(f" Insider Data: {results['insider_data']} records")
print(f" Technical Indicators: {results['technical_indicators']} records")
# Validate migration
validation_results = await migrator.validate_migration()
print(f"Validation: {validation_results}")
if __name__ == "__main__":
asyncio.run(main())
Phase 3: Service Layer Migration
# Switch services to PostgreSQL repositories
# Update dependency injection in service factory
class ServiceFactory:
def __init__(self, session_factory: async_sessionmaker[AsyncSession]):
self.session_factory = session_factory
def create_market_data_service(self) -> MarketDataService:
repository = MarketDataRepository(self.session_factory)
return MarketDataService(repository)
def create_fundamental_data_service(self) -> FundamentalDataService:
repository = FundamentalDataRepository(self.session_factory)
return FundamentalDataService(repository)
def create_insider_data_service(self) -> InsiderDataService:
repository = InsiderDataRepository(self.session_factory)
return InsiderDataService(repository)
Phase 4: RAG Enhancement and Testing
# Generate embeddings for existing data
async def enhance_with_rag():
embedding_service = MarketDataEmbeddingService()
# Generate embeddings for all technical indicators
indicators = await repository.get_all_technical_indicators()
for indicator in indicators:
embedding = embedding_service.generate_technical_pattern_embedding(indicator)
await repository.update_indicator_embedding(indicator.id, embedding)
print(f"Generated embeddings for {len(indicators)} technical indicators")
# Test RAG functionality
rag_service = MarketDataRAGService(repository)
test_results = await rag_service.find_similar_market_conditions("AAPL", datetime.now())
print(f"RAG test completed: {test_results}")
Testing Strategy
Migration Testing
Data Integrity Tests
class TestMarketDataMigration:
"""Validate PostgreSQL migration maintains data integrity"""
async def test_ohlc_data_accuracy(self):
"""Verify OHLC data matches CSV files exactly"""
# Load original CSV data
original_df = pd.read_csv("./data/market_data/AAPL.csv")
# Get PostgreSQL data
postgres_entities = await self.repository.get_ohlc_data("AAPL", "2020-01-01", "2024-01-01")
postgres_df = pd.DataFrame([entity.to_dict() for entity in postgres_entities])
# Compare datasets
assert len(original_df) == len(postgres_df)
assert np.allclose(original_df['Close'].values, postgres_df['close_price'].values)
assert np.allclose(original_df['Volume'].values, postgres_df['volume'].values)
async def test_performance_improvement(self):
"""Verify 10x performance improvement"""
import time
# Test PostgreSQL query time
start = time.time()
postgres_data = await self.repository.get_ohlc_data("AAPL", "2023-01-01", "2024-01-01")
postgres_time = time.time() - start
# Historical CSV time (from benchmarks)
csv_baseline_time = 0.500 # 500ms baseline
assert postgres_time < 0.100 # Sub-100ms requirement
assert postgres_time < csv_baseline_time / 10 # 10x improvement
async def test_rag_functionality(self):
"""Verify RAG vector similarity search works"""
rag_service = MarketDataRAGService(self.repository)
# Test pattern matching
results = await rag_service.find_similar_market_conditions("AAPL", datetime(2023, 6, 1))
assert len(results['historical_outcomes']) > 0
assert 'confidence_score' in results
assert results['average_return'] is not None
API Compatibility Tests
class TestAPICompatibility:
"""Ensure 100% API compatibility after migration"""
async def test_market_data_service_api(self):
"""Verify MarketDataService API unchanged"""
service = MarketDataService(self.repository)
# Test all existing methods
ohlc_data = await service.get_ohlc_data("AAPL", "2023-01-01", "2023-12-31")
assert isinstance(ohlc_data, pd.DataFrame)
assert list(ohlc_data.columns) == ['open', 'high', 'low', 'close', 'volume', 'adj_close']
indicators = await service.get_technical_indicators("AAPL", "2023-01-01", "2023-12-31")
assert isinstance(indicators, dict)
assert 'sma_20' in indicators
assert 'rsi_14' in indicators
momentum_analysis = await service.get_trading_style_preset("momentum", "AAPL")
assert 'signal' in momentum_analysis
assert 'confidence' in momentum_analysis
async def test_fundamental_service_api(self):
"""Verify FundamentalDataService API unchanged"""
service = FundamentalDataService(self.repository)
ratios = await service.get_financial_ratios("AAPL")
assert isinstance(ratios, dict)
assert 'pe_ratio' in ratios
health = await service.analyze_financial_health("AAPL")
assert 'health_score' in health
assert 'trend_analysis' in health
Performance Validation
class TestPerformanceRequirements:
"""Validate performance requirements are met"""
async def test_sub_100ms_queries(self):
"""Verify sub-100ms query performance"""
import time
queries = [
lambda: self.repository.get_ohlc_data("AAPL", "2023-12-01", "2023-12-31"),
lambda: self.repository.get_technical_indicators("AAPL", "2023-12-01", "2023-12-31"),
lambda: self.repository.get_latest_fundamentals("AAPL")
]
for query in queries:
start = time.time()
await query()
elapsed = time.time() - start
assert elapsed < 0.100, f"Query took {elapsed:.3f}s, exceeds 100ms requirement"
async def test_rag_query_performance(self):
"""Verify sub-200ms RAG query performance"""
rag_service = MarketDataRAGService(self.repository)
start = time.time()
results = await rag_service.find_similar_market_conditions("AAPL", datetime.now())
elapsed = time.time() - start
assert elapsed < 0.200, f"RAG query took {elapsed:.3f}s, exceeds 200ms requirement"
async def test_concurrent_access(self):
"""Verify concurrent agent access performance"""
import asyncio
async def concurrent_query(symbol: str):
return await self.repository.get_ohlc_data(symbol, "2023-12-01", "2023-12-31")
# Simulate 10 concurrent agents
tasks = [concurrent_query(f"SYMBOL_{i}") for i in range(10)]
start = time.time()
results = await asyncio.gather(*tasks)
elapsed = time.time() - start
assert elapsed < 1.0, f"Concurrent queries took {elapsed:.3f}s, too slow for agent workload"
assert len(results) == 10
Implementation Guidance
Step-by-Step Implementation
Week 1: Database Setup and Schema Migration
- Set up PostgreSQL with TimescaleDB and pgvectorscale extensions
- Create database schemas with proper indexing
- Run Alembic migrations to create all tables
- Test hypertable creation and vector index performance
Week 2: Entity Models and Repository Layer
- Implement all SQLAlchemy entity models with business logic
- Create async repository classes with PostgreSQL operations
- Implement batch operations for high-performance data loading
- Add vector similarity search capabilities
Week 3: Data Migration and Validation
- Build CSV-to-PostgreSQL migration scripts
- Migrate all historical data with integrity validation
- Generate vector embeddings for all existing data
- Validate data accuracy and performance benchmarks
Week 4: Service Layer and API Preservation
- Update service layer to use PostgreSQL repositories
- Ensure 100% API compatibility with existing interfaces
- Implement RAG-enhanced analysis features
- Complete integration testing and performance validation
Code Organization
tradingagents/domains/marketdata/
├── entities/
│ ├── __init__.py
│ ├── market_data_entity.py
│ ├── fundamental_data_entity.py
│ ├── insider_data_entity.py
│ └── technical_indicator_entity.py
├── repositories/
│ ├── __init__.py
│ ├── market_data_repository.py
│ ├── fundamental_data_repository.py
│ └── insider_data_repository.py
├── services/
│ ├── __init__.py
│ ├── market_data_service.py
│ ├── fundamental_data_service.py
│ ├── insider_data_service.py
│ └── embedding_service.py
├── migration/
│ ├── __init__.py
│ ├── migrator.py
│ └── validation.py
├── rag/
│ ├── __init__.py
│ ├── rag_service.py
│ └── pattern_matcher.py
└── clients/
├── __init__.py
├── yfinance_client.py # Already implemented
└── finnhub_client.py # Already implemented
Configuration Updates
Database Configuration
# config/database.py
from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker
from sqlalchemy.pool import QueuePool
class DatabaseConfig:
def __init__(self, database_url: str):
self.engine = create_async_engine(
database_url,
poolclass=QueuePool,
pool_size=20,
max_overflow=30,
pool_pre_ping=True,
echo=False # Set True for SQL debugging
)
self.session_factory = async_sessionmaker(
bind=self.engine,
expire_on_commit=False,
autoflush=True,
autocommit=False
)
async def health_check(self) -> bool:
"""Verify database connectivity and extensions"""
async with self.session_factory() as session:
result = await session.execute(
"SELECT extname FROM pg_extension WHERE extname IN ('timescaledb', 'vectorscale')"
)
extensions = result.fetchall()
return len(extensions) == 2
Monitoring and Observability
Performance Monitoring
class MarketDataMetrics:
"""Monitor PostgreSQL migration performance"""
def __init__(self):
self.query_times = []
self.rag_query_times = []
async def track_query_performance(self, operation: str, execution_time: float):
"""Track query performance metrics"""
self.query_times.append({
'operation': operation,
'time': execution_time,
'timestamp': datetime.now()
})
# Alert if performance degrades
if execution_time > 0.100:
print(f"WARNING: {operation} took {execution_time:.3f}s, exceeds 100ms SLA")
def generate_performance_report(self) -> Dict[str, Any]:
"""Generate performance analytics report"""
recent_queries = [q for q in self.query_times if q['timestamp'] > datetime.now() - timedelta(hours=1)]
return {
'total_queries': len(recent_queries),
'average_query_time': np.mean([q['time'] for q in recent_queries]),
'p95_query_time': np.percentile([q['time'] for q in recent_queries], 95),
'sla_violations': len([q for q in recent_queries if q['time'] > 0.100]),
'performance_trend': 'stable' # Could implement trending analysis
}
This technical design provides a comprehensive migration strategy from the 85% complete CSV-based system to a high-performance PostgreSQL + TimescaleDB + pgvectorscale architecture while preserving 100% API compatibility and adding powerful RAG capabilities for historical pattern matching.
The design emphasizes:
- Performance: Sub-100ms queries with TimescaleDB optimization
- Compatibility: Zero API changes for existing services
- Intelligence: RAG-enhanced analysis with vector similarity search
- Scalability: Async PostgreSQL operations for concurrent agent access
- Quality: Comprehensive testing strategy for migration validation
Implementation should follow the 4-phase approach with weekly milestones to ensure smooth migration and immediate performance benefits.