TradingAgents/scripts/anonymize_dataset.py

276 lines
9.0 KiB
Python

#!/usr/bin/env python3
"""
Ticker Anonymization Script - The "Blindfire Protocol"
This script anonymizes historical trading data by replacing:
- Ticker symbols (AAPL → ASSET_042)
- Company names (Apple Inc. → Company ASSET_042)
- Product names (iPhone → Product A, MacBook → Product B)
This prevents LLMs from using memorized knowledge about specific companies.
"""
import hashlib
import re
import json
from pathlib import Path
from typing import Dict, List
import pandas as pd
class TickerAnonymizer:
"""Anonymize tickers and company-specific information."""
def __init__(self, seed: str = "blindfire_v1"):
self.seed = seed
self.ticker_map = {}
self.reverse_map = {}
self.company_names = {}
self.product_map = {
# Apple products
"iPhone": "Product A",
"iPad": "Product B",
"MacBook": "Product C",
"Apple Watch": "Product D",
"AirPods": "Product E",
# Nvidia products
"GeForce": "Product X",
"RTX": "Product Y",
"H100": "Product Z",
"A100": "Product W",
# Microsoft products
"Windows": "Software Platform A",
"Office": "Software Platform B",
"Azure": "Cloud Platform A",
# Meta products
"Facebook": "Social Platform A",
"Instagram": "Social Platform B",
"WhatsApp": "Messaging Platform A",
# Google products
"Search": "Platform Service A",
"YouTube": "Video Platform A",
"Android": "Mobile OS A",
}
def anonymize_ticker(self, ticker: str) -> str:
"""
Map ticker to anonymous label.
Example: AAPL → ASSET_042
"""
if ticker not in self.ticker_map:
hash_input = f"{self.seed}_{ticker}"
hash_val = int(hashlib.md5(hash_input.encode()).hexdigest(), 16)
anon_label = f"ASSET_{hash_val % 1000:03d}"
self.ticker_map[ticker] = anon_label
self.reverse_map[anon_label] = ticker
return self.ticker_map[ticker]
def set_company_name(self, ticker: str, company_name: str):
"""Store company name for anonymization."""
self.company_names[ticker] = company_name
def anonymize_text(self, text: str, ticker: str) -> str:
"""
Replace all company-specific information in text.
Args:
text: Text to anonymize (news article, earnings report, etc.)
ticker: Ticker symbol for context
Returns:
Anonymized text with ASSET_XXX labels
"""
if not text:
return text
anon_ticker = self.anonymize_ticker(ticker)
# Replace ticker symbol (case-insensitive)
text = re.sub(rf'\b{ticker}\b', anon_ticker, text, flags=re.IGNORECASE)
# Replace company name if known
if ticker in self.company_names:
company_name = self.company_names[ticker]
text = re.sub(
rf'\b{re.escape(company_name)}\b',
f"Company {anon_ticker}",
text,
flags=re.IGNORECASE
)
# Replace product names
for product, anon_product in self.product_map.items():
text = re.sub(
rf'\b{re.escape(product)}\b',
anon_product,
text,
flags=re.IGNORECASE
)
return text
def normalize_price_series(self, df: pd.DataFrame, base_value: float = 100.0) -> pd.DataFrame:
"""
Normalize price series to base-100 index to prevent LLM from identifying stocks by price level.
This prevents the "Price Scale Leak" where an LLM can identify NVDA by seeing $480 prices.
Args:
df: DataFrame with OHLCV columns
base_value: Starting index value (default 100.0)
Returns:
DataFrame with normalized prices (all rebased to start at 100.0)
Example:
Original: Close = [150, 153, 149, 155]
Normalized: Close = [100.0, 102.0, 99.33, 103.33]
"""
df_normalized = df.copy()
# Get first row as baseline
first_row = df.iloc[0]
# Normalize OHLC columns
price_columns = ['Open', 'High', 'Low', 'Close']
for col in price_columns:
if col in df.columns:
baseline = first_row[col]
if baseline > 0:
# Rebase to 100.0
df_normalized[col] = (df[col] / baseline) * base_value
# Volume stays absolute (but could be normalized too if desired)
# Keeping volume absolute for now as it's less identifying
return df_normalized
def normalize_price_value(self, value: float, baseline: float, base_value: float = 100.0) -> float:
"""
Normalize a single price value.
Args:
value: Current price
baseline: Reference price (e.g., first price in series)
base_value: Target baseline (default 100.0)
Returns:
Normalized price
"""
if baseline <= 0:
return value
return (value / baseline) * base_value
def anonymize_csv(self, input_path: Path, output_path: Path, ticker: str):
"""
Anonymize a CSV file containing market data.
Preserves numerical data but removes ticker references.
"""
df = pd.read_csv(input_path)
# Replace ticker in column names if present
anon_ticker = self.anonymize_ticker(ticker)
df.columns = [col.replace(ticker, anon_ticker) for col in df.columns]
# Anonymize any text columns
for col in df.columns:
if df[col].dtype == 'object':
df[col] = df[col].apply(lambda x: self.anonymize_text(str(x), ticker) if pd.notna(x) else x)
df.to_csv(output_path, index=False)
print(f"✅ Anonymized {input_path.name}{output_path.name}")
def save_mapping(self, output_path: Path):
"""Save ticker mapping for later de-anonymization."""
mapping = {
"ticker_map": self.ticker_map,
"reverse_map": self.reverse_map,
"company_names": self.company_names,
}
with open(output_path, 'w') as f:
json.dump(mapping, f, indent=2)
print(f"✅ Saved mapping to {output_path}")
def main():
"""
Anonymize dataset for TradingAgents testing.
Usage:
python scripts/anonymize_dataset.py
"""
# Configuration
tickers = ["AAPL", "NVDA", "MSFT", "META", "GOOGL"]
company_names = {
"AAPL": "Apple Inc.",
"NVDA": "NVIDIA Corporation",
"MSFT": "Microsoft Corporation",
"META": "Meta Platforms Inc.",
"GOOGL": "Alphabet Inc.",
}
# Paths
data_dir = Path("data/raw")
output_dir = Path("data/anonymized")
output_dir.mkdir(parents=True, exist_ok=True)
# Initialize anonymizer
anonymizer = TickerAnonymizer(seed="blindfire_v1")
# Set company names
for ticker, name in company_names.items():
anonymizer.set_company_name(ticker, name)
print("🔒 BLINDFIRE PROTOCOL - Anonymizing Dataset")
print("=" * 60)
# Anonymize each ticker's data
for ticker in tickers:
anon_ticker = anonymizer.anonymize_ticker(ticker)
print(f"\n📊 Processing {ticker}{anon_ticker}")
# Anonymize price data
price_file = data_dir / f"{ticker}_prices.csv"
if price_file.exists():
anonymizer.anonymize_csv(
price_file,
output_dir / f"{anon_ticker}_prices.csv",
ticker
)
# Anonymize news data
news_file = data_dir / f"{ticker}_news.csv"
if news_file.exists():
anonymizer.anonymize_csv(
news_file,
output_dir / f"{anon_ticker}_news.csv",
ticker
)
# Anonymize fundamentals
fundamentals_file = data_dir / f"{ticker}_fundamentals.csv"
if fundamentals_file.exists():
anonymizer.anonymize_csv(
fundamentals_file,
output_dir / f"{anon_ticker}_fundamentals.csv",
ticker
)
# Save mapping for de-anonymization
anonymizer.save_mapping(output_dir / "ticker_mapping.json")
print("\n" + "=" * 60)
print("✅ ANONYMIZATION COMPLETE")
print(f"📁 Anonymized data saved to: {output_dir}")
print("\n🎯 Next Steps:")
print("1. Update TradingAgents config to use anonymized data")
print("2. Modify analyst prompts to remove {ticker} references")
print("3. Run backtests on anonymized dataset")
print("4. Compare results to original (should be similar if no contamination)")
if __name__ == "__main__":
main()