276 lines
9.0 KiB
Python
276 lines
9.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Ticker Anonymization Script - The "Blindfire Protocol"
|
|
|
|
This script anonymizes historical trading data by replacing:
|
|
- Ticker symbols (AAPL → ASSET_042)
|
|
- Company names (Apple Inc. → Company ASSET_042)
|
|
- Product names (iPhone → Product A, MacBook → Product B)
|
|
|
|
This prevents LLMs from using memorized knowledge about specific companies.
|
|
"""
|
|
|
|
import hashlib
|
|
import re
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Dict, List
|
|
import pandas as pd
|
|
|
|
|
|
class TickerAnonymizer:
|
|
"""Anonymize tickers and company-specific information."""
|
|
|
|
def __init__(self, seed: str = "blindfire_v1"):
|
|
self.seed = seed
|
|
self.ticker_map = {}
|
|
self.reverse_map = {}
|
|
self.company_names = {}
|
|
self.product_map = {
|
|
# Apple products
|
|
"iPhone": "Product A",
|
|
"iPad": "Product B",
|
|
"MacBook": "Product C",
|
|
"Apple Watch": "Product D",
|
|
"AirPods": "Product E",
|
|
# Nvidia products
|
|
"GeForce": "Product X",
|
|
"RTX": "Product Y",
|
|
"H100": "Product Z",
|
|
"A100": "Product W",
|
|
# Microsoft products
|
|
"Windows": "Software Platform A",
|
|
"Office": "Software Platform B",
|
|
"Azure": "Cloud Platform A",
|
|
# Meta products
|
|
"Facebook": "Social Platform A",
|
|
"Instagram": "Social Platform B",
|
|
"WhatsApp": "Messaging Platform A",
|
|
# Google products
|
|
"Search": "Platform Service A",
|
|
"YouTube": "Video Platform A",
|
|
"Android": "Mobile OS A",
|
|
}
|
|
|
|
def anonymize_ticker(self, ticker: str) -> str:
|
|
"""
|
|
Map ticker to anonymous label.
|
|
|
|
Example: AAPL → ASSET_042
|
|
"""
|
|
if ticker not in self.ticker_map:
|
|
hash_input = f"{self.seed}_{ticker}"
|
|
hash_val = int(hashlib.md5(hash_input.encode()).hexdigest(), 16)
|
|
anon_label = f"ASSET_{hash_val % 1000:03d}"
|
|
self.ticker_map[ticker] = anon_label
|
|
self.reverse_map[anon_label] = ticker
|
|
return self.ticker_map[ticker]
|
|
|
|
def set_company_name(self, ticker: str, company_name: str):
|
|
"""Store company name for anonymization."""
|
|
self.company_names[ticker] = company_name
|
|
|
|
def anonymize_text(self, text: str, ticker: str) -> str:
|
|
"""
|
|
Replace all company-specific information in text.
|
|
|
|
Args:
|
|
text: Text to anonymize (news article, earnings report, etc.)
|
|
ticker: Ticker symbol for context
|
|
|
|
Returns:
|
|
Anonymized text with ASSET_XXX labels
|
|
"""
|
|
if not text:
|
|
return text
|
|
|
|
anon_ticker = self.anonymize_ticker(ticker)
|
|
|
|
# Replace ticker symbol (case-insensitive)
|
|
text = re.sub(rf'\b{ticker}\b', anon_ticker, text, flags=re.IGNORECASE)
|
|
|
|
# Replace company name if known
|
|
if ticker in self.company_names:
|
|
company_name = self.company_names[ticker]
|
|
text = re.sub(
|
|
rf'\b{re.escape(company_name)}\b',
|
|
f"Company {anon_ticker}",
|
|
text,
|
|
flags=re.IGNORECASE
|
|
)
|
|
|
|
# Replace product names
|
|
for product, anon_product in self.product_map.items():
|
|
text = re.sub(
|
|
rf'\b{re.escape(product)}\b',
|
|
anon_product,
|
|
text,
|
|
flags=re.IGNORECASE
|
|
)
|
|
|
|
return text
|
|
|
|
def normalize_price_series(self, df: pd.DataFrame, base_value: float = 100.0) -> pd.DataFrame:
|
|
"""
|
|
Normalize price series to base-100 index to prevent LLM from identifying stocks by price level.
|
|
|
|
This prevents the "Price Scale Leak" where an LLM can identify NVDA by seeing $480 prices.
|
|
|
|
Args:
|
|
df: DataFrame with OHLCV columns
|
|
base_value: Starting index value (default 100.0)
|
|
|
|
Returns:
|
|
DataFrame with normalized prices (all rebased to start at 100.0)
|
|
|
|
Example:
|
|
Original: Close = [150, 153, 149, 155]
|
|
Normalized: Close = [100.0, 102.0, 99.33, 103.33]
|
|
"""
|
|
df_normalized = df.copy()
|
|
|
|
# Get first row as baseline
|
|
first_row = df.iloc[0]
|
|
|
|
# Normalize OHLC columns
|
|
price_columns = ['Open', 'High', 'Low', 'Close']
|
|
for col in price_columns:
|
|
if col in df.columns:
|
|
baseline = first_row[col]
|
|
if baseline > 0:
|
|
# Rebase to 100.0
|
|
df_normalized[col] = (df[col] / baseline) * base_value
|
|
|
|
# Volume stays absolute (but could be normalized too if desired)
|
|
# Keeping volume absolute for now as it's less identifying
|
|
|
|
return df_normalized
|
|
|
|
def normalize_price_value(self, value: float, baseline: float, base_value: float = 100.0) -> float:
|
|
"""
|
|
Normalize a single price value.
|
|
|
|
Args:
|
|
value: Current price
|
|
baseline: Reference price (e.g., first price in series)
|
|
base_value: Target baseline (default 100.0)
|
|
|
|
Returns:
|
|
Normalized price
|
|
"""
|
|
if baseline <= 0:
|
|
return value
|
|
return (value / baseline) * base_value
|
|
|
|
def anonymize_csv(self, input_path: Path, output_path: Path, ticker: str):
|
|
"""
|
|
Anonymize a CSV file containing market data.
|
|
|
|
Preserves numerical data but removes ticker references.
|
|
"""
|
|
df = pd.read_csv(input_path)
|
|
|
|
# Replace ticker in column names if present
|
|
anon_ticker = self.anonymize_ticker(ticker)
|
|
df.columns = [col.replace(ticker, anon_ticker) for col in df.columns]
|
|
|
|
# Anonymize any text columns
|
|
for col in df.columns:
|
|
if df[col].dtype == 'object':
|
|
df[col] = df[col].apply(lambda x: self.anonymize_text(str(x), ticker) if pd.notna(x) else x)
|
|
|
|
df.to_csv(output_path, index=False)
|
|
print(f"✅ Anonymized {input_path.name} → {output_path.name}")
|
|
|
|
def save_mapping(self, output_path: Path):
|
|
"""Save ticker mapping for later de-anonymization."""
|
|
mapping = {
|
|
"ticker_map": self.ticker_map,
|
|
"reverse_map": self.reverse_map,
|
|
"company_names": self.company_names,
|
|
}
|
|
with open(output_path, 'w') as f:
|
|
json.dump(mapping, f, indent=2)
|
|
print(f"✅ Saved mapping to {output_path}")
|
|
|
|
|
|
def main():
|
|
"""
|
|
Anonymize dataset for TradingAgents testing.
|
|
|
|
Usage:
|
|
python scripts/anonymize_dataset.py
|
|
"""
|
|
# Configuration
|
|
tickers = ["AAPL", "NVDA", "MSFT", "META", "GOOGL"]
|
|
company_names = {
|
|
"AAPL": "Apple Inc.",
|
|
"NVDA": "NVIDIA Corporation",
|
|
"MSFT": "Microsoft Corporation",
|
|
"META": "Meta Platforms Inc.",
|
|
"GOOGL": "Alphabet Inc.",
|
|
}
|
|
|
|
# Paths
|
|
data_dir = Path("data/raw")
|
|
output_dir = Path("data/anonymized")
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Initialize anonymizer
|
|
anonymizer = TickerAnonymizer(seed="blindfire_v1")
|
|
|
|
# Set company names
|
|
for ticker, name in company_names.items():
|
|
anonymizer.set_company_name(ticker, name)
|
|
|
|
print("🔒 BLINDFIRE PROTOCOL - Anonymizing Dataset")
|
|
print("=" * 60)
|
|
|
|
# Anonymize each ticker's data
|
|
for ticker in tickers:
|
|
anon_ticker = anonymizer.anonymize_ticker(ticker)
|
|
print(f"\n📊 Processing {ticker} → {anon_ticker}")
|
|
|
|
# Anonymize price data
|
|
price_file = data_dir / f"{ticker}_prices.csv"
|
|
if price_file.exists():
|
|
anonymizer.anonymize_csv(
|
|
price_file,
|
|
output_dir / f"{anon_ticker}_prices.csv",
|
|
ticker
|
|
)
|
|
|
|
# Anonymize news data
|
|
news_file = data_dir / f"{ticker}_news.csv"
|
|
if news_file.exists():
|
|
anonymizer.anonymize_csv(
|
|
news_file,
|
|
output_dir / f"{anon_ticker}_news.csv",
|
|
ticker
|
|
)
|
|
|
|
# Anonymize fundamentals
|
|
fundamentals_file = data_dir / f"{ticker}_fundamentals.csv"
|
|
if fundamentals_file.exists():
|
|
anonymizer.anonymize_csv(
|
|
fundamentals_file,
|
|
output_dir / f"{anon_ticker}_fundamentals.csv",
|
|
ticker
|
|
)
|
|
|
|
# Save mapping for de-anonymization
|
|
anonymizer.save_mapping(output_dir / "ticker_mapping.json")
|
|
|
|
print("\n" + "=" * 60)
|
|
print("✅ ANONYMIZATION COMPLETE")
|
|
print(f"📁 Anonymized data saved to: {output_dir}")
|
|
print("\n🎯 Next Steps:")
|
|
print("1. Update TradingAgents config to use anonymized data")
|
|
print("2. Modify analyst prompts to remove {ticker} references")
|
|
print("3. Run backtests on anonymized dataset")
|
|
print("4. Compare results to original (should be similar if no contamination)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|