TradingAgents/scripts/anonymize_dataset.py

#!/usr/bin/env python3
"""
Ticker Anonymization Script - The "Blindfire Protocol"

This script anonymizes historical trading data by replacing:
- Ticker symbols (AAPL → ASSET_042)
- Company names (Apple Inc. → Company ASSET_042)
- Product names (iPhone → Product A, MacBook → Product B)

This prevents LLMs from using memorized knowledge about specific companies.
"""

import hashlib
import re
import json
from pathlib import Path
from typing import Dict, List
import pandas as pd


class TickerAnonymizer:
    """Anonymize tickers and company-specific information."""

    def __init__(self, seed: str = "blindfire_v1"):
        self.seed = seed
        self.ticker_map = {}
        self.reverse_map = {}
        self.company_names = {}
        self.product_map = {
            # Apple products
            "iPhone": "Product A",
            "iPad": "Product B",
            "MacBook": "Product C",
            "Apple Watch": "Product D",
            "AirPods": "Product E",
            # Nvidia products
            "GeForce": "Product X",
            "RTX": "Product Y",
            "H100": "Product Z",
            "A100": "Product W",
            # Microsoft products
            "Windows": "Software Platform A",
            "Office": "Software Platform B",
            "Azure": "Cloud Platform A",
            # Meta products
            "Facebook": "Social Platform A",
            "Instagram": "Social Platform B",
            "WhatsApp": "Messaging Platform A",
            # Google products
            "Search": "Platform Service A",
            "YouTube": "Video Platform A",
            "Android": "Mobile OS A",
        }

    def anonymize_ticker(self, ticker: str) -> str:
        """
        Map ticker to anonymous label.

        Example: AAPL → ASSET_042
        """
        if ticker not in self.ticker_map:
            hash_input = f"{self.seed}_{ticker}"
            hash_val = int(hashlib.md5(hash_input.encode()).hexdigest(), 16)
            anon_label = f"ASSET_{hash_val % 1000:03d}"
            self.ticker_map[ticker] = anon_label
            self.reverse_map[anon_label] = ticker
        return self.ticker_map[ticker]

    def set_company_name(self, ticker: str, company_name: str):
        """Store company name for anonymization."""
        self.company_names[ticker] = company_name

    def anonymize_text(self, text: str, ticker: str) -> str:
        """
        Replace all company-specific information in text.

        Args:
            text: Text to anonymize (news article, earnings report, etc.)
            ticker: Ticker symbol for context

        Returns:
            Anonymized text with ASSET_XXX labels
        """
        if not text:
            return text

        anon_ticker = self.anonymize_ticker(ticker)

        # Replace ticker symbol (case-insensitive)
        text = re.sub(rf'\b{ticker}\b', anon_ticker, text, flags=re.IGNORECASE)

        # Replace company name if known
        if ticker in self.company_names:
            company_name = self.company_names[ticker]
            text = re.sub(
                rf'\b{re.escape(company_name)}\b',
                f"Company {anon_ticker}",
                text,
                flags=re.IGNORECASE
            )

        # Replace product names
        for product, anon_product in self.product_map.items():
            text = re.sub(
                rf'\b{re.escape(product)}\b',
                anon_product,
                text,
                flags=re.IGNORECASE
            )

        return text

    def normalize_price_series(self, df: pd.DataFrame, base_value: float = 100.0) -> pd.DataFrame:
        """
        Normalize price series to base-100 index to prevent LLM from identifying stocks by price level.

        This prevents the "Price Scale Leak" where an LLM can identify NVDA by seeing $480 prices.

        Args:
            df: DataFrame with OHLCV columns
            base_value: Starting index value (default 100.0)

        Returns:
            DataFrame with normalized prices (all rebased to start at 100.0)

        Example:
            Original: Close = [150, 153, 149, 155]
            Normalized: Close = [100.0, 102.0, 99.33, 103.33]
        """
        df_normalized = df.copy()

        # Get first row as baseline
        first_row = df.iloc[0]

        # Normalize OHLC columns
        price_columns = ['Open', 'High', 'Low', 'Close']
        for col in price_columns:
            if col in df.columns:
                baseline = first_row[col]
                if baseline > 0:
                    # Rebase to 100.0
                    df_normalized[col] = (df[col] / baseline) * base_value

        # Volume stays absolute (but could be normalized too if desired)
        # Keeping volume absolute for now as it's less identifying

        return df_normalized

    def normalize_price_value(self, value: float, baseline: float, base_value: float = 100.0) -> float:
        """
        Normalize a single price value.

        Args:
            value: Current price
            baseline: Reference price (e.g., first price in series)
            base_value: Target baseline (default 100.0)

        Returns:
            Normalized price
        """
        if baseline <= 0:
            return value
        return (value / baseline) * base_value

    def anonymize_csv(self, input_path: Path, output_path: Path, ticker: str):
        """
        Anonymize a CSV file containing market data.

        Preserves numerical data but removes ticker references.
        """
        df = pd.read_csv(input_path)

        # Replace ticker in column names if present
        anon_ticker = self.anonymize_ticker(ticker)
        df.columns = [col.replace(ticker, anon_ticker) for col in df.columns]

        # Anonymize any text columns
        for col in df.columns:
            if df[col].dtype == 'object':
                df[col] = df[col].apply(lambda x: self.anonymize_text(str(x), ticker) if pd.notna(x) else x)

        df.to_csv(output_path, index=False)
        print(f"✅ Anonymized {input_path.name} → {output_path.name}")

    def save_mapping(self, output_path: Path):
        """Save ticker mapping for later de-anonymization."""
        mapping = {
            "ticker_map": self.ticker_map,
            "reverse_map": self.reverse_map,
            "company_names": self.company_names,
        }
        with open(output_path, 'w') as f:
            json.dump(mapping, f, indent=2)
        print(f"✅ Saved mapping to {output_path}")


def main():
    """
    Anonymize dataset for TradingAgents testing.

    Usage:
        python scripts/anonymize_dataset.py
    """
    # Configuration
    tickers = ["AAPL", "NVDA", "MSFT", "META", "GOOGL"]
    company_names = {
        "AAPL": "Apple Inc.",
        "NVDA": "NVIDIA Corporation",
        "MSFT": "Microsoft Corporation",
        "META": "Meta Platforms Inc.",
        "GOOGL": "Alphabet Inc.",
    }

    # Paths
    data_dir = Path("data/raw")
    output_dir = Path("data/anonymized")
    output_dir.mkdir(parents=True, exist_ok=True)

    # Initialize anonymizer
    anonymizer = TickerAnonymizer(seed="blindfire_v1")

    # Set company names
    for ticker, name in company_names.items():
        anonymizer.set_company_name(ticker, name)

    print("🔒 BLINDFIRE PROTOCOL - Anonymizing Dataset")
    print("=" * 60)

    # Anonymize each ticker's data
    for ticker in tickers:
        anon_ticker = anonymizer.anonymize_ticker(ticker)
        print(f"\n📊 Processing {ticker} → {anon_ticker}")

        # Anonymize price data
        price_file = data_dir / f"{ticker}_prices.csv"
        if price_file.exists():
            anonymizer.anonymize_csv(
                price_file,
                output_dir / f"{anon_ticker}_prices.csv",
                ticker
            )

        # Anonymize news data
        news_file = data_dir / f"{ticker}_news.csv"
        if news_file.exists():
            anonymizer.anonymize_csv(
                news_file,
                output_dir / f"{anon_ticker}_news.csv",
                ticker
            )

        # Anonymize fundamentals
        fundamentals_file = data_dir / f"{ticker}_fundamentals.csv"
        if fundamentals_file.exists():
            anonymizer.anonymize_csv(
                fundamentals_file,
                output_dir / f"{anon_ticker}_fundamentals.csv",
                ticker
            )

    # Save mapping for de-anonymization
    anonymizer.save_mapping(output_dir / "ticker_mapping.json")

    print("\n" + "=" * 60)
    print("✅ ANONYMIZATION COMPLETE")
    print(f"📁 Anonymized data saved to: {output_dir}")
    print("\n🎯 Next Steps:")
    print("1. Update TradingAgents config to use anonymized data")
    print("2. Modify analyst prompts to remove {ticker} references")
    print("3. Run backtests on anonymized dataset")
    print("4. Compare results to original (should be similar if no contamination)")


if __name__ == "__main__":
    main()