TradingAgents/tradingagents/utils/anonymizer.py

"""
Ticker Anonymizer - Production Implementation

Handles:
- Ticker masking (AAPL → ASSET_042)
- Company name anonymization
- Product name anonymization
- Price normalization to base-100 index
- CRITICAL: Uses Adj Close to handle dividends/splits correctly
"""

import hashlib
import re
import json
from pathlib import Path
from typing import Dict, List, Optional
import pandas as pd
import numpy as np


class TickerAnonymizer:
    """
    Anonymize tickers and normalize prices to prevent LLM identification.

    CRITICAL: Uses adjusted close prices to handle dividends and splits.
    """

    def __init__(self, seed: str = "blindfire_v1"):
        self.seed = seed
        self.ticker_map = {}
        self.reverse_map = {}
        self.company_names = {}
        self.baseline_prices = {}  # Store baseline for normalization

        # Product name mappings
        self.product_map = {
            # Apple
            "iPhone": "Product A",
            "iPad": "Product B",
            "MacBook": "Product C",
            "Apple Watch": "Product D",
            "AirPods": "Product E",
            # Nvidia
            "GeForce": "Product X",
            "RTX": "Product Y",
            "H100": "Product Z",
            "A100": "Product W",
            # Microsoft
            "Windows": "Software Platform A",
            "Office": "Software Platform B",
            "Azure": "Cloud Platform A",
            # Meta
            "Facebook": "Social Platform A",
            "Instagram": "Social Platform B",
            "WhatsApp": "Messaging Platform A",
            # Google
            "Search": "Platform Service A",
            "YouTube": "Video Platform A",
            "Android": "Mobile OS A",
        }

    def anonymize_ticker(self, ticker: str) -> str:
        """
        Map ticker to anonymous label using deterministic hash.

        Args:
            ticker: Original ticker symbol (e.g., "AAPL")

        Returns:
            Anonymous label (e.g., "ASSET_042")
        """
        if ticker not in self.ticker_map:
            hash_input = f"{self.seed}_{ticker}"
            hash_val = int(hashlib.md5(hash_input.encode()).hexdigest(), 16)
            anon_label = f"ASSET_{hash_val % 1000:03d}"
            self.ticker_map[ticker] = anon_label
            self.reverse_map[anon_label] = ticker
        return self.ticker_map[ticker]

    def set_company_name(self, ticker: str, company_name: str):
        """Store company name for anonymization."""
        self.company_names[ticker] = company_name

    def anonymize_text(self, text: str, ticker: str) -> str:
        """
        Replace all company-specific information in text.

        Args:
            text: Text to anonymize
            ticker: Ticker symbol for context

        Returns:
            Anonymized text
        """
        if not text:
            return text

        anon_ticker = self.anonymize_ticker(ticker)

        # Replace company name FIRST (before ticker, to avoid partial replacements)
        if ticker in self.company_names:
            company_name = self.company_names[ticker]
            # Escape special regex characters including periods
            escaped_name = re.escape(company_name)
            text = re.sub(
                rf'\b{escaped_name}\b',
                f"Company {anon_ticker}",
                text,
                flags=re.IGNORECASE
            )

        # Replace ticker symbol
        text = re.sub(rf'\b{ticker}\b', anon_ticker, text, flags=re.IGNORECASE)

        # Replace product names
        for product, anon_product in self.product_map.items():
            text = re.sub(
                rf'\b{re.escape(product)}\b',
                anon_product,
                text,
                flags=re.IGNORECASE
            )

        return text

    def normalize_price_series(
        self,
        df: pd.DataFrame,
        base_value: float = 100.0,
        use_adjusted: bool = True
    ) -> pd.DataFrame:
        """
        Normalize price series to base-100 index.

        CRITICAL: Uses Adj Close by default to handle dividends/splits correctly.

        Args:
            df: DataFrame with OHLCV columns
            base_value: Starting index value (default 100.0)
            use_adjusted: Use 'Adj Close' if available (default True)

        Returns:
            DataFrame with normalized prices

        Raises:
            ValueError: If required columns are missing
        """
        df_normalized = df.copy()

        # Determine which close column to use
        if use_adjusted and 'Adj Close' in df.columns:
            close_col = 'Adj Close'
        elif 'Close' in df.columns:
            close_col = 'Close'
        else:
            raise ValueError("DataFrame must have 'Close' or 'Adj Close' column")

        # Get baseline (first row)
        if len(df) == 0:
            raise ValueError("DataFrame is empty")

        baseline = df[close_col].iloc[0]
        if baseline <= 0 or np.isnan(baseline):
            raise ValueError(f"Invalid baseline price: {baseline}")

        # Normalize all price columns
        price_columns = ['Open', 'High', 'Low', 'Close']
        if 'Adj Close' in df.columns:
            price_columns.append('Adj Close')

        for col in price_columns:
            if col in df.columns:
                # Use the same baseline for all columns
                df_normalized[col] = (df[col] / baseline) * base_value

        # Volume stays absolute (less identifying than price)
        # Could normalize if needed, but keeping raw for now

        return df_normalized

    def normalize_price_value(
        self,
        value: float,
        baseline: float,
        base_value: float = 100.0
    ) -> float:
        """
        Normalize a single price value.

        Args:
            value: Current price
            baseline: Reference price
            base_value: Target baseline (default 100.0)

        Returns:
            Normalized price
        """
        if baseline <= 0:
            raise ValueError(f"Invalid baseline: {baseline}")
        return (value / baseline) * base_value

    def anonymize_csv(
        self,
        input_path: Path,
        output_path: Path,
        ticker: str,
        normalize_prices: bool = True
    ):
        """
        Anonymize a CSV file containing market data.

        Args:
            input_path: Path to input CSV
            output_path: Path to output CSV
            ticker: Ticker symbol
            normalize_prices: Whether to normalize prices to base-100
        """
        df = pd.read_csv(input_path)

        # Anonymize ticker in column names
        anon_ticker = self.anonymize_ticker(ticker)
        df.columns = [col.replace(ticker, anon_ticker) for col in df.columns]

        # Normalize prices if requested
        if normalize_prices:
            df = self.normalize_price_series(df, base_value=100.0)

        # Anonymize text columns
        for col in df.columns:
            if df[col].dtype == 'object':
                df[col] = df[col].apply(
                    lambda x: self.anonymize_text(str(x), ticker) if pd.notna(x) else x
                )

        df.to_csv(output_path, index=False)
        print(f"✅ Anonymized {input_path.name} → {output_path.name}")

    def save_mapping(self, output_path: Path):
        """Save ticker mapping for de-anonymization."""
        mapping = {
            "ticker_map": self.ticker_map,
            "reverse_map": self.reverse_map,
            "company_names": self.company_names,
            "seed": self.seed
        }
        with open(output_path, 'w') as f:
            json.dump(mapping, f, indent=2)
        print(f"✅ Saved mapping to {output_path}")

    def load_mapping(self, input_path: Path):
        """Load ticker mapping from file."""
        with open(input_path, 'r') as f:
            mapping = json.load(f)

        self.ticker_map = mapping["ticker_map"]
        self.reverse_map = mapping["reverse_map"]
        self.company_names = mapping["company_names"]
        self.seed = mapping.get("seed", self.seed)
        print(f"✅ Loaded mapping from {input_path}")

    def deanonymize_ticker(self, anon_ticker: str) -> Optional[str]:
        """Reverse mapping: ASSET_042 → AAPL."""
        return self.reverse_map.get(anon_ticker)


# Example usage
if __name__ == "__main__":
    anonymizer = TickerAnonymizer()

    # Test anonymization
    ticker = "AAPL"
    anonymizer.set_company_name(ticker, "Apple Inc.")

    anon_ticker = anonymizer.anonymize_ticker(ticker)
    print(f"Ticker: {ticker} → {anon_ticker}")

    # Test text anonymization
    text = "Apple Inc. (AAPL) reported strong iPhone sales"
    anon_text = anonymizer.anonymize_text(text, ticker)
    print(f"Text: {text}")
    print(f"Anonymized: {anon_text}")

    # Test price normalization with Adj Close
    df = pd.DataFrame({
        'Date': pd.date_range('2024-01-01', periods=5),
        'Open': [150.0, 152.0, 151.0, 153.0, 155.0],
        'High': [152.0, 154.0, 153.0, 155.0, 157.0],
        'Low': [149.0, 151.0, 150.0, 152.0, 154.0],
        'Close': [151.0, 153.0, 152.0, 154.0, 156.0],
        'Adj Close': [150.5, 152.5, 151.5, 153.5, 155.5],  # Adjusted for dividends
        'Volume': [1000000] * 5
    })

    print("\nOriginal prices:")
    print(df[['Date', 'Close', 'Adj Close']].head())

    df_normalized = anonymizer.normalize_price_series(df)
    print("\nNormalized prices (using Adj Close):")
    print(df_normalized[['Date', 'Close', 'Adj Close']].head())