300 lines
9.6 KiB
Python
300 lines
9.6 KiB
Python
"""
|
|
Ticker Anonymizer - Production Implementation
|
|
|
|
Handles:
|
|
- Ticker masking (AAPL → ASSET_042)
|
|
- Company name anonymization
|
|
- Product name anonymization
|
|
- Price normalization to base-100 index
|
|
- CRITICAL: Uses Adj Close to handle dividends/splits correctly
|
|
"""
|
|
|
|
import hashlib
|
|
import re
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
|
|
class TickerAnonymizer:
|
|
"""
|
|
Anonymize tickers and normalize prices to prevent LLM identification.
|
|
|
|
CRITICAL: Uses adjusted close prices to handle dividends and splits.
|
|
"""
|
|
|
|
def __init__(self, seed: str = "blindfire_v1"):
|
|
self.seed = seed
|
|
self.ticker_map = {}
|
|
self.reverse_map = {}
|
|
self.company_names = {}
|
|
self.baseline_prices = {} # Store baseline for normalization
|
|
|
|
# Product name mappings
|
|
self.product_map = {
|
|
# Apple
|
|
"iPhone": "Product A",
|
|
"iPad": "Product B",
|
|
"MacBook": "Product C",
|
|
"Apple Watch": "Product D",
|
|
"AirPods": "Product E",
|
|
# Nvidia
|
|
"GeForce": "Product X",
|
|
"RTX": "Product Y",
|
|
"H100": "Product Z",
|
|
"A100": "Product W",
|
|
# Microsoft
|
|
"Windows": "Software Platform A",
|
|
"Office": "Software Platform B",
|
|
"Azure": "Cloud Platform A",
|
|
# Meta
|
|
"Facebook": "Social Platform A",
|
|
"Instagram": "Social Platform B",
|
|
"WhatsApp": "Messaging Platform A",
|
|
# Google
|
|
"Search": "Platform Service A",
|
|
"YouTube": "Video Platform A",
|
|
"Android": "Mobile OS A",
|
|
}
|
|
|
|
def anonymize_ticker(self, ticker: str) -> str:
|
|
"""
|
|
Map ticker to anonymous label using deterministic hash.
|
|
|
|
Args:
|
|
ticker: Original ticker symbol (e.g., "AAPL")
|
|
|
|
Returns:
|
|
Anonymous label (e.g., "ASSET_042")
|
|
"""
|
|
if ticker not in self.ticker_map:
|
|
hash_input = f"{self.seed}_{ticker}"
|
|
hash_val = int(hashlib.md5(hash_input.encode()).hexdigest(), 16)
|
|
anon_label = f"ASSET_{hash_val % 1000:03d}"
|
|
self.ticker_map[ticker] = anon_label
|
|
self.reverse_map[anon_label] = ticker
|
|
return self.ticker_map[ticker]
|
|
|
|
def set_company_name(self, ticker: str, company_name: str):
|
|
"""Store company name for anonymization."""
|
|
self.company_names[ticker] = company_name
|
|
|
|
def anonymize_text(self, text: str, ticker: str) -> str:
|
|
"""
|
|
Replace all company-specific information in text.
|
|
|
|
Args:
|
|
text: Text to anonymize
|
|
ticker: Ticker symbol for context
|
|
|
|
Returns:
|
|
Anonymized text
|
|
"""
|
|
if not text:
|
|
return text
|
|
|
|
anon_ticker = self.anonymize_ticker(ticker)
|
|
|
|
# Replace company name FIRST (before ticker, to avoid partial replacements)
|
|
if ticker in self.company_names:
|
|
company_name = self.company_names[ticker]
|
|
# Escape special regex characters including periods
|
|
escaped_name = re.escape(company_name)
|
|
text = re.sub(
|
|
rf'\b{escaped_name}\b',
|
|
f"Company {anon_ticker}",
|
|
text,
|
|
flags=re.IGNORECASE
|
|
)
|
|
|
|
# Replace ticker symbol
|
|
text = re.sub(rf'\b{ticker}\b', anon_ticker, text, flags=re.IGNORECASE)
|
|
|
|
# Replace product names
|
|
for product, anon_product in self.product_map.items():
|
|
text = re.sub(
|
|
rf'\b{re.escape(product)}\b',
|
|
anon_product,
|
|
text,
|
|
flags=re.IGNORECASE
|
|
)
|
|
|
|
return text
|
|
|
|
def normalize_price_series(
|
|
self,
|
|
df: pd.DataFrame,
|
|
base_value: float = 100.0,
|
|
use_adjusted: bool = True
|
|
) -> pd.DataFrame:
|
|
"""
|
|
Normalize price series to base-100 index.
|
|
|
|
CRITICAL: Uses Adj Close by default to handle dividends/splits correctly.
|
|
|
|
Args:
|
|
df: DataFrame with OHLCV columns
|
|
base_value: Starting index value (default 100.0)
|
|
use_adjusted: Use 'Adj Close' if available (default True)
|
|
|
|
Returns:
|
|
DataFrame with normalized prices
|
|
|
|
Raises:
|
|
ValueError: If required columns are missing
|
|
"""
|
|
df_normalized = df.copy()
|
|
|
|
# Determine which close column to use
|
|
if use_adjusted and 'Adj Close' in df.columns:
|
|
close_col = 'Adj Close'
|
|
elif 'Close' in df.columns:
|
|
close_col = 'Close'
|
|
else:
|
|
raise ValueError("DataFrame must have 'Close' or 'Adj Close' column")
|
|
|
|
# Get baseline (first row)
|
|
if len(df) == 0:
|
|
raise ValueError("DataFrame is empty")
|
|
|
|
baseline = df[close_col].iloc[0]
|
|
if baseline <= 0 or np.isnan(baseline):
|
|
raise ValueError(f"Invalid baseline price: {baseline}")
|
|
|
|
# Normalize all price columns
|
|
price_columns = ['Open', 'High', 'Low', 'Close']
|
|
if 'Adj Close' in df.columns:
|
|
price_columns.append('Adj Close')
|
|
|
|
for col in price_columns:
|
|
if col in df.columns:
|
|
# Use the same baseline for all columns
|
|
df_normalized[col] = (df[col] / baseline) * base_value
|
|
|
|
# Volume stays absolute (less identifying than price)
|
|
# Could normalize if needed, but keeping raw for now
|
|
|
|
return df_normalized
|
|
|
|
def normalize_price_value(
|
|
self,
|
|
value: float,
|
|
baseline: float,
|
|
base_value: float = 100.0
|
|
) -> float:
|
|
"""
|
|
Normalize a single price value.
|
|
|
|
Args:
|
|
value: Current price
|
|
baseline: Reference price
|
|
base_value: Target baseline (default 100.0)
|
|
|
|
Returns:
|
|
Normalized price
|
|
"""
|
|
if baseline <= 0:
|
|
raise ValueError(f"Invalid baseline: {baseline}")
|
|
return (value / baseline) * base_value
|
|
|
|
def anonymize_csv(
|
|
self,
|
|
input_path: Path,
|
|
output_path: Path,
|
|
ticker: str,
|
|
normalize_prices: bool = True
|
|
):
|
|
"""
|
|
Anonymize a CSV file containing market data.
|
|
|
|
Args:
|
|
input_path: Path to input CSV
|
|
output_path: Path to output CSV
|
|
ticker: Ticker symbol
|
|
normalize_prices: Whether to normalize prices to base-100
|
|
"""
|
|
df = pd.read_csv(input_path)
|
|
|
|
# Anonymize ticker in column names
|
|
anon_ticker = self.anonymize_ticker(ticker)
|
|
df.columns = [col.replace(ticker, anon_ticker) for col in df.columns]
|
|
|
|
# Normalize prices if requested
|
|
if normalize_prices:
|
|
df = self.normalize_price_series(df, base_value=100.0)
|
|
|
|
# Anonymize text columns
|
|
for col in df.columns:
|
|
if df[col].dtype == 'object':
|
|
df[col] = df[col].apply(
|
|
lambda x: self.anonymize_text(str(x), ticker) if pd.notna(x) else x
|
|
)
|
|
|
|
df.to_csv(output_path, index=False)
|
|
print(f"✅ Anonymized {input_path.name} → {output_path.name}")
|
|
|
|
def save_mapping(self, output_path: Path):
|
|
"""Save ticker mapping for de-anonymization."""
|
|
mapping = {
|
|
"ticker_map": self.ticker_map,
|
|
"reverse_map": self.reverse_map,
|
|
"company_names": self.company_names,
|
|
"seed": self.seed
|
|
}
|
|
with open(output_path, 'w') as f:
|
|
json.dump(mapping, f, indent=2)
|
|
print(f"✅ Saved mapping to {output_path}")
|
|
|
|
def load_mapping(self, input_path: Path):
|
|
"""Load ticker mapping from file."""
|
|
with open(input_path, 'r') as f:
|
|
mapping = json.load(f)
|
|
|
|
self.ticker_map = mapping["ticker_map"]
|
|
self.reverse_map = mapping["reverse_map"]
|
|
self.company_names = mapping["company_names"]
|
|
self.seed = mapping.get("seed", self.seed)
|
|
print(f"✅ Loaded mapping from {input_path}")
|
|
|
|
def deanonymize_ticker(self, anon_ticker: str) -> Optional[str]:
|
|
"""Reverse mapping: ASSET_042 → AAPL."""
|
|
return self.reverse_map.get(anon_ticker)
|
|
|
|
|
|
# Example usage
|
|
if __name__ == "__main__":
|
|
anonymizer = TickerAnonymizer()
|
|
|
|
# Test anonymization
|
|
ticker = "AAPL"
|
|
anonymizer.set_company_name(ticker, "Apple Inc.")
|
|
|
|
anon_ticker = anonymizer.anonymize_ticker(ticker)
|
|
print(f"Ticker: {ticker} → {anon_ticker}")
|
|
|
|
# Test text anonymization
|
|
text = "Apple Inc. (AAPL) reported strong iPhone sales"
|
|
anon_text = anonymizer.anonymize_text(text, ticker)
|
|
print(f"Text: {text}")
|
|
print(f"Anonymized: {anon_text}")
|
|
|
|
# Test price normalization with Adj Close
|
|
df = pd.DataFrame({
|
|
'Date': pd.date_range('2024-01-01', periods=5),
|
|
'Open': [150.0, 152.0, 151.0, 153.0, 155.0],
|
|
'High': [152.0, 154.0, 153.0, 155.0, 157.0],
|
|
'Low': [149.0, 151.0, 150.0, 152.0, 154.0],
|
|
'Close': [151.0, 153.0, 152.0, 154.0, 156.0],
|
|
'Adj Close': [150.5, 152.5, 151.5, 153.5, 155.5], # Adjusted for dividends
|
|
'Volume': [1000000] * 5
|
|
})
|
|
|
|
print("\nOriginal prices:")
|
|
print(df[['Date', 'Close', 'Adj Close']].head())
|
|
|
|
df_normalized = anonymizer.normalize_price_series(df)
|
|
print("\nNormalized prices (using Adj Close):")
|
|
print(df_normalized[['Date', 'Close', 'Adj Close']].head())
|