TradingAgents/tradingagents/alerts/alert_manager.py

1101 lines
32 KiB
Python

"""Alert Manager for orchestration and routing.
This module provides comprehensive alert management including:
- Alert orchestration and routing
- Multiple alert channels (email, slack, sms, webhook)
- Alert priorities and severity levels
- Rate limiting to prevent alert storms
- Alert history tracking
- Template-based formatting
Issue #38: [ALERT-37] Alert manager - orchestration and routing
Design Principles:
- Flexible channel routing
- Rate limiting prevents spam
- Template-based alert formatting
- Async support for non-blocking delivery
"""
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from decimal import Decimal
from enum import Enum
from typing import Any, Callable, Dict, List, Optional, Protocol, Set
import asyncio
import hashlib
import logging
import uuid
# ============================================================================
# Logging Setup
# ============================================================================
logger = logging.getLogger(__name__)
# ============================================================================
# Enums
# ============================================================================
class AlertPriority(str, Enum):
"""Alert priority levels."""
LOW = "low" # Informational
MEDIUM = "medium" # Important but not urgent
HIGH = "high" # Requires attention
CRITICAL = "critical" # Immediate action required
class AlertCategory(str, Enum):
"""Alert category types."""
TRADE = "trade" # Trade-related alerts
RISK = "risk" # Risk management alerts
SYSTEM = "system" # System status alerts
MARKET = "market" # Market condition alerts
PORTFOLIO = "portfolio" # Portfolio alerts
EXECUTION = "execution" # Order execution alerts
COMPLIANCE = "compliance" # Regulatory/compliance alerts
class AlertStatus(str, Enum):
"""Alert delivery status."""
PENDING = "pending"
SENDING = "sending"
DELIVERED = "delivered"
FAILED = "failed"
RATE_LIMITED = "rate_limited"
SUPPRESSED = "suppressed"
class ChannelType(str, Enum):
"""Alert channel types."""
EMAIL = "email"
SLACK = "slack"
SMS = "sms"
WEBHOOK = "webhook"
PUSH = "push"
LOG = "log"
# ============================================================================
# Protocols
# ============================================================================
class AlertChannel(Protocol):
"""Protocol for alert channels."""
@property
def channel_type(self) -> ChannelType:
"""Get channel type."""
...
@property
def is_available(self) -> bool:
"""Check if channel is available."""
...
async def send(self, alert: "Alert") -> bool:
"""Send an alert through this channel.
Args:
alert: Alert to send
Returns:
True if sent successfully
"""
...
# ============================================================================
# Data Classes
# ============================================================================
@dataclass
class AlertTemplate:
"""Template for formatting alerts.
Attributes:
template_id: Unique template identifier
name: Template name
title_template: Template for alert title
body_template: Template for alert body
category: Alert category this applies to
variables: Required template variables
"""
template_id: str = field(default_factory=lambda: str(uuid.uuid4()))
name: str = ""
title_template: str = "{category}: {title}"
body_template: str = "{message}"
category: Optional[AlertCategory] = None
variables: List[str] = field(default_factory=list)
def render(self, context: Dict[str, Any]) -> tuple[str, str]:
"""Render the template with context.
Args:
context: Template variables
Returns:
Tuple of (title, body)
"""
title = self.title_template.format(**context)
body = self.body_template.format(**context)
return title, body
@dataclass
class RateLimitConfig:
"""Rate limiting configuration.
Attributes:
max_alerts_per_minute: Maximum alerts per minute per category
max_alerts_per_hour: Maximum alerts per hour per category
cooldown_seconds: Cooldown after rate limit hit
dedupe_window_seconds: Window for deduplication
enable_deduplication: Enable duplicate detection
"""
max_alerts_per_minute: int = 10
max_alerts_per_hour: int = 100
cooldown_seconds: int = 60
dedupe_window_seconds: int = 300
enable_deduplication: bool = True
@dataclass
class RoutingRule:
"""Rule for routing alerts to channels.
Attributes:
rule_id: Unique rule identifier
name: Rule name
priority: Minimum priority for this rule
categories: Categories this rule applies to
channels: Channels to route to
enabled: Whether rule is enabled
conditions: Additional conditions as callable
"""
rule_id: str = field(default_factory=lambda: str(uuid.uuid4()))
name: str = ""
priority: AlertPriority = AlertPriority.LOW
categories: List[AlertCategory] = field(default_factory=list)
channels: List[ChannelType] = field(default_factory=list)
enabled: bool = True
conditions: Optional[Callable[["Alert"], bool]] = None
def matches(self, alert: "Alert") -> bool:
"""Check if alert matches this rule.
Args:
alert: Alert to check
Returns:
True if alert matches
"""
if not self.enabled:
return False
# Check priority
priority_order = [
AlertPriority.LOW,
AlertPriority.MEDIUM,
AlertPriority.HIGH,
AlertPriority.CRITICAL,
]
if priority_order.index(alert.priority) < priority_order.index(self.priority):
return False
# Check category
if self.categories and alert.category not in self.categories:
return False
# Check custom conditions
if self.conditions and not self.conditions(alert):
return False
return True
@dataclass
class AlertConfig:
"""Alert manager configuration.
Attributes:
rate_limit_config: Rate limiting configuration
default_channels: Default channels for alerts
log_all_alerts: Log all alerts to file
store_history: Store alert history
max_history_size: Maximum history entries
retry_failed: Retry failed deliveries
max_retries: Maximum retry attempts
async_delivery: Use async delivery
"""
rate_limit_config: RateLimitConfig = field(default_factory=RateLimitConfig)
default_channels: List[ChannelType] = field(
default_factory=lambda: [ChannelType.LOG]
)
log_all_alerts: bool = True
store_history: bool = True
max_history_size: int = 1000
retry_failed: bool = True
max_retries: int = 3
async_delivery: bool = True
@dataclass
class Alert:
"""An alert to be sent.
Attributes:
alert_id: Unique alert identifier
title: Alert title
message: Alert message body
priority: Alert priority
category: Alert category
source: Source of the alert
timestamp: When alert was created
data: Additional alert data
tags: Alert tags for filtering
status: Current delivery status
channels_sent: Channels that received alert
delivery_attempts: Number of delivery attempts
last_error: Last delivery error
acknowledged: Whether alert was acknowledged
acknowledged_by: Who acknowledged
acknowledged_at: When acknowledged
"""
alert_id: str = field(default_factory=lambda: str(uuid.uuid4()))
title: str = ""
message: str = ""
priority: AlertPriority = AlertPriority.MEDIUM
category: AlertCategory = AlertCategory.SYSTEM
source: str = ""
timestamp: datetime = field(default_factory=datetime.now)
data: Dict[str, Any] = field(default_factory=dict)
tags: List[str] = field(default_factory=list)
status: AlertStatus = AlertStatus.PENDING
channels_sent: List[ChannelType] = field(default_factory=list)
delivery_attempts: int = 0
last_error: str = ""
acknowledged: bool = False
acknowledged_by: Optional[str] = None
acknowledged_at: Optional[datetime] = None
@property
def content_hash(self) -> str:
"""Get hash of alert content for deduplication."""
content = f"{self.title}:{self.message}:{self.category.value}"
return hashlib.sha256(content.encode()).hexdigest()[:16]
@dataclass
class DeliveryResult:
"""Result of alert delivery attempt.
Attributes:
alert_id: Alert that was delivered
channel: Channel used
success: Whether delivery succeeded
timestamp: When delivery occurred
error_message: Error if failed
response_data: Channel response data
"""
alert_id: str = ""
channel: ChannelType = ChannelType.LOG
success: bool = False
timestamp: datetime = field(default_factory=datetime.now)
error_message: str = ""
response_data: Dict[str, Any] = field(default_factory=dict)
@dataclass
class AlertStats:
"""Statistics about alerts.
Attributes:
total_sent: Total alerts sent
total_failed: Total failed deliveries
total_rate_limited: Total rate-limited alerts
total_suppressed: Total suppressed (dedupe)
by_priority: Count by priority
by_category: Count by category
by_channel: Count by channel
avg_delivery_time_ms: Average delivery time
"""
total_sent: int = 0
total_failed: int = 0
total_rate_limited: int = 0
total_suppressed: int = 0
by_priority: Dict[str, int] = field(default_factory=dict)
by_category: Dict[str, int] = field(default_factory=dict)
by_channel: Dict[str, int] = field(default_factory=dict)
avg_delivery_time_ms: float = 0.0
# ============================================================================
# Channel Implementations
# ============================================================================
class LogChannel:
"""Channel that logs alerts to Python logging."""
@property
def channel_type(self) -> ChannelType:
return ChannelType.LOG
@property
def is_available(self) -> bool:
return True
async def send(self, alert: Alert) -> bool:
"""Log the alert."""
log_level = {
AlertPriority.LOW: logging.INFO,
AlertPriority.MEDIUM: logging.WARNING,
AlertPriority.HIGH: logging.ERROR,
AlertPriority.CRITICAL: logging.CRITICAL,
}.get(alert.priority, logging.INFO)
logger.log(
log_level,
f"[{alert.category.value.upper()}] {alert.title}: {alert.message}",
)
return True
class WebhookChannel:
"""Channel that sends alerts to webhooks."""
def __init__(self, webhook_url: str, headers: Optional[Dict[str, str]] = None):
"""Initialize webhook channel.
Args:
webhook_url: URL to send webhooks to
headers: Optional HTTP headers
"""
self.webhook_url = webhook_url
self.headers = headers or {}
self._available = bool(webhook_url)
@property
def channel_type(self) -> ChannelType:
return ChannelType.WEBHOOK
@property
def is_available(self) -> bool:
return self._available
async def send(self, alert: Alert) -> bool:
"""Send alert to webhook.
Note: Actual HTTP call would be implemented here.
For now, returns success if URL is configured.
"""
if not self.webhook_url:
return False
payload = {
"alert_id": alert.alert_id,
"title": alert.title,
"message": alert.message,
"priority": alert.priority.value,
"category": alert.category.value,
"timestamp": alert.timestamp.isoformat(),
"data": alert.data,
}
# In production, would use aiohttp or similar
logger.info(f"Webhook payload: {payload}")
return True
# ============================================================================
# AlertManager Class
# ============================================================================
class AlertManager:
"""Orchestrates alert routing and delivery.
Manages alert channels, routing rules, rate limiting,
and delivery tracking.
Attributes:
config: Alert configuration
channels: Registered alert channels
routing_rules: Alert routing rules
templates: Alert templates
"""
def __init__(
self,
config: Optional[AlertConfig] = None,
):
"""Initialize alert manager.
Args:
config: Alert configuration
"""
self.config = config or AlertConfig()
# Channels
self.channels: Dict[ChannelType, AlertChannel] = {}
self._register_default_channels()
# Routing
self.routing_rules: List[RoutingRule] = []
self._setup_default_rules()
# Templates
self.templates: Dict[str, AlertTemplate] = {}
self._setup_default_templates()
# Rate limiting
self._rate_limit_state: Dict[str, List[datetime]] = {}
self._seen_hashes: Dict[str, datetime] = {}
# History
self._history: List[Alert] = []
self._delivery_results: List[DeliveryResult] = []
# Stats
self._stats = AlertStats()
def _register_default_channels(self) -> None:
"""Register default channels."""
self.register_channel(LogChannel())
def _setup_default_rules(self) -> None:
"""Setup default routing rules."""
# Critical alerts go to all channels
self.add_routing_rule(RoutingRule(
name="critical_all_channels",
priority=AlertPriority.CRITICAL,
categories=[], # All categories
channels=[ChannelType.LOG],
enabled=True,
))
# Trade alerts go to log
self.add_routing_rule(RoutingRule(
name="trade_alerts",
priority=AlertPriority.MEDIUM,
categories=[AlertCategory.TRADE, AlertCategory.EXECUTION],
channels=[ChannelType.LOG],
enabled=True,
))
# Risk alerts go to log
self.add_routing_rule(RoutingRule(
name="risk_alerts",
priority=AlertPriority.HIGH,
categories=[AlertCategory.RISK],
channels=[ChannelType.LOG],
enabled=True,
))
def _setup_default_templates(self) -> None:
"""Setup default alert templates."""
self.register_template(AlertTemplate(
name="trade_signal",
title_template="[TRADE] {symbol} - {action}",
body_template="Signal: {action} {symbol}\nPrice: {price}\nReason: {reason}",
category=AlertCategory.TRADE,
variables=["symbol", "action", "price", "reason"],
))
self.register_template(AlertTemplate(
name="risk_breach",
title_template="[RISK] {risk_type} threshold breached",
body_template="Risk breach detected:\nType: {risk_type}\nCurrent: {current_value}\nLimit: {limit_value}",
category=AlertCategory.RISK,
variables=["risk_type", "current_value", "limit_value"],
))
self.register_template(AlertTemplate(
name="order_executed",
title_template="[EXECUTION] Order {order_id} {status}",
body_template="Order {order_id} for {symbol} has been {status}.\nQuantity: {quantity}\nPrice: {price}",
category=AlertCategory.EXECUTION,
variables=["order_id", "symbol", "status", "quantity", "price"],
))
def register_channel(self, channel: AlertChannel) -> None:
"""Register an alert channel.
Args:
channel: Channel to register
"""
self.channels[channel.channel_type] = channel
logger.info(f"Registered alert channel: {channel.channel_type.value}")
def unregister_channel(self, channel_type: ChannelType) -> None:
"""Unregister an alert channel.
Args:
channel_type: Type of channel to remove
"""
if channel_type in self.channels:
del self.channels[channel_type]
logger.info(f"Unregistered alert channel: {channel_type.value}")
def add_routing_rule(self, rule: RoutingRule) -> None:
"""Add a routing rule.
Args:
rule: Routing rule to add
"""
self.routing_rules.append(rule)
logger.debug(f"Added routing rule: {rule.name}")
def remove_routing_rule(self, rule_id: str) -> bool:
"""Remove a routing rule.
Args:
rule_id: ID of rule to remove
Returns:
True if removed
"""
for i, rule in enumerate(self.routing_rules):
if rule.rule_id == rule_id:
del self.routing_rules[i]
return True
return False
def register_template(self, template: AlertTemplate) -> None:
"""Register an alert template.
Args:
template: Template to register
"""
self.templates[template.name] = template
def create_alert(
self,
title: str,
message: str,
priority: AlertPriority = AlertPriority.MEDIUM,
category: AlertCategory = AlertCategory.SYSTEM,
source: str = "",
data: Optional[Dict[str, Any]] = None,
tags: Optional[List[str]] = None,
) -> Alert:
"""Create a new alert.
Args:
title: Alert title
message: Alert message
priority: Alert priority
category: Alert category
source: Alert source
data: Additional data
tags: Alert tags
Returns:
Created alert
"""
return Alert(
title=title,
message=message,
priority=priority,
category=category,
source=source,
data=data or {},
tags=tags or [],
)
def create_alert_from_template(
self,
template_name: str,
context: Dict[str, Any],
priority: Optional[AlertPriority] = None,
source: str = "",
tags: Optional[List[str]] = None,
) -> Optional[Alert]:
"""Create an alert from a template.
Args:
template_name: Name of template to use
context: Template variables
priority: Override priority
source: Alert source
tags: Alert tags
Returns:
Created alert or None if template not found
"""
template = self.templates.get(template_name)
if not template:
logger.warning(f"Template not found: {template_name}")
return None
# Add category and title to context for default template
context.setdefault("category", template.category.value if template.category else "SYSTEM")
context.setdefault("title", template_name)
title, body = template.render(context)
return Alert(
title=title,
message=body,
priority=priority or AlertPriority.MEDIUM,
category=template.category or AlertCategory.SYSTEM,
source=source,
data=context,
tags=tags or [],
)
def _check_rate_limit(self, alert: Alert) -> bool:
"""Check if alert is rate-limited.
Args:
alert: Alert to check
Returns:
True if rate-limited
"""
config = self.config.rate_limit_config
key = alert.category.value
now = datetime.now()
# Initialize if needed
if key not in self._rate_limit_state:
self._rate_limit_state[key] = []
# Clean old entries
minute_ago = now - timedelta(minutes=1)
hour_ago = now - timedelta(hours=1)
self._rate_limit_state[key] = [
ts for ts in self._rate_limit_state[key]
if ts > hour_ago
]
# Count recent alerts
minute_count = sum(1 for ts in self._rate_limit_state[key] if ts > minute_ago)
hour_count = len(self._rate_limit_state[key])
if minute_count >= config.max_alerts_per_minute:
return True
if hour_count >= config.max_alerts_per_hour:
return True
return False
def _check_duplicate(self, alert: Alert) -> bool:
"""Check if alert is a duplicate.
Args:
alert: Alert to check
Returns:
True if duplicate
"""
if not self.config.rate_limit_config.enable_deduplication:
return False
content_hash = alert.content_hash
now = datetime.now()
window = timedelta(
seconds=self.config.rate_limit_config.dedupe_window_seconds
)
# Clean old hashes
self._seen_hashes = {
h: ts for h, ts in self._seen_hashes.items()
if now - ts < window
}
if content_hash in self._seen_hashes:
return True
self._seen_hashes[content_hash] = now
return False
def _get_target_channels(self, alert: Alert) -> Set[ChannelType]:
"""Get channels to route alert to.
Args:
alert: Alert to route
Returns:
Set of channel types
"""
channels: Set[ChannelType] = set()
# Check routing rules
for rule in self.routing_rules:
if rule.matches(alert):
channels.update(rule.channels)
# Add default channels if no rules matched
if not channels:
channels.update(self.config.default_channels)
return channels
def send(self, alert: Alert) -> List[DeliveryResult]:
"""Send an alert synchronously.
Args:
alert: Alert to send
Returns:
List of delivery results
"""
return asyncio.run(self.send_async(alert))
async def send_async(self, alert: Alert) -> List[DeliveryResult]:
"""Send an alert asynchronously.
Args:
alert: Alert to send
Returns:
List of delivery results
"""
results: List[DeliveryResult] = []
# Check rate limit
if self._check_rate_limit(alert):
alert.status = AlertStatus.RATE_LIMITED
self._stats.total_rate_limited += 1
logger.warning(f"Alert rate-limited: {alert.title}")
return results
# Check duplicate
if self._check_duplicate(alert):
alert.status = AlertStatus.SUPPRESSED
self._stats.total_suppressed += 1
logger.debug(f"Duplicate alert suppressed: {alert.title}")
return results
# Get target channels
target_channels = self._get_target_channels(alert)
# Record for rate limiting
self._rate_limit_state.setdefault(alert.category.value, []).append(
datetime.now()
)
# Update status
alert.status = AlertStatus.SENDING
# Send to each channel
for channel_type in target_channels:
channel = self.channels.get(channel_type)
if not channel or not channel.is_available:
continue
result = await self._deliver_to_channel(alert, channel)
results.append(result)
if result.success:
alert.channels_sent.append(channel_type)
self._stats.by_channel[channel_type.value] = (
self._stats.by_channel.get(channel_type.value, 0) + 1
)
# Update final status
if any(r.success for r in results):
alert.status = AlertStatus.DELIVERED
self._stats.total_sent += 1
elif results:
alert.status = AlertStatus.FAILED
self._stats.total_failed += 1
alert.last_error = results[-1].error_message
# Update stats
self._stats.by_priority[alert.priority.value] = (
self._stats.by_priority.get(alert.priority.value, 0) + 1
)
self._stats.by_category[alert.category.value] = (
self._stats.by_category.get(alert.category.value, 0) + 1
)
# Store history
if self.config.store_history:
self._add_to_history(alert)
# Store results
self._delivery_results.extend(results)
return results
async def _deliver_to_channel(
self,
alert: Alert,
channel: AlertChannel,
) -> DeliveryResult:
"""Deliver alert to a specific channel.
Args:
alert: Alert to deliver
channel: Channel to use
Returns:
Delivery result
"""
result = DeliveryResult(
alert_id=alert.alert_id,
channel=channel.channel_type,
)
try:
alert.delivery_attempts += 1
success = await channel.send(alert)
result.success = success
if not success:
result.error_message = "Channel returned failure"
except Exception as e:
result.success = False
result.error_message = str(e)
logger.error(f"Error delivering to {channel.channel_type.value}: {e}")
return result
def _add_to_history(self, alert: Alert) -> None:
"""Add alert to history.
Args:
alert: Alert to add
"""
self._history.append(alert)
# Trim history if needed
max_size = self.config.max_history_size
if len(self._history) > max_size:
self._history = self._history[-max_size:]
def acknowledge_alert(
self,
alert_id: str,
acknowledged_by: str,
) -> bool:
"""Acknowledge an alert.
Args:
alert_id: ID of alert to acknowledge
acknowledged_by: Who is acknowledging
Returns:
True if acknowledged
"""
for alert in self._history:
if alert.alert_id == alert_id:
alert.acknowledged = True
alert.acknowledged_by = acknowledged_by
alert.acknowledged_at = datetime.now()
return True
return False
def get_history(
self,
category: Optional[AlertCategory] = None,
priority: Optional[AlertPriority] = None,
since: Optional[datetime] = None,
limit: int = 100,
) -> List[Alert]:
"""Get alert history.
Args:
category: Filter by category
priority: Filter by priority
since: Filter by timestamp
limit: Maximum results
Returns:
List of alerts
"""
alerts = self._history
if category:
alerts = [a for a in alerts if a.category == category]
if priority:
alerts = [a for a in alerts if a.priority == priority]
if since:
alerts = [a for a in alerts if a.timestamp >= since]
# Return most recent first
return sorted(alerts, key=lambda a: a.timestamp, reverse=True)[:limit]
def get_unacknowledged(
self,
priority: Optional[AlertPriority] = None,
) -> List[Alert]:
"""Get unacknowledged alerts.
Args:
priority: Filter by priority
Returns:
List of unacknowledged alerts
"""
alerts = [a for a in self._history if not a.acknowledged]
if priority:
alerts = [a for a in alerts if a.priority == priority]
return sorted(alerts, key=lambda a: a.timestamp, reverse=True)
def get_stats(self) -> AlertStats:
"""Get alert statistics.
Returns:
Current statistics
"""
return self._stats
def clear_history(self) -> int:
"""Clear alert history.
Returns:
Number of alerts cleared
"""
count = len(self._history)
self._history.clear()
self._delivery_results.clear()
return count
def reset_stats(self) -> None:
"""Reset statistics."""
self._stats = AlertStats()
# ========================================================================
# Convenience Methods
# ========================================================================
def alert_trade(
self,
symbol: str,
action: str,
price: Decimal,
reason: str = "",
priority: AlertPriority = AlertPriority.MEDIUM,
) -> Alert:
"""Send a trade alert.
Args:
symbol: Trading symbol
action: Action (buy/sell)
price: Trade price
reason: Reason for trade
priority: Alert priority
Returns:
Sent alert
"""
alert = self.create_alert_from_template(
"trade_signal",
{
"symbol": symbol,
"action": action,
"price": str(price),
"reason": reason,
},
priority=priority,
source="trade_alert",
)
if alert:
self.send(alert)
return alert
# Fallback if template not found
return self.create_alert(
title=f"Trade Signal: {action} {symbol}",
message=f"Price: {price}, Reason: {reason}",
priority=priority,
category=AlertCategory.TRADE,
)
def alert_risk(
self,
risk_type: str,
current_value: Any,
limit_value: Any,
priority: AlertPriority = AlertPriority.HIGH,
) -> Alert:
"""Send a risk alert.
Args:
risk_type: Type of risk
current_value: Current risk value
limit_value: Limit value
priority: Alert priority
Returns:
Sent alert
"""
alert = self.create_alert_from_template(
"risk_breach",
{
"risk_type": risk_type,
"current_value": str(current_value),
"limit_value": str(limit_value),
},
priority=priority,
source="risk_alert",
)
if alert:
self.send(alert)
return alert
return self.create_alert(
title=f"Risk Breach: {risk_type}",
message=f"Current: {current_value}, Limit: {limit_value}",
priority=priority,
category=AlertCategory.RISK,
)
def alert_execution(
self,
order_id: str,
symbol: str,
status: str,
quantity: Decimal,
price: Decimal,
priority: AlertPriority = AlertPriority.MEDIUM,
) -> Alert:
"""Send an execution alert.
Args:
order_id: Order ID
symbol: Trading symbol
status: Order status
quantity: Order quantity
price: Execution price
priority: Alert priority
Returns:
Sent alert
"""
alert = self.create_alert_from_template(
"order_executed",
{
"order_id": order_id,
"symbol": symbol,
"status": status,
"quantity": str(quantity),
"price": str(price),
},
priority=priority,
source="execution_alert",
)
if alert:
self.send(alert)
return alert
return self.create_alert(
title=f"Order {status}: {order_id}",
message=f"Symbol: {symbol}, Qty: {quantity}, Price: {price}",
priority=priority,
category=AlertCategory.EXECUTION,
)