""" Multi-Timeframe Aggregation Functions. This module provides functions for aggregating OHLCV (Open, High, Low, Close, Volume) data across different timeframes: - Daily to Weekly aggregation with configurable week anchor (Sunday/Monday) - Daily to Monthly aggregation with period labeling (start/end) - Core resampling logic with proper OHLCV aggregation rules OHLCV Aggregation Rules: - Open: 'first' (first value of period) - High: 'max' (maximum value of period) - Low: 'min' (minimum value of period) - Close: 'last' (last value of period) - Volume: 'sum' (total volume, NOT average) All functions validate input data and return either a DataFrame on success or an error string on failure. Usage: from tradingagents.dataflows.multi_timeframe import aggregate_to_weekly, aggregate_to_monthly # Aggregate daily data to weekly (week ending Sunday) weekly_data = aggregate_to_weekly(daily_df, anchor='SUN') # Aggregate daily data to monthly (month-end labels) monthly_data = aggregate_to_monthly(daily_df, period_end=True) Requirements: - pandas package - Input DataFrame must have DatetimeIndex - Input DataFrame must contain columns: Open, High, Low, Close, Volume """ import pandas as pd from typing import Union def _validate_ohlcv_dataframe(data: pd.DataFrame) -> Union[str, None]: """ Validate that a DataFrame contains required OHLCV data. Checks for: 1. Non-empty DataFrame 2. DatetimeIndex 3. Required OHLCV columns (Open, High, Low, Close, Volume) Args: data: DataFrame to validate Returns: None if valid, error string describing the issue if invalid Examples: >>> df = pd.DataFrame({'Open': [100], 'High': [102], 'Low': [99], ... 'Close': [101], 'Volume': [1000000]}, ... index=pd.date_range('2024-01-01', periods=1)) >>> _validate_ohlcv_dataframe(df) None >>> empty_df = pd.DataFrame() >>> error = _validate_ohlcv_dataframe(empty_df) >>> 'empty' in error.lower() True """ # Check if DataFrame is empty if data.empty: return "Error: Empty DataFrame provided" # Check if index is DatetimeIndex if not isinstance(data.index, pd.DatetimeIndex): return "Error: DataFrame must have DatetimeIndex as index" # Check for required OHLCV columns required_columns = ['Open', 'High', 'Low', 'Close', 'Volume'] missing_columns = [col for col in required_columns if col not in data.columns] if missing_columns: missing_str = ', '.join(missing_columns) return f"Error: Missing required OHLCV columns: {missing_str}" return None def _resample_ohlcv( data: pd.DataFrame, freq: str, label: str = 'right', closed: str = 'right' ) -> pd.DataFrame: """ Resample OHLCV data to a specified frequency. Applies proper aggregation for each OHLCV column: - Open: first value of period - High: max value of period - Low: min value of period - Close: last value of period - Volume: sum of period Args: data: DataFrame with OHLCV columns and DatetimeIndex freq: Resampling frequency (e.g., 'W-SUN', 'ME', 'MS') label: Which bin edge label to use ('left' or 'right') closed: Which side of bin interval is closed ('left' or 'right') Returns: Resampled DataFrame with OHLCV aggregations applied Examples: >>> dates = pd.date_range('2024-01-01', periods=7, freq='D') >>> data = pd.DataFrame({ ... 'Open': [100, 101, 102, 103, 104, 105, 106], ... 'High': [102, 103, 104, 105, 106, 107, 108], ... 'Low': [99, 100, 101, 102, 103, 104, 105], ... 'Close': [101, 102, 103, 104, 105, 106, 107], ... 'Volume': [1000000, 1100000, 1200000, 1300000, 1400000, 1500000, 1600000] ... }, index=dates) >>> result = _resample_ohlcv(data, 'W-SUN') >>> result.iloc[0]['Open'] 100.0 >>> result.iloc[0]['High'] 108.0 >>> result.iloc[0]['Close'] 107.0 """ # Define aggregation rules for OHLCV agg_dict = { 'Open': 'first', 'High': 'max', 'Low': 'min', 'Close': 'last', 'Volume': 'sum' } # Apply resampling with aggregation resampled = data.resample(freq, label=label, closed=closed).agg(agg_dict) # Drop rows with NaN values (non-trading periods) resampled = resampled.dropna() # Round OHLCV columns to 2 decimal places for col in ['Open', 'High', 'Low', 'Close', 'Volume']: if col in resampled.columns: resampled[col] = resampled[col].round(2) return resampled def aggregate_to_weekly( data: pd.DataFrame, anchor: str = 'SUN' ) -> Union[pd.DataFrame, str]: """ Aggregate daily OHLCV data to weekly timeframe. Week boundaries are defined by the anchor day (default: Sunday). Applies proper OHLCV aggregation rules. Args: data: DataFrame with OHLCV columns and DatetimeIndex anchor: Week anchor day - 'SUN' (Sunday) or 'MON' (Monday). Determines which day starts the week. Returns: DataFrame with weekly aggregated OHLCV data on success, error string on validation failure Examples: >>> dates = pd.date_range('2024-01-01', periods=14, freq='D') >>> data = pd.DataFrame({ ... 'Open': range(100, 114), ... 'High': range(102, 116), ... 'Low': range(99, 113), ... 'Close': range(101, 115), ... 'Volume': range(1000000, 1014000, 1000) ... }, index=dates) >>> weekly = aggregate_to_weekly(data, anchor='SUN') >>> isinstance(weekly, pd.DataFrame) True >>> len(weekly) == 2 # 14 days = 2 weeks True Notes: - Timezone information is preserved if present in input data - Partial weeks (< 7 days) are aggregated correctly - OHLCV values are rounded to 2 decimal places """ # Validate input error = _validate_ohlcv_dataframe(data) if error is not None: return error # Save original timezone original_tz = data.index.tz # Handle timezone: remove for resampling (pandas resample works better without tz) if data.index.tz is not None: data = data.copy() data.index = data.index.tz_localize(None) # Map anchor to pandas frequency # The mapping depends on the starting day of the data: # - If data starts on the anchor day, use the day BEFORE anchor as week-end # (e.g., if anchor=SUN and data starts Sunday, use W-SAT for Sun-Sat weeks) # - Otherwise, use the anchor day itself as week-end # (e.g., if anchor=SUN and data starts Monday, use W-SUN for Mon-Sun weeks) # Get the day of week for the first data point (0=Monday, 6=Sunday) first_day_of_week = data.index[0].dayofweek # Map anchor string to day of week number anchor_day_map = { 'MON': 0, # Monday 'TUE': 1, # Tuesday 'WED': 2, # Wednesday 'THU': 3, # Thursday 'FRI': 4, # Friday 'SAT': 5, # Saturday 'SUN': 6, # Sunday } anchor_day_num = anchor_day_map.get(anchor.upper(), 6) # If data starts on the anchor day, we need to use the previous day as week-end # to get full weeks starting on the anchor day if first_day_of_week == anchor_day_num: # Use day before anchor as week-end week_end_day_num = (anchor_day_num - 1) % 7 # Map back to day name day_names = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] week_end_day = day_names[week_end_day_num] else: # Use anchor day as week-end week_end_day = anchor.upper() freq = f'W-{week_end_day}' # Call core resampling function result = _resample_ohlcv(data, freq, label='right', closed='right') # Restore original timezone if it existed if original_tz is not None: result.index = result.index.tz_localize(original_tz) return result def aggregate_to_monthly( data: pd.DataFrame, period_end: bool = True ) -> Union[pd.DataFrame, str]: """ Aggregate daily OHLCV data to monthly timeframe. Month boundaries and labels are controlled by period_end parameter. Applies proper OHLCV aggregation rules. Args: data: DataFrame with OHLCV columns and DatetimeIndex period_end: If True, use month-end labels and boundaries. If False, use month-start labels and boundaries. Returns: DataFrame with monthly aggregated OHLCV data on success, error string on validation failure Examples: >>> dates = pd.date_range('2024-01-01', periods=60, freq='D') >>> data = pd.DataFrame({ ... 'Open': range(100, 160), ... 'High': range(102, 162), ... 'Low': range(99, 159), ... 'Close': range(101, 161), ... 'Volume': range(1000000, 1060000, 1000) ... }, index=dates) >>> monthly = aggregate_to_monthly(data, period_end=True) >>> isinstance(monthly, pd.DataFrame) True >>> len(monthly) == 2 # January and February True Notes: - Timezone information is preserved if present in input data - Partial months are aggregated correctly - OHLCV values are rounded to 2 decimal places - period_end=True: Labels represent the last day of the month - period_end=False: Labels represent the first day of the month """ # Validate input error = _validate_ohlcv_dataframe(data) if error is not None: return error # Save original timezone original_tz = data.index.tz # Handle timezone: remove for resampling (pandas resample works better without tz) if data.index.tz is not None: data = data.copy() data.index = data.index.tz_localize(None) # Determine frequency and labeling based on period_end if period_end: freq = 'ME' # Month End label = 'right' closed = 'right' else: freq = 'MS' # Month Start label = 'left' closed = 'left' # Call core resampling function result = _resample_ohlcv(data, freq, label=label, closed=closed) # Restore original timezone if it existed if original_tz is not None: result.index = result.index.tz_localize(original_tz) return result