--- id: "ET-ML-003" title: "Feature Engineering" type: "Technical Specification" status: "Done" priority: "Alta" epic: "OQI-006" project: "trading-platform" version: "1.0.0" created_date: "2025-12-05" updated_date: "2026-01-04" --- # ET-ML-003: Feature Engineering ## Metadata | Campo | Valor | |-------|-------| | **ID** | ET-ML-003 | | **Épica** | OQI-006 - Señales ML | | **Tipo** | Especificación Técnica | | **Versión** | 1.0.0 | | **Estado** | Aprobado | | **Última actualización** | 2025-12-05 | --- ## Propósito Especificar el proceso de Feature Engineering para los modelos de ML, incluyendo la definición de features, cálculo de indicadores técnicos, normalización y selección de variables. --- ## Arquitectura de Features ### Feature Builder ```python # app/features/builder.py import pandas as pd import numpy as np from typing import List, Dict, Optional from .volatility import VolatilityFeatures from .momentum import MomentumFeatures from .trend import TrendFeatures from .volume import VolumeFeatures class FeatureBuilder: """ Constructs feature matrix from OHLCV data. Combines multiple feature groups for comprehensive analysis. """ def __init__(self, lookback_periods: List[int] = None): self.lookback_periods = lookback_periods or [5, 10, 20, 50] # Feature calculators self.volatility = VolatilityFeatures(self.lookback_periods) self.momentum = MomentumFeatures(self.lookback_periods) self.trend = TrendFeatures(self.lookback_periods) self.volume = VolumeFeatures(self.lookback_periods) self.feature_names: List[str] = [] def build(self, df: pd.DataFrame) -> pd.DataFrame: """ Build complete feature matrix from OHLCV data. Args: df: DataFrame with columns ['open', 'high', 'low', 'close', 'volume'] Returns: DataFrame with all calculated features """ features = pd.DataFrame(index=df.index) # Calculate each feature group features = pd.concat([ features, self.volatility.calculate(df), self.momentum.calculate(df), self.trend.calculate(df), self.volume.calculate(df) ], axis=1) # Store feature names self.feature_names = features.columns.tolist() # Drop rows with NaN (due to lookback periods) features = features.dropna() return features def get_feature_names(self) -> List[str]: """Return list of all feature names""" return self.feature_names def get_feature_importance(self, model) -> pd.DataFrame: """Extract feature importance from trained model""" importance = model.feature_importances_ return pd.DataFrame({ 'feature': self.feature_names, 'importance': importance }).sort_values('importance', ascending=False) ``` --- ## Feature Groups ### 1. Volatility Features ```python # app/features/volatility.py import pandas as pd import numpy as np from typing import List class VolatilityFeatures: """Calculate volatility-based features""" def __init__(self, periods: List[int]): self.periods = periods def calculate(self, df: pd.DataFrame) -> pd.DataFrame: """Calculate all volatility features""" features = pd.DataFrame(index=df.index) # Standard deviation of returns for period in self.periods: features[f'volatility_{period}'] = self._std_returns(df['close'], period) # Average True Range (ATR) for period in self.periods: features[f'atr_{period}'] = self._atr(df, period) # ATR as percentage of price for period in self.periods: features[f'atr_pct_{period}'] = features[f'atr_{period}'] / df['close'] * 100 # Bollinger Band Width for period in [20]: features[f'bb_width_{period}'] = self._bb_width(df['close'], period) features[f'bb_position_{period}'] = self._bb_position(df['close'], period) # High-Low Range features['hl_range'] = (df['high'] - df['low']) / df['close'] * 100 # Overnight gap features['gap'] = (df['open'] - df['close'].shift(1)) / df['close'].shift(1) * 100 return features def _std_returns(self, close: pd.Series, period: int) -> pd.Series: """Calculate rolling standard deviation of returns""" returns = close.pct_change() return returns.rolling(period).std() * 100 def _atr(self, df: pd.DataFrame, period: int) -> pd.Series: """Calculate Average True Range""" high = df['high'] low = df['low'] close = df['close'] tr1 = high - low tr2 = abs(high - close.shift(1)) tr3 = abs(low - close.shift(1)) tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1) return tr.rolling(period).mean() def _bb_width(self, close: pd.Series, period: int, std_dev: float = 2.0) -> pd.Series: """Calculate Bollinger Band width""" sma = close.rolling(period).mean() std = close.rolling(period).std() upper = sma + std_dev * std lower = sma - std_dev * std return (upper - lower) / sma * 100 def _bb_position(self, close: pd.Series, period: int, std_dev: float = 2.0) -> pd.Series: """Calculate position within Bollinger Bands (0-100)""" sma = close.rolling(period).mean() std = close.rolling(period).std() upper = sma + std_dev * std lower = sma - std_dev * std return (close - lower) / (upper - lower) * 100 ``` ### 2. Momentum Features ```python # app/features/momentum.py import pandas as pd import numpy as np from typing import List class MomentumFeatures: """Calculate momentum-based features""" def __init__(self, periods: List[int]): self.periods = periods def calculate(self, df: pd.DataFrame) -> pd.DataFrame: """Calculate all momentum features""" features = pd.DataFrame(index=df.index) close = df['close'] # Rate of Change (ROC) for period in self.periods: features[f'roc_{period}'] = self._roc(close, period) # Momentum (absolute) for period in self.periods: features[f'momentum_{period}'] = self._momentum(close, period) # RSI for period in [7, 14, 21]: features[f'rsi_{period}'] = self._rsi(close, period) # Stochastic Oscillator features['stoch_k'] = self._stochastic_k(df, 14) features['stoch_d'] = features['stoch_k'].rolling(3).mean() # Williams %R features['williams_r'] = self._williams_r(df, 14) # CCI (Commodity Channel Index) features['cci_20'] = self._cci(df, 20) # MACD macd, signal, hist = self._macd(close) features['macd'] = macd features['macd_signal'] = signal features['macd_histogram'] = hist # ADX (Average Directional Index) features['adx'] = self._adx(df, 14) return features def _roc(self, close: pd.Series, period: int) -> pd.Series: """Rate of Change""" return (close - close.shift(period)) / close.shift(period) * 100 def _momentum(self, close: pd.Series, period: int) -> pd.Series: """Simple momentum""" return close - close.shift(period) def _rsi(self, close: pd.Series, period: int) -> pd.Series: """Relative Strength Index""" delta = close.diff() gain = delta.where(delta > 0, 0) loss = -delta.where(delta < 0, 0) avg_gain = gain.rolling(period).mean() avg_loss = loss.rolling(period).mean() rs = avg_gain / avg_loss return 100 - (100 / (1 + rs)) def _stochastic_k(self, df: pd.DataFrame, period: int) -> pd.Series: """Stochastic %K""" lowest = df['low'].rolling(period).min() highest = df['high'].rolling(period).max() return (df['close'] - lowest) / (highest - lowest) * 100 def _williams_r(self, df: pd.DataFrame, period: int) -> pd.Series: """Williams %R""" highest = df['high'].rolling(period).max() lowest = df['low'].rolling(period).min() return (highest - df['close']) / (highest - lowest) * -100 def _cci(self, df: pd.DataFrame, period: int) -> pd.Series: """Commodity Channel Index""" tp = (df['high'] + df['low'] + df['close']) / 3 sma = tp.rolling(period).mean() mad = tp.rolling(period).apply(lambda x: np.abs(x - x.mean()).mean()) return (tp - sma) / (0.015 * mad) def _macd(self, close: pd.Series, fast: int = 12, slow: int = 26, signal: int = 9): """MACD (Moving Average Convergence Divergence)""" ema_fast = close.ewm(span=fast).mean() ema_slow = close.ewm(span=slow).mean() macd_line = ema_fast - ema_slow signal_line = macd_line.ewm(span=signal).mean() histogram = macd_line - signal_line return macd_line, signal_line, histogram def _adx(self, df: pd.DataFrame, period: int) -> pd.Series: """Average Directional Index""" high = df['high'] low = df['low'] close = df['close'] # +DM and -DM plus_dm = high.diff() minus_dm = -low.diff() plus_dm[plus_dm < 0] = 0 minus_dm[minus_dm < 0] = 0 # True Range tr = pd.concat([ high - low, abs(high - close.shift(1)), abs(low - close.shift(1)) ], axis=1).max(axis=1) # Smoothed averages atr = tr.rolling(period).mean() plus_di = 100 * (plus_dm.rolling(period).mean() / atr) minus_di = 100 * (minus_dm.rolling(period).mean() / atr) # ADX dx = 100 * abs(plus_di - minus_di) / (plus_di + minus_di) return dx.rolling(period).mean() ``` ### 3. Trend Features ```python # app/features/trend.py import pandas as pd import numpy as np from typing import List class TrendFeatures: """Calculate trend-based features""" def __init__(self, periods: List[int]): self.periods = periods def calculate(self, df: pd.DataFrame) -> pd.DataFrame: """Calculate all trend features""" features = pd.DataFrame(index=df.index) close = df['close'] # Simple Moving Averages for period in self.periods: features[f'sma_{period}'] = close.rolling(period).mean() # Exponential Moving Averages for period in self.periods: features[f'ema_{period}'] = close.ewm(span=period).mean() # Price vs SMA ratio for period in self.periods: features[f'price_sma_ratio_{period}'] = close / features[f'sma_{period}'] # SMA crossovers features['sma_5_20_ratio'] = features['sma_5'] / features['sma_20'] features['sma_10_50_ratio'] = features['sma_10'] / features['sma_50'] # EMA crossovers features['ema_5_20_ratio'] = features['ema_5'] / features['ema_20'] # Trend strength (slope of SMA) for period in [10, 20]: features[f'trend_slope_{period}'] = self._slope(features[f'sma_{period}'], 5) # Distance from high/low features['dist_from_high_20'] = ( (df['high'].rolling(20).max() - close) / close * 100 ) features['dist_from_low_20'] = ( (close - df['low'].rolling(20).min()) / close * 100 ) # Price position in range features['price_position_20'] = self._price_position(df, 20) features['price_position_50'] = self._price_position(df, 50) # Ichimoku components tenkan, kijun, senkou_a, senkou_b = self._ichimoku(df) features['ichimoku_tenkan'] = tenkan features['ichimoku_kijun'] = kijun features['ichimoku_cloud_top'] = senkou_a features['ichimoku_cloud_bottom'] = senkou_b features['above_cloud'] = (close > senkou_a) & (close > senkou_b) features['above_cloud'] = features['above_cloud'].astype(int) return features def _slope(self, series: pd.Series, period: int) -> pd.Series: """Calculate slope (linear regression coefficient)""" def calc_slope(x): if len(x) < 2: return 0 y = np.arange(len(x)) return np.polyfit(y, x, 1)[0] return series.rolling(period).apply(calc_slope) def _price_position(self, df: pd.DataFrame, period: int) -> pd.Series: """Price position within period's high-low range (0-100)""" highest = df['high'].rolling(period).max() lowest = df['low'].rolling(period).min() return (df['close'] - lowest) / (highest - lowest) * 100 def _ichimoku(self, df: pd.DataFrame): """Ichimoku Cloud components""" high = df['high'] low = df['low'] # Tenkan-sen (Conversion Line): (9-period high + 9-period low) / 2 tenkan = (high.rolling(9).max() + low.rolling(9).min()) / 2 # Kijun-sen (Base Line): (26-period high + 26-period low) / 2 kijun = (high.rolling(26).max() + low.rolling(26).min()) / 2 # Senkou Span A (Leading Span A): (Tenkan + Kijun) / 2 senkou_a = (tenkan + kijun) / 2 # Senkou Span B (Leading Span B): (52-period high + 52-period low) / 2 senkou_b = (high.rolling(52).max() + low.rolling(52).min()) / 2 return tenkan, kijun, senkou_a, senkou_b ``` ### 4. Volume Features ```python # app/features/volume.py import pandas as pd import numpy as np from typing import List class VolumeFeatures: """Calculate volume-based features""" def __init__(self, periods: List[int]): self.periods = periods def calculate(self, df: pd.DataFrame) -> pd.DataFrame: """Calculate all volume features""" features = pd.DataFrame(index=df.index) volume = df['volume'] close = df['close'] # Volume Moving Averages for period in self.periods: features[f'volume_sma_{period}'] = volume.rolling(period).mean() # Volume ratio vs average for period in self.periods: features[f'volume_ratio_{period}'] = volume / features[f'volume_sma_{period}'] # On-Balance Volume (OBV) features['obv'] = self._obv(df) features['obv_sma_10'] = features['obv'].rolling(10).mean() features['obv_trend'] = features['obv'] / features['obv_sma_10'] # Volume-Price Trend (VPT) features['vpt'] = self._vpt(df) # Money Flow Index (MFI) features['mfi_14'] = self._mfi(df, 14) # Accumulation/Distribution Line features['ad_line'] = self._ad_line(df) # Chaikin Money Flow features['cmf_20'] = self._cmf(df, 20) # Volume Weighted Average Price (VWAP) deviation features['vwap'] = self._vwap(df) features['vwap_deviation'] = (close - features['vwap']) / features['vwap'] * 100 # Volume changes features['volume_change'] = volume.pct_change() * 100 features['volume_std_20'] = volume.rolling(20).std() / volume.rolling(20).mean() return features def _obv(self, df: pd.DataFrame) -> pd.Series: """On-Balance Volume""" close = df['close'] volume = df['volume'] direction = np.sign(close.diff()) return (direction * volume).cumsum() def _vpt(self, df: pd.DataFrame) -> pd.Series: """Volume-Price Trend""" close = df['close'] volume = df['volume'] vpt = volume * (close.diff() / close.shift(1)) return vpt.cumsum() def _mfi(self, df: pd.DataFrame, period: int) -> pd.Series: """Money Flow Index""" tp = (df['high'] + df['low'] + df['close']) / 3 mf = tp * df['volume'] positive_mf = mf.where(tp > tp.shift(1), 0) negative_mf = mf.where(tp < tp.shift(1), 0) positive_sum = positive_mf.rolling(period).sum() negative_sum = negative_mf.rolling(period).sum() mfi = 100 - (100 / (1 + positive_sum / negative_sum)) return mfi def _ad_line(self, df: pd.DataFrame) -> pd.Series: """Accumulation/Distribution Line""" high = df['high'] low = df['low'] close = df['close'] volume = df['volume'] clv = ((close - low) - (high - close)) / (high - low) clv = clv.fillna(0) return (clv * volume).cumsum() def _cmf(self, df: pd.DataFrame, period: int) -> pd.Series: """Chaikin Money Flow""" high = df['high'] low = df['low'] close = df['close'] volume = df['volume'] clv = ((close - low) - (high - close)) / (high - low) clv = clv.fillna(0) return (clv * volume).rolling(period).sum() / volume.rolling(period).sum() def _vwap(self, df: pd.DataFrame) -> pd.Series: """Volume Weighted Average Price (cumulative)""" tp = (df['high'] + df['low'] + df['close']) / 3 cumulative_tp_vol = (tp * df['volume']).cumsum() cumulative_vol = df['volume'].cumsum() return cumulative_tp_vol / cumulative_vol ``` --- ## Feature Summary ### Complete Feature List (21 Core Features) | # | Feature | Category | Description | |---|---------|----------|-------------| | 1 | volatility_5 | Volatility | 5-period return std | | 2 | volatility_10 | Volatility | 10-period return std | | 3 | volatility_20 | Volatility | 20-period return std | | 4 | atr_14 | Volatility | 14-period ATR | | 5 | bb_position_20 | Volatility | Position in Bollinger Bands | | 6 | roc_5 | Momentum | 5-period rate of change | | 7 | roc_10 | Momentum | 10-period rate of change | | 8 | rsi_14 | Momentum | 14-period RSI | | 9 | macd | Momentum | MACD line | | 10 | macd_histogram | Momentum | MACD histogram | | 11 | stoch_k | Momentum | Stochastic %K | | 12 | sma_5 | Trend | 5-period SMA | | 13 | sma_20 | Trend | 20-period SMA | | 14 | ema_10 | Trend | 10-period EMA | | 15 | price_sma_ratio_20 | Trend | Price vs SMA20 | | 16 | trend_slope_20 | Trend | SMA20 slope | | 17 | price_position_20 | Trend | Position in range | | 18 | volume_ratio_20 | Volume | Volume vs avg | | 19 | obv_trend | Volume | OBV trend | | 20 | mfi_14 | Volume | Money Flow Index | | 21 | cmf_20 | Volume | Chaikin Money Flow | --- ## Feature Normalization ```python # app/features/normalizer.py from sklearn.preprocessing import StandardScaler, RobustScaler import pandas as pd import numpy as np import joblib class FeatureNormalizer: """Normalize features for model input""" def __init__(self, method: str = 'robust'): """ Args: method: 'standard' for StandardScaler, 'robust' for RobustScaler """ if method == 'standard': self.scaler = StandardScaler() else: self.scaler = RobustScaler() self.fitted = False def fit(self, X: pd.DataFrame): """Fit scaler on training data""" self.scaler.fit(X) self.fitted = True def transform(self, X: pd.DataFrame) -> np.ndarray: """Transform features""" if not self.fitted: raise ValueError("Scaler not fitted. Call fit() first.") return self.scaler.transform(X) def fit_transform(self, X: pd.DataFrame) -> np.ndarray: """Fit and transform in one step""" self.fit(X) return self.transform(X) def save(self, path: str): """Save scaler to disk""" joblib.dump(self.scaler, f"{path}/scaler.joblib") def load(self, path: str): """Load scaler from disk""" self.scaler = joblib.load(f"{path}/scaler.joblib") self.fitted = True ``` --- ## Feature Selection ```python # app/features/selector.py from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif import pandas as pd import numpy as np class FeatureSelector: """Select most important features""" def __init__(self, n_features: int = 15, method: str = 'mutual_info'): self.n_features = n_features self.method = method self.selected_features = [] def fit(self, X: pd.DataFrame, y: np.ndarray): """Fit selector and identify top features""" if self.method == 'mutual_info': score_func = mutual_info_classif else: score_func = f_classif selector = SelectKBest(score_func=score_func, k=self.n_features) selector.fit(X, y) # Get selected feature names mask = selector.get_support() self.selected_features = X.columns[mask].tolist() # Store scores for analysis self.feature_scores = pd.DataFrame({ 'feature': X.columns, 'score': selector.scores_ }).sort_values('score', ascending=False) def transform(self, X: pd.DataFrame) -> pd.DataFrame: """Filter to selected features""" return X[self.selected_features] def get_scores(self) -> pd.DataFrame: """Return feature scores""" return self.feature_scores ``` --- ## Referencias - [ET-ML-001: Arquitectura](./ET-ML-001-arquitectura.md) - [ET-ML-002: Modelos XGBoost](./ET-ML-002-modelos.md) - [Pandas Documentation](https://pandas.pydata.org/) - [TA-Lib](https://github.com/mrjbq7/ta-lib) --- **Autor:** Requirements-Analyst **Fecha:** 2025-12-05