---
id: "ET-ML-003"
title: "Feature Engineering"
type: "Technical Specification"
status: "Done"
priority: "Alta"
epic: "OQI-006"
project: "trading-platform"
version: "1.0.0"
created_date: "2025-12-05"
updated_date: "2026-01-04"
---

# ET-ML-003: Feature Engineering

## Metadata

| Campo | Valor |
|-------|-------|
| **ID** | ET-ML-003 |
| **Épica** | OQI-006 - Señales ML |
| **Tipo** | Especificación Técnica |
| **Versión** | 1.0.0 |
| **Estado** | Aprobado |
| **Última actualización** | 2025-12-05 |

---

## Propósito

Especificar el proceso de Feature Engineering para los modelos de ML, incluyendo la definición de features, cálculo de indicadores técnicos, normalización y selección de variables.

---

## Arquitectura de Features

### Feature Builder

```python
# app/features/builder.py
import pandas as pd
import numpy as np
from typing import List, Dict, Optional
from .volatility import VolatilityFeatures
from .momentum import MomentumFeatures
from .trend import TrendFeatures
from .volume import VolumeFeatures

class FeatureBuilder:
    """
    Constructs feature matrix from OHLCV data.
    Combines multiple feature groups for comprehensive analysis.
    """

    def __init__(self, lookback_periods: List[int] = None):
        self.lookback_periods = lookback_periods or [5, 10, 20, 50]

        # Feature calculators
        self.volatility = VolatilityFeatures(self.lookback_periods)
        self.momentum = MomentumFeatures(self.lookback_periods)
        self.trend = TrendFeatures(self.lookback_periods)
        self.volume = VolumeFeatures(self.lookback_periods)

        self.feature_names: List[str] = []

    def build(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Build complete feature matrix from OHLCV data.

        Args:
            df: DataFrame with columns ['open', 'high', 'low', 'close', 'volume']

        Returns:
            DataFrame with all calculated features
        """
        features = pd.DataFrame(index=df.index)

        # Calculate each feature group
        features = pd.concat([
            features,
            self.volatility.calculate(df),
            self.momentum.calculate(df),
            self.trend.calculate(df),
            self.volume.calculate(df)
        ], axis=1)

        # Store feature names
        self.feature_names = features.columns.tolist()

        # Drop rows with NaN (due to lookback periods)
        features = features.dropna()

        return features

    def get_feature_names(self) -> List[str]:
        """Return list of all feature names"""
        return self.feature_names

    def get_feature_importance(self, model) -> pd.DataFrame:
        """Extract feature importance from trained model"""
        importance = model.feature_importances_

        return pd.DataFrame({
            'feature': self.feature_names,
            'importance': importance
        }).sort_values('importance', ascending=False)
```

---

## Feature Groups

### 1. Volatility Features

```python
# app/features/volatility.py
import pandas as pd
import numpy as np
from typing import List

class VolatilityFeatures:
    """Calculate volatility-based features"""

    def __init__(self, periods: List[int]):
        self.periods = periods

    def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate all volatility features"""
        features = pd.DataFrame(index=df.index)

        # Standard deviation of returns
        for period in self.periods:
            features[f'volatility_{period}'] = self._std_returns(df['close'], period)

        # Average True Range (ATR)
        for period in self.periods:
            features[f'atr_{period}'] = self._atr(df, period)

        # ATR as percentage of price
        for period in self.periods:
            features[f'atr_pct_{period}'] = features[f'atr_{period}'] / df['close'] * 100

        # Bollinger Band Width
        for period in [20]:
            features[f'bb_width_{period}'] = self._bb_width(df['close'], period)
            features[f'bb_position_{period}'] = self._bb_position(df['close'], period)

        # High-Low Range
        features['hl_range'] = (df['high'] - df['low']) / df['close'] * 100

        # Overnight gap
        features['gap'] = (df['open'] - df['close'].shift(1)) / df['close'].shift(1) * 100

        return features

    def _std_returns(self, close: pd.Series, period: int) -> pd.Series:
        """Calculate rolling standard deviation of returns"""
        returns = close.pct_change()
        return returns.rolling(period).std() * 100

    def _atr(self, df: pd.DataFrame, period: int) -> pd.Series:
        """Calculate Average True Range"""
        high = df['high']
        low = df['low']
        close = df['close']

        tr1 = high - low
        tr2 = abs(high - close.shift(1))
        tr3 = abs(low - close.shift(1))

        tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
        return tr.rolling(period).mean()

    def _bb_width(self, close: pd.Series, period: int, std_dev: float = 2.0) -> pd.Series:
        """Calculate Bollinger Band width"""
        sma = close.rolling(period).mean()
        std = close.rolling(period).std()

        upper = sma + std_dev * std
        lower = sma - std_dev * std

        return (upper - lower) / sma * 100

    def _bb_position(self, close: pd.Series, period: int, std_dev: float = 2.0) -> pd.Series:
        """Calculate position within Bollinger Bands (0-100)"""
        sma = close.rolling(period).mean()
        std = close.rolling(period).std()

        upper = sma + std_dev * std
        lower = sma - std_dev * std

        return (close - lower) / (upper - lower) * 100
```

### 2. Momentum Features

```python
# app/features/momentum.py
import pandas as pd
import numpy as np
from typing import List

class MomentumFeatures:
    """Calculate momentum-based features"""

    def __init__(self, periods: List[int]):
        self.periods = periods

    def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate all momentum features"""
        features = pd.DataFrame(index=df.index)
        close = df['close']

        # Rate of Change (ROC)
        for period in self.periods:
            features[f'roc_{period}'] = self._roc(close, period)

        # Momentum (absolute)
        for period in self.periods:
            features[f'momentum_{period}'] = self._momentum(close, period)

        # RSI
        for period in [7, 14, 21]:
            features[f'rsi_{period}'] = self._rsi(close, period)

        # Stochastic Oscillator
        features['stoch_k'] = self._stochastic_k(df, 14)
        features['stoch_d'] = features['stoch_k'].rolling(3).mean()

        # Williams %R
        features['williams_r'] = self._williams_r(df, 14)

        # CCI (Commodity Channel Index)
        features['cci_20'] = self._cci(df, 20)

        # MACD
        macd, signal, hist = self._macd(close)
        features['macd'] = macd
        features['macd_signal'] = signal
        features['macd_histogram'] = hist

        # ADX (Average Directional Index)
        features['adx'] = self._adx(df, 14)

        return features

    def _roc(self, close: pd.Series, period: int) -> pd.Series:
        """Rate of Change"""
        return (close - close.shift(period)) / close.shift(period) * 100

    def _momentum(self, close: pd.Series, period: int) -> pd.Series:
        """Simple momentum"""
        return close - close.shift(period)

    def _rsi(self, close: pd.Series, period: int) -> pd.Series:
        """Relative Strength Index"""
        delta = close.diff()

        gain = delta.where(delta > 0, 0)
        loss = -delta.where(delta < 0, 0)

        avg_gain = gain.rolling(period).mean()
        avg_loss = loss.rolling(period).mean()

        rs = avg_gain / avg_loss
        return 100 - (100 / (1 + rs))

    def _stochastic_k(self, df: pd.DataFrame, period: int) -> pd.Series:
        """Stochastic %K"""
        lowest = df['low'].rolling(period).min()
        highest = df['high'].rolling(period).max()

        return (df['close'] - lowest) / (highest - lowest) * 100

    def _williams_r(self, df: pd.DataFrame, period: int) -> pd.Series:
        """Williams %R"""
        highest = df['high'].rolling(period).max()
        lowest = df['low'].rolling(period).min()

        return (highest - df['close']) / (highest - lowest) * -100

    def _cci(self, df: pd.DataFrame, period: int) -> pd.Series:
        """Commodity Channel Index"""
        tp = (df['high'] + df['low'] + df['close']) / 3
        sma = tp.rolling(period).mean()
        mad = tp.rolling(period).apply(lambda x: np.abs(x - x.mean()).mean())

        return (tp - sma) / (0.015 * mad)

    def _macd(self, close: pd.Series, fast: int = 12, slow: int = 26, signal: int = 9):
        """MACD (Moving Average Convergence Divergence)"""
        ema_fast = close.ewm(span=fast).mean()
        ema_slow = close.ewm(span=slow).mean()

        macd_line = ema_fast - ema_slow
        signal_line = macd_line.ewm(span=signal).mean()
        histogram = macd_line - signal_line

        return macd_line, signal_line, histogram

    def _adx(self, df: pd.DataFrame, period: int) -> pd.Series:
        """Average Directional Index"""
        high = df['high']
        low = df['low']
        close = df['close']

        # +DM and -DM
        plus_dm = high.diff()
        minus_dm = -low.diff()

        plus_dm[plus_dm < 0] = 0
        minus_dm[minus_dm < 0] = 0

        # True Range
        tr = pd.concat([
            high - low,
            abs(high - close.shift(1)),
            abs(low - close.shift(1))
        ], axis=1).max(axis=1)

        # Smoothed averages
        atr = tr.rolling(period).mean()
        plus_di = 100 * (plus_dm.rolling(period).mean() / atr)
        minus_di = 100 * (minus_dm.rolling(period).mean() / atr)

        # ADX
        dx = 100 * abs(plus_di - minus_di) / (plus_di + minus_di)
        return dx.rolling(period).mean()
```

### 3. Trend Features

```python
# app/features/trend.py
import pandas as pd
import numpy as np
from typing import List

class TrendFeatures:
    """Calculate trend-based features"""

    def __init__(self, periods: List[int]):
        self.periods = periods

    def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate all trend features"""
        features = pd.DataFrame(index=df.index)
        close = df['close']

        # Simple Moving Averages
        for period in self.periods:
            features[f'sma_{period}'] = close.rolling(period).mean()

        # Exponential Moving Averages
        for period in self.periods:
            features[f'ema_{period}'] = close.ewm(span=period).mean()

        # Price vs SMA ratio
        for period in self.periods:
            features[f'price_sma_ratio_{period}'] = close / features[f'sma_{period}']

        # SMA crossovers
        features['sma_5_20_ratio'] = features['sma_5'] / features['sma_20']
        features['sma_10_50_ratio'] = features['sma_10'] / features['sma_50']

        # EMA crossovers
        features['ema_5_20_ratio'] = features['ema_5'] / features['ema_20']

        # Trend strength (slope of SMA)
        for period in [10, 20]:
            features[f'trend_slope_{period}'] = self._slope(features[f'sma_{period}'], 5)

        # Distance from high/low
        features['dist_from_high_20'] = (
            (df['high'].rolling(20).max() - close) / close * 100
        )
        features['dist_from_low_20'] = (
            (close - df['low'].rolling(20).min()) / close * 100
        )

        # Price position in range
        features['price_position_20'] = self._price_position(df, 20)
        features['price_position_50'] = self._price_position(df, 50)

        # Ichimoku components
        tenkan, kijun, senkou_a, senkou_b = self._ichimoku(df)
        features['ichimoku_tenkan'] = tenkan
        features['ichimoku_kijun'] = kijun
        features['ichimoku_cloud_top'] = senkou_a
        features['ichimoku_cloud_bottom'] = senkou_b
        features['above_cloud'] = (close > senkou_a) & (close > senkou_b)
        features['above_cloud'] = features['above_cloud'].astype(int)

        return features

    def _slope(self, series: pd.Series, period: int) -> pd.Series:
        """Calculate slope (linear regression coefficient)"""
        def calc_slope(x):
            if len(x) < 2:
                return 0
            y = np.arange(len(x))
            return np.polyfit(y, x, 1)[0]

        return series.rolling(period).apply(calc_slope)

    def _price_position(self, df: pd.DataFrame, period: int) -> pd.Series:
        """Price position within period's high-low range (0-100)"""
        highest = df['high'].rolling(period).max()
        lowest = df['low'].rolling(period).min()

        return (df['close'] - lowest) / (highest - lowest) * 100

    def _ichimoku(self, df: pd.DataFrame):
        """Ichimoku Cloud components"""
        high = df['high']
        low = df['low']

        # Tenkan-sen (Conversion Line): (9-period high + 9-period low) / 2
        tenkan = (high.rolling(9).max() + low.rolling(9).min()) / 2

        # Kijun-sen (Base Line): (26-period high + 26-period low) / 2
        kijun = (high.rolling(26).max() + low.rolling(26).min()) / 2

        # Senkou Span A (Leading Span A): (Tenkan + Kijun) / 2
        senkou_a = (tenkan + kijun) / 2

        # Senkou Span B (Leading Span B): (52-period high + 52-period low) / 2
        senkou_b = (high.rolling(52).max() + low.rolling(52).min()) / 2

        return tenkan, kijun, senkou_a, senkou_b
```

### 4. Volume Features

```python
# app/features/volume.py
import pandas as pd
import numpy as np
from typing import List

class VolumeFeatures:
    """Calculate volume-based features"""

    def __init__(self, periods: List[int]):
        self.periods = periods

    def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate all volume features"""
        features = pd.DataFrame(index=df.index)
        volume = df['volume']
        close = df['close']

        # Volume Moving Averages
        for period in self.periods:
            features[f'volume_sma_{period}'] = volume.rolling(period).mean()

        # Volume ratio vs average
        for period in self.periods:
            features[f'volume_ratio_{period}'] = volume / features[f'volume_sma_{period}']

        # On-Balance Volume (OBV)
        features['obv'] = self._obv(df)
        features['obv_sma_10'] = features['obv'].rolling(10).mean()
        features['obv_trend'] = features['obv'] / features['obv_sma_10']

        # Volume-Price Trend (VPT)
        features['vpt'] = self._vpt(df)

        # Money Flow Index (MFI)
        features['mfi_14'] = self._mfi(df, 14)

        # Accumulation/Distribution Line
        features['ad_line'] = self._ad_line(df)

        # Chaikin Money Flow
        features['cmf_20'] = self._cmf(df, 20)

        # Volume Weighted Average Price (VWAP) deviation
        features['vwap'] = self._vwap(df)
        features['vwap_deviation'] = (close - features['vwap']) / features['vwap'] * 100

        # Volume changes
        features['volume_change'] = volume.pct_change() * 100
        features['volume_std_20'] = volume.rolling(20).std() / volume.rolling(20).mean()

        return features

    def _obv(self, df: pd.DataFrame) -> pd.Series:
        """On-Balance Volume"""
        close = df['close']
        volume = df['volume']

        direction = np.sign(close.diff())
        return (direction * volume).cumsum()

    def _vpt(self, df: pd.DataFrame) -> pd.Series:
        """Volume-Price Trend"""
        close = df['close']
        volume = df['volume']

        vpt = volume * (close.diff() / close.shift(1))
        return vpt.cumsum()

    def _mfi(self, df: pd.DataFrame, period: int) -> pd.Series:
        """Money Flow Index"""
        tp = (df['high'] + df['low'] + df['close']) / 3
        mf = tp * df['volume']

        positive_mf = mf.where(tp > tp.shift(1), 0)
        negative_mf = mf.where(tp < tp.shift(1), 0)

        positive_sum = positive_mf.rolling(period).sum()
        negative_sum = negative_mf.rolling(period).sum()

        mfi = 100 - (100 / (1 + positive_sum / negative_sum))
        return mfi

    def _ad_line(self, df: pd.DataFrame) -> pd.Series:
        """Accumulation/Distribution Line"""
        high = df['high']
        low = df['low']
        close = df['close']
        volume = df['volume']

        clv = ((close - low) - (high - close)) / (high - low)
        clv = clv.fillna(0)

        return (clv * volume).cumsum()

    def _cmf(self, df: pd.DataFrame, period: int) -> pd.Series:
        """Chaikin Money Flow"""
        high = df['high']
        low = df['low']
        close = df['close']
        volume = df['volume']

        clv = ((close - low) - (high - close)) / (high - low)
        clv = clv.fillna(0)

        return (clv * volume).rolling(period).sum() / volume.rolling(period).sum()

    def _vwap(self, df: pd.DataFrame) -> pd.Series:
        """Volume Weighted Average Price (cumulative)"""
        tp = (df['high'] + df['low'] + df['close']) / 3

        cumulative_tp_vol = (tp * df['volume']).cumsum()
        cumulative_vol = df['volume'].cumsum()

        return cumulative_tp_vol / cumulative_vol
```

---

## Feature Summary

### Complete Feature List (21 Core Features)

| # | Feature | Category | Description |
|---|---------|----------|-------------|
| 1 | volatility_5 | Volatility | 5-period return std |
| 2 | volatility_10 | Volatility | 10-period return std |
| 3 | volatility_20 | Volatility | 20-period return std |
| 4 | atr_14 | Volatility | 14-period ATR |
| 5 | bb_position_20 | Volatility | Position in Bollinger Bands |
| 6 | roc_5 | Momentum | 5-period rate of change |
| 7 | roc_10 | Momentum | 10-period rate of change |
| 8 | rsi_14 | Momentum | 14-period RSI |
| 9 | macd | Momentum | MACD line |
| 10 | macd_histogram | Momentum | MACD histogram |
| 11 | stoch_k | Momentum | Stochastic %K |
| 12 | sma_5 | Trend | 5-period SMA |
| 13 | sma_20 | Trend | 20-period SMA |
| 14 | ema_10 | Trend | 10-period EMA |
| 15 | price_sma_ratio_20 | Trend | Price vs SMA20 |
| 16 | trend_slope_20 | Trend | SMA20 slope |
| 17 | price_position_20 | Trend | Position in range |
| 18 | volume_ratio_20 | Volume | Volume vs avg |
| 19 | obv_trend | Volume | OBV trend |
| 20 | mfi_14 | Volume | Money Flow Index |
| 21 | cmf_20 | Volume | Chaikin Money Flow |

---

## Feature Normalization

```python
# app/features/normalizer.py
from sklearn.preprocessing import StandardScaler, RobustScaler
import pandas as pd
import numpy as np
import joblib

class FeatureNormalizer:
    """Normalize features for model input"""

    def __init__(self, method: str = 'robust'):
        """
        Args:
            method: 'standard' for StandardScaler, 'robust' for RobustScaler
        """
        if method == 'standard':
            self.scaler = StandardScaler()
        else:
            self.scaler = RobustScaler()

        self.fitted = False

    def fit(self, X: pd.DataFrame):
        """Fit scaler on training data"""
        self.scaler.fit(X)
        self.fitted = True

    def transform(self, X: pd.DataFrame) -> np.ndarray:
        """Transform features"""
        if not self.fitted:
            raise ValueError("Scaler not fitted. Call fit() first.")
        return self.scaler.transform(X)

    def fit_transform(self, X: pd.DataFrame) -> np.ndarray:
        """Fit and transform in one step"""
        self.fit(X)
        return self.transform(X)

    def save(self, path: str):
        """Save scaler to disk"""
        joblib.dump(self.scaler, f"{path}/scaler.joblib")

    def load(self, path: str):
        """Load scaler from disk"""
        self.scaler = joblib.load(f"{path}/scaler.joblib")
        self.fitted = True
```

---

## Feature Selection

```python
# app/features/selector.py
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
import pandas as pd
import numpy as np

class FeatureSelector:
    """Select most important features"""

    def __init__(self, n_features: int = 15, method: str = 'mutual_info'):
        self.n_features = n_features
        self.method = method
        self.selected_features = []

    def fit(self, X: pd.DataFrame, y: np.ndarray):
        """Fit selector and identify top features"""
        if self.method == 'mutual_info':
            score_func = mutual_info_classif
        else:
            score_func = f_classif

        selector = SelectKBest(score_func=score_func, k=self.n_features)
        selector.fit(X, y)

        # Get selected feature names
        mask = selector.get_support()
        self.selected_features = X.columns[mask].tolist()

        # Store scores for analysis
        self.feature_scores = pd.DataFrame({
            'feature': X.columns,
            'score': selector.scores_
        }).sort_values('score', ascending=False)

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Filter to selected features"""
        return X[self.selected_features]

    def get_scores(self) -> pd.DataFrame:
        """Return feature scores"""
        return self.feature_scores
```

---

## Referencias

- [ET-ML-001: Arquitectura](./ET-ML-001-arquitectura.md)
- [ET-ML-002: Modelos XGBoost](./ET-ML-002-modelos.md)
- [Pandas Documentation](https://pandas.pydata.org/)
- [TA-Lib](https://github.com/mrjbq7/ta-lib)

---

**Autor:** Requirements-Analyst
**Fecha:** 2025-12-05