trading-platform/docs/02-definicion-modulos/OQI-006-ml-signals/especificaciones/ET-ML-003-features.md
rckrdmrd a7cca885f0 feat: Major platform documentation and architecture updates
Changes include:
- Updated architecture documentation
- Enhanced module definitions (OQI-001 to OQI-008)
- ML integration documentation updates
- Trading strategies documentation
- Orchestration and inventory updates
- Docker configuration updates

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-07 05:33:35 -06:00

21 KiB

id title type status priority epic project version created_date updated_date
ET-ML-003 Feature Engineering Technical Specification Done Alta OQI-006 trading-platform 1.0.0 2025-12-05 2026-01-04

ET-ML-003: Feature Engineering

Metadata

Campo Valor
ID ET-ML-003
Épica OQI-006 - Señales ML
Tipo Especificación Técnica
Versión 1.0.0
Estado Aprobado
Última actualización 2025-12-05

Propósito

Especificar el proceso de Feature Engineering para los modelos de ML, incluyendo la definición de features, cálculo de indicadores técnicos, normalización y selección de variables.


Arquitectura de Features

Feature Builder

# app/features/builder.py
import pandas as pd
import numpy as np
from typing import List, Dict, Optional
from .volatility import VolatilityFeatures
from .momentum import MomentumFeatures
from .trend import TrendFeatures
from .volume import VolumeFeatures

class FeatureBuilder:
    """
    Constructs feature matrix from OHLCV data.
    Combines multiple feature groups for comprehensive analysis.
    """

    def __init__(self, lookback_periods: List[int] = None):
        self.lookback_periods = lookback_periods or [5, 10, 20, 50]

        # Feature calculators
        self.volatility = VolatilityFeatures(self.lookback_periods)
        self.momentum = MomentumFeatures(self.lookback_periods)
        self.trend = TrendFeatures(self.lookback_periods)
        self.volume = VolumeFeatures(self.lookback_periods)

        self.feature_names: List[str] = []

    def build(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Build complete feature matrix from OHLCV data.

        Args:
            df: DataFrame with columns ['open', 'high', 'low', 'close', 'volume']

        Returns:
            DataFrame with all calculated features
        """
        features = pd.DataFrame(index=df.index)

        # Calculate each feature group
        features = pd.concat([
            features,
            self.volatility.calculate(df),
            self.momentum.calculate(df),
            self.trend.calculate(df),
            self.volume.calculate(df)
        ], axis=1)

        # Store feature names
        self.feature_names = features.columns.tolist()

        # Drop rows with NaN (due to lookback periods)
        features = features.dropna()

        return features

    def get_feature_names(self) -> List[str]:
        """Return list of all feature names"""
        return self.feature_names

    def get_feature_importance(self, model) -> pd.DataFrame:
        """Extract feature importance from trained model"""
        importance = model.feature_importances_

        return pd.DataFrame({
            'feature': self.feature_names,
            'importance': importance
        }).sort_values('importance', ascending=False)

Feature Groups

1. Volatility Features

# app/features/volatility.py
import pandas as pd
import numpy as np
from typing import List

class VolatilityFeatures:
    """Calculate volatility-based features"""

    def __init__(self, periods: List[int]):
        self.periods = periods

    def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate all volatility features"""
        features = pd.DataFrame(index=df.index)

        # Standard deviation of returns
        for period in self.periods:
            features[f'volatility_{period}'] = self._std_returns(df['close'], period)

        # Average True Range (ATR)
        for period in self.periods:
            features[f'atr_{period}'] = self._atr(df, period)

        # ATR as percentage of price
        for period in self.periods:
            features[f'atr_pct_{period}'] = features[f'atr_{period}'] / df['close'] * 100

        # Bollinger Band Width
        for period in [20]:
            features[f'bb_width_{period}'] = self._bb_width(df['close'], period)
            features[f'bb_position_{period}'] = self._bb_position(df['close'], period)

        # High-Low Range
        features['hl_range'] = (df['high'] - df['low']) / df['close'] * 100

        # Overnight gap
        features['gap'] = (df['open'] - df['close'].shift(1)) / df['close'].shift(1) * 100

        return features

    def _std_returns(self, close: pd.Series, period: int) -> pd.Series:
        """Calculate rolling standard deviation of returns"""
        returns = close.pct_change()
        return returns.rolling(period).std() * 100

    def _atr(self, df: pd.DataFrame, period: int) -> pd.Series:
        """Calculate Average True Range"""
        high = df['high']
        low = df['low']
        close = df['close']

        tr1 = high - low
        tr2 = abs(high - close.shift(1))
        tr3 = abs(low - close.shift(1))

        tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
        return tr.rolling(period).mean()

    def _bb_width(self, close: pd.Series, period: int, std_dev: float = 2.0) -> pd.Series:
        """Calculate Bollinger Band width"""
        sma = close.rolling(period).mean()
        std = close.rolling(period).std()

        upper = sma + std_dev * std
        lower = sma - std_dev * std

        return (upper - lower) / sma * 100

    def _bb_position(self, close: pd.Series, period: int, std_dev: float = 2.0) -> pd.Series:
        """Calculate position within Bollinger Bands (0-100)"""
        sma = close.rolling(period).mean()
        std = close.rolling(period).std()

        upper = sma + std_dev * std
        lower = sma - std_dev * std

        return (close - lower) / (upper - lower) * 100

2. Momentum Features

# app/features/momentum.py
import pandas as pd
import numpy as np
from typing import List

class MomentumFeatures:
    """Calculate momentum-based features"""

    def __init__(self, periods: List[int]):
        self.periods = periods

    def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate all momentum features"""
        features = pd.DataFrame(index=df.index)
        close = df['close']

        # Rate of Change (ROC)
        for period in self.periods:
            features[f'roc_{period}'] = self._roc(close, period)

        # Momentum (absolute)
        for period in self.periods:
            features[f'momentum_{period}'] = self._momentum(close, period)

        # RSI
        for period in [7, 14, 21]:
            features[f'rsi_{period}'] = self._rsi(close, period)

        # Stochastic Oscillator
        features['stoch_k'] = self._stochastic_k(df, 14)
        features['stoch_d'] = features['stoch_k'].rolling(3).mean()

        # Williams %R
        features['williams_r'] = self._williams_r(df, 14)

        # CCI (Commodity Channel Index)
        features['cci_20'] = self._cci(df, 20)

        # MACD
        macd, signal, hist = self._macd(close)
        features['macd'] = macd
        features['macd_signal'] = signal
        features['macd_histogram'] = hist

        # ADX (Average Directional Index)
        features['adx'] = self._adx(df, 14)

        return features

    def _roc(self, close: pd.Series, period: int) -> pd.Series:
        """Rate of Change"""
        return (close - close.shift(period)) / close.shift(period) * 100

    def _momentum(self, close: pd.Series, period: int) -> pd.Series:
        """Simple momentum"""
        return close - close.shift(period)

    def _rsi(self, close: pd.Series, period: int) -> pd.Series:
        """Relative Strength Index"""
        delta = close.diff()

        gain = delta.where(delta > 0, 0)
        loss = -delta.where(delta < 0, 0)

        avg_gain = gain.rolling(period).mean()
        avg_loss = loss.rolling(period).mean()

        rs = avg_gain / avg_loss
        return 100 - (100 / (1 + rs))

    def _stochastic_k(self, df: pd.DataFrame, period: int) -> pd.Series:
        """Stochastic %K"""
        lowest = df['low'].rolling(period).min()
        highest = df['high'].rolling(period).max()

        return (df['close'] - lowest) / (highest - lowest) * 100

    def _williams_r(self, df: pd.DataFrame, period: int) -> pd.Series:
        """Williams %R"""
        highest = df['high'].rolling(period).max()
        lowest = df['low'].rolling(period).min()

        return (highest - df['close']) / (highest - lowest) * -100

    def _cci(self, df: pd.DataFrame, period: int) -> pd.Series:
        """Commodity Channel Index"""
        tp = (df['high'] + df['low'] + df['close']) / 3
        sma = tp.rolling(period).mean()
        mad = tp.rolling(period).apply(lambda x: np.abs(x - x.mean()).mean())

        return (tp - sma) / (0.015 * mad)

    def _macd(self, close: pd.Series, fast: int = 12, slow: int = 26, signal: int = 9):
        """MACD (Moving Average Convergence Divergence)"""
        ema_fast = close.ewm(span=fast).mean()
        ema_slow = close.ewm(span=slow).mean()

        macd_line = ema_fast - ema_slow
        signal_line = macd_line.ewm(span=signal).mean()
        histogram = macd_line - signal_line

        return macd_line, signal_line, histogram

    def _adx(self, df: pd.DataFrame, period: int) -> pd.Series:
        """Average Directional Index"""
        high = df['high']
        low = df['low']
        close = df['close']

        # +DM and -DM
        plus_dm = high.diff()
        minus_dm = -low.diff()

        plus_dm[plus_dm < 0] = 0
        minus_dm[minus_dm < 0] = 0

        # True Range
        tr = pd.concat([
            high - low,
            abs(high - close.shift(1)),
            abs(low - close.shift(1))
        ], axis=1).max(axis=1)

        # Smoothed averages
        atr = tr.rolling(period).mean()
        plus_di = 100 * (plus_dm.rolling(period).mean() / atr)
        minus_di = 100 * (minus_dm.rolling(period).mean() / atr)

        # ADX
        dx = 100 * abs(plus_di - minus_di) / (plus_di + minus_di)
        return dx.rolling(period).mean()

3. Trend Features

# app/features/trend.py
import pandas as pd
import numpy as np
from typing import List

class TrendFeatures:
    """Calculate trend-based features"""

    def __init__(self, periods: List[int]):
        self.periods = periods

    def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate all trend features"""
        features = pd.DataFrame(index=df.index)
        close = df['close']

        # Simple Moving Averages
        for period in self.periods:
            features[f'sma_{period}'] = close.rolling(period).mean()

        # Exponential Moving Averages
        for period in self.periods:
            features[f'ema_{period}'] = close.ewm(span=period).mean()

        # Price vs SMA ratio
        for period in self.periods:
            features[f'price_sma_ratio_{period}'] = close / features[f'sma_{period}']

        # SMA crossovers
        features['sma_5_20_ratio'] = features['sma_5'] / features['sma_20']
        features['sma_10_50_ratio'] = features['sma_10'] / features['sma_50']

        # EMA crossovers
        features['ema_5_20_ratio'] = features['ema_5'] / features['ema_20']

        # Trend strength (slope of SMA)
        for period in [10, 20]:
            features[f'trend_slope_{period}'] = self._slope(features[f'sma_{period}'], 5)

        # Distance from high/low
        features['dist_from_high_20'] = (
            (df['high'].rolling(20).max() - close) / close * 100
        )
        features['dist_from_low_20'] = (
            (close - df['low'].rolling(20).min()) / close * 100
        )

        # Price position in range
        features['price_position_20'] = self._price_position(df, 20)
        features['price_position_50'] = self._price_position(df, 50)

        # Ichimoku components
        tenkan, kijun, senkou_a, senkou_b = self._ichimoku(df)
        features['ichimoku_tenkan'] = tenkan
        features['ichimoku_kijun'] = kijun
        features['ichimoku_cloud_top'] = senkou_a
        features['ichimoku_cloud_bottom'] = senkou_b
        features['above_cloud'] = (close > senkou_a) & (close > senkou_b)
        features['above_cloud'] = features['above_cloud'].astype(int)

        return features

    def _slope(self, series: pd.Series, period: int) -> pd.Series:
        """Calculate slope (linear regression coefficient)"""
        def calc_slope(x):
            if len(x) < 2:
                return 0
            y = np.arange(len(x))
            return np.polyfit(y, x, 1)[0]

        return series.rolling(period).apply(calc_slope)

    def _price_position(self, df: pd.DataFrame, period: int) -> pd.Series:
        """Price position within period's high-low range (0-100)"""
        highest = df['high'].rolling(period).max()
        lowest = df['low'].rolling(period).min()

        return (df['close'] - lowest) / (highest - lowest) * 100

    def _ichimoku(self, df: pd.DataFrame):
        """Ichimoku Cloud components"""
        high = df['high']
        low = df['low']

        # Tenkan-sen (Conversion Line): (9-period high + 9-period low) / 2
        tenkan = (high.rolling(9).max() + low.rolling(9).min()) / 2

        # Kijun-sen (Base Line): (26-period high + 26-period low) / 2
        kijun = (high.rolling(26).max() + low.rolling(26).min()) / 2

        # Senkou Span A (Leading Span A): (Tenkan + Kijun) / 2
        senkou_a = (tenkan + kijun) / 2

        # Senkou Span B (Leading Span B): (52-period high + 52-period low) / 2
        senkou_b = (high.rolling(52).max() + low.rolling(52).min()) / 2

        return tenkan, kijun, senkou_a, senkou_b

4. Volume Features

# app/features/volume.py
import pandas as pd
import numpy as np
from typing import List

class VolumeFeatures:
    """Calculate volume-based features"""

    def __init__(self, periods: List[int]):
        self.periods = periods

    def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate all volume features"""
        features = pd.DataFrame(index=df.index)
        volume = df['volume']
        close = df['close']

        # Volume Moving Averages
        for period in self.periods:
            features[f'volume_sma_{period}'] = volume.rolling(period).mean()

        # Volume ratio vs average
        for period in self.periods:
            features[f'volume_ratio_{period}'] = volume / features[f'volume_sma_{period}']

        # On-Balance Volume (OBV)
        features['obv'] = self._obv(df)
        features['obv_sma_10'] = features['obv'].rolling(10).mean()
        features['obv_trend'] = features['obv'] / features['obv_sma_10']

        # Volume-Price Trend (VPT)
        features['vpt'] = self._vpt(df)

        # Money Flow Index (MFI)
        features['mfi_14'] = self._mfi(df, 14)

        # Accumulation/Distribution Line
        features['ad_line'] = self._ad_line(df)

        # Chaikin Money Flow
        features['cmf_20'] = self._cmf(df, 20)

        # Volume Weighted Average Price (VWAP) deviation
        features['vwap'] = self._vwap(df)
        features['vwap_deviation'] = (close - features['vwap']) / features['vwap'] * 100

        # Volume changes
        features['volume_change'] = volume.pct_change() * 100
        features['volume_std_20'] = volume.rolling(20).std() / volume.rolling(20).mean()

        return features

    def _obv(self, df: pd.DataFrame) -> pd.Series:
        """On-Balance Volume"""
        close = df['close']
        volume = df['volume']

        direction = np.sign(close.diff())
        return (direction * volume).cumsum()

    def _vpt(self, df: pd.DataFrame) -> pd.Series:
        """Volume-Price Trend"""
        close = df['close']
        volume = df['volume']

        vpt = volume * (close.diff() / close.shift(1))
        return vpt.cumsum()

    def _mfi(self, df: pd.DataFrame, period: int) -> pd.Series:
        """Money Flow Index"""
        tp = (df['high'] + df['low'] + df['close']) / 3
        mf = tp * df['volume']

        positive_mf = mf.where(tp > tp.shift(1), 0)
        negative_mf = mf.where(tp < tp.shift(1), 0)

        positive_sum = positive_mf.rolling(period).sum()
        negative_sum = negative_mf.rolling(period).sum()

        mfi = 100 - (100 / (1 + positive_sum / negative_sum))
        return mfi

    def _ad_line(self, df: pd.DataFrame) -> pd.Series:
        """Accumulation/Distribution Line"""
        high = df['high']
        low = df['low']
        close = df['close']
        volume = df['volume']

        clv = ((close - low) - (high - close)) / (high - low)
        clv = clv.fillna(0)

        return (clv * volume).cumsum()

    def _cmf(self, df: pd.DataFrame, period: int) -> pd.Series:
        """Chaikin Money Flow"""
        high = df['high']
        low = df['low']
        close = df['close']
        volume = df['volume']

        clv = ((close - low) - (high - close)) / (high - low)
        clv = clv.fillna(0)

        return (clv * volume).rolling(period).sum() / volume.rolling(period).sum()

    def _vwap(self, df: pd.DataFrame) -> pd.Series:
        """Volume Weighted Average Price (cumulative)"""
        tp = (df['high'] + df['low'] + df['close']) / 3

        cumulative_tp_vol = (tp * df['volume']).cumsum()
        cumulative_vol = df['volume'].cumsum()

        return cumulative_tp_vol / cumulative_vol

Feature Summary

Complete Feature List (21 Core Features)

# Feature Category Description
1 volatility_5 Volatility 5-period return std
2 volatility_10 Volatility 10-period return std
3 volatility_20 Volatility 20-period return std
4 atr_14 Volatility 14-period ATR
5 bb_position_20 Volatility Position in Bollinger Bands
6 roc_5 Momentum 5-period rate of change
7 roc_10 Momentum 10-period rate of change
8 rsi_14 Momentum 14-period RSI
9 macd Momentum MACD line
10 macd_histogram Momentum MACD histogram
11 stoch_k Momentum Stochastic %K
12 sma_5 Trend 5-period SMA
13 sma_20 Trend 20-period SMA
14 ema_10 Trend 10-period EMA
15 price_sma_ratio_20 Trend Price vs SMA20
16 trend_slope_20 Trend SMA20 slope
17 price_position_20 Trend Position in range
18 volume_ratio_20 Volume Volume vs avg
19 obv_trend Volume OBV trend
20 mfi_14 Volume Money Flow Index
21 cmf_20 Volume Chaikin Money Flow

Feature Normalization

# app/features/normalizer.py
from sklearn.preprocessing import StandardScaler, RobustScaler
import pandas as pd
import numpy as np
import joblib

class FeatureNormalizer:
    """Normalize features for model input"""

    def __init__(self, method: str = 'robust'):
        """
        Args:
            method: 'standard' for StandardScaler, 'robust' for RobustScaler
        """
        if method == 'standard':
            self.scaler = StandardScaler()
        else:
            self.scaler = RobustScaler()

        self.fitted = False

    def fit(self, X: pd.DataFrame):
        """Fit scaler on training data"""
        self.scaler.fit(X)
        self.fitted = True

    def transform(self, X: pd.DataFrame) -> np.ndarray:
        """Transform features"""
        if not self.fitted:
            raise ValueError("Scaler not fitted. Call fit() first.")
        return self.scaler.transform(X)

    def fit_transform(self, X: pd.DataFrame) -> np.ndarray:
        """Fit and transform in one step"""
        self.fit(X)
        return self.transform(X)

    def save(self, path: str):
        """Save scaler to disk"""
        joblib.dump(self.scaler, f"{path}/scaler.joblib")

    def load(self, path: str):
        """Load scaler from disk"""
        self.scaler = joblib.load(f"{path}/scaler.joblib")
        self.fitted = True

Feature Selection

# app/features/selector.py
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
import pandas as pd
import numpy as np

class FeatureSelector:
    """Select most important features"""

    def __init__(self, n_features: int = 15, method: str = 'mutual_info'):
        self.n_features = n_features
        self.method = method
        self.selected_features = []

    def fit(self, X: pd.DataFrame, y: np.ndarray):
        """Fit selector and identify top features"""
        if self.method == 'mutual_info':
            score_func = mutual_info_classif
        else:
            score_func = f_classif

        selector = SelectKBest(score_func=score_func, k=self.n_features)
        selector.fit(X, y)

        # Get selected feature names
        mask = selector.get_support()
        self.selected_features = X.columns[mask].tolist()

        # Store scores for analysis
        self.feature_scores = pd.DataFrame({
            'feature': X.columns,
            'score': selector.scores_
        }).sort_values('score', ascending=False)

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Filter to selected features"""
        return X[self.selected_features]

    def get_scores(self) -> pd.DataFrame:
        """Return feature scores"""
        return self.feature_scores

Referencias


Autor: Requirements-Analyst Fecha: 2025-12-05