rckrdmrd a7cca885f0 feat: Major platform documentation and architecture updates

Changes include:
- Updated architecture documentation
- Enhanced module definitions (OQI-001 to OQI-008)
- ML integration documentation updates
- Trading strategies documentation
- Orchestration and inventory updates
- Docker configuration updates

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

2026-01-07 05:33:35 -06:00

21 KiB

Raw Blame History

id	title	type	status	priority	epic	project	version	created_date	updated_date
ET-ML-003	Feature Engineering	Technical Specification	Done	Alta	OQI-006	trading-platform	1.0.0	2025-12-05	2026-01-04

ET-ML-003: Feature Engineering

Metadata

Campo	Valor
ID	ET-ML-003
Épica	OQI-006 - Señales ML
Tipo	Especificación Técnica
Versión	1.0.0
Estado	Aprobado
Última actualización	2025-12-05

Propósito

Especificar el proceso de Feature Engineering para los modelos de ML, incluyendo la definición de features, cálculo de indicadores técnicos, normalización y selección de variables.

Arquitectura de Features

Feature Builder

# app/features/builder.py
import pandas as pd
import numpy as np
from typing import List, Dict, Optional
from .volatility import VolatilityFeatures
from .momentum import MomentumFeatures
from .trend import TrendFeatures
from .volume import VolumeFeatures

class FeatureBuilder:
    """
    Constructs feature matrix from OHLCV data.
    Combines multiple feature groups for comprehensive analysis.
    """

    def __init__(self, lookback_periods: List[int] = None):
        self.lookback_periods = lookback_periods or [5, 10, 20, 50]

        # Feature calculators
        self.volatility = VolatilityFeatures(self.lookback_periods)
        self.momentum = MomentumFeatures(self.lookback_periods)
        self.trend = TrendFeatures(self.lookback_periods)
        self.volume = VolumeFeatures(self.lookback_periods)

        self.feature_names: List[str] = []

    def build(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Build complete feature matrix from OHLCV data.

        Args:
            df: DataFrame with columns ['open', 'high', 'low', 'close', 'volume']

        Returns:
            DataFrame with all calculated features
        """
        features = pd.DataFrame(index=df.index)

        # Calculate each feature group
        features = pd.concat([
            features,
            self.volatility.calculate(df),
            self.momentum.calculate(df),
            self.trend.calculate(df),
            self.volume.calculate(df)
        ], axis=1)

        # Store feature names
        self.feature_names = features.columns.tolist()

        # Drop rows with NaN (due to lookback periods)
        features = features.dropna()

        return features

    def get_feature_names(self) -> List[str]:
        """Return list of all feature names"""
        return self.feature_names

    def get_feature_importance(self, model) -> pd.DataFrame:
        """Extract feature importance from trained model"""
        importance = model.feature_importances_

        return pd.DataFrame({
            'feature': self.feature_names,
            'importance': importance
        }).sort_values('importance', ascending=False)

Feature Groups

1. Volatility Features

# app/features/volatility.py
import pandas as pd
import numpy as np
from typing import List

class VolatilityFeatures:
    """Calculate volatility-based features"""

    def __init__(self, periods: List[int]):
        self.periods = periods

    def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate all volatility features"""
        features = pd.DataFrame(index=df.index)

        # Standard deviation of returns
        for period in self.periods:
            features[f'volatility_{period}'] = self._std_returns(df['close'], period)

        # Average True Range (ATR)
        for period in self.periods:
            features[f'atr_{period}'] = self._atr(df, period)

        # ATR as percentage of price
        for period in self.periods:
            features[f'atr_pct_{period}'] = features[f'atr_{period}'] / df['close'] * 100

        # Bollinger Band Width
        for period in [20]:
            features[f'bb_width_{period}'] = self._bb_width(df['close'], period)
            features[f'bb_position_{period}'] = self._bb_position(df['close'], period)

        # High-Low Range
        features['hl_range'] = (df['high'] - df['low']) / df['close'] * 100

        # Overnight gap
        features['gap'] = (df['open'] - df['close'].shift(1)) / df['close'].shift(1) * 100

        return features

    def _std_returns(self, close: pd.Series, period: int) -> pd.Series:
        """Calculate rolling standard deviation of returns"""
        returns = close.pct_change()
        return returns.rolling(period).std() * 100

    def _atr(self, df: pd.DataFrame, period: int) -> pd.Series:
        """Calculate Average True Range"""
        high = df['high']
        low = df['low']
        close = df['close']

        tr1 = high - low
        tr2 = abs(high - close.shift(1))
        tr3 = abs(low - close.shift(1))

        tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
        return tr.rolling(period).mean()

    def _bb_width(self, close: pd.Series, period: int, std_dev: float = 2.0) -> pd.Series:
        """Calculate Bollinger Band width"""
        sma = close.rolling(period).mean()
        std = close.rolling(period).std()

        upper = sma + std_dev * std
        lower = sma - std_dev * std

        return (upper - lower) / sma * 100

    def _bb_position(self, close: pd.Series, period: int, std_dev: float = 2.0) -> pd.Series:
        """Calculate position within Bollinger Bands (0-100)"""
        sma = close.rolling(period).mean()
        std = close.rolling(period).std()

        upper = sma + std_dev * std
        lower = sma - std_dev * std

        return (close - lower) / (upper - lower) * 100

2. Momentum Features

# app/features/momentum.py
import pandas as pd
import numpy as np
from typing import List

class MomentumFeatures:
    """Calculate momentum-based features"""

    def __init__(self, periods: List[int]):
        self.periods = periods

    def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate all momentum features"""
        features = pd.DataFrame(index=df.index)
        close = df['close']

        # Rate of Change (ROC)
        for period in self.periods:
            features[f'roc_{period}'] = self._roc(close, period)

        # Momentum (absolute)
        for period in self.periods:
            features[f'momentum_{period}'] = self._momentum(close, period)

        # RSI
        for period in [7, 14, 21]:
            features[f'rsi_{period}'] = self._rsi(close, period)

        # Stochastic Oscillator
        features['stoch_k'] = self._stochastic_k(df, 14)
        features['stoch_d'] = features['stoch_k'].rolling(3).mean()

        # Williams %R
        features['williams_r'] = self._williams_r(df, 14)

        # CCI (Commodity Channel Index)
        features['cci_20'] = self._cci(df, 20)

        # MACD
        macd, signal, hist = self._macd(close)
        features['macd'] = macd
        features['macd_signal'] = signal
        features['macd_histogram'] = hist

        # ADX (Average Directional Index)
        features['adx'] = self._adx(df, 14)

        return features

    def _roc(self, close: pd.Series, period: int) -> pd.Series:
        """Rate of Change"""
        return (close - close.shift(period)) / close.shift(period) * 100

    def _momentum(self, close: pd.Series, period: int) -> pd.Series:
        """Simple momentum"""
        return close - close.shift(period)

    def _rsi(self, close: pd.Series, period: int) -> pd.Series:
        """Relative Strength Index"""
        delta = close.diff()

        gain = delta.where(delta > 0, 0)
        loss = -delta.where(delta < 0, 0)

        avg_gain = gain.rolling(period).mean()
        avg_loss = loss.rolling(period).mean()

        rs = avg_gain / avg_loss
        return 100 - (100 / (1 + rs))

    def _stochastic_k(self, df: pd.DataFrame, period: int) -> pd.Series:
        """Stochastic %K"""
        lowest = df['low'].rolling(period).min()
        highest = df['high'].rolling(period).max()

        return (df['close'] - lowest) / (highest - lowest) * 100

    def _williams_r(self, df: pd.DataFrame, period: int) -> pd.Series:
        """Williams %R"""
        highest = df['high'].rolling(period).max()
        lowest = df['low'].rolling(period).min()

        return (highest - df['close']) / (highest - lowest) * -100

    def _cci(self, df: pd.DataFrame, period: int) -> pd.Series:
        """Commodity Channel Index"""
        tp = (df['high'] + df['low'] + df['close']) / 3
        sma = tp.rolling(period).mean()
        mad = tp.rolling(period).apply(lambda x: np.abs(x - x.mean()).mean())

        return (tp - sma) / (0.015 * mad)

    def _macd(self, close: pd.Series, fast: int = 12, slow: int = 26, signal: int = 9):
        """MACD (Moving Average Convergence Divergence)"""
        ema_fast = close.ewm(span=fast).mean()
        ema_slow = close.ewm(span=slow).mean()

        macd_line = ema_fast - ema_slow
        signal_line = macd_line.ewm(span=signal).mean()
        histogram = macd_line - signal_line

        return macd_line, signal_line, histogram

    def _adx(self, df: pd.DataFrame, period: int) -> pd.Series:
        """Average Directional Index"""
        high = df['high']
        low = df['low']
        close = df['close']

        # +DM and -DM
        plus_dm = high.diff()
        minus_dm = -low.diff()

        plus_dm[plus_dm < 0] = 0
        minus_dm[minus_dm < 0] = 0

        # True Range
        tr = pd.concat([
            high - low,
            abs(high - close.shift(1)),
            abs(low - close.shift(1))
        ], axis=1).max(axis=1)

        # Smoothed averages
        atr = tr.rolling(period).mean()
        plus_di = 100 * (plus_dm.rolling(period).mean() / atr)
        minus_di = 100 * (minus_dm.rolling(period).mean() / atr)

        # ADX
        dx = 100 * abs(plus_di - minus_di) / (plus_di + minus_di)
        return dx.rolling(period).mean()

3. Trend Features

# app/features/trend.py
import pandas as pd
import numpy as np
from typing import List

class TrendFeatures:
    """Calculate trend-based features"""

    def __init__(self, periods: List[int]):
        self.periods = periods

    def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate all trend features"""
        features = pd.DataFrame(index=df.index)
        close = df['close']

        # Simple Moving Averages
        for period in self.periods:
            features[f'sma_{period}'] = close.rolling(period).mean()

        # Exponential Moving Averages
        for period in self.periods:
            features[f'ema_{period}'] = close.ewm(span=period).mean()

        # Price vs SMA ratio
        for period in self.periods:
            features[f'price_sma_ratio_{period}'] = close / features[f'sma_{period}']

        # SMA crossovers
        features['sma_5_20_ratio'] = features['sma_5'] / features['sma_20']
        features['sma_10_50_ratio'] = features['sma_10'] / features['sma_50']

        # EMA crossovers
        features['ema_5_20_ratio'] = features['ema_5'] / features['ema_20']

        # Trend strength (slope of SMA)
        for period in [10, 20]:
            features[f'trend_slope_{period}'] = self._slope(features[f'sma_{period}'], 5)

        # Distance from high/low
        features['dist_from_high_20'] = (
            (df['high'].rolling(20).max() - close) / close * 100
        )
        features['dist_from_low_20'] = (
            (close - df['low'].rolling(20).min()) / close * 100
        )

        # Price position in range
        features['price_position_20'] = self._price_position(df, 20)
        features['price_position_50'] = self._price_position(df, 50)

        # Ichimoku components
        tenkan, kijun, senkou_a, senkou_b = self._ichimoku(df)
        features['ichimoku_tenkan'] = tenkan
        features['ichimoku_kijun'] = kijun
        features['ichimoku_cloud_top'] = senkou_a
        features['ichimoku_cloud_bottom'] = senkou_b
        features['above_cloud'] = (close > senkou_a) & (close > senkou_b)
        features['above_cloud'] = features['above_cloud'].astype(int)

        return features

    def _slope(self, series: pd.Series, period: int) -> pd.Series:
        """Calculate slope (linear regression coefficient)"""
        def calc_slope(x):
            if len(x) < 2:
                return 0
            y = np.arange(len(x))
            return np.polyfit(y, x, 1)[0]

        return series.rolling(period).apply(calc_slope)

    def _price_position(self, df: pd.DataFrame, period: int) -> pd.Series:
        """Price position within period's high-low range (0-100)"""
        highest = df['high'].rolling(period).max()
        lowest = df['low'].rolling(period).min()

        return (df['close'] - lowest) / (highest - lowest) * 100

    def _ichimoku(self, df: pd.DataFrame):
        """Ichimoku Cloud components"""
        high = df['high']
        low = df['low']

        # Tenkan-sen (Conversion Line): (9-period high + 9-period low) / 2
        tenkan = (high.rolling(9).max() + low.rolling(9).min()) / 2

        # Kijun-sen (Base Line): (26-period high + 26-period low) / 2
        kijun = (high.rolling(26).max() + low.rolling(26).min()) / 2

        # Senkou Span A (Leading Span A): (Tenkan + Kijun) / 2
        senkou_a = (tenkan + kijun) / 2

        # Senkou Span B (Leading Span B): (52-period high + 52-period low) / 2
        senkou_b = (high.rolling(52).max() + low.rolling(52).min()) / 2

        return tenkan, kijun, senkou_a, senkou_b

4. Volume Features

# app/features/volume.py
import pandas as pd
import numpy as np
from typing import List

class VolumeFeatures:
    """Calculate volume-based features"""

    def __init__(self, periods: List[int]):
        self.periods = periods

    def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate all volume features"""
        features = pd.DataFrame(index=df.index)
        volume = df['volume']
        close = df['close']

        # Volume Moving Averages
        for period in self.periods:
            features[f'volume_sma_{period}'] = volume.rolling(period).mean()

        # Volume ratio vs average
        for period in self.periods:
            features[f'volume_ratio_{period}'] = volume / features[f'volume_sma_{period}']

        # On-Balance Volume (OBV)
        features['obv'] = self._obv(df)
        features['obv_sma_10'] = features['obv'].rolling(10).mean()
        features['obv_trend'] = features['obv'] / features['obv_sma_10']

        # Volume-Price Trend (VPT)
        features['vpt'] = self._vpt(df)

        # Money Flow Index (MFI)
        features['mfi_14'] = self._mfi(df, 14)

        # Accumulation/Distribution Line
        features['ad_line'] = self._ad_line(df)

        # Chaikin Money Flow
        features['cmf_20'] = self._cmf(df, 20)

        # Volume Weighted Average Price (VWAP) deviation
        features['vwap'] = self._vwap(df)
        features['vwap_deviation'] = (close - features['vwap']) / features['vwap'] * 100

        # Volume changes
        features['volume_change'] = volume.pct_change() * 100
        features['volume_std_20'] = volume.rolling(20).std() / volume.rolling(20).mean()

        return features

    def _obv(self, df: pd.DataFrame) -> pd.Series:
        """On-Balance Volume"""
        close = df['close']
        volume = df['volume']

        direction = np.sign(close.diff())
        return (direction * volume).cumsum()

    def _vpt(self, df: pd.DataFrame) -> pd.Series:
        """Volume-Price Trend"""
        close = df['close']
        volume = df['volume']

        vpt = volume * (close.diff() / close.shift(1))
        return vpt.cumsum()

    def _mfi(self, df: pd.DataFrame, period: int) -> pd.Series:
        """Money Flow Index"""
        tp = (df['high'] + df['low'] + df['close']) / 3
        mf = tp * df['volume']

        positive_mf = mf.where(tp > tp.shift(1), 0)
        negative_mf = mf.where(tp < tp.shift(1), 0)

        positive_sum = positive_mf.rolling(period).sum()
        negative_sum = negative_mf.rolling(period).sum()

        mfi = 100 - (100 / (1 + positive_sum / negative_sum))
        return mfi

    def _ad_line(self, df: pd.DataFrame) -> pd.Series:
        """Accumulation/Distribution Line"""
        high = df['high']
        low = df['low']
        close = df['close']
        volume = df['volume']

        clv = ((close - low) - (high - close)) / (high - low)
        clv = clv.fillna(0)

        return (clv * volume).cumsum()

    def _cmf(self, df: pd.DataFrame, period: int) -> pd.Series:
        """Chaikin Money Flow"""
        high = df['high']
        low = df['low']
        close = df['close']
        volume = df['volume']

        clv = ((close - low) - (high - close)) / (high - low)
        clv = clv.fillna(0)

        return (clv * volume).rolling(period).sum() / volume.rolling(period).sum()

    def _vwap(self, df: pd.DataFrame) -> pd.Series:
        """Volume Weighted Average Price (cumulative)"""
        tp = (df['high'] + df['low'] + df['close']) / 3

        cumulative_tp_vol = (tp * df['volume']).cumsum()
        cumulative_vol = df['volume'].cumsum()

        return cumulative_tp_vol / cumulative_vol

Feature Summary

Complete Feature List (21 Core Features)

#	Feature	Category	Description
1	volatility_5	Volatility	5-period return std
2	volatility_10	Volatility	10-period return std
3	volatility_20	Volatility	20-period return std
4	atr_14	Volatility	14-period ATR
5	bb_position_20	Volatility	Position in Bollinger Bands
6	roc_5	Momentum	5-period rate of change
7	roc_10	Momentum	10-period rate of change
8	rsi_14	Momentum	14-period RSI
9	macd	Momentum	MACD line
10	macd_histogram	Momentum	MACD histogram
11	stoch_k	Momentum	Stochastic %K
12	sma_5	Trend	5-period SMA
13	sma_20	Trend	20-period SMA
14	ema_10	Trend	10-period EMA
15	price_sma_ratio_20	Trend	Price vs SMA20
16	trend_slope_20	Trend	SMA20 slope
17	price_position_20	Trend	Position in range
18	volume_ratio_20	Volume	Volume vs avg
19	obv_trend	Volume	OBV trend
20	mfi_14	Volume	Money Flow Index
21	cmf_20	Volume	Chaikin Money Flow

Feature Normalization

# app/features/normalizer.py
from sklearn.preprocessing import StandardScaler, RobustScaler
import pandas as pd
import numpy as np
import joblib

class FeatureNormalizer:
    """Normalize features for model input"""

    def __init__(self, method: str = 'robust'):
        """
        Args:
            method: 'standard' for StandardScaler, 'robust' for RobustScaler
        """
        if method == 'standard':
            self.scaler = StandardScaler()
        else:
            self.scaler = RobustScaler()

        self.fitted = False

    def fit(self, X: pd.DataFrame):
        """Fit scaler on training data"""
        self.scaler.fit(X)
        self.fitted = True

    def transform(self, X: pd.DataFrame) -> np.ndarray:
        """Transform features"""
        if not self.fitted:
            raise ValueError("Scaler not fitted. Call fit() first.")
        return self.scaler.transform(X)

    def fit_transform(self, X: pd.DataFrame) -> np.ndarray:
        """Fit and transform in one step"""
        self.fit(X)
        return self.transform(X)

    def save(self, path: str):
        """Save scaler to disk"""
        joblib.dump(self.scaler, f"{path}/scaler.joblib")

    def load(self, path: str):
        """Load scaler from disk"""
        self.scaler = joblib.load(f"{path}/scaler.joblib")
        self.fitted = True

Feature Selection

# app/features/selector.py
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
import pandas as pd
import numpy as np

class FeatureSelector:
    """Select most important features"""

    def __init__(self, n_features: int = 15, method: str = 'mutual_info'):
        self.n_features = n_features
        self.method = method
        self.selected_features = []

    def fit(self, X: pd.DataFrame, y: np.ndarray):
        """Fit selector and identify top features"""
        if self.method == 'mutual_info':
            score_func = mutual_info_classif
        else:
            score_func = f_classif

        selector = SelectKBest(score_func=score_func, k=self.n_features)
        selector.fit(X, y)

        # Get selected feature names
        mask = selector.get_support()
        self.selected_features = X.columns[mask].tolist()

        # Store scores for analysis
        self.feature_scores = pd.DataFrame({
            'feature': X.columns,
            'score': selector.scores_
        }).sort_values('score', ascending=False)

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Filter to selected features"""
        return X[self.selected_features]

    def get_scores(self) -> pd.DataFrame:
        """Return feature scores"""
        return self.feature_scores

Referencias

Autor: Requirements-Analyst Fecha: 2025-12-05

21 KiB Raw Blame History