Changes include: - Updated architecture documentation - Enhanced module definitions (OQI-001 to OQI-008) - ML integration documentation updates - Trading strategies documentation - Orchestration and inventory updates - Docker configuration updates 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
694 lines
21 KiB
Markdown
694 lines
21 KiB
Markdown
---
|
|
id: "ET-ML-003"
|
|
title: "Feature Engineering"
|
|
type: "Technical Specification"
|
|
status: "Done"
|
|
priority: "Alta"
|
|
epic: "OQI-006"
|
|
project: "trading-platform"
|
|
version: "1.0.0"
|
|
created_date: "2025-12-05"
|
|
updated_date: "2026-01-04"
|
|
---
|
|
|
|
# ET-ML-003: Feature Engineering
|
|
|
|
## Metadata
|
|
|
|
| Campo | Valor |
|
|
|-------|-------|
|
|
| **ID** | ET-ML-003 |
|
|
| **Épica** | OQI-006 - Señales ML |
|
|
| **Tipo** | Especificación Técnica |
|
|
| **Versión** | 1.0.0 |
|
|
| **Estado** | Aprobado |
|
|
| **Última actualización** | 2025-12-05 |
|
|
|
|
---
|
|
|
|
## Propósito
|
|
|
|
Especificar el proceso de Feature Engineering para los modelos de ML, incluyendo la definición de features, cálculo de indicadores técnicos, normalización y selección de variables.
|
|
|
|
---
|
|
|
|
## Arquitectura de Features
|
|
|
|
### Feature Builder
|
|
|
|
```python
|
|
# app/features/builder.py
|
|
import pandas as pd
|
|
import numpy as np
|
|
from typing import List, Dict, Optional
|
|
from .volatility import VolatilityFeatures
|
|
from .momentum import MomentumFeatures
|
|
from .trend import TrendFeatures
|
|
from .volume import VolumeFeatures
|
|
|
|
class FeatureBuilder:
|
|
"""
|
|
Constructs feature matrix from OHLCV data.
|
|
Combines multiple feature groups for comprehensive analysis.
|
|
"""
|
|
|
|
def __init__(self, lookback_periods: List[int] = None):
|
|
self.lookback_periods = lookback_periods or [5, 10, 20, 50]
|
|
|
|
# Feature calculators
|
|
self.volatility = VolatilityFeatures(self.lookback_periods)
|
|
self.momentum = MomentumFeatures(self.lookback_periods)
|
|
self.trend = TrendFeatures(self.lookback_periods)
|
|
self.volume = VolumeFeatures(self.lookback_periods)
|
|
|
|
self.feature_names: List[str] = []
|
|
|
|
def build(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
"""
|
|
Build complete feature matrix from OHLCV data.
|
|
|
|
Args:
|
|
df: DataFrame with columns ['open', 'high', 'low', 'close', 'volume']
|
|
|
|
Returns:
|
|
DataFrame with all calculated features
|
|
"""
|
|
features = pd.DataFrame(index=df.index)
|
|
|
|
# Calculate each feature group
|
|
features = pd.concat([
|
|
features,
|
|
self.volatility.calculate(df),
|
|
self.momentum.calculate(df),
|
|
self.trend.calculate(df),
|
|
self.volume.calculate(df)
|
|
], axis=1)
|
|
|
|
# Store feature names
|
|
self.feature_names = features.columns.tolist()
|
|
|
|
# Drop rows with NaN (due to lookback periods)
|
|
features = features.dropna()
|
|
|
|
return features
|
|
|
|
def get_feature_names(self) -> List[str]:
|
|
"""Return list of all feature names"""
|
|
return self.feature_names
|
|
|
|
def get_feature_importance(self, model) -> pd.DataFrame:
|
|
"""Extract feature importance from trained model"""
|
|
importance = model.feature_importances_
|
|
|
|
return pd.DataFrame({
|
|
'feature': self.feature_names,
|
|
'importance': importance
|
|
}).sort_values('importance', ascending=False)
|
|
```
|
|
|
|
---
|
|
|
|
## Feature Groups
|
|
|
|
### 1. Volatility Features
|
|
|
|
```python
|
|
# app/features/volatility.py
|
|
import pandas as pd
|
|
import numpy as np
|
|
from typing import List
|
|
|
|
class VolatilityFeatures:
|
|
"""Calculate volatility-based features"""
|
|
|
|
def __init__(self, periods: List[int]):
|
|
self.periods = periods
|
|
|
|
def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
"""Calculate all volatility features"""
|
|
features = pd.DataFrame(index=df.index)
|
|
|
|
# Standard deviation of returns
|
|
for period in self.periods:
|
|
features[f'volatility_{period}'] = self._std_returns(df['close'], period)
|
|
|
|
# Average True Range (ATR)
|
|
for period in self.periods:
|
|
features[f'atr_{period}'] = self._atr(df, period)
|
|
|
|
# ATR as percentage of price
|
|
for period in self.periods:
|
|
features[f'atr_pct_{period}'] = features[f'atr_{period}'] / df['close'] * 100
|
|
|
|
# Bollinger Band Width
|
|
for period in [20]:
|
|
features[f'bb_width_{period}'] = self._bb_width(df['close'], period)
|
|
features[f'bb_position_{period}'] = self._bb_position(df['close'], period)
|
|
|
|
# High-Low Range
|
|
features['hl_range'] = (df['high'] - df['low']) / df['close'] * 100
|
|
|
|
# Overnight gap
|
|
features['gap'] = (df['open'] - df['close'].shift(1)) / df['close'].shift(1) * 100
|
|
|
|
return features
|
|
|
|
def _std_returns(self, close: pd.Series, period: int) -> pd.Series:
|
|
"""Calculate rolling standard deviation of returns"""
|
|
returns = close.pct_change()
|
|
return returns.rolling(period).std() * 100
|
|
|
|
def _atr(self, df: pd.DataFrame, period: int) -> pd.Series:
|
|
"""Calculate Average True Range"""
|
|
high = df['high']
|
|
low = df['low']
|
|
close = df['close']
|
|
|
|
tr1 = high - low
|
|
tr2 = abs(high - close.shift(1))
|
|
tr3 = abs(low - close.shift(1))
|
|
|
|
tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
|
|
return tr.rolling(period).mean()
|
|
|
|
def _bb_width(self, close: pd.Series, period: int, std_dev: float = 2.0) -> pd.Series:
|
|
"""Calculate Bollinger Band width"""
|
|
sma = close.rolling(period).mean()
|
|
std = close.rolling(period).std()
|
|
|
|
upper = sma + std_dev * std
|
|
lower = sma - std_dev * std
|
|
|
|
return (upper - lower) / sma * 100
|
|
|
|
def _bb_position(self, close: pd.Series, period: int, std_dev: float = 2.0) -> pd.Series:
|
|
"""Calculate position within Bollinger Bands (0-100)"""
|
|
sma = close.rolling(period).mean()
|
|
std = close.rolling(period).std()
|
|
|
|
upper = sma + std_dev * std
|
|
lower = sma - std_dev * std
|
|
|
|
return (close - lower) / (upper - lower) * 100
|
|
```
|
|
|
|
### 2. Momentum Features
|
|
|
|
```python
|
|
# app/features/momentum.py
|
|
import pandas as pd
|
|
import numpy as np
|
|
from typing import List
|
|
|
|
class MomentumFeatures:
|
|
"""Calculate momentum-based features"""
|
|
|
|
def __init__(self, periods: List[int]):
|
|
self.periods = periods
|
|
|
|
def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
"""Calculate all momentum features"""
|
|
features = pd.DataFrame(index=df.index)
|
|
close = df['close']
|
|
|
|
# Rate of Change (ROC)
|
|
for period in self.periods:
|
|
features[f'roc_{period}'] = self._roc(close, period)
|
|
|
|
# Momentum (absolute)
|
|
for period in self.periods:
|
|
features[f'momentum_{period}'] = self._momentum(close, period)
|
|
|
|
# RSI
|
|
for period in [7, 14, 21]:
|
|
features[f'rsi_{period}'] = self._rsi(close, period)
|
|
|
|
# Stochastic Oscillator
|
|
features['stoch_k'] = self._stochastic_k(df, 14)
|
|
features['stoch_d'] = features['stoch_k'].rolling(3).mean()
|
|
|
|
# Williams %R
|
|
features['williams_r'] = self._williams_r(df, 14)
|
|
|
|
# CCI (Commodity Channel Index)
|
|
features['cci_20'] = self._cci(df, 20)
|
|
|
|
# MACD
|
|
macd, signal, hist = self._macd(close)
|
|
features['macd'] = macd
|
|
features['macd_signal'] = signal
|
|
features['macd_histogram'] = hist
|
|
|
|
# ADX (Average Directional Index)
|
|
features['adx'] = self._adx(df, 14)
|
|
|
|
return features
|
|
|
|
def _roc(self, close: pd.Series, period: int) -> pd.Series:
|
|
"""Rate of Change"""
|
|
return (close - close.shift(period)) / close.shift(period) * 100
|
|
|
|
def _momentum(self, close: pd.Series, period: int) -> pd.Series:
|
|
"""Simple momentum"""
|
|
return close - close.shift(period)
|
|
|
|
def _rsi(self, close: pd.Series, period: int) -> pd.Series:
|
|
"""Relative Strength Index"""
|
|
delta = close.diff()
|
|
|
|
gain = delta.where(delta > 0, 0)
|
|
loss = -delta.where(delta < 0, 0)
|
|
|
|
avg_gain = gain.rolling(period).mean()
|
|
avg_loss = loss.rolling(period).mean()
|
|
|
|
rs = avg_gain / avg_loss
|
|
return 100 - (100 / (1 + rs))
|
|
|
|
def _stochastic_k(self, df: pd.DataFrame, period: int) -> pd.Series:
|
|
"""Stochastic %K"""
|
|
lowest = df['low'].rolling(period).min()
|
|
highest = df['high'].rolling(period).max()
|
|
|
|
return (df['close'] - lowest) / (highest - lowest) * 100
|
|
|
|
def _williams_r(self, df: pd.DataFrame, period: int) -> pd.Series:
|
|
"""Williams %R"""
|
|
highest = df['high'].rolling(period).max()
|
|
lowest = df['low'].rolling(period).min()
|
|
|
|
return (highest - df['close']) / (highest - lowest) * -100
|
|
|
|
def _cci(self, df: pd.DataFrame, period: int) -> pd.Series:
|
|
"""Commodity Channel Index"""
|
|
tp = (df['high'] + df['low'] + df['close']) / 3
|
|
sma = tp.rolling(period).mean()
|
|
mad = tp.rolling(period).apply(lambda x: np.abs(x - x.mean()).mean())
|
|
|
|
return (tp - sma) / (0.015 * mad)
|
|
|
|
def _macd(self, close: pd.Series, fast: int = 12, slow: int = 26, signal: int = 9):
|
|
"""MACD (Moving Average Convergence Divergence)"""
|
|
ema_fast = close.ewm(span=fast).mean()
|
|
ema_slow = close.ewm(span=slow).mean()
|
|
|
|
macd_line = ema_fast - ema_slow
|
|
signal_line = macd_line.ewm(span=signal).mean()
|
|
histogram = macd_line - signal_line
|
|
|
|
return macd_line, signal_line, histogram
|
|
|
|
def _adx(self, df: pd.DataFrame, period: int) -> pd.Series:
|
|
"""Average Directional Index"""
|
|
high = df['high']
|
|
low = df['low']
|
|
close = df['close']
|
|
|
|
# +DM and -DM
|
|
plus_dm = high.diff()
|
|
minus_dm = -low.diff()
|
|
|
|
plus_dm[plus_dm < 0] = 0
|
|
minus_dm[minus_dm < 0] = 0
|
|
|
|
# True Range
|
|
tr = pd.concat([
|
|
high - low,
|
|
abs(high - close.shift(1)),
|
|
abs(low - close.shift(1))
|
|
], axis=1).max(axis=1)
|
|
|
|
# Smoothed averages
|
|
atr = tr.rolling(period).mean()
|
|
plus_di = 100 * (plus_dm.rolling(period).mean() / atr)
|
|
minus_di = 100 * (minus_dm.rolling(period).mean() / atr)
|
|
|
|
# ADX
|
|
dx = 100 * abs(plus_di - minus_di) / (plus_di + minus_di)
|
|
return dx.rolling(period).mean()
|
|
```
|
|
|
|
### 3. Trend Features
|
|
|
|
```python
|
|
# app/features/trend.py
|
|
import pandas as pd
|
|
import numpy as np
|
|
from typing import List
|
|
|
|
class TrendFeatures:
|
|
"""Calculate trend-based features"""
|
|
|
|
def __init__(self, periods: List[int]):
|
|
self.periods = periods
|
|
|
|
def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
"""Calculate all trend features"""
|
|
features = pd.DataFrame(index=df.index)
|
|
close = df['close']
|
|
|
|
# Simple Moving Averages
|
|
for period in self.periods:
|
|
features[f'sma_{period}'] = close.rolling(period).mean()
|
|
|
|
# Exponential Moving Averages
|
|
for period in self.periods:
|
|
features[f'ema_{period}'] = close.ewm(span=period).mean()
|
|
|
|
# Price vs SMA ratio
|
|
for period in self.periods:
|
|
features[f'price_sma_ratio_{period}'] = close / features[f'sma_{period}']
|
|
|
|
# SMA crossovers
|
|
features['sma_5_20_ratio'] = features['sma_5'] / features['sma_20']
|
|
features['sma_10_50_ratio'] = features['sma_10'] / features['sma_50']
|
|
|
|
# EMA crossovers
|
|
features['ema_5_20_ratio'] = features['ema_5'] / features['ema_20']
|
|
|
|
# Trend strength (slope of SMA)
|
|
for period in [10, 20]:
|
|
features[f'trend_slope_{period}'] = self._slope(features[f'sma_{period}'], 5)
|
|
|
|
# Distance from high/low
|
|
features['dist_from_high_20'] = (
|
|
(df['high'].rolling(20).max() - close) / close * 100
|
|
)
|
|
features['dist_from_low_20'] = (
|
|
(close - df['low'].rolling(20).min()) / close * 100
|
|
)
|
|
|
|
# Price position in range
|
|
features['price_position_20'] = self._price_position(df, 20)
|
|
features['price_position_50'] = self._price_position(df, 50)
|
|
|
|
# Ichimoku components
|
|
tenkan, kijun, senkou_a, senkou_b = self._ichimoku(df)
|
|
features['ichimoku_tenkan'] = tenkan
|
|
features['ichimoku_kijun'] = kijun
|
|
features['ichimoku_cloud_top'] = senkou_a
|
|
features['ichimoku_cloud_bottom'] = senkou_b
|
|
features['above_cloud'] = (close > senkou_a) & (close > senkou_b)
|
|
features['above_cloud'] = features['above_cloud'].astype(int)
|
|
|
|
return features
|
|
|
|
def _slope(self, series: pd.Series, period: int) -> pd.Series:
|
|
"""Calculate slope (linear regression coefficient)"""
|
|
def calc_slope(x):
|
|
if len(x) < 2:
|
|
return 0
|
|
y = np.arange(len(x))
|
|
return np.polyfit(y, x, 1)[0]
|
|
|
|
return series.rolling(period).apply(calc_slope)
|
|
|
|
def _price_position(self, df: pd.DataFrame, period: int) -> pd.Series:
|
|
"""Price position within period's high-low range (0-100)"""
|
|
highest = df['high'].rolling(period).max()
|
|
lowest = df['low'].rolling(period).min()
|
|
|
|
return (df['close'] - lowest) / (highest - lowest) * 100
|
|
|
|
def _ichimoku(self, df: pd.DataFrame):
|
|
"""Ichimoku Cloud components"""
|
|
high = df['high']
|
|
low = df['low']
|
|
|
|
# Tenkan-sen (Conversion Line): (9-period high + 9-period low) / 2
|
|
tenkan = (high.rolling(9).max() + low.rolling(9).min()) / 2
|
|
|
|
# Kijun-sen (Base Line): (26-period high + 26-period low) / 2
|
|
kijun = (high.rolling(26).max() + low.rolling(26).min()) / 2
|
|
|
|
# Senkou Span A (Leading Span A): (Tenkan + Kijun) / 2
|
|
senkou_a = (tenkan + kijun) / 2
|
|
|
|
# Senkou Span B (Leading Span B): (52-period high + 52-period low) / 2
|
|
senkou_b = (high.rolling(52).max() + low.rolling(52).min()) / 2
|
|
|
|
return tenkan, kijun, senkou_a, senkou_b
|
|
```
|
|
|
|
### 4. Volume Features
|
|
|
|
```python
|
|
# app/features/volume.py
|
|
import pandas as pd
|
|
import numpy as np
|
|
from typing import List
|
|
|
|
class VolumeFeatures:
|
|
"""Calculate volume-based features"""
|
|
|
|
def __init__(self, periods: List[int]):
|
|
self.periods = periods
|
|
|
|
def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
"""Calculate all volume features"""
|
|
features = pd.DataFrame(index=df.index)
|
|
volume = df['volume']
|
|
close = df['close']
|
|
|
|
# Volume Moving Averages
|
|
for period in self.periods:
|
|
features[f'volume_sma_{period}'] = volume.rolling(period).mean()
|
|
|
|
# Volume ratio vs average
|
|
for period in self.periods:
|
|
features[f'volume_ratio_{period}'] = volume / features[f'volume_sma_{period}']
|
|
|
|
# On-Balance Volume (OBV)
|
|
features['obv'] = self._obv(df)
|
|
features['obv_sma_10'] = features['obv'].rolling(10).mean()
|
|
features['obv_trend'] = features['obv'] / features['obv_sma_10']
|
|
|
|
# Volume-Price Trend (VPT)
|
|
features['vpt'] = self._vpt(df)
|
|
|
|
# Money Flow Index (MFI)
|
|
features['mfi_14'] = self._mfi(df, 14)
|
|
|
|
# Accumulation/Distribution Line
|
|
features['ad_line'] = self._ad_line(df)
|
|
|
|
# Chaikin Money Flow
|
|
features['cmf_20'] = self._cmf(df, 20)
|
|
|
|
# Volume Weighted Average Price (VWAP) deviation
|
|
features['vwap'] = self._vwap(df)
|
|
features['vwap_deviation'] = (close - features['vwap']) / features['vwap'] * 100
|
|
|
|
# Volume changes
|
|
features['volume_change'] = volume.pct_change() * 100
|
|
features['volume_std_20'] = volume.rolling(20).std() / volume.rolling(20).mean()
|
|
|
|
return features
|
|
|
|
def _obv(self, df: pd.DataFrame) -> pd.Series:
|
|
"""On-Balance Volume"""
|
|
close = df['close']
|
|
volume = df['volume']
|
|
|
|
direction = np.sign(close.diff())
|
|
return (direction * volume).cumsum()
|
|
|
|
def _vpt(self, df: pd.DataFrame) -> pd.Series:
|
|
"""Volume-Price Trend"""
|
|
close = df['close']
|
|
volume = df['volume']
|
|
|
|
vpt = volume * (close.diff() / close.shift(1))
|
|
return vpt.cumsum()
|
|
|
|
def _mfi(self, df: pd.DataFrame, period: int) -> pd.Series:
|
|
"""Money Flow Index"""
|
|
tp = (df['high'] + df['low'] + df['close']) / 3
|
|
mf = tp * df['volume']
|
|
|
|
positive_mf = mf.where(tp > tp.shift(1), 0)
|
|
negative_mf = mf.where(tp < tp.shift(1), 0)
|
|
|
|
positive_sum = positive_mf.rolling(period).sum()
|
|
negative_sum = negative_mf.rolling(period).sum()
|
|
|
|
mfi = 100 - (100 / (1 + positive_sum / negative_sum))
|
|
return mfi
|
|
|
|
def _ad_line(self, df: pd.DataFrame) -> pd.Series:
|
|
"""Accumulation/Distribution Line"""
|
|
high = df['high']
|
|
low = df['low']
|
|
close = df['close']
|
|
volume = df['volume']
|
|
|
|
clv = ((close - low) - (high - close)) / (high - low)
|
|
clv = clv.fillna(0)
|
|
|
|
return (clv * volume).cumsum()
|
|
|
|
def _cmf(self, df: pd.DataFrame, period: int) -> pd.Series:
|
|
"""Chaikin Money Flow"""
|
|
high = df['high']
|
|
low = df['low']
|
|
close = df['close']
|
|
volume = df['volume']
|
|
|
|
clv = ((close - low) - (high - close)) / (high - low)
|
|
clv = clv.fillna(0)
|
|
|
|
return (clv * volume).rolling(period).sum() / volume.rolling(period).sum()
|
|
|
|
def _vwap(self, df: pd.DataFrame) -> pd.Series:
|
|
"""Volume Weighted Average Price (cumulative)"""
|
|
tp = (df['high'] + df['low'] + df['close']) / 3
|
|
|
|
cumulative_tp_vol = (tp * df['volume']).cumsum()
|
|
cumulative_vol = df['volume'].cumsum()
|
|
|
|
return cumulative_tp_vol / cumulative_vol
|
|
```
|
|
|
|
---
|
|
|
|
## Feature Summary
|
|
|
|
### Complete Feature List (21 Core Features)
|
|
|
|
| # | Feature | Category | Description |
|
|
|---|---------|----------|-------------|
|
|
| 1 | volatility_5 | Volatility | 5-period return std |
|
|
| 2 | volatility_10 | Volatility | 10-period return std |
|
|
| 3 | volatility_20 | Volatility | 20-period return std |
|
|
| 4 | atr_14 | Volatility | 14-period ATR |
|
|
| 5 | bb_position_20 | Volatility | Position in Bollinger Bands |
|
|
| 6 | roc_5 | Momentum | 5-period rate of change |
|
|
| 7 | roc_10 | Momentum | 10-period rate of change |
|
|
| 8 | rsi_14 | Momentum | 14-period RSI |
|
|
| 9 | macd | Momentum | MACD line |
|
|
| 10 | macd_histogram | Momentum | MACD histogram |
|
|
| 11 | stoch_k | Momentum | Stochastic %K |
|
|
| 12 | sma_5 | Trend | 5-period SMA |
|
|
| 13 | sma_20 | Trend | 20-period SMA |
|
|
| 14 | ema_10 | Trend | 10-period EMA |
|
|
| 15 | price_sma_ratio_20 | Trend | Price vs SMA20 |
|
|
| 16 | trend_slope_20 | Trend | SMA20 slope |
|
|
| 17 | price_position_20 | Trend | Position in range |
|
|
| 18 | volume_ratio_20 | Volume | Volume vs avg |
|
|
| 19 | obv_trend | Volume | OBV trend |
|
|
| 20 | mfi_14 | Volume | Money Flow Index |
|
|
| 21 | cmf_20 | Volume | Chaikin Money Flow |
|
|
|
|
---
|
|
|
|
## Feature Normalization
|
|
|
|
```python
|
|
# app/features/normalizer.py
|
|
from sklearn.preprocessing import StandardScaler, RobustScaler
|
|
import pandas as pd
|
|
import numpy as np
|
|
import joblib
|
|
|
|
class FeatureNormalizer:
|
|
"""Normalize features for model input"""
|
|
|
|
def __init__(self, method: str = 'robust'):
|
|
"""
|
|
Args:
|
|
method: 'standard' for StandardScaler, 'robust' for RobustScaler
|
|
"""
|
|
if method == 'standard':
|
|
self.scaler = StandardScaler()
|
|
else:
|
|
self.scaler = RobustScaler()
|
|
|
|
self.fitted = False
|
|
|
|
def fit(self, X: pd.DataFrame):
|
|
"""Fit scaler on training data"""
|
|
self.scaler.fit(X)
|
|
self.fitted = True
|
|
|
|
def transform(self, X: pd.DataFrame) -> np.ndarray:
|
|
"""Transform features"""
|
|
if not self.fitted:
|
|
raise ValueError("Scaler not fitted. Call fit() first.")
|
|
return self.scaler.transform(X)
|
|
|
|
def fit_transform(self, X: pd.DataFrame) -> np.ndarray:
|
|
"""Fit and transform in one step"""
|
|
self.fit(X)
|
|
return self.transform(X)
|
|
|
|
def save(self, path: str):
|
|
"""Save scaler to disk"""
|
|
joblib.dump(self.scaler, f"{path}/scaler.joblib")
|
|
|
|
def load(self, path: str):
|
|
"""Load scaler from disk"""
|
|
self.scaler = joblib.load(f"{path}/scaler.joblib")
|
|
self.fitted = True
|
|
```
|
|
|
|
---
|
|
|
|
## Feature Selection
|
|
|
|
```python
|
|
# app/features/selector.py
|
|
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
class FeatureSelector:
|
|
"""Select most important features"""
|
|
|
|
def __init__(self, n_features: int = 15, method: str = 'mutual_info'):
|
|
self.n_features = n_features
|
|
self.method = method
|
|
self.selected_features = []
|
|
|
|
def fit(self, X: pd.DataFrame, y: np.ndarray):
|
|
"""Fit selector and identify top features"""
|
|
if self.method == 'mutual_info':
|
|
score_func = mutual_info_classif
|
|
else:
|
|
score_func = f_classif
|
|
|
|
selector = SelectKBest(score_func=score_func, k=self.n_features)
|
|
selector.fit(X, y)
|
|
|
|
# Get selected feature names
|
|
mask = selector.get_support()
|
|
self.selected_features = X.columns[mask].tolist()
|
|
|
|
# Store scores for analysis
|
|
self.feature_scores = pd.DataFrame({
|
|
'feature': X.columns,
|
|
'score': selector.scores_
|
|
}).sort_values('score', ascending=False)
|
|
|
|
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
|
|
"""Filter to selected features"""
|
|
return X[self.selected_features]
|
|
|
|
def get_scores(self) -> pd.DataFrame:
|
|
"""Return feature scores"""
|
|
return self.feature_scores
|
|
```
|
|
|
|
---
|
|
|
|
## Referencias
|
|
|
|
- [ET-ML-001: Arquitectura](./ET-ML-001-arquitectura.md)
|
|
- [ET-ML-002: Modelos XGBoost](./ET-ML-002-modelos.md)
|
|
- [Pandas Documentation](https://pandas.pydata.org/)
|
|
- [TA-Lib](https://github.com/mrjbq7/ta-lib)
|
|
|
|
---
|
|
|
|
**Autor:** Requirements-Analyst
|
|
**Fecha:** 2025-12-05
|