Changes include: - Updated architecture documentation - Enhanced module definitions (OQI-001 to OQI-008) - ML integration documentation updates - Trading strategies documentation - Orchestration and inventory updates - Docker configuration updates 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
21 KiB
21 KiB
| id | title | type | status | priority | epic | project | version | created_date | updated_date |
|---|---|---|---|---|---|---|---|---|---|
| ET-ML-003 | Feature Engineering | Technical Specification | Done | Alta | OQI-006 | trading-platform | 1.0.0 | 2025-12-05 | 2026-01-04 |
ET-ML-003: Feature Engineering
Metadata
| Campo | Valor |
|---|---|
| ID | ET-ML-003 |
| Épica | OQI-006 - Señales ML |
| Tipo | Especificación Técnica |
| Versión | 1.0.0 |
| Estado | Aprobado |
| Última actualización | 2025-12-05 |
Propósito
Especificar el proceso de Feature Engineering para los modelos de ML, incluyendo la definición de features, cálculo de indicadores técnicos, normalización y selección de variables.
Arquitectura de Features
Feature Builder
# app/features/builder.py
import pandas as pd
import numpy as np
from typing import List, Dict, Optional
from .volatility import VolatilityFeatures
from .momentum import MomentumFeatures
from .trend import TrendFeatures
from .volume import VolumeFeatures
class FeatureBuilder:
"""
Constructs feature matrix from OHLCV data.
Combines multiple feature groups for comprehensive analysis.
"""
def __init__(self, lookback_periods: List[int] = None):
self.lookback_periods = lookback_periods or [5, 10, 20, 50]
# Feature calculators
self.volatility = VolatilityFeatures(self.lookback_periods)
self.momentum = MomentumFeatures(self.lookback_periods)
self.trend = TrendFeatures(self.lookback_periods)
self.volume = VolumeFeatures(self.lookback_periods)
self.feature_names: List[str] = []
def build(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Build complete feature matrix from OHLCV data.
Args:
df: DataFrame with columns ['open', 'high', 'low', 'close', 'volume']
Returns:
DataFrame with all calculated features
"""
features = pd.DataFrame(index=df.index)
# Calculate each feature group
features = pd.concat([
features,
self.volatility.calculate(df),
self.momentum.calculate(df),
self.trend.calculate(df),
self.volume.calculate(df)
], axis=1)
# Store feature names
self.feature_names = features.columns.tolist()
# Drop rows with NaN (due to lookback periods)
features = features.dropna()
return features
def get_feature_names(self) -> List[str]:
"""Return list of all feature names"""
return self.feature_names
def get_feature_importance(self, model) -> pd.DataFrame:
"""Extract feature importance from trained model"""
importance = model.feature_importances_
return pd.DataFrame({
'feature': self.feature_names,
'importance': importance
}).sort_values('importance', ascending=False)
Feature Groups
1. Volatility Features
# app/features/volatility.py
import pandas as pd
import numpy as np
from typing import List
class VolatilityFeatures:
"""Calculate volatility-based features"""
def __init__(self, periods: List[int]):
self.periods = periods
def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
"""Calculate all volatility features"""
features = pd.DataFrame(index=df.index)
# Standard deviation of returns
for period in self.periods:
features[f'volatility_{period}'] = self._std_returns(df['close'], period)
# Average True Range (ATR)
for period in self.periods:
features[f'atr_{period}'] = self._atr(df, period)
# ATR as percentage of price
for period in self.periods:
features[f'atr_pct_{period}'] = features[f'atr_{period}'] / df['close'] * 100
# Bollinger Band Width
for period in [20]:
features[f'bb_width_{period}'] = self._bb_width(df['close'], period)
features[f'bb_position_{period}'] = self._bb_position(df['close'], period)
# High-Low Range
features['hl_range'] = (df['high'] - df['low']) / df['close'] * 100
# Overnight gap
features['gap'] = (df['open'] - df['close'].shift(1)) / df['close'].shift(1) * 100
return features
def _std_returns(self, close: pd.Series, period: int) -> pd.Series:
"""Calculate rolling standard deviation of returns"""
returns = close.pct_change()
return returns.rolling(period).std() * 100
def _atr(self, df: pd.DataFrame, period: int) -> pd.Series:
"""Calculate Average True Range"""
high = df['high']
low = df['low']
close = df['close']
tr1 = high - low
tr2 = abs(high - close.shift(1))
tr3 = abs(low - close.shift(1))
tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
return tr.rolling(period).mean()
def _bb_width(self, close: pd.Series, period: int, std_dev: float = 2.0) -> pd.Series:
"""Calculate Bollinger Band width"""
sma = close.rolling(period).mean()
std = close.rolling(period).std()
upper = sma + std_dev * std
lower = sma - std_dev * std
return (upper - lower) / sma * 100
def _bb_position(self, close: pd.Series, period: int, std_dev: float = 2.0) -> pd.Series:
"""Calculate position within Bollinger Bands (0-100)"""
sma = close.rolling(period).mean()
std = close.rolling(period).std()
upper = sma + std_dev * std
lower = sma - std_dev * std
return (close - lower) / (upper - lower) * 100
2. Momentum Features
# app/features/momentum.py
import pandas as pd
import numpy as np
from typing import List
class MomentumFeatures:
"""Calculate momentum-based features"""
def __init__(self, periods: List[int]):
self.periods = periods
def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
"""Calculate all momentum features"""
features = pd.DataFrame(index=df.index)
close = df['close']
# Rate of Change (ROC)
for period in self.periods:
features[f'roc_{period}'] = self._roc(close, period)
# Momentum (absolute)
for period in self.periods:
features[f'momentum_{period}'] = self._momentum(close, period)
# RSI
for period in [7, 14, 21]:
features[f'rsi_{period}'] = self._rsi(close, period)
# Stochastic Oscillator
features['stoch_k'] = self._stochastic_k(df, 14)
features['stoch_d'] = features['stoch_k'].rolling(3).mean()
# Williams %R
features['williams_r'] = self._williams_r(df, 14)
# CCI (Commodity Channel Index)
features['cci_20'] = self._cci(df, 20)
# MACD
macd, signal, hist = self._macd(close)
features['macd'] = macd
features['macd_signal'] = signal
features['macd_histogram'] = hist
# ADX (Average Directional Index)
features['adx'] = self._adx(df, 14)
return features
def _roc(self, close: pd.Series, period: int) -> pd.Series:
"""Rate of Change"""
return (close - close.shift(period)) / close.shift(period) * 100
def _momentum(self, close: pd.Series, period: int) -> pd.Series:
"""Simple momentum"""
return close - close.shift(period)
def _rsi(self, close: pd.Series, period: int) -> pd.Series:
"""Relative Strength Index"""
delta = close.diff()
gain = delta.where(delta > 0, 0)
loss = -delta.where(delta < 0, 0)
avg_gain = gain.rolling(period).mean()
avg_loss = loss.rolling(period).mean()
rs = avg_gain / avg_loss
return 100 - (100 / (1 + rs))
def _stochastic_k(self, df: pd.DataFrame, period: int) -> pd.Series:
"""Stochastic %K"""
lowest = df['low'].rolling(period).min()
highest = df['high'].rolling(period).max()
return (df['close'] - lowest) / (highest - lowest) * 100
def _williams_r(self, df: pd.DataFrame, period: int) -> pd.Series:
"""Williams %R"""
highest = df['high'].rolling(period).max()
lowest = df['low'].rolling(period).min()
return (highest - df['close']) / (highest - lowest) * -100
def _cci(self, df: pd.DataFrame, period: int) -> pd.Series:
"""Commodity Channel Index"""
tp = (df['high'] + df['low'] + df['close']) / 3
sma = tp.rolling(period).mean()
mad = tp.rolling(period).apply(lambda x: np.abs(x - x.mean()).mean())
return (tp - sma) / (0.015 * mad)
def _macd(self, close: pd.Series, fast: int = 12, slow: int = 26, signal: int = 9):
"""MACD (Moving Average Convergence Divergence)"""
ema_fast = close.ewm(span=fast).mean()
ema_slow = close.ewm(span=slow).mean()
macd_line = ema_fast - ema_slow
signal_line = macd_line.ewm(span=signal).mean()
histogram = macd_line - signal_line
return macd_line, signal_line, histogram
def _adx(self, df: pd.DataFrame, period: int) -> pd.Series:
"""Average Directional Index"""
high = df['high']
low = df['low']
close = df['close']
# +DM and -DM
plus_dm = high.diff()
minus_dm = -low.diff()
plus_dm[plus_dm < 0] = 0
minus_dm[minus_dm < 0] = 0
# True Range
tr = pd.concat([
high - low,
abs(high - close.shift(1)),
abs(low - close.shift(1))
], axis=1).max(axis=1)
# Smoothed averages
atr = tr.rolling(period).mean()
plus_di = 100 * (plus_dm.rolling(period).mean() / atr)
minus_di = 100 * (minus_dm.rolling(period).mean() / atr)
# ADX
dx = 100 * abs(plus_di - minus_di) / (plus_di + minus_di)
return dx.rolling(period).mean()
3. Trend Features
# app/features/trend.py
import pandas as pd
import numpy as np
from typing import List
class TrendFeatures:
"""Calculate trend-based features"""
def __init__(self, periods: List[int]):
self.periods = periods
def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
"""Calculate all trend features"""
features = pd.DataFrame(index=df.index)
close = df['close']
# Simple Moving Averages
for period in self.periods:
features[f'sma_{period}'] = close.rolling(period).mean()
# Exponential Moving Averages
for period in self.periods:
features[f'ema_{period}'] = close.ewm(span=period).mean()
# Price vs SMA ratio
for period in self.periods:
features[f'price_sma_ratio_{period}'] = close / features[f'sma_{period}']
# SMA crossovers
features['sma_5_20_ratio'] = features['sma_5'] / features['sma_20']
features['sma_10_50_ratio'] = features['sma_10'] / features['sma_50']
# EMA crossovers
features['ema_5_20_ratio'] = features['ema_5'] / features['ema_20']
# Trend strength (slope of SMA)
for period in [10, 20]:
features[f'trend_slope_{period}'] = self._slope(features[f'sma_{period}'], 5)
# Distance from high/low
features['dist_from_high_20'] = (
(df['high'].rolling(20).max() - close) / close * 100
)
features['dist_from_low_20'] = (
(close - df['low'].rolling(20).min()) / close * 100
)
# Price position in range
features['price_position_20'] = self._price_position(df, 20)
features['price_position_50'] = self._price_position(df, 50)
# Ichimoku components
tenkan, kijun, senkou_a, senkou_b = self._ichimoku(df)
features['ichimoku_tenkan'] = tenkan
features['ichimoku_kijun'] = kijun
features['ichimoku_cloud_top'] = senkou_a
features['ichimoku_cloud_bottom'] = senkou_b
features['above_cloud'] = (close > senkou_a) & (close > senkou_b)
features['above_cloud'] = features['above_cloud'].astype(int)
return features
def _slope(self, series: pd.Series, period: int) -> pd.Series:
"""Calculate slope (linear regression coefficient)"""
def calc_slope(x):
if len(x) < 2:
return 0
y = np.arange(len(x))
return np.polyfit(y, x, 1)[0]
return series.rolling(period).apply(calc_slope)
def _price_position(self, df: pd.DataFrame, period: int) -> pd.Series:
"""Price position within period's high-low range (0-100)"""
highest = df['high'].rolling(period).max()
lowest = df['low'].rolling(period).min()
return (df['close'] - lowest) / (highest - lowest) * 100
def _ichimoku(self, df: pd.DataFrame):
"""Ichimoku Cloud components"""
high = df['high']
low = df['low']
# Tenkan-sen (Conversion Line): (9-period high + 9-period low) / 2
tenkan = (high.rolling(9).max() + low.rolling(9).min()) / 2
# Kijun-sen (Base Line): (26-period high + 26-period low) / 2
kijun = (high.rolling(26).max() + low.rolling(26).min()) / 2
# Senkou Span A (Leading Span A): (Tenkan + Kijun) / 2
senkou_a = (tenkan + kijun) / 2
# Senkou Span B (Leading Span B): (52-period high + 52-period low) / 2
senkou_b = (high.rolling(52).max() + low.rolling(52).min()) / 2
return tenkan, kijun, senkou_a, senkou_b
4. Volume Features
# app/features/volume.py
import pandas as pd
import numpy as np
from typing import List
class VolumeFeatures:
"""Calculate volume-based features"""
def __init__(self, periods: List[int]):
self.periods = periods
def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
"""Calculate all volume features"""
features = pd.DataFrame(index=df.index)
volume = df['volume']
close = df['close']
# Volume Moving Averages
for period in self.periods:
features[f'volume_sma_{period}'] = volume.rolling(period).mean()
# Volume ratio vs average
for period in self.periods:
features[f'volume_ratio_{period}'] = volume / features[f'volume_sma_{period}']
# On-Balance Volume (OBV)
features['obv'] = self._obv(df)
features['obv_sma_10'] = features['obv'].rolling(10).mean()
features['obv_trend'] = features['obv'] / features['obv_sma_10']
# Volume-Price Trend (VPT)
features['vpt'] = self._vpt(df)
# Money Flow Index (MFI)
features['mfi_14'] = self._mfi(df, 14)
# Accumulation/Distribution Line
features['ad_line'] = self._ad_line(df)
# Chaikin Money Flow
features['cmf_20'] = self._cmf(df, 20)
# Volume Weighted Average Price (VWAP) deviation
features['vwap'] = self._vwap(df)
features['vwap_deviation'] = (close - features['vwap']) / features['vwap'] * 100
# Volume changes
features['volume_change'] = volume.pct_change() * 100
features['volume_std_20'] = volume.rolling(20).std() / volume.rolling(20).mean()
return features
def _obv(self, df: pd.DataFrame) -> pd.Series:
"""On-Balance Volume"""
close = df['close']
volume = df['volume']
direction = np.sign(close.diff())
return (direction * volume).cumsum()
def _vpt(self, df: pd.DataFrame) -> pd.Series:
"""Volume-Price Trend"""
close = df['close']
volume = df['volume']
vpt = volume * (close.diff() / close.shift(1))
return vpt.cumsum()
def _mfi(self, df: pd.DataFrame, period: int) -> pd.Series:
"""Money Flow Index"""
tp = (df['high'] + df['low'] + df['close']) / 3
mf = tp * df['volume']
positive_mf = mf.where(tp > tp.shift(1), 0)
negative_mf = mf.where(tp < tp.shift(1), 0)
positive_sum = positive_mf.rolling(period).sum()
negative_sum = negative_mf.rolling(period).sum()
mfi = 100 - (100 / (1 + positive_sum / negative_sum))
return mfi
def _ad_line(self, df: pd.DataFrame) -> pd.Series:
"""Accumulation/Distribution Line"""
high = df['high']
low = df['low']
close = df['close']
volume = df['volume']
clv = ((close - low) - (high - close)) / (high - low)
clv = clv.fillna(0)
return (clv * volume).cumsum()
def _cmf(self, df: pd.DataFrame, period: int) -> pd.Series:
"""Chaikin Money Flow"""
high = df['high']
low = df['low']
close = df['close']
volume = df['volume']
clv = ((close - low) - (high - close)) / (high - low)
clv = clv.fillna(0)
return (clv * volume).rolling(period).sum() / volume.rolling(period).sum()
def _vwap(self, df: pd.DataFrame) -> pd.Series:
"""Volume Weighted Average Price (cumulative)"""
tp = (df['high'] + df['low'] + df['close']) / 3
cumulative_tp_vol = (tp * df['volume']).cumsum()
cumulative_vol = df['volume'].cumsum()
return cumulative_tp_vol / cumulative_vol
Feature Summary
Complete Feature List (21 Core Features)
| # | Feature | Category | Description |
|---|---|---|---|
| 1 | volatility_5 | Volatility | 5-period return std |
| 2 | volatility_10 | Volatility | 10-period return std |
| 3 | volatility_20 | Volatility | 20-period return std |
| 4 | atr_14 | Volatility | 14-period ATR |
| 5 | bb_position_20 | Volatility | Position in Bollinger Bands |
| 6 | roc_5 | Momentum | 5-period rate of change |
| 7 | roc_10 | Momentum | 10-period rate of change |
| 8 | rsi_14 | Momentum | 14-period RSI |
| 9 | macd | Momentum | MACD line |
| 10 | macd_histogram | Momentum | MACD histogram |
| 11 | stoch_k | Momentum | Stochastic %K |
| 12 | sma_5 | Trend | 5-period SMA |
| 13 | sma_20 | Trend | 20-period SMA |
| 14 | ema_10 | Trend | 10-period EMA |
| 15 | price_sma_ratio_20 | Trend | Price vs SMA20 |
| 16 | trend_slope_20 | Trend | SMA20 slope |
| 17 | price_position_20 | Trend | Position in range |
| 18 | volume_ratio_20 | Volume | Volume vs avg |
| 19 | obv_trend | Volume | OBV trend |
| 20 | mfi_14 | Volume | Money Flow Index |
| 21 | cmf_20 | Volume | Chaikin Money Flow |
Feature Normalization
# app/features/normalizer.py
from sklearn.preprocessing import StandardScaler, RobustScaler
import pandas as pd
import numpy as np
import joblib
class FeatureNormalizer:
"""Normalize features for model input"""
def __init__(self, method: str = 'robust'):
"""
Args:
method: 'standard' for StandardScaler, 'robust' for RobustScaler
"""
if method == 'standard':
self.scaler = StandardScaler()
else:
self.scaler = RobustScaler()
self.fitted = False
def fit(self, X: pd.DataFrame):
"""Fit scaler on training data"""
self.scaler.fit(X)
self.fitted = True
def transform(self, X: pd.DataFrame) -> np.ndarray:
"""Transform features"""
if not self.fitted:
raise ValueError("Scaler not fitted. Call fit() first.")
return self.scaler.transform(X)
def fit_transform(self, X: pd.DataFrame) -> np.ndarray:
"""Fit and transform in one step"""
self.fit(X)
return self.transform(X)
def save(self, path: str):
"""Save scaler to disk"""
joblib.dump(self.scaler, f"{path}/scaler.joblib")
def load(self, path: str):
"""Load scaler from disk"""
self.scaler = joblib.load(f"{path}/scaler.joblib")
self.fitted = True
Feature Selection
# app/features/selector.py
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
import pandas as pd
import numpy as np
class FeatureSelector:
"""Select most important features"""
def __init__(self, n_features: int = 15, method: str = 'mutual_info'):
self.n_features = n_features
self.method = method
self.selected_features = []
def fit(self, X: pd.DataFrame, y: np.ndarray):
"""Fit selector and identify top features"""
if self.method == 'mutual_info':
score_func = mutual_info_classif
else:
score_func = f_classif
selector = SelectKBest(score_func=score_func, k=self.n_features)
selector.fit(X, y)
# Get selected feature names
mask = selector.get_support()
self.selected_features = X.columns[mask].tolist()
# Store scores for analysis
self.feature_scores = pd.DataFrame({
'feature': X.columns,
'score': selector.scores_
}).sort_values('score', ascending=False)
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""Filter to selected features"""
return X[self.selected_features]
def get_scores(self) -> pd.DataFrame:
"""Return feature scores"""
return self.feature_scores
Referencias
Autor: Requirements-Analyst Fecha: 2025-12-05