trading-platform-ml-engine-v2/src/models/strategies/vbp/feature_engineering.py
Adrian Flores Cortes d015e2b0f3 feat(ml-engine): Phase 4 - PostgreSQL migration, dynamic OOS, data pipeline
- Fix database.py: Add DatabaseConnection alias for backward compat
- Fix train_symbol_timeframe_models.py: Use PostgreSQLConnection + native queries
- Fix run_oos_backtest.py: Fix broken import + add dynamic OOS support
- Update data_splitter.py: split_dynamic_oos() method (from previous session)
- Update validation_oos.yaml: Dynamic OOS config + all 6 symbols enabled
- Create ingest_ohlcv_polygon.py: Standalone Polygon→PostgreSQL ingestion script
- Fix .gitignore: /data/ instead of data/ to not ignore src/data/
- Add untracked src/ modules: backtesting, data, llm, models (attention/metamodel/strategies)
- Add aiohttp, sqlalchemy, psycopg2-binary to requirements.txt

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-27 04:39:05 -06:00

756 lines
26 KiB
Python

#!/usr/bin/env python3
"""
VBP Feature Engineering - Volatility Breakout Predictor Features
=================================================================
Comprehensive feature engineering for volatility-based breakout prediction.
Features include:
- ATR (Average True Range) at multiple periods
- Bollinger Bands (width, squeeze detection)
- Keltner Channels (for squeeze indicator)
- Compression Score (range contraction detection)
- Historical Volatility metrics
- Breakout labeling for training
Key Concept:
Breakouts often occur after periods of low volatility (squeeze/compression).
This module extracts features that capture volatility compression and
expansion patterns to predict upcoming breakouts.
Author: ML-Specialist (NEXUS v4.0)
Version: 1.0.0
Created: 2026-01-25
"""
import numpy as np
import pandas as pd
from typing import Dict, List, Optional, Tuple, Union
from dataclasses import dataclass, field
from loguru import logger
@dataclass
class VBPFeatureConfig:
"""Configuration for VBP feature engineering."""
# ATR periods
atr_periods: List[int] = field(default_factory=lambda: [5, 10, 20, 50])
# Bollinger Bands
bb_period: int = 20
bb_std: float = 2.0
# Keltner Channels
keltner_period: int = 20
keltner_mult: float = 1.5
# Compression detection
compression_lookback: int = 50
# Historical volatility
hv_windows: List[int] = field(default_factory=lambda: [10, 20, 50])
# Breakout labeling
breakout_atr_mult: float = 2.0
forward_periods: int = 5
# Minimum periods for rolling calculations
min_periods_ratio: float = 0.5
class VBPFeatureEngineer:
"""
Feature engineering for Volatility Breakout Prediction.
Extracts volatility-based features designed to predict breakouts:
- ATR-based volatility at multiple timeframes
- Bollinger Band squeeze indicators
- Keltner Channel squeeze (BB inside KC)
- Range compression scores
- Historical volatility metrics
Usage:
engineer = VBPFeatureEngineer(VBPFeatureConfig())
features = engineer.compute_all_features(df)
labels = engineer.label_breakouts(df)
"""
def __init__(self, config: Optional[VBPFeatureConfig] = None):
"""
Initialize VBP Feature Engineer.
Args:
config: Feature engineering configuration
"""
self.config = config or VBPFeatureConfig()
def _get_price_columns(self, df: pd.DataFrame) -> Tuple[str, str, str, str]:
"""Get standardized column names for OHLC."""
# Handle different column naming conventions
open_col = 'Open' if 'Open' in df.columns else 'open'
high_col = 'High' if 'High' in df.columns else 'high'
low_col = 'Low' if 'Low' in df.columns else 'low'
close_col = 'Close' if 'Close' in df.columns else 'close'
return open_col, high_col, low_col, close_col
def compute_true_range(self, df: pd.DataFrame) -> pd.Series:
"""
Compute True Range.
TR = max(H-L, |H-C_prev|, |L-C_prev|)
Args:
df: DataFrame with OHLC data
Returns:
Series with True Range values
"""
_, high_col, low_col, close_col = self._get_price_columns(df)
high = df[high_col]
low = df[low_col]
close_prev = df[close_col].shift(1)
tr1 = high - low
tr2 = (high - close_prev).abs()
tr3 = (low - close_prev).abs()
true_range = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
return true_range
def compute_atr(
self,
df: pd.DataFrame,
periods: Optional[List[int]] = None
) -> pd.DataFrame:
"""
Compute ATR (Average True Range) at multiple periods.
ATR is the smoothed moving average of True Range.
Multiple periods capture short-term vs long-term volatility.
Args:
df: DataFrame with OHLC data
periods: List of ATR periods (default: [5, 10, 20, 50])
Returns:
DataFrame with ATR columns for each period
"""
periods = periods or self.config.atr_periods
true_range = self.compute_true_range(df)
atr_features = pd.DataFrame(index=df.index)
for period in periods:
min_periods = max(1, int(period * self.config.min_periods_ratio))
# Use EMA for ATR (Wilder's smoothing)
atr = true_range.ewm(span=period, min_periods=min_periods, adjust=False).mean()
atr_features[f'atr_{period}'] = atr
# ATR as percentage of close
_, _, _, close_col = self._get_price_columns(df)
atr_features[f'atr_{period}_pct'] = atr / (df[close_col] + 1e-8) * 100
# ATR ratios (short vs long term)
if len(periods) >= 2:
short_period = periods[0]
long_period = periods[-1]
atr_features['atr_ratio_short_long'] = (
atr_features[f'atr_{short_period}'] /
(atr_features[f'atr_{long_period}'] + 1e-8)
)
# ATR expansion/contraction
atr_20 = atr_features.get('atr_20', atr_features[f'atr_{periods[0]}'])
atr_features['atr_change'] = atr_20.pct_change(5)
return atr_features
def compute_bollinger_bands(
self,
df: pd.DataFrame,
period: Optional[int] = None,
std: Optional[float] = None
) -> pd.DataFrame:
"""
Compute Bollinger Bands and derived features.
BB = SMA +/- (std_mult * rolling_std)
Key features:
- upper, lower: Band boundaries
- width: Band width normalized by middle
- squeeze: Width relative to historical width (compression indicator)
- position: Price position within bands (0-1)
Args:
df: DataFrame with OHLC data
period: SMA period (default: 20)
std: Standard deviation multiplier (default: 2.0)
Returns:
DataFrame with Bollinger Band features
"""
period = period or self.config.bb_period
std_mult = std or self.config.bb_std
_, _, _, close_col = self._get_price_columns(df)
close = df[close_col]
min_periods = max(1, int(period * self.config.min_periods_ratio))
# Calculate bands
sma = close.rolling(window=period, min_periods=min_periods).mean()
rolling_std = close.rolling(window=period, min_periods=min_periods).std()
bb_features = pd.DataFrame(index=df.index)
bb_features['bb_upper'] = sma + (std_mult * rolling_std)
bb_features['bb_lower'] = sma - (std_mult * rolling_std)
bb_features['bb_middle'] = sma
# Band width (normalized)
bb_features['bb_width'] = (
(bb_features['bb_upper'] - bb_features['bb_lower']) /
(sma + 1e-8)
)
# Width relative to historical (squeeze indicator)
# Low values indicate compression/squeeze
width_ma = bb_features['bb_width'].rolling(window=50, min_periods=10).mean()
bb_features['bb_squeeze'] = bb_features['bb_width'] / (width_ma + 1e-8)
# Price position within bands (0 = at lower, 1 = at upper)
bb_features['bb_position'] = (
(close - bb_features['bb_lower']) /
(bb_features['bb_upper'] - bb_features['bb_lower'] + 1e-8)
)
# Distance from bands (useful for breakout detection)
bb_features['bb_dist_upper'] = (bb_features['bb_upper'] - close) / (close + 1e-8)
bb_features['bb_dist_lower'] = (close - bb_features['bb_lower']) / (close + 1e-8)
# Percent B (standardized position, can be < 0 or > 1)
bb_features['bb_percent_b'] = (
(close - bb_features['bb_lower']) /
(bb_features['bb_upper'] - bb_features['bb_lower'] + 1e-8)
)
return bb_features
def compute_keltner_channels(
self,
df: pd.DataFrame,
period: Optional[int] = None,
mult: Optional[float] = None
) -> pd.DataFrame:
"""
Compute Keltner Channels.
KC = EMA +/- (mult * ATR)
Used with Bollinger Bands to detect squeeze:
When BB is inside KC, volatility is compressed (squeeze).
Args:
df: DataFrame with OHLC data
period: EMA/ATR period (default: 20)
mult: ATR multiplier (default: 1.5)
Returns:
DataFrame with Keltner Channel features
"""
period = period or self.config.keltner_period
mult = mult or self.config.keltner_mult
_, _, _, close_col = self._get_price_columns(df)
close = df[close_col]
min_periods = max(1, int(period * self.config.min_periods_ratio))
# Calculate EMA
ema = close.ewm(span=period, min_periods=min_periods, adjust=False).mean()
# Calculate ATR
true_range = self.compute_true_range(df)
atr = true_range.ewm(span=period, min_periods=min_periods, adjust=False).mean()
kc_features = pd.DataFrame(index=df.index)
kc_features['kc_upper'] = ema + (mult * atr)
kc_features['kc_lower'] = ema - (mult * atr)
kc_features['kc_middle'] = ema
# Channel width
kc_features['kc_width'] = (
(kc_features['kc_upper'] - kc_features['kc_lower']) /
(ema + 1e-8)
)
# Price position within channels
kc_features['kc_position'] = (
(close - kc_features['kc_lower']) /
(kc_features['kc_upper'] - kc_features['kc_lower'] + 1e-8)
)
return kc_features
def compute_squeeze_indicator(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Compute Squeeze Indicator (TTM Squeeze concept).
Squeeze occurs when Bollinger Bands are inside Keltner Channels.
This indicates low volatility and potential upcoming breakout.
Returns:
- squeeze_on: Binary indicator (1 = squeeze active)
- squeeze_strength: How tight the squeeze is
- squeeze_duration: Consecutive bars in squeeze
Args:
df: DataFrame with OHLC data
Returns:
DataFrame with squeeze indicator features
"""
# Get BB and KC
bb = self.compute_bollinger_bands(df)
kc = self.compute_keltner_channels(df)
squeeze_features = pd.DataFrame(index=df.index)
# Squeeze is ON when BB is inside KC
# BB lower > KC lower AND BB upper < KC upper
squeeze_on = (
(bb['bb_lower'] > kc['kc_lower']) &
(bb['bb_upper'] < kc['kc_upper'])
).astype(float)
squeeze_features['squeeze_on'] = squeeze_on
# Squeeze strength: how much BB is inside KC
# Higher = tighter squeeze
bb_width = bb['bb_upper'] - bb['bb_lower']
kc_width = kc['kc_upper'] - kc['kc_lower']
squeeze_features['squeeze_strength'] = 1 - (bb_width / (kc_width + 1e-8))
squeeze_features['squeeze_strength'] = squeeze_features['squeeze_strength'].clip(0, 1)
# Squeeze duration (consecutive bars in squeeze)
squeeze_duration = squeeze_on.copy()
for i in range(1, len(squeeze_duration)):
if squeeze_on.iloc[i] == 1:
squeeze_duration.iloc[i] = squeeze_duration.iloc[i-1] + 1
else:
squeeze_duration.iloc[i] = 0
squeeze_features['squeeze_duration'] = squeeze_duration
# Squeeze release (transition from squeeze to no squeeze)
squeeze_features['squeeze_release'] = (
(squeeze_on.shift(1) == 1) & (squeeze_on == 0)
).astype(float)
# Momentum indicator during squeeze (using close momentum)
_, _, _, close_col = self._get_price_columns(df)
close = df[close_col]
momentum = close - close.rolling(12).mean()
squeeze_features['squeeze_momentum'] = momentum / (close + 1e-8)
# Momentum direction (positive = bullish breakout likely)
squeeze_features['squeeze_momentum_direction'] = np.sign(momentum)
return squeeze_features
def compute_compression_score(
self,
df: pd.DataFrame,
lookback: Optional[int] = None
) -> pd.DataFrame:
"""
Compute range compression score.
Measures how compressed current range is vs historical range.
Low score = high compression = potential breakout setup.
Formula: current_range / max_range_in_lookback
Args:
df: DataFrame with OHLC data
lookback: Lookback period for historical range (default: 50)
Returns:
DataFrame with compression score features
"""
lookback = lookback or self.config.compression_lookback
_, high_col, low_col, close_col = self._get_price_columns(df)
high = df[high_col]
low = df[low_col]
close = df[close_col]
compression_features = pd.DataFrame(index=df.index)
# Current range (single bar)
current_range = high - low
# Rolling range (multi-bar)
rolling_high = high.rolling(5).max()
rolling_low = low.rolling(5).min()
rolling_range = rolling_high - rolling_low
# Historical max range
max_range = rolling_range.rolling(lookback).max()
min_range = rolling_range.rolling(lookback).min()
# Compression score: min/max (lower = more compressed)
compression_features['compression_score'] = (
rolling_range / (max_range + 1e-8)
)
# Normalized compression (0-1 scale)
compression_features['compression_normalized'] = (
(rolling_range - min_range) / (max_range - min_range + 1e-8)
)
# Range percentile (how current range ranks historically)
def rolling_percentile(series, window):
result = pd.Series(index=series.index, dtype=float)
for i in range(window, len(series)):
hist_values = series.iloc[i-window:i]
current = series.iloc[i]
percentile = (hist_values < current).sum() / window
result.iloc[i] = percentile
return result
compression_features['range_percentile'] = rolling_percentile(
rolling_range, lookback
)
# Range as percentage of price
compression_features['range_pct'] = current_range / (close + 1e-8) * 100
# Range change (expansion/contraction trend)
compression_features['range_change_5'] = rolling_range.pct_change(5)
compression_features['range_change_10'] = rolling_range.pct_change(10)
# Inside bars count (consecutive lower range bars)
lower_range = current_range < current_range.shift(1)
inside_count = lower_range.astype(int).copy()
for i in range(1, len(inside_count)):
if lower_range.iloc[i]:
inside_count.iloc[i] = inside_count.iloc[i-1] + 1
else:
inside_count.iloc[i] = 0
compression_features['inside_bar_count'] = inside_count
return compression_features
def compute_historical_volatility(
self,
df: pd.DataFrame,
windows: Optional[List[int]] = None
) -> pd.DataFrame:
"""
Compute historical volatility metrics.
Uses log returns for more accurate volatility estimation.
Args:
df: DataFrame with OHLC data
windows: List of volatility windows (default: [10, 20, 50])
Returns:
DataFrame with historical volatility features
"""
windows = windows or self.config.hv_windows
_, _, _, close_col = self._get_price_columns(df)
close = df[close_col]
# Log returns (more accurate for volatility)
log_returns = np.log(close / close.shift(1))
hv_features = pd.DataFrame(index=df.index)
for window in windows:
min_periods = max(1, int(window * self.config.min_periods_ratio))
# Standard deviation of log returns (annualized)
hv = log_returns.rolling(window=window, min_periods=min_periods).std()
hv_features[f'hv_{window}'] = hv
# Annualized (assuming 252 trading days, adjusting for intraday)
hv_features[f'hv_{window}_annual'] = hv * np.sqrt(252 * 24) # for hourly
# Volatility ratios
if len(windows) >= 2:
short_window = windows[0]
long_window = windows[-1]
hv_features['hv_ratio'] = (
hv_features[f'hv_{short_window}'] /
(hv_features[f'hv_{long_window}'] + 1e-8)
)
# Volatility percentile
hv_20 = hv_features.get('hv_20', hv_features[f'hv_{windows[0]}'])
hv_rolling_max = hv_20.rolling(100).max()
hv_rolling_min = hv_20.rolling(100).min()
hv_features['hv_percentile'] = (
(hv_20 - hv_rolling_min) / (hv_rolling_max - hv_rolling_min + 1e-8)
)
# Volatility regime (low/medium/high)
hv_features['hv_regime'] = pd.cut(
hv_features['hv_percentile'],
bins=[-np.inf, 0.33, 0.66, np.inf],
labels=[0, 1, 2]
).astype(float)
# Volatility trend
hv_features['hv_trend'] = hv_20 - hv_20.rolling(10).mean()
# Parkinson volatility (using high-low range)
_, high_col, low_col, _ = self._get_price_columns(df)
high = df[high_col]
low = df[low_col]
log_hl = np.log(high / low)
parkinson = log_hl.pow(2) / (4 * np.log(2))
hv_features['parkinson_vol'] = parkinson.rolling(20).mean().pow(0.5)
return hv_features
def label_breakouts(
self,
df: pd.DataFrame,
atr_mult: Optional[float] = None,
forward_periods: Optional[int] = None
) -> pd.DataFrame:
"""
Label breakouts for training.
A breakout is defined as a move exceeding atr_mult * ATR
within forward_periods bars.
Labels:
- 0: No breakout
- 1: Bullish breakout (upward)
- 2: Bearish breakout (downward)
Also returns direction and magnitude for regression targets.
Args:
df: DataFrame with OHLC data
atr_mult: ATR multiplier threshold (default: 2.0)
forward_periods: Forward look period (default: 5)
Returns:
DataFrame with breakout labels and targets
"""
atr_mult = atr_mult or self.config.breakout_atr_mult
forward_periods = forward_periods or self.config.forward_periods
_, high_col, low_col, close_col = self._get_price_columns(df)
close = df[close_col]
high = df[high_col]
low = df[low_col]
# Compute ATR for threshold
true_range = self.compute_true_range(df)
atr = true_range.rolling(20).mean()
labels = pd.DataFrame(index=df.index)
# Forward high and low (max/min in forward window)
forward_high = high.rolling(forward_periods).max().shift(-forward_periods)
forward_low = low.rolling(forward_periods).min().shift(-forward_periods)
# Calculate forward moves from current close
upward_move = forward_high - close
downward_move = close - forward_low
# Threshold for breakout
threshold = atr_mult * atr
# Classify breakouts
bullish_breakout = upward_move > threshold
bearish_breakout = downward_move > threshold
# Labels: 0 = no breakout, 1 = bullish, 2 = bearish
# If both directions trigger, use the larger move
labels['breakout_label'] = 0
labels.loc[bullish_breakout, 'breakout_label'] = 1
labels.loc[bearish_breakout, 'breakout_label'] = 2
# Handle cases where both are true (use stronger direction)
both_mask = bullish_breakout & bearish_breakout
labels.loc[both_mask & (upward_move >= downward_move), 'breakout_label'] = 1
labels.loc[both_mask & (downward_move > upward_move), 'breakout_label'] = 2
# Binary breakout (any direction)
labels['breakout_binary'] = (labels['breakout_label'] > 0).astype(int)
# Direction: 1 = bullish, -1 = bearish, 0 = no breakout
labels['breakout_direction'] = 0
labels.loc[labels['breakout_label'] == 1, 'breakout_direction'] = 1
labels.loc[labels['breakout_label'] == 2, 'breakout_direction'] = -1
# Magnitude (for regression)
labels['breakout_magnitude'] = np.maximum(upward_move, downward_move) / (atr + 1e-8)
# Signed magnitude (positive for bullish, negative for bearish)
labels['breakout_signed_magnitude'] = labels['breakout_magnitude'] * labels['breakout_direction']
# Forward return (for additional target)
labels['forward_return'] = close.shift(-forward_periods) / close - 1
# Log breakout statistics
total = len(labels)
no_breakout = (labels['breakout_label'] == 0).sum()
bullish = (labels['breakout_label'] == 1).sum()
bearish = (labels['breakout_label'] == 2).sum()
logger.info(f"Breakout labeling complete:")
logger.info(f" No breakout: {no_breakout} ({no_breakout/total*100:.1f}%)")
logger.info(f" Bullish: {bullish} ({bullish/total*100:.1f}%)")
logger.info(f" Bearish: {bearish} ({bearish/total*100:.1f}%)")
return labels
def compute_all_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Compute all VBP features.
Combines all feature types into a single DataFrame.
Args:
df: DataFrame with OHLC data
Returns:
DataFrame with all VBP features
"""
logger.info(f"Computing VBP features for {len(df)} samples...")
# Compute all feature groups
atr_features = self.compute_atr(df)
bb_features = self.compute_bollinger_bands(df)
kc_features = self.compute_keltner_channels(df)
squeeze_features = self.compute_squeeze_indicator(df)
compression_features = self.compute_compression_score(df)
hv_features = self.compute_historical_volatility(df)
# Combine all features
all_features = pd.concat([
atr_features,
bb_features,
kc_features,
squeeze_features,
compression_features,
hv_features
], axis=1)
# Remove duplicate columns (if any)
all_features = all_features.loc[:, ~all_features.columns.duplicated()]
# Fill NaN with forward/backward fill, then 0
all_features = all_features.fillna(method='ffill').fillna(method='bfill').fillna(0)
logger.info(f"Computed {len(all_features.columns)} features")
return all_features
def get_feature_names(self) -> List[str]:
"""Get list of all feature names."""
# Create a dummy dataframe to get feature names
dummy_dates = pd.date_range('2020-01-01', periods=200, freq='1H')
dummy_df = pd.DataFrame({
'open': np.random.randn(200).cumsum() + 100,
'high': np.random.randn(200).cumsum() + 101,
'low': np.random.randn(200).cumsum() + 99,
'close': np.random.randn(200).cumsum() + 100
}, index=dummy_dates)
dummy_df['high'] = dummy_df[['open', 'high', 'close']].max(axis=1)
dummy_df['low'] = dummy_df[['open', 'low', 'close']].min(axis=1)
features = self.compute_all_features(dummy_df)
return features.columns.tolist()
if __name__ == "__main__":
# Test the feature engineering module
print("Testing VBP Feature Engineering")
print("=" * 60)
# Create sample OHLCV data with some volatility patterns
np.random.seed(42)
n = 500
dates = pd.date_range('2025-01-01', periods=n, freq='1H')
# Simulate price with varying volatility
volatility = np.where(
(np.arange(n) % 100 > 70), # High vol periods
0.02,
0.005 # Low vol (compression) periods
)
returns = np.random.randn(n) * volatility
price = 2000 * (1 + returns).cumprod()
df = pd.DataFrame({
'open': price,
'high': price * (1 + np.abs(np.random.randn(n)) * volatility),
'low': price * (1 - np.abs(np.random.randn(n)) * volatility),
'close': price * (1 + np.random.randn(n) * volatility * 0.5),
'volume': np.random.randint(100, 1000, n)
}, index=dates)
# Ensure OHLC consistency
df['high'] = df[['open', 'high', 'close']].max(axis=1)
df['low'] = df[['open', 'low', 'close']].min(axis=1)
# Initialize engineer
config = VBPFeatureConfig()
engineer = VBPFeatureEngineer(config)
# Test individual feature groups
print("\n1. Testing ATR features...")
atr_features = engineer.compute_atr(df)
print(f" ATR features: {len(atr_features.columns)}")
print(f" Columns: {list(atr_features.columns)}")
print("\n2. Testing Bollinger Bands features...")
bb_features = engineer.compute_bollinger_bands(df)
print(f" BB features: {len(bb_features.columns)}")
print(f" Columns: {list(bb_features.columns)}")
print("\n3. Testing Keltner Channels features...")
kc_features = engineer.compute_keltner_channels(df)
print(f" KC features: {len(kc_features.columns)}")
print("\n4. Testing Squeeze Indicator...")
squeeze_features = engineer.compute_squeeze_indicator(df)
print(f" Squeeze features: {len(squeeze_features.columns)}")
print(f" Squeeze ON periods: {squeeze_features['squeeze_on'].sum()}")
print("\n5. Testing Compression Score...")
compression_features = engineer.compute_compression_score(df)
print(f" Compression features: {len(compression_features.columns)}")
print(f" Avg compression score: {compression_features['compression_score'].mean():.4f}")
print("\n6. Testing Historical Volatility...")
hv_features = engineer.compute_historical_volatility(df)
print(f" HV features: {len(hv_features.columns)}")
print("\n7. Testing Breakout Labels...")
labels = engineer.label_breakouts(df)
print(f" Label columns: {list(labels.columns)}")
print("\n8. Testing All Features Combined...")
all_features = engineer.compute_all_features(df)
print(f" Total features: {len(all_features.columns)}")
print(f" Sample shape: {all_features.shape}")
print(f" NaN count: {all_features.isna().sum().sum()}")
print("\n" + "=" * 60)
print("All feature engineering tests passed!")