Hierarchical ML Pipeline for trading predictions:
- Level 0: Attention Models (volatility/flow classification)
- Level 1: Base Models (XGBoost per symbol/timeframe)
- Level 2: Metamodels (XGBoost Stacking + Neural Gating)
Key components:
- src/pipelines/hierarchical_pipeline.py - Main prediction pipeline
- src/models/ - All ML model classes
- src/training/ - Training utilities
- src/api/ - FastAPI endpoints
- scripts/ - Training and evaluation scripts
- config/ - YAML configurations
Note: Trained models (*.joblib, *.pt) are gitignored.
Regenerate with training scripts.
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
510 lines
17 KiB
Python
510 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Range-Based Backtest
|
|
====================
|
|
Uses RangePredictorV2 predictions directly for adaptive TP/SL.
|
|
|
|
Strategy:
|
|
- Predict high_delta and low_delta for each bar
|
|
- Direction: If predicted_high > predicted_low * factor -> Long
|
|
- TP: Set at fraction of predicted favorable range
|
|
- SL: Set at multiple of predicted adverse range
|
|
|
|
Author: ML-Specialist (NEXUS v4.0)
|
|
Date: 2026-01-04
|
|
"""
|
|
|
|
import sys
|
|
sys.path.insert(0, 'src')
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
import yaml
|
|
import json
|
|
from loguru import logger
|
|
import argparse
|
|
import joblib
|
|
|
|
from data.database import MySQLConnection
|
|
from data.features import FeatureEngineer
|
|
from training.data_splitter import TemporalDataSplitter
|
|
|
|
|
|
def load_range_predictor(model_path: str):
|
|
"""Load trained RangePredictorV2 model."""
|
|
from models.range_predictor_v2 import RangePredictorV2
|
|
|
|
# Load individual XGBoost models and metadata
|
|
models = {}
|
|
metadata = {}
|
|
|
|
for model_file in Path(model_path).glob("*.joblib"):
|
|
name = model_file.stem
|
|
if name == 'metadata':
|
|
metadata = joblib.load(model_file)
|
|
logger.info(f"Loaded metadata")
|
|
else:
|
|
models[name] = joblib.load(model_file)
|
|
logger.info(f"Loaded model: {name}")
|
|
|
|
return models, metadata
|
|
|
|
|
|
def prepare_features(df: pd.DataFrame, feature_cols: list = None) -> pd.DataFrame:
|
|
"""
|
|
Prepare features matching training.
|
|
|
|
If feature_cols is provided, ensures all required features exist.
|
|
"""
|
|
feature_eng = FeatureEngineer()
|
|
|
|
df_processed = df.copy()
|
|
df_processed = feature_eng.create_price_features(df_processed)
|
|
df_processed = feature_eng.create_volume_features(df_processed)
|
|
df_processed = feature_eng.create_time_features(df_processed)
|
|
df_processed = feature_eng.create_rolling_features(
|
|
df_processed,
|
|
columns=['close', 'volume', 'high', 'low'],
|
|
windows=[5, 10, 20]
|
|
)
|
|
|
|
# Add missing features if needed
|
|
if 'obv' not in df_processed.columns:
|
|
df_processed['obv'] = (np.sign(df_processed['close'].diff()) * df_processed['volume']).cumsum()
|
|
|
|
if 'vpt' not in df_processed.columns:
|
|
df_processed['vpt'] = (df_processed['close'].pct_change() * df_processed['volume']).cumsum()
|
|
|
|
# Session features
|
|
if 'is_london' not in df_processed.columns:
|
|
hour = df_processed.index.hour
|
|
df_processed['is_london'] = ((hour >= 8) & (hour < 16)).astype(int)
|
|
df_processed['is_newyork'] = ((hour >= 13) & (hour < 21)).astype(int)
|
|
df_processed['is_tokyo'] = ((hour >= 0) & (hour < 8)).astype(int)
|
|
|
|
# Fill any missing required features with 0
|
|
if feature_cols:
|
|
for col in feature_cols:
|
|
if col not in df_processed.columns:
|
|
df_processed[col] = 0
|
|
logger.warning(f"Missing feature {col}, filled with 0")
|
|
|
|
return df_processed.dropna()
|
|
|
|
|
|
def get_feature_columns(df: pd.DataFrame) -> list:
|
|
"""Get feature columns (exclude OHLCV and targets)."""
|
|
exclude = ['open', 'high', 'low', 'close', 'volume', 'vwap']
|
|
exclude += [c for c in df.columns if c.startswith('target_')]
|
|
|
|
return [c for c in df.columns
|
|
if c not in exclude
|
|
and df[c].dtype in ['float64', 'float32', 'int64']]
|
|
|
|
|
|
def predict_ranges(models: dict, X: np.ndarray) -> dict:
|
|
"""Predict high/low ranges using loaded models."""
|
|
predictions = {}
|
|
|
|
for name, model in models.items():
|
|
if 'high' in name:
|
|
predictions[name] = model.predict(X)
|
|
elif 'low' in name:
|
|
predictions[name] = model.predict(X)
|
|
elif 'direction' in name:
|
|
predictions[name] = model.predict(X)
|
|
|
|
return predictions
|
|
|
|
|
|
def simulate_trade(
|
|
entry_price: float,
|
|
tp_price: float,
|
|
sl_price: float,
|
|
direction: str,
|
|
future_highs: np.ndarray,
|
|
future_lows: np.ndarray,
|
|
max_bars: int = 50
|
|
) -> tuple:
|
|
"""
|
|
Simulate a trade and determine outcome.
|
|
|
|
Returns:
|
|
(result, exit_price, bars_held)
|
|
"""
|
|
for i in range(min(len(future_highs), max_bars)):
|
|
high = future_highs[i]
|
|
low = future_lows[i]
|
|
|
|
if direction == 'long':
|
|
# Check SL first (conservative)
|
|
if low <= sl_price:
|
|
return 'sl', sl_price, i + 1
|
|
# Check TP
|
|
if high >= tp_price:
|
|
return 'tp', tp_price, i + 1
|
|
else: # short
|
|
# Check SL first
|
|
if high >= sl_price:
|
|
return 'sl', sl_price, i + 1
|
|
# Check TP
|
|
if low <= tp_price:
|
|
return 'tp', tp_price, i + 1
|
|
|
|
# Timeout
|
|
return 'timeout', (future_highs[-1] + future_lows[-1]) / 2, max_bars
|
|
|
|
|
|
def run_range_based_backtest(
|
|
symbol: str = "XAUUSD",
|
|
timeframe: str = "15m",
|
|
horizon: str = "scalping",
|
|
tp_factor: float = 0.4, # TP at 40% of predicted range
|
|
sl_factor: float = 2.0, # SL at 200% of opposite range
|
|
min_range_pct: float = 0.0001, # Minimum 0.01% range to trade
|
|
direction_bias: float = 1.3, # Require 30% higher favorable range
|
|
signal_every_n: int = 4 # Only trade every N bars
|
|
):
|
|
"""
|
|
Run backtest using range predictions for TP/SL.
|
|
"""
|
|
logger.info("=" * 60)
|
|
logger.info("RANGE-BASED BACKTEST")
|
|
logger.info(f"Symbol: {symbol}")
|
|
logger.info(f"TP Factor: {tp_factor}, SL Factor: {sl_factor}")
|
|
logger.info("=" * 60)
|
|
|
|
# Load model
|
|
model_path = f"models/ml_first/{symbol}/range_predictor/{timeframe}"
|
|
if not Path(model_path).exists():
|
|
logger.error(f"Model not found: {model_path}")
|
|
return None
|
|
|
|
models, metadata = load_range_predictor(model_path)
|
|
logger.info(f"Loaded {len(models)} models")
|
|
|
|
# Get expected feature columns from metadata
|
|
fi = metadata.get('feature_importance', {})
|
|
if fi:
|
|
first_key = list(fi.keys())[0]
|
|
expected_features = list(fi[first_key].keys())
|
|
logger.info(f"Model expects {len(expected_features)} features")
|
|
else:
|
|
expected_features = None
|
|
|
|
# Load data
|
|
db = MySQLConnection('config/database.yaml')
|
|
df_raw = db.get_ticker_data(symbol, limit=100000)
|
|
logger.info(f"Loaded {len(df_raw)} records")
|
|
|
|
# Split data - use OOS only
|
|
splitter = TemporalDataSplitter()
|
|
split = splitter.split_temporal(df_raw)
|
|
df_test = split.test_data
|
|
logger.info(f"Using OOS data: {len(df_test)} records ({df_test.index.min()} to {df_test.index.max()})")
|
|
|
|
# Prepare features
|
|
df = prepare_features(df_test, expected_features)
|
|
|
|
# Use expected features in exact order
|
|
if expected_features:
|
|
feature_cols = expected_features
|
|
else:
|
|
feature_cols = get_feature_columns(df)
|
|
|
|
X = df[feature_cols].values
|
|
logger.info(f"Features prepared: {X.shape}")
|
|
|
|
# Get predictions
|
|
predictions = predict_ranges(models, X)
|
|
|
|
# Find high and low prediction models
|
|
high_model_key = None
|
|
low_model_key = None
|
|
for key in models.keys():
|
|
if f'{horizon}_high' in key:
|
|
high_model_key = key
|
|
elif f'{horizon}_low' in key:
|
|
low_model_key = key
|
|
|
|
if not high_model_key or not low_model_key:
|
|
logger.error(f"Could not find models for horizon: {horizon}")
|
|
logger.info(f"Available models: {list(models.keys())}")
|
|
return None
|
|
|
|
pred_high = predictions[high_model_key]
|
|
pred_low = predictions[low_model_key]
|
|
|
|
logger.info(f"Using predictions: {high_model_key}, {low_model_key}")
|
|
logger.info(f"Pred High - mean: {pred_high.mean():.6f}, std: {pred_high.std():.6f}")
|
|
logger.info(f"Pred Low - mean: {pred_low.mean():.6f}, std: {pred_low.std():.6f}")
|
|
|
|
# If predictions have no variance, use actual price action for direction
|
|
use_price_action_direction = pred_high.std() < 1e-6 or abs(pred_low).std() < 1e-6
|
|
if use_price_action_direction:
|
|
logger.warning("Predictions have no variance - using price action for direction")
|
|
|
|
# Run backtest
|
|
trades = []
|
|
capital = 10000.0
|
|
risk_per_trade = 0.01
|
|
equity_curve = [capital]
|
|
|
|
prices = df[['open', 'high', 'low', 'close']].values
|
|
close_prices = df['close'].values
|
|
high_prices = df['high'].values
|
|
low_prices = df['low'].values
|
|
|
|
n_signals = 0
|
|
n_long = 0
|
|
n_short = 0
|
|
n_skipped = 0
|
|
|
|
# Calculate momentum for price action direction
|
|
momentum = pd.Series(close_prices).pct_change(5).values
|
|
|
|
# Calculate dynamic ATR for range estimation
|
|
atr = (pd.Series(high_prices) - pd.Series(low_prices)).rolling(14).mean().values
|
|
atr_pct = atr / close_prices # ATR as percentage of price
|
|
|
|
# Use mean predicted range if predictions are constant
|
|
mean_high_delta = pred_high.mean()
|
|
mean_low_delta = abs(pred_low.mean())
|
|
|
|
for i in range(len(df) - 50): # Leave room for simulation
|
|
# Only signal every N bars
|
|
if i % signal_every_n != 0:
|
|
continue
|
|
|
|
current_price = close_prices[i]
|
|
|
|
# Use predicted or fallback to dynamic ATR
|
|
if use_price_action_direction:
|
|
# Use dynamic ATR for range estimation
|
|
if i >= 14 and not np.isnan(atr_pct[i]):
|
|
current_atr = atr_pct[i]
|
|
predicted_high_delta = current_atr * 0.8 # ~80% of ATR for high
|
|
predicted_low_delta = current_atr * 0.8 # ~80% of ATR for low
|
|
else:
|
|
predicted_high_delta = mean_high_delta
|
|
predicted_low_delta = mean_low_delta
|
|
current_atr = mean_high_delta
|
|
|
|
# Use price momentum for direction with stronger filter
|
|
# Require momentum to exceed a significant threshold (0.2% move in 5 bars)
|
|
mom_threshold = 0.002 # 0.2% momentum threshold
|
|
if i >= 5 and momentum[i] > mom_threshold:
|
|
direction = 'long'
|
|
high_range = predicted_high_delta * current_price
|
|
low_range = predicted_low_delta * current_price
|
|
n_long += 1
|
|
elif i >= 5 and momentum[i] < -mom_threshold:
|
|
direction = 'short'
|
|
high_range = predicted_high_delta * current_price
|
|
low_range = predicted_low_delta * current_price
|
|
n_short += 1
|
|
else:
|
|
n_skipped += 1
|
|
continue
|
|
else:
|
|
predicted_high_delta = pred_high[i] # Delta as percentage
|
|
predicted_low_delta = abs(pred_low[i]) # Make positive
|
|
|
|
# Convert delta to price ranges
|
|
high_range = predicted_high_delta * current_price
|
|
low_range = predicted_low_delta * current_price
|
|
|
|
# Determine direction based on range comparison
|
|
if high_range > low_range * direction_bias:
|
|
direction = 'long'
|
|
n_long += 1
|
|
elif low_range > high_range * direction_bias:
|
|
direction = 'short'
|
|
n_short += 1
|
|
else:
|
|
n_skipped += 1
|
|
continue # No clear direction
|
|
|
|
# Calculate TP/SL based on direction
|
|
if direction == 'long':
|
|
tp_distance = high_range * tp_factor
|
|
sl_distance = low_range * sl_factor
|
|
else:
|
|
tp_distance = low_range * tp_factor
|
|
sl_distance = high_range * sl_factor
|
|
|
|
# Check minimum range
|
|
if tp_distance / current_price < min_range_pct:
|
|
n_skipped += 1
|
|
continue
|
|
|
|
# Calculate TP/SL prices
|
|
if direction == 'long':
|
|
tp_price = current_price + tp_distance
|
|
sl_price = current_price - sl_distance
|
|
else:
|
|
tp_price = current_price - tp_distance
|
|
sl_price = current_price + sl_distance
|
|
|
|
# Get future prices for simulation
|
|
future_highs = high_prices[i+1:i+51]
|
|
future_lows = low_prices[i+1:i+51]
|
|
|
|
# Simulate trade
|
|
result, exit_price, bars_held = simulate_trade(
|
|
entry_price=current_price,
|
|
tp_price=tp_price,
|
|
sl_price=sl_price,
|
|
direction=direction,
|
|
future_highs=future_highs,
|
|
future_lows=future_lows,
|
|
max_bars=50
|
|
)
|
|
|
|
# Calculate P&L
|
|
risk_amount = capital * risk_per_trade
|
|
position_size = risk_amount / sl_distance if sl_distance > 0 else 0
|
|
|
|
if direction == 'long':
|
|
pnl = (exit_price - current_price) * position_size
|
|
else:
|
|
pnl = (current_price - exit_price) * position_size
|
|
|
|
capital += pnl
|
|
equity_curve.append(capital)
|
|
|
|
trades.append({
|
|
'bar': i,
|
|
'time': df.index[i],
|
|
'direction': direction,
|
|
'entry': current_price,
|
|
'tp': tp_price,
|
|
'sl': sl_price,
|
|
'exit': exit_price,
|
|
'result': result,
|
|
'pnl': pnl,
|
|
'bars_held': bars_held,
|
|
'pred_high': predicted_high_delta,
|
|
'pred_low': predicted_low_delta
|
|
})
|
|
|
|
n_signals += 1
|
|
|
|
# Calculate metrics
|
|
if not trades:
|
|
logger.warning("No trades executed")
|
|
return None
|
|
|
|
trades_df = pd.DataFrame(trades)
|
|
n_wins = (trades_df['result'] == 'tp').sum()
|
|
n_losses = (trades_df['result'] == 'sl').sum()
|
|
n_timeouts = (trades_df['result'] == 'timeout').sum()
|
|
total_trades = len(trades_df)
|
|
|
|
win_rate = n_wins / total_trades if total_trades > 0 else 0
|
|
total_pnl = trades_df['pnl'].sum()
|
|
avg_win = trades_df[trades_df['pnl'] > 0]['pnl'].mean() if n_wins > 0 else 0
|
|
avg_loss = trades_df[trades_df['pnl'] < 0]['pnl'].mean() if n_losses > 0 else 0
|
|
|
|
equity_curve = np.array(equity_curve)
|
|
max_equity = np.maximum.accumulate(equity_curve)
|
|
drawdown = (max_equity - equity_curve) / max_equity
|
|
max_drawdown = drawdown.max()
|
|
|
|
# Print results
|
|
print("\n" + "=" * 60)
|
|
print("RANGE-BASED BACKTEST RESULTS")
|
|
print("=" * 60)
|
|
print(f"Strategy: TP={tp_factor*100:.0f}% range, SL={sl_factor*100:.0f}% opposite")
|
|
print(f"Direction Bias: {direction_bias}")
|
|
print(f"Signal Frequency: Every {signal_every_n} bars")
|
|
print("-" * 60)
|
|
print(f"Total Signals Analyzed: {n_long + n_short + n_skipped}")
|
|
print(f" Long Signals: {n_long}")
|
|
print(f" Short Signals: {n_short}")
|
|
print(f" Skipped (no bias): {n_skipped}")
|
|
print("-" * 60)
|
|
print(f"Trades Executed: {total_trades}")
|
|
print(f" Wins (TP hit): {n_wins} ({n_wins/total_trades*100:.1f}%)")
|
|
print(f" Losses (SL hit): {n_losses} ({n_losses/total_trades*100:.1f}%)")
|
|
print(f" Timeouts: {n_timeouts} ({n_timeouts/total_trades*100:.1f}%)")
|
|
print("-" * 60)
|
|
print(f"WIN RATE: {win_rate*100:.2f}%")
|
|
print(f"Net P&L: ${total_pnl:,.2f}")
|
|
print(f"Avg Win: ${avg_win:,.2f}")
|
|
print(f"Avg Loss: ${avg_loss:,.2f}")
|
|
print(f"Final Capital: ${capital:,.2f}")
|
|
print(f"Max Drawdown: {max_drawdown*100:.2f}%")
|
|
|
|
if win_rate >= 0.80:
|
|
print("\n*** 80% WIN RATE TARGET ACHIEVED! ***")
|
|
elif win_rate >= 0.75:
|
|
print("\n*** Close to target: 75%+ achieved ***")
|
|
else:
|
|
print(f"\n*** Below target. Need to adjust parameters ***")
|
|
|
|
# Save results
|
|
output_dir = Path("reports/range_backtest")
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
results = {
|
|
'config': {
|
|
'symbol': symbol,
|
|
'timeframe': timeframe,
|
|
'horizon': horizon,
|
|
'tp_factor': tp_factor,
|
|
'sl_factor': sl_factor,
|
|
'min_range_pct': min_range_pct,
|
|
'direction_bias': direction_bias,
|
|
'signal_every_n': signal_every_n
|
|
},
|
|
'metrics': {
|
|
'total_trades': total_trades,
|
|
'win_rate': win_rate,
|
|
'n_wins': n_wins,
|
|
'n_losses': n_losses,
|
|
'n_timeouts': n_timeouts,
|
|
'total_pnl': total_pnl,
|
|
'final_capital': capital,
|
|
'max_drawdown': max_drawdown
|
|
},
|
|
'trades': trades
|
|
}
|
|
|
|
filepath = output_dir / f"{symbol}_{horizon}_{timestamp}.json"
|
|
with open(filepath, 'w') as f:
|
|
json.dump(results, f, indent=2, default=str)
|
|
logger.info(f"Results saved to {filepath}")
|
|
|
|
return results
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Run Range-Based Backtest')
|
|
parser.add_argument('--symbol', default='XAUUSD', help='Trading symbol')
|
|
parser.add_argument('--timeframe', default='15m', help='Timeframe')
|
|
parser.add_argument('--horizon', default='scalping', help='Prediction horizon')
|
|
parser.add_argument('--tp-factor', type=float, default=0.3, help='TP as fraction of predicted range')
|
|
parser.add_argument('--sl-factor', type=float, default=3.0, help='SL as multiple of opposite range')
|
|
parser.add_argument('--bias', type=float, default=1.2, help='Direction bias factor')
|
|
parser.add_argument('--signal-freq', type=int, default=4, help='Signal every N bars')
|
|
|
|
args = parser.parse_args()
|
|
|
|
results = run_range_based_backtest(
|
|
symbol=args.symbol,
|
|
timeframe=args.timeframe,
|
|
horizon=args.horizon,
|
|
tp_factor=args.tp_factor,
|
|
sl_factor=args.sl_factor,
|
|
direction_bias=args.bias,
|
|
signal_every_n=args.signal_freq
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|