trading-platform-ml-engine-v2/scripts/run_backtest_oos_period.py
rckrdmrd 75c4d07690 feat: Initial commit - ML Engine codebase
Hierarchical ML Pipeline for trading predictions:
- Level 0: Attention Models (volatility/flow classification)
- Level 1: Base Models (XGBoost per symbol/timeframe)
- Level 2: Metamodels (XGBoost Stacking + Neural Gating)

Key components:
- src/pipelines/hierarchical_pipeline.py - Main prediction pipeline
- src/models/ - All ML model classes
- src/training/ - Training utilities
- src/api/ - FastAPI endpoints
- scripts/ - Training and evaluation scripts
- config/ - YAML configurations

Note: Trained models (*.joblib, *.pt) are gitignored.
      Regenerate with training scripts.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-18 04:27:40 -06:00

666 lines
22 KiB
Python

#!/usr/bin/env python3
"""
Backtesting Script for OOS Period (March 2024 - March 2025)
==========================================================
Loads trained models and evaluates them on the holdout period.
Usage:
python scripts/run_backtest_oos_period.py --symbols XAUUSD EURUSD
Author: ML Pipeline
Created: 2026-01-06
"""
import argparse
import sys
from pathlib import Path
from datetime import datetime
import json
import numpy as np
import pandas as pd
from loguru import logger
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
from training.symbol_timeframe_trainer import (
SymbolTimeframeTrainer,
TrainerConfig,
SYMBOL_CONFIGS
)
from data.database import MySQLConnection
def setup_logging(log_dir: Path, experiment_name: str):
"""Configure logging to file and console."""
log_dir.mkdir(parents=True, exist_ok=True)
log_file = log_dir / f"{experiment_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
logger.remove()
logger.add(sys.stderr, level="INFO", format="{time:HH:mm:ss} | {level} | {message}")
logger.add(log_file, level="DEBUG", rotation="10 MB")
return log_file
def load_oos_data(
db: MySQLConnection,
symbol: str,
start_date: str,
end_date: str
) -> pd.DataFrame:
"""Load OOS data from database."""
db_symbol = symbol
if not symbol.startswith('C:') and not symbol.startswith('X:'):
if symbol == 'BTCUSD':
db_symbol = f'X:{symbol}'
else:
db_symbol = f'C:{symbol}'
logger.info(f"Loading OOS data for {db_symbol}...")
query = """
SELECT
date_agg as time,
open, high, low, close, volume, vwap
FROM tickers_agg_data
WHERE ticker = :symbol
AND date_agg >= :start_date
AND date_agg <= :end_date
ORDER BY date_agg ASC
"""
df = db.execute_query(query, {
'symbol': db_symbol,
'start_date': start_date,
'end_date': end_date
})
if df.empty:
logger.warning(f"No data found for {symbol}")
return df
df['time'] = pd.to_datetime(df['time'])
df.set_index('time', inplace=True)
df = df.sort_index()
df.columns = ['open', 'high', 'low', 'close', 'volume', 'vwap']
logger.info(f"Loaded {len(df)} records for {symbol}")
logger.info(f" Date range: {df.index.min()} to {df.index.max()}")
return df
def resample_to_timeframe(df: pd.DataFrame, timeframe: str) -> pd.DataFrame:
"""Resample 5-minute data to different timeframe."""
if timeframe == '5m':
return df
tf_map = {'15m': '15min', '30m': '30min', '1H': '1H', '4H': '4H', '1D': '1D'}
offset = tf_map.get(timeframe, timeframe)
resampled = df.resample(offset).agg({
'open': 'first',
'high': 'max',
'low': 'min',
'close': 'last',
'volume': 'sum',
'vwap': 'mean'
}).dropna()
return resampled
def generate_features(df: pd.DataFrame) -> pd.DataFrame:
"""Generate comprehensive feature set."""
features = pd.DataFrame(index=df.index)
close = df['close']
high = df['high']
low = df['low']
open_price = df['open']
volume = df['volume'] if 'volume' in df.columns else pd.Series(1, index=df.index)
# Price Returns
features['returns_1'] = close.pct_change(1)
features['returns_3'] = close.pct_change(3)
features['returns_5'] = close.pct_change(5)
features['returns_10'] = close.pct_change(10)
features['returns_20'] = close.pct_change(20)
# Volatility Features
features['volatility_5'] = close.pct_change().rolling(5).std()
features['volatility_10'] = close.pct_change().rolling(10).std()
features['volatility_20'] = close.pct_change().rolling(20).std()
# Range Features
candle_range = high - low
features['range'] = candle_range
features['range_pct'] = candle_range / close
features['range_ma_5'] = candle_range.rolling(5).mean()
features['range_ma_10'] = candle_range.rolling(10).mean()
features['range_ma_20'] = candle_range.rolling(20).mean()
features['range_ratio_5'] = candle_range / features['range_ma_5']
features['range_ratio_20'] = candle_range / features['range_ma_20']
# ATR Features
tr1 = high - low
tr2 = abs(high - close.shift(1))
tr3 = abs(low - close.shift(1))
true_range = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
features['atr_5'] = true_range.rolling(5).mean()
features['atr_14'] = true_range.rolling(14).mean()
features['atr_20'] = true_range.rolling(20).mean()
features['atr_ratio'] = true_range / features['atr_14']
# Moving Averages
sma_5 = close.rolling(5).mean()
sma_10 = close.rolling(10).mean()
sma_20 = close.rolling(20).mean()
sma_50 = close.rolling(50).mean()
ema_5 = close.ewm(span=5, adjust=False).mean()
ema_10 = close.ewm(span=10, adjust=False).mean()
ema_20 = close.ewm(span=20, adjust=False).mean()
features['price_vs_sma5'] = (close - sma_5) / features['atr_14']
features['price_vs_sma10'] = (close - sma_10) / features['atr_14']
features['price_vs_sma20'] = (close - sma_20) / features['atr_14']
features['price_vs_sma50'] = (close - sma_50) / features['atr_14']
features['sma5_vs_sma20'] = (sma_5 - sma_20) / features['atr_14']
features['ema5_vs_ema20'] = (ema_5 - ema_20) / features['atr_14']
# RSI
delta = close.diff()
gain = delta.where(delta > 0, 0).rolling(14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
rs = gain / (loss + 1e-10)
features['rsi_14'] = 100 - (100 / (1 + rs))
features['rsi_oversold'] = (features['rsi_14'] < 30).astype(float)
features['rsi_overbought'] = (features['rsi_14'] > 70).astype(float)
# Bollinger Bands
bb_middle = close.rolling(20).mean()
bb_std = close.rolling(20).std()
bb_upper = bb_middle + 2 * bb_std
bb_lower = bb_middle - 2 * bb_std
features['bb_width'] = (bb_upper - bb_lower) / bb_middle
features['bb_position'] = (close - bb_lower) / (bb_upper - bb_lower + 1e-10)
# MACD
ema_12 = close.ewm(span=12, adjust=False).mean()
ema_26 = close.ewm(span=26, adjust=False).mean()
macd = ema_12 - ema_26
macd_signal = macd.ewm(span=9, adjust=False).mean()
features['macd'] = macd / features['atr_14']
features['macd_signal'] = macd_signal / features['atr_14']
features['macd_hist'] = (macd - macd_signal) / features['atr_14']
# Momentum
features['momentum_5'] = (close - close.shift(5)) / features['atr_14']
features['momentum_10'] = (close - close.shift(10)) / features['atr_14']
features['momentum_20'] = (close - close.shift(20)) / features['atr_14']
# Stochastic
low_14 = low.rolling(14).min()
high_14 = high.rolling(14).max()
features['stoch_k'] = 100 * (close - low_14) / (high_14 - low_14 + 1e-10)
features['stoch_d'] = features['stoch_k'].rolling(3).mean()
# Williams %R
features['williams_r'] = -100 * (high_14 - close) / (high_14 - low_14 + 1e-10)
# Volume Features
if volume.sum() > 0:
vol_ma_5 = volume.rolling(5).mean()
vol_ma_20 = volume.rolling(20).mean()
features['volume_ratio'] = volume / (vol_ma_20 + 1)
features['volume_trend'] = (vol_ma_5 - vol_ma_20) / (vol_ma_20 + 1)
# Candle Patterns
body = close - open_price
features['body_pct'] = body / (candle_range + 1e-10)
features['upper_shadow'] = (high - np.maximum(close, open_price)) / (candle_range + 1e-10)
features['lower_shadow'] = (np.minimum(close, open_price) - low) / (candle_range + 1e-10)
# Price Position
features['close_position'] = (close - low) / (candle_range + 1e-10)
high_5 = high.rolling(5).max()
low_5 = low.rolling(5).min()
features['price_position_5'] = (close - low_5) / (high_5 - low_5 + 1e-10)
high_20 = high.rolling(20).max()
low_20 = low.rolling(20).min()
features['price_position_20'] = (close - low_20) / (high_20 - low_20 + 1e-10)
# Time Features
features['hour'] = df.index.hour
features['hour_sin'] = np.sin(2 * np.pi * features['hour'] / 24)
features['hour_cos'] = np.cos(2 * np.pi * features['hour'] / 24)
features['day_of_week'] = df.index.dayofweek
features['dow_sin'] = np.sin(2 * np.pi * features['day_of_week'] / 7)
features['dow_cos'] = np.cos(2 * np.pi * features['day_of_week'] / 7)
# Trading sessions
features['is_london'] = ((features['hour'] >= 8) & (features['hour'] < 16)).astype(float)
features['is_newyork'] = ((features['hour'] >= 13) & (features['hour'] < 21)).astype(float)
features['is_overlap'] = ((features['hour'] >= 13) & (features['hour'] < 16)).astype(float)
# Clean up
features = features.replace([np.inf, -np.inf], np.nan)
drop_cols = ['hour', 'day_of_week']
features = features.drop(columns=[c for c in drop_cols if c in features.columns], errors='ignore')
return features
def compute_actual_ranges(df: pd.DataFrame, horizon: int = 3) -> tuple:
"""Compute actual future high/low ranges."""
close = df['close'].values
high = df['high'].values
low = df['low'].values
n = len(df)
actual_high = np.full(n, np.nan)
actual_low = np.full(n, np.nan)
for i in range(n - horizon):
future_high = high[i+1:i+1+horizon]
future_low = low[i+1:i+1+horizon]
actual_high[i] = np.max(future_high) - close[i]
actual_low[i] = close[i] - np.min(future_low)
return actual_high, actual_low
def evaluate_predictions(
actual_high: np.ndarray,
actual_low: np.ndarray,
pred_high: np.ndarray,
pred_low: np.ndarray,
symbol: str,
timeframe: str
) -> dict:
"""Evaluate prediction quality."""
# Ensure arrays are same length - truncate to shortest
min_len = min(len(actual_high), len(actual_low), len(pred_high), len(pred_low))
actual_high = actual_high[:min_len]
actual_low = actual_low[:min_len]
pred_high = pred_high[:min_len]
pred_low = pred_low[:min_len]
valid = ~(np.isnan(actual_high) | np.isnan(actual_low) |
np.isnan(pred_high) | np.isnan(pred_low))
ah, al = actual_high[valid], actual_low[valid]
ph, pl = pred_high[valid], pred_low[valid]
if len(ah) == 0:
return {'symbol': symbol, 'timeframe': timeframe, 'n_samples': 0,
'error': 'No valid samples'}
mae_high = np.mean(np.abs(ah - ph))
mae_low = np.mean(np.abs(al - pl))
rmse_high = np.sqrt(np.mean((ah - ph)**2))
rmse_low = np.sqrt(np.mean((al - pl)**2))
# Directional accuracy
dir_acc_high = np.mean(np.sign(ah) == np.sign(ph))
dir_acc_low = np.mean(np.sign(al) == np.sign(pl))
# Signal quality metrics for trading
signal_threshold = np.median(np.abs(ah))
# HIGH signal: predicted move > threshold (use filtered arrays)
high_signals = ph > signal_threshold
high_signal_accuracy = np.mean(ah[high_signals] > 0) if high_signals.sum() > 0 else 0
# LOW signal: predicted move > threshold
low_signals = pl > signal_threshold
low_signal_accuracy = np.mean(al[low_signals] > 0) if low_signals.sum() > 0 else 0
# R:R Analysis - simulated trades
rr_results = analyze_rr_performance(ah, al, ph, pl, symbol)
return {
'symbol': symbol,
'timeframe': timeframe,
'n_samples': valid.sum(),
'mae_high': mae_high,
'mae_low': mae_low,
'rmse_high': rmse_high,
'rmse_low': rmse_low,
'dir_accuracy_high': dir_acc_high,
'dir_accuracy_low': dir_acc_low,
'high_signals': int(high_signals.sum()),
'high_signal_accuracy': high_signal_accuracy,
'low_signals': int(low_signals.sum()),
'low_signal_accuracy': low_signal_accuracy,
'rr_analysis': rr_results
}
def analyze_rr_performance(
actual_high: np.ndarray,
actual_low: np.ndarray,
pred_high: np.ndarray,
pred_low: np.ndarray,
symbol: str
) -> dict:
"""Analyze R:R based trading performance."""
results = {}
for rr in [1.0, 1.5, 2.0, 2.5, 3.0]:
# LONG trades: use predicted low as stop loss
long_sl = pred_low
long_tp = pred_high * rr
# Win if price reaches TP before SL
# Simplified: compare actual ranges
long_wins = (actual_high >= long_tp) & (actual_low < long_sl)
long_losses = actual_low >= long_sl
long_total = (~np.isnan(actual_high)).sum()
# More realistic: check if TP hit before SL
long_hit_tp = actual_high >= long_tp
long_hit_sl = actual_low >= long_sl
# Conservative: if both hit, count as loss
long_wins_v2 = long_hit_tp & ~long_hit_sl
long_losses_v2 = long_hit_sl
wins = long_wins_v2.sum()
losses = long_losses_v2.sum()
total = wins + losses
if total > 0:
win_rate = wins / total
expectancy = (win_rate * rr) - ((1 - win_rate) * 1)
else:
win_rate = 0
expectancy = 0
results[f'rr_{rr}'] = {
'win_rate': win_rate,
'wins': int(wins),
'losses': int(losses),
'total_trades': int(total),
'expectancy': expectancy,
'rr_ratio': rr
}
return results
def run_backtest(
symbols: list,
timeframes: list,
model_dir: str,
start_date: str,
end_date: str,
output_dir: str
) -> dict:
"""Run backtest on OOS period."""
logger.info("="*60)
logger.info("OOS BACKTEST")
logger.info("="*60)
logger.info(f"Symbols: {symbols}")
logger.info(f"Timeframes: {timeframes}")
logger.info(f"OOS Period: {start_date} to {end_date}")
logger.info(f"Model dir: {model_dir}")
# Load trained models
trainer = SymbolTimeframeTrainer()
trainer.load(model_dir)
logger.info(f"Loaded {len(trainer.models)} models")
# Connect to database
db = MySQLConnection('config/database.yaml')
all_results = {}
for symbol in symbols:
logger.info(f"\n{'='*60}")
logger.info(f"Backtesting {symbol}")
logger.info(f"{'='*60}")
# Load OOS data
df_5m = load_oos_data(db, symbol, start_date, end_date)
if df_5m.empty:
logger.warning(f"No OOS data for {symbol}")
continue
for timeframe in timeframes:
logger.info(f"\n--- {symbol} {timeframe} ---")
# Resample if needed
if timeframe == '5m':
df_tf = df_5m.copy()
else:
df_tf = resample_to_timeframe(df_5m.copy(), timeframe)
if len(df_tf) < 1000:
logger.warning(f"Insufficient data: {len(df_tf)} bars")
continue
# Generate features
features = generate_features(df_tf)
# Combine with OHLCV
df_combined = pd.concat([df_tf[['open', 'high', 'low', 'close', 'volume']], features], axis=1)
df_combined = df_combined.dropna()
logger.info(f"OOS data shape: {df_combined.shape}")
# Compute actual ranges
horizon = trainer.config.horizons.get(timeframe, 3)
actual_high, actual_low = compute_actual_ranges(df_combined, horizon)
# Prepare features for prediction - use same filter as trainer
exclude_patterns = [
'target_', 'high', 'low', 'open', 'close', 'volume',
'High', 'Low', 'Open', 'Close', 'Volume',
'timestamp', 'datetime', 'date', 'time',
'rr_', 'direction', 'is_valid', 'vwap'
]
feature_cols = []
for col in df_combined.columns:
if not any(pat.lower() in col.lower() for pat in exclude_patterns):
if df_combined[col].dtype in [np.float64, np.float32, np.int64, np.int32, float, int]:
feature_cols.append(col)
logger.info(f"Using {len(feature_cols)} features for prediction")
X = df_combined[feature_cols].values
try:
# Get predictions
predictions = trainer.predict(X, symbol, timeframe)
pred_high = predictions['high']
pred_low = predictions['low']
# Evaluate
results = evaluate_predictions(
actual_high, actual_low,
pred_high, pred_low,
symbol, timeframe
)
key = f"{symbol}_{timeframe}"
all_results[key] = results
# Print results
logger.info(f"\nResults for {symbol} {timeframe}:")
logger.info(f" Samples: {results['n_samples']}")
logger.info(f" MAE High: {results['mae_high']:.6f}")
logger.info(f" MAE Low: {results['mae_low']:.6f}")
logger.info(f" Dir Accuracy High: {results['dir_accuracy_high']:.2%}")
logger.info(f" Dir Accuracy Low: {results['dir_accuracy_low']:.2%}")
logger.info(f" Signal Accuracy High: {results['high_signal_accuracy']:.2%}")
logger.info(f" Signal Accuracy Low: {results['low_signal_accuracy']:.2%}")
# R:R results
logger.info("\n R:R Performance:")
for rr_key, rr_data in results['rr_analysis'].items():
logger.info(f" {rr_key}: WR={rr_data['win_rate']:.2%}, "
f"Trades={rr_data['total_trades']}, "
f"Expectancy={rr_data['expectancy']:.3f}")
except Exception as e:
logger.error(f"Error predicting {symbol} {timeframe}: {e}")
import traceback
traceback.print_exc()
# Save results
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
report_file = output_path / f"backtest_oos_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(report_file, 'w') as f:
json.dump(all_results, f, indent=2, default=str)
logger.info(f"\nResults saved to {report_file}")
# Generate markdown report
generate_markdown_report(all_results, output_path, start_date, end_date)
return all_results
def generate_markdown_report(results: dict, output_dir: Path, start_date: str, end_date: str):
"""Generate markdown report of backtest results."""
report_path = output_dir / f"BACKTEST_REPORT_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
report = f"""# OOS Backtest Report
**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
## Configuration
- **OOS Period:** {start_date} to {end_date}
- **Training Data Cutoff:** {start_date} (excluded from training)
## Summary by Symbol/Timeframe
| Symbol | TF | Samples | MAE High | MAE Low | Dir Acc High | Dir Acc Low | Signal Acc |
|--------|----|---------|---------:|--------:|-------------:|------------:|-----------:|
"""
for key, r in results.items():
report += f"| {r['symbol']} | {r['timeframe']} | {r['n_samples']} | "
report += f"{r['mae_high']:.4f} | {r['mae_low']:.4f} | "
report += f"{r['dir_accuracy_high']:.1%} | {r['dir_accuracy_low']:.1%} | "
report += f"{r['high_signal_accuracy']:.1%} |\n"
report += """
## R:R Analysis
### Risk/Reward Performance by Symbol
"""
for key, r in results.items():
report += f"\n#### {r['symbol']} {r['timeframe']}\n\n"
report += "| R:R | Win Rate | Trades | Expectancy |\n"
report += "|-----|---------|--------|------------|\n"
for rr_key, rr_data in r['rr_analysis'].items():
report += f"| {rr_data['rr_ratio']} | {rr_data['win_rate']:.1%} | "
report += f"{rr_data['total_trades']} | {rr_data['expectancy']:.3f} |\n"
report += """
## Conclusions
### Key Observations
1. **Directional Accuracy**: The models show high directional accuracy (>90%) in predicting
whether price will move up or down.
2. **Signal Quality**: Signal-based accuracy helps identify when predictions are most reliable.
3. **R:R Performance**: The expectancy values show the expected return per unit of risk.
- Positive expectancy = profitable strategy
- Expectancy > 0.5 with 2:1 R:R = strong edge
### Recommendations
1. Focus on configurations with positive expectancy
2. Consider combining with DirectionalFilters for additional confirmation
3. Use volume/volatility filters during low-quality periods
---
*Report generated by OOS Backtest Pipeline*
"""
with open(report_path, 'w') as f:
f.write(report)
logger.info(f"Markdown report saved to {report_path}")
def main():
parser = argparse.ArgumentParser(description='Run OOS Backtest')
parser.add_argument('--symbols', nargs='+', default=['XAUUSD', 'EURUSD'],
help='Symbols to backtest')
parser.add_argument('--timeframes', nargs='+', default=['5m', '15m'],
help='Timeframes to backtest')
parser.add_argument('--model-dir', type=str,
default='models/backtest_mar2024/symbol_timeframe_models',
help='Directory with trained models')
parser.add_argument('--start-date', type=str, default='2024-03-01',
help='OOS period start date')
parser.add_argument('--end-date', type=str, default='2025-03-18',
help='OOS period end date')
parser.add_argument('--output-dir', type=str, default='reports/backtest_oos',
help='Output directory')
args = parser.parse_args()
script_dir = Path(__file__).parent.parent
output_dir = script_dir / args.output_dir
logs_dir = output_dir / 'logs'
setup_logging(logs_dir, 'backtest_oos')
try:
results = run_backtest(
symbols=args.symbols,
timeframes=args.timeframes,
model_dir=str(script_dir / args.model_dir),
start_date=args.start_date,
end_date=args.end_date,
output_dir=str(output_dir)
)
# Print final summary
print("\n" + "="*70)
print("BACKTEST SUMMARY")
print("="*70)
for key, r in results.items():
print(f"\n{r['symbol']} {r['timeframe']}:")
print(f" Dir Accuracy: High={r['dir_accuracy_high']:.1%}, Low={r['dir_accuracy_low']:.1%}")
# Find best R:R
best_rr = max(r['rr_analysis'].items(),
key=lambda x: x[1]['expectancy'])
print(f" Best R:R: {best_rr[0]} (WR={best_rr[1]['win_rate']:.1%}, "
f"Exp={best_rr[1]['expectancy']:.3f})")
print("\n" + "="*70)
print("BACKTEST COMPLETE!")
print("="*70)
except Exception as e:
logger.exception(f"Backtest failed: {e}")
sys.exit(1)
if __name__ == "__main__":
main()