Hierarchical ML Pipeline for trading predictions:
- Level 0: Attention Models (volatility/flow classification)
- Level 1: Base Models (XGBoost per symbol/timeframe)
- Level 2: Metamodels (XGBoost Stacking + Neural Gating)
Key components:
- src/pipelines/hierarchical_pipeline.py - Main prediction pipeline
- src/models/ - All ML model classes
- src/training/ - Training utilities
- src/api/ - FastAPI endpoints
- scripts/ - Training and evaluation scripts
- config/ - YAML configurations
Note: Trained models (*.joblib, *.pt) are gitignored.
Regenerate with training scripts.
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
256 lines
8.4 KiB
Python
256 lines
8.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Train Movement Magnitude Predictor
|
|
==================================
|
|
Trains the new MovementMagnitudePredictor model for asymmetric trading opportunities.
|
|
|
|
Horizons:
|
|
- 5m candles -> 15 min prediction (3 bars)
|
|
- 15m candles -> 60 min prediction (4 bars)
|
|
|
|
Author: ML-Specialist (NEXUS v4.0)
|
|
Date: 2026-01-04
|
|
"""
|
|
|
|
import sys
|
|
sys.path.insert(0, 'src')
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
import yaml
|
|
import json
|
|
from loguru import logger
|
|
import argparse
|
|
|
|
from data.database import MySQLConnection
|
|
from data.features import FeatureEngineer
|
|
from training.data_splitter import TemporalDataSplitter
|
|
from models.movement_magnitude_predictor import (
|
|
MovementMagnitudePredictor,
|
|
calculate_standard_variance
|
|
)
|
|
|
|
|
|
def resample_to_timeframe(df: pd.DataFrame, timeframe: str) -> pd.DataFrame:
|
|
"""Resample minute data to desired timeframe"""
|
|
if timeframe == '5m':
|
|
rule = '5min'
|
|
elif timeframe == '15m':
|
|
rule = '15min'
|
|
elif timeframe == '1H':
|
|
rule = '1h'
|
|
else:
|
|
raise ValueError(f"Unknown timeframe: {timeframe}")
|
|
|
|
# Ensure datetime index
|
|
if not isinstance(df.index, pd.DatetimeIndex):
|
|
df.index = pd.to_datetime(df.index)
|
|
|
|
ohlcv = df.resample(rule).agg({
|
|
'open': 'first',
|
|
'high': 'max',
|
|
'low': 'min',
|
|
'close': 'last',
|
|
'volume': 'sum'
|
|
}).dropna()
|
|
|
|
return ohlcv
|
|
|
|
|
|
def train_movement_predictor(
|
|
symbol: str = "XAUUSD",
|
|
horizons: list = None,
|
|
asymmetry_threshold: float = 1.5,
|
|
min_move_usd: float = 3.0
|
|
):
|
|
"""
|
|
Train the MovementMagnitudePredictor model.
|
|
|
|
Args:
|
|
symbol: Trading symbol
|
|
horizons: Which horizons to train
|
|
asymmetry_threshold: Threshold for opportunity detection
|
|
min_move_usd: Minimum USD move to consider
|
|
"""
|
|
horizons = horizons or ['5m_15min', '15m_60min']
|
|
|
|
logger.info("=" * 60)
|
|
logger.info("MOVEMENT MAGNITUDE PREDICTOR TRAINING")
|
|
logger.info(f"Symbol: {symbol}")
|
|
logger.info(f"Horizons: {horizons}")
|
|
logger.info(f"Asymmetry Threshold: {asymmetry_threshold}")
|
|
logger.info(f"Min Move USD: ${min_move_usd}")
|
|
logger.info("=" * 60)
|
|
|
|
# Load data from database
|
|
logger.info("\nLoading data from database...")
|
|
db = MySQLConnection('config/database.yaml')
|
|
df_raw = db.get_ticker_data(symbol, limit=150000)
|
|
|
|
if df_raw.empty:
|
|
logger.error("No data loaded")
|
|
return None
|
|
|
|
logger.info(f"Loaded {len(df_raw)} records")
|
|
logger.info(f"Date range: {df_raw.index.min()} to {df_raw.index.max()}")
|
|
|
|
# Split data temporally (exclude 2025 for OOS)
|
|
splitter = TemporalDataSplitter()
|
|
split = splitter.split_temporal(df_raw)
|
|
|
|
df_train = split.train_data
|
|
df_test = split.test_data
|
|
|
|
logger.info(f"\nTrain data: {len(df_train)} records ({df_train.index.min()} to {df_train.index.max()})")
|
|
logger.info(f"Test OOS: {len(df_test)} records ({df_test.index.min()} to {df_test.index.max()})")
|
|
|
|
# Calculate baseline statistics for Gold
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info("BASELINE MOVEMENT STATISTICS")
|
|
logger.info("=" * 60)
|
|
|
|
# Stats for 5m timeframe
|
|
df_5m = resample_to_timeframe(df_train, '5m')
|
|
stats_5m = calculate_standard_variance(df_5m, '5m', lookback_periods=1000)
|
|
logger.info(f"\n5-minute bars (Gold):")
|
|
logger.info(f" Mean range: ${stats_5m['mean_range']:.2f}")
|
|
logger.info(f" Std range: ${stats_5m['std_range']:.2f}")
|
|
logger.info(f" P75 range: ${stats_5m['p75_range']:.2f}")
|
|
logger.info(f" P90 range: ${stats_5m['p90_range']:.2f}")
|
|
logger.info(f" Mean high move: ${stats_5m['mean_high_move']:.2f}")
|
|
logger.info(f" Mean low move: ${stats_5m['mean_low_move']:.2f}")
|
|
|
|
# Stats for 15m timeframe
|
|
df_15m = resample_to_timeframe(df_train, '15m')
|
|
stats_15m = calculate_standard_variance(df_15m, '15m', lookback_periods=1000)
|
|
logger.info(f"\n15-minute bars (Gold):")
|
|
logger.info(f" Mean range: ${stats_15m['mean_range']:.2f}")
|
|
logger.info(f" Std range: ${stats_15m['std_range']:.2f}")
|
|
logger.info(f" P75 range: ${stats_15m['p75_range']:.2f}")
|
|
logger.info(f" P90 range: ${stats_15m['p90_range']:.2f}")
|
|
logger.info(f" Mean high move: ${stats_15m['mean_high_move']:.2f}")
|
|
logger.info(f" Mean low move: ${stats_15m['mean_low_move']:.2f}")
|
|
|
|
# Train models for each horizon
|
|
all_results = {}
|
|
|
|
for horizon in horizons:
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info(f"TRAINING: {horizon}")
|
|
logger.info("=" * 60)
|
|
|
|
# Get correct timeframe data
|
|
if horizon.startswith('5m'):
|
|
df_train_tf = resample_to_timeframe(df_train, '5m')
|
|
df_test_tf = resample_to_timeframe(df_test, '5m')
|
|
else: # 15m
|
|
df_train_tf = resample_to_timeframe(df_train, '15m')
|
|
df_test_tf = resample_to_timeframe(df_test, '15m')
|
|
|
|
logger.info(f"Train samples: {len(df_train_tf)}")
|
|
logger.info(f"Test samples: {len(df_test_tf)}")
|
|
|
|
# Initialize predictor for this horizon
|
|
predictor = MovementMagnitudePredictor(
|
|
horizons=[horizon],
|
|
use_gpu=True,
|
|
asymmetry_threshold=asymmetry_threshold,
|
|
min_move_usd=min_move_usd
|
|
)
|
|
|
|
# Train
|
|
train_metrics = predictor.fit(df_train_tf)
|
|
|
|
# Evaluate OOS
|
|
logger.info("\nEvaluating on OOS data (2025)...")
|
|
oos_metrics = predictor.evaluate_oos(df_test_tf)
|
|
|
|
# Store results
|
|
all_results[horizon] = {
|
|
'train_metrics': {k: v.to_dict() for k, v in train_metrics.items()},
|
|
'oos_metrics': {k: v.to_dict() for k, v in oos_metrics.items()},
|
|
'baseline_stats': predictor.baseline_stats
|
|
}
|
|
|
|
# Save model
|
|
model_path = f"models/ml_first/{symbol}/movement_predictor/{horizon}"
|
|
predictor.save(model_path)
|
|
logger.info(f"Model saved to {model_path}")
|
|
|
|
# Generate sample predictions on last 10 bars
|
|
logger.info("\nSample predictions (last 10 bars of OOS):")
|
|
predictions = predictor.predict(df_test_tf.tail(20))
|
|
for pred in predictions[-10:]:
|
|
logger.info(
|
|
f" {pred.timestamp}: High=${pred.predicted_high_usd:.2f}, "
|
|
f"Low=${pred.predicted_low_usd:.2f}, "
|
|
f"Asymmetry={pred.asymmetry_ratio:.2f}, "
|
|
f"Direction={pred.suggested_direction}, "
|
|
f"RR={pred.suggested_rr:.1f}"
|
|
)
|
|
|
|
# Print summary
|
|
print("\n" + "=" * 60)
|
|
print("TRAINING SUMMARY")
|
|
print("=" * 60)
|
|
|
|
for horizon, results in all_results.items():
|
|
print(f"\n{horizon}:")
|
|
print("-" * 40)
|
|
|
|
if results['oos_metrics']:
|
|
for key, metrics in results['oos_metrics'].items():
|
|
print(f" {key}:")
|
|
print(f" MAE: ${metrics['mae_usd']:.2f}")
|
|
print(f" R²: {metrics['r2']:.4f}")
|
|
print(f" Asymmetry Accuracy: {metrics['asymmetry_accuracy']:.2%}")
|
|
|
|
# Save combined results
|
|
output_path = Path(f"models/ml_first/{symbol}/movement_predictor")
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
results_file = output_path / "training_results.json"
|
|
with open(results_file, 'w') as f:
|
|
json.dump({
|
|
'timestamp': datetime.now().isoformat(),
|
|
'symbol': symbol,
|
|
'horizons': horizons,
|
|
'asymmetry_threshold': asymmetry_threshold,
|
|
'min_move_usd': min_move_usd,
|
|
'baseline_5m': stats_5m,
|
|
'baseline_15m': stats_15m,
|
|
'results': all_results
|
|
}, f, indent=2)
|
|
|
|
logger.info(f"\nResults saved to {results_file}")
|
|
|
|
return all_results
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Train Movement Magnitude Predictor')
|
|
parser.add_argument('--symbol', default='XAUUSD', help='Trading symbol')
|
|
parser.add_argument('--horizons', nargs='+', default=['5m_15min', '15m_60min'],
|
|
help='Horizons to train')
|
|
parser.add_argument('--asymmetry-threshold', type=float, default=1.5,
|
|
help='Asymmetry threshold for opportunities')
|
|
parser.add_argument('--min-move', type=float, default=3.0,
|
|
help='Minimum move in USD to consider')
|
|
|
|
args = parser.parse_args()
|
|
|
|
results = train_movement_predictor(
|
|
symbol=args.symbol,
|
|
horizons=args.horizons,
|
|
asymmetry_threshold=args.asymmetry_threshold,
|
|
min_move_usd=args.min_move
|
|
)
|
|
|
|
return results
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|