trading-platform-ml-engine-v2/scripts/train_movement_predictor.py
rckrdmrd 75c4d07690 feat: Initial commit - ML Engine codebase
Hierarchical ML Pipeline for trading predictions:
- Level 0: Attention Models (volatility/flow classification)
- Level 1: Base Models (XGBoost per symbol/timeframe)
- Level 2: Metamodels (XGBoost Stacking + Neural Gating)

Key components:
- src/pipelines/hierarchical_pipeline.py - Main prediction pipeline
- src/models/ - All ML model classes
- src/training/ - Training utilities
- src/api/ - FastAPI endpoints
- scripts/ - Training and evaluation scripts
- config/ - YAML configurations

Note: Trained models (*.joblib, *.pt) are gitignored.
      Regenerate with training scripts.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-18 04:27:40 -06:00

256 lines
8.4 KiB
Python

#!/usr/bin/env python3
"""
Train Movement Magnitude Predictor
==================================
Trains the new MovementMagnitudePredictor model for asymmetric trading opportunities.
Horizons:
- 5m candles -> 15 min prediction (3 bars)
- 15m candles -> 60 min prediction (4 bars)
Author: ML-Specialist (NEXUS v4.0)
Date: 2026-01-04
"""
import sys
sys.path.insert(0, 'src')
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime
import yaml
import json
from loguru import logger
import argparse
from data.database import MySQLConnection
from data.features import FeatureEngineer
from training.data_splitter import TemporalDataSplitter
from models.movement_magnitude_predictor import (
MovementMagnitudePredictor,
calculate_standard_variance
)
def resample_to_timeframe(df: pd.DataFrame, timeframe: str) -> pd.DataFrame:
"""Resample minute data to desired timeframe"""
if timeframe == '5m':
rule = '5min'
elif timeframe == '15m':
rule = '15min'
elif timeframe == '1H':
rule = '1h'
else:
raise ValueError(f"Unknown timeframe: {timeframe}")
# Ensure datetime index
if not isinstance(df.index, pd.DatetimeIndex):
df.index = pd.to_datetime(df.index)
ohlcv = df.resample(rule).agg({
'open': 'first',
'high': 'max',
'low': 'min',
'close': 'last',
'volume': 'sum'
}).dropna()
return ohlcv
def train_movement_predictor(
symbol: str = "XAUUSD",
horizons: list = None,
asymmetry_threshold: float = 1.5,
min_move_usd: float = 3.0
):
"""
Train the MovementMagnitudePredictor model.
Args:
symbol: Trading symbol
horizons: Which horizons to train
asymmetry_threshold: Threshold for opportunity detection
min_move_usd: Minimum USD move to consider
"""
horizons = horizons or ['5m_15min', '15m_60min']
logger.info("=" * 60)
logger.info("MOVEMENT MAGNITUDE PREDICTOR TRAINING")
logger.info(f"Symbol: {symbol}")
logger.info(f"Horizons: {horizons}")
logger.info(f"Asymmetry Threshold: {asymmetry_threshold}")
logger.info(f"Min Move USD: ${min_move_usd}")
logger.info("=" * 60)
# Load data from database
logger.info("\nLoading data from database...")
db = MySQLConnection('config/database.yaml')
df_raw = db.get_ticker_data(symbol, limit=150000)
if df_raw.empty:
logger.error("No data loaded")
return None
logger.info(f"Loaded {len(df_raw)} records")
logger.info(f"Date range: {df_raw.index.min()} to {df_raw.index.max()}")
# Split data temporally (exclude 2025 for OOS)
splitter = TemporalDataSplitter()
split = splitter.split_temporal(df_raw)
df_train = split.train_data
df_test = split.test_data
logger.info(f"\nTrain data: {len(df_train)} records ({df_train.index.min()} to {df_train.index.max()})")
logger.info(f"Test OOS: {len(df_test)} records ({df_test.index.min()} to {df_test.index.max()})")
# Calculate baseline statistics for Gold
logger.info("\n" + "=" * 60)
logger.info("BASELINE MOVEMENT STATISTICS")
logger.info("=" * 60)
# Stats for 5m timeframe
df_5m = resample_to_timeframe(df_train, '5m')
stats_5m = calculate_standard_variance(df_5m, '5m', lookback_periods=1000)
logger.info(f"\n5-minute bars (Gold):")
logger.info(f" Mean range: ${stats_5m['mean_range']:.2f}")
logger.info(f" Std range: ${stats_5m['std_range']:.2f}")
logger.info(f" P75 range: ${stats_5m['p75_range']:.2f}")
logger.info(f" P90 range: ${stats_5m['p90_range']:.2f}")
logger.info(f" Mean high move: ${stats_5m['mean_high_move']:.2f}")
logger.info(f" Mean low move: ${stats_5m['mean_low_move']:.2f}")
# Stats for 15m timeframe
df_15m = resample_to_timeframe(df_train, '15m')
stats_15m = calculate_standard_variance(df_15m, '15m', lookback_periods=1000)
logger.info(f"\n15-minute bars (Gold):")
logger.info(f" Mean range: ${stats_15m['mean_range']:.2f}")
logger.info(f" Std range: ${stats_15m['std_range']:.2f}")
logger.info(f" P75 range: ${stats_15m['p75_range']:.2f}")
logger.info(f" P90 range: ${stats_15m['p90_range']:.2f}")
logger.info(f" Mean high move: ${stats_15m['mean_high_move']:.2f}")
logger.info(f" Mean low move: ${stats_15m['mean_low_move']:.2f}")
# Train models for each horizon
all_results = {}
for horizon in horizons:
logger.info("\n" + "=" * 60)
logger.info(f"TRAINING: {horizon}")
logger.info("=" * 60)
# Get correct timeframe data
if horizon.startswith('5m'):
df_train_tf = resample_to_timeframe(df_train, '5m')
df_test_tf = resample_to_timeframe(df_test, '5m')
else: # 15m
df_train_tf = resample_to_timeframe(df_train, '15m')
df_test_tf = resample_to_timeframe(df_test, '15m')
logger.info(f"Train samples: {len(df_train_tf)}")
logger.info(f"Test samples: {len(df_test_tf)}")
# Initialize predictor for this horizon
predictor = MovementMagnitudePredictor(
horizons=[horizon],
use_gpu=True,
asymmetry_threshold=asymmetry_threshold,
min_move_usd=min_move_usd
)
# Train
train_metrics = predictor.fit(df_train_tf)
# Evaluate OOS
logger.info("\nEvaluating on OOS data (2025)...")
oos_metrics = predictor.evaluate_oos(df_test_tf)
# Store results
all_results[horizon] = {
'train_metrics': {k: v.to_dict() for k, v in train_metrics.items()},
'oos_metrics': {k: v.to_dict() for k, v in oos_metrics.items()},
'baseline_stats': predictor.baseline_stats
}
# Save model
model_path = f"models/ml_first/{symbol}/movement_predictor/{horizon}"
predictor.save(model_path)
logger.info(f"Model saved to {model_path}")
# Generate sample predictions on last 10 bars
logger.info("\nSample predictions (last 10 bars of OOS):")
predictions = predictor.predict(df_test_tf.tail(20))
for pred in predictions[-10:]:
logger.info(
f" {pred.timestamp}: High=${pred.predicted_high_usd:.2f}, "
f"Low=${pred.predicted_low_usd:.2f}, "
f"Asymmetry={pred.asymmetry_ratio:.2f}, "
f"Direction={pred.suggested_direction}, "
f"RR={pred.suggested_rr:.1f}"
)
# Print summary
print("\n" + "=" * 60)
print("TRAINING SUMMARY")
print("=" * 60)
for horizon, results in all_results.items():
print(f"\n{horizon}:")
print("-" * 40)
if results['oos_metrics']:
for key, metrics in results['oos_metrics'].items():
print(f" {key}:")
print(f" MAE: ${metrics['mae_usd']:.2f}")
print(f" R²: {metrics['r2']:.4f}")
print(f" Asymmetry Accuracy: {metrics['asymmetry_accuracy']:.2%}")
# Save combined results
output_path = Path(f"models/ml_first/{symbol}/movement_predictor")
output_path.mkdir(parents=True, exist_ok=True)
results_file = output_path / "training_results.json"
with open(results_file, 'w') as f:
json.dump({
'timestamp': datetime.now().isoformat(),
'symbol': symbol,
'horizons': horizons,
'asymmetry_threshold': asymmetry_threshold,
'min_move_usd': min_move_usd,
'baseline_5m': stats_5m,
'baseline_15m': stats_15m,
'results': all_results
}, f, indent=2)
logger.info(f"\nResults saved to {results_file}")
return all_results
def main():
parser = argparse.ArgumentParser(description='Train Movement Magnitude Predictor')
parser.add_argument('--symbol', default='XAUUSD', help='Trading symbol')
parser.add_argument('--horizons', nargs='+', default=['5m_15min', '15m_60min'],
help='Horizons to train')
parser.add_argument('--asymmetry-threshold', type=float, default=1.5,
help='Asymmetry threshold for opportunities')
parser.add_argument('--min-move', type=float, default=3.0,
help='Minimum move in USD to consider')
args = parser.parse_args()
results = train_movement_predictor(
symbol=args.symbol,
horizons=args.horizons,
asymmetry_threshold=args.asymmetry_threshold,
min_move_usd=args.min_move
)
return results
if __name__ == "__main__":
main()