#!/usr/bin/env python3 """ Train Movement Magnitude Predictor ================================== Trains the new MovementMagnitudePredictor model for asymmetric trading opportunities. Horizons: - 5m candles -> 15 min prediction (3 bars) - 15m candles -> 60 min prediction (4 bars) Author: ML-Specialist (NEXUS v4.0) Date: 2026-01-04 """ import sys sys.path.insert(0, 'src') import numpy as np import pandas as pd from pathlib import Path from datetime import datetime import yaml import json from loguru import logger import argparse from data.database import MySQLConnection from data.features import FeatureEngineer from training.data_splitter import TemporalDataSplitter from models.movement_magnitude_predictor import ( MovementMagnitudePredictor, calculate_standard_variance ) def resample_to_timeframe(df: pd.DataFrame, timeframe: str) -> pd.DataFrame: """Resample minute data to desired timeframe""" if timeframe == '5m': rule = '5min' elif timeframe == '15m': rule = '15min' elif timeframe == '1H': rule = '1h' else: raise ValueError(f"Unknown timeframe: {timeframe}") # Ensure datetime index if not isinstance(df.index, pd.DatetimeIndex): df.index = pd.to_datetime(df.index) ohlcv = df.resample(rule).agg({ 'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last', 'volume': 'sum' }).dropna() return ohlcv def train_movement_predictor( symbol: str = "XAUUSD", horizons: list = None, asymmetry_threshold: float = 1.5, min_move_usd: float = 3.0 ): """ Train the MovementMagnitudePredictor model. Args: symbol: Trading symbol horizons: Which horizons to train asymmetry_threshold: Threshold for opportunity detection min_move_usd: Minimum USD move to consider """ horizons = horizons or ['5m_15min', '15m_60min'] logger.info("=" * 60) logger.info("MOVEMENT MAGNITUDE PREDICTOR TRAINING") logger.info(f"Symbol: {symbol}") logger.info(f"Horizons: {horizons}") logger.info(f"Asymmetry Threshold: {asymmetry_threshold}") logger.info(f"Min Move USD: ${min_move_usd}") logger.info("=" * 60) # Load data from database logger.info("\nLoading data from database...") db = MySQLConnection('config/database.yaml') df_raw = db.get_ticker_data(symbol, limit=150000) if df_raw.empty: logger.error("No data loaded") return None logger.info(f"Loaded {len(df_raw)} records") logger.info(f"Date range: {df_raw.index.min()} to {df_raw.index.max()}") # Split data temporally (exclude 2025 for OOS) splitter = TemporalDataSplitter() split = splitter.split_temporal(df_raw) df_train = split.train_data df_test = split.test_data logger.info(f"\nTrain data: {len(df_train)} records ({df_train.index.min()} to {df_train.index.max()})") logger.info(f"Test OOS: {len(df_test)} records ({df_test.index.min()} to {df_test.index.max()})") # Calculate baseline statistics for Gold logger.info("\n" + "=" * 60) logger.info("BASELINE MOVEMENT STATISTICS") logger.info("=" * 60) # Stats for 5m timeframe df_5m = resample_to_timeframe(df_train, '5m') stats_5m = calculate_standard_variance(df_5m, '5m', lookback_periods=1000) logger.info(f"\n5-minute bars (Gold):") logger.info(f" Mean range: ${stats_5m['mean_range']:.2f}") logger.info(f" Std range: ${stats_5m['std_range']:.2f}") logger.info(f" P75 range: ${stats_5m['p75_range']:.2f}") logger.info(f" P90 range: ${stats_5m['p90_range']:.2f}") logger.info(f" Mean high move: ${stats_5m['mean_high_move']:.2f}") logger.info(f" Mean low move: ${stats_5m['mean_low_move']:.2f}") # Stats for 15m timeframe df_15m = resample_to_timeframe(df_train, '15m') stats_15m = calculate_standard_variance(df_15m, '15m', lookback_periods=1000) logger.info(f"\n15-minute bars (Gold):") logger.info(f" Mean range: ${stats_15m['mean_range']:.2f}") logger.info(f" Std range: ${stats_15m['std_range']:.2f}") logger.info(f" P75 range: ${stats_15m['p75_range']:.2f}") logger.info(f" P90 range: ${stats_15m['p90_range']:.2f}") logger.info(f" Mean high move: ${stats_15m['mean_high_move']:.2f}") logger.info(f" Mean low move: ${stats_15m['mean_low_move']:.2f}") # Train models for each horizon all_results = {} for horizon in horizons: logger.info("\n" + "=" * 60) logger.info(f"TRAINING: {horizon}") logger.info("=" * 60) # Get correct timeframe data if horizon.startswith('5m'): df_train_tf = resample_to_timeframe(df_train, '5m') df_test_tf = resample_to_timeframe(df_test, '5m') else: # 15m df_train_tf = resample_to_timeframe(df_train, '15m') df_test_tf = resample_to_timeframe(df_test, '15m') logger.info(f"Train samples: {len(df_train_tf)}") logger.info(f"Test samples: {len(df_test_tf)}") # Initialize predictor for this horizon predictor = MovementMagnitudePredictor( horizons=[horizon], use_gpu=True, asymmetry_threshold=asymmetry_threshold, min_move_usd=min_move_usd ) # Train train_metrics = predictor.fit(df_train_tf) # Evaluate OOS logger.info("\nEvaluating on OOS data (2025)...") oos_metrics = predictor.evaluate_oos(df_test_tf) # Store results all_results[horizon] = { 'train_metrics': {k: v.to_dict() for k, v in train_metrics.items()}, 'oos_metrics': {k: v.to_dict() for k, v in oos_metrics.items()}, 'baseline_stats': predictor.baseline_stats } # Save model model_path = f"models/ml_first/{symbol}/movement_predictor/{horizon}" predictor.save(model_path) logger.info(f"Model saved to {model_path}") # Generate sample predictions on last 10 bars logger.info("\nSample predictions (last 10 bars of OOS):") predictions = predictor.predict(df_test_tf.tail(20)) for pred in predictions[-10:]: logger.info( f" {pred.timestamp}: High=${pred.predicted_high_usd:.2f}, " f"Low=${pred.predicted_low_usd:.2f}, " f"Asymmetry={pred.asymmetry_ratio:.2f}, " f"Direction={pred.suggested_direction}, " f"RR={pred.suggested_rr:.1f}" ) # Print summary print("\n" + "=" * 60) print("TRAINING SUMMARY") print("=" * 60) for horizon, results in all_results.items(): print(f"\n{horizon}:") print("-" * 40) if results['oos_metrics']: for key, metrics in results['oos_metrics'].items(): print(f" {key}:") print(f" MAE: ${metrics['mae_usd']:.2f}") print(f" R²: {metrics['r2']:.4f}") print(f" Asymmetry Accuracy: {metrics['asymmetry_accuracy']:.2%}") # Save combined results output_path = Path(f"models/ml_first/{symbol}/movement_predictor") output_path.mkdir(parents=True, exist_ok=True) results_file = output_path / "training_results.json" with open(results_file, 'w') as f: json.dump({ 'timestamp': datetime.now().isoformat(), 'symbol': symbol, 'horizons': horizons, 'asymmetry_threshold': asymmetry_threshold, 'min_move_usd': min_move_usd, 'baseline_5m': stats_5m, 'baseline_15m': stats_15m, 'results': all_results }, f, indent=2) logger.info(f"\nResults saved to {results_file}") return all_results def main(): parser = argparse.ArgumentParser(description='Train Movement Magnitude Predictor') parser.add_argument('--symbol', default='XAUUSD', help='Trading symbol') parser.add_argument('--horizons', nargs='+', default=['5m_15min', '15m_60min'], help='Horizons to train') parser.add_argument('--asymmetry-threshold', type=float, default=1.5, help='Asymmetry threshold for opportunities') parser.add_argument('--min-move', type=float, default=3.0, help='Minimum move in USD to consider') args = parser.parse_args() results = train_movement_predictor( symbol=args.symbol, horizons=args.horizons, asymmetry_threshold=args.asymmetry_threshold, min_move_usd=args.min_move ) return results if __name__ == "__main__": main()