#!/usr/bin/env python3 """ ML-First Training Pipeline =========================== Complete training pipeline for ML-First strategy. Trains all models with proper temporal validation: - RangePredictorV2 (multi-timeframe) - AMDDetectorML (phase detection) - Walk-forward validation - OOS evaluation (2025 data never seen in training) Usage: python scripts/train_ml_first.py --symbol XAUUSD --timeframes 15m,1H python scripts/train_ml_first.py --full-training Author: ML-Specialist (NEXUS v4.0) Created: 2026-01-04 """ import os import sys import argparse from pathlib import Path from datetime import datetime from typing import Dict, List, Optional, Any import pandas as pd import numpy as np import yaml import json from loguru import logger # Add src to path sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from data.database import DatabaseManager from data.pipeline import DataPipeline from training.data_splitter import TemporalDataSplitter, create_ml_first_splits from training.walk_forward import WalkForwardValidator from models.range_predictor_v2 import RangePredictorV2, RangeMetricsV2 from models.amd_detector_ml import AMDDetectorML, AMDMetrics # Configure logging logger.remove() logger.add( sys.stdout, format="{time:HH:mm:ss} | {level: <8} | {message}", level="INFO" ) logger.add( "logs/training_{time}.log", format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {message}", level="DEBUG", rotation="1 day" ) class MLFirstTrainer: """ Complete ML training pipeline for ML-First strategy. Handles: - Data loading and preparation - Temporal splitting (2025 excluded for OOS) - Multi-timeframe model training - Walk-forward validation - OOS evaluation - Model saving and reporting """ def __init__( self, output_dir: str = "models/ml_first", config_path: str = "config/validation_oos.yaml" ): """ Initialize the ML trainer. Args: output_dir: Directory for saved models config_path: Path to validation configuration """ self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) self.config_path = config_path self.db_manager = DatabaseManager() self.data_pipeline = DataPipeline() self.splitter = TemporalDataSplitter(config_path) # Load config with open(config_path, 'r') as f: self.config = yaml.safe_load(f) # Training results self.results: Dict[str, Any] = { 'timestamp': datetime.now().isoformat(), 'models': {}, 'oos_results': {}, 'summary': {} } def load_data( self, symbol: str, limit: int = 500000 ) -> pd.DataFrame: """Load raw data from database""" logger.info(f"Loading data for {symbol}...") df = self.db_manager.db.get_ticker_data(symbol, limit=limit) if df.empty: raise ValueError(f"No data found for {symbol}") logger.info(f"Loaded {len(df):,} records ({df.index.min()} to {df.index.max()})") # Show year distribution self.splitter.print_data_summary(df) return df def prepare_features_targets( self, df: pd.DataFrame, timeframe: str = '15m' ) -> pd.DataFrame: """ Prepare features and targets for training. Args: df: Raw OHLCV DataFrame timeframe: Target timeframe Returns: DataFrame with features and targets """ logger.info(f"Preparing features for timeframe {timeframe}...") # Process features using FeatureEngineer from data.features import FeatureEngineer feature_eng = FeatureEngineer() df_processed = df.copy() df_processed = feature_eng.create_price_features(df_processed) df_processed = feature_eng.create_volume_features(df_processed) df_processed = feature_eng.create_time_features(df_processed) df_processed = feature_eng.create_rolling_features( df_processed, columns=['close', 'volume', 'high', 'low'], windows=[5, 10, 20] ) # Create targets based on timeframe config horizons = RangePredictorV2.TIMEFRAME_CONFIGS.get(timeframe) if horizons: for horizon_name, periods in horizons.horizons.items(): # Future high/low future_highs = [df_processed['high'].shift(-i) for i in range(1, periods + 1)] future_lows = [df_processed['low'].shift(-i) for i in range(1, periods + 1)] df_processed[f'target_delta_high_{horizon_name}'] = ( pd.concat(future_highs, axis=1).max(axis=1) / df_processed['close'] - 1 ) df_processed[f'target_delta_low_{horizon_name}'] = ( pd.concat(future_lows, axis=1).min(axis=1) / df_processed['close'] - 1 ) df_processed[f'target_direction_{horizon_name}'] = ( df_processed['close'].shift(-periods) > df_processed['close'] ).astype(int) # Drop NaN df_processed = df_processed.dropna() logger.info(f"Prepared {len(df_processed):,} samples with {len(df_processed.columns)} columns") return df_processed def train_range_predictor( self, df: pd.DataFrame, timeframes: List[str], symbol: str ) -> Dict[str, Any]: """ Train RangePredictorV2 for specified timeframes. Args: df: Prepared DataFrame with features and targets timeframes: List of timeframes to train symbol: Trading symbol Returns: Training results """ logger.info(f"\n{'='*60}") logger.info("TRAINING RANGE PREDICTOR V2") logger.info(f"{'='*60}") results = {} for tf in timeframes: logger.info(f"\n--- Timeframe: {tf} ---") # Create temporal split splits = create_ml_first_splits(df, self.config_path) train_df = splits['train'] val_df = splits['val'] test_df = splits['test_oos'] # Separate features and targets target_cols = [c for c in train_df.columns if c.startswith('target_')] feature_cols = [c for c in train_df.columns if not c.startswith('target_') and c not in ['open', 'high', 'low', 'close', 'volume', 'ticker']] X_train = train_df[feature_cols] y_train = train_df[target_cols] X_val = val_df[feature_cols] y_val = val_df[target_cols] X_test = test_df[feature_cols] y_test = test_df[target_cols] logger.info(f"Train: {len(X_train):,}, Val: {len(X_val):,}, Test OOS: {len(X_test):,}") logger.info(f"Features: {len(feature_cols)}, Targets: {len(target_cols)}") # Initialize and train predictor = RangePredictorV2(timeframes=[tf], use_gpu=True) train_metrics = predictor.train(X_train, y_train, X_val, y_val, timeframe=tf) # Evaluate on OOS logger.info("\n--- OOS Evaluation ---") oos_metrics = predictor.evaluate(X_test, y_test, timeframe=tf) # Save model model_path = self.output_dir / symbol / 'range_predictor' / tf predictor.save(str(model_path)) # Store results results[tf] = { 'train_metrics': {k: vars(v) for k, v in train_metrics.items()}, 'oos_metrics': {k: vars(v) for k, v in oos_metrics.items()}, 'model_path': str(model_path), 'train_size': len(X_train), 'val_size': len(X_val), 'test_size': len(X_test) } # Print OOS summary logger.info("\nOOS Results:") for key, m in oos_metrics.items(): logger.info(f" {key}: MAE={m.mae:.4f}, R2={m.r2:.4f}, DirAcc={m.directional_accuracy:.2%}") return results def train_amd_detector( self, df: pd.DataFrame, symbol: str ) -> Dict[str, Any]: """ Train AMDDetectorML. Args: df: Raw OHLCV DataFrame symbol: Trading symbol Returns: Training results """ logger.info(f"\n{'='*60}") logger.info("TRAINING AMD DETECTOR ML") logger.info(f"{'='*60}") # Create temporal split splits = create_ml_first_splits(df, self.config_path) train_df = splits['train'] val_df = splits['val'] test_df = splits['test_oos'] logger.info(f"Train: {len(train_df):,}, Val: {len(val_df):,}, Test OOS: {len(test_df):,}") # Initialize and train detector = AMDDetectorML(use_gpu=True) train_metrics = detector.train(train_df, val_df) # Evaluate on OOS logger.info("\n--- OOS Evaluation ---") X_test, y_test = detector.prepare_training_data(test_df) y_pred = detector.model.predict(X_test.values) from sklearn.metrics import accuracy_score, f1_score oos_accuracy = accuracy_score(y_test.values, y_pred) oos_f1 = f1_score(y_test.values, y_pred, average='weighted') logger.info(f"OOS Accuracy: {oos_accuracy:.2%}") logger.info(f"OOS Weighted F1: {oos_f1:.4f}") # Save model model_path = self.output_dir / symbol / 'amd_detector' detector.save(str(model_path)) return { 'train_metrics': vars(train_metrics), 'oos_metrics': { 'accuracy': oos_accuracy, 'weighted_f1': oos_f1 }, 'model_path': str(model_path), 'train_size': len(train_df), 'test_size': len(test_df) } def run_walk_forward_validation( self, df: pd.DataFrame, timeframe: str, n_splits: int = 5 ) -> Dict[str, Any]: """ Run walk-forward validation for robustness testing. Args: df: Prepared DataFrame timeframe: Timeframe to validate n_splits: Number of walk-forward splits Returns: Walk-forward results """ logger.info(f"\n{'='*60}") logger.info(f"WALK-FORWARD VALIDATION ({n_splits} splits)") logger.info(f"{'='*60}") # Get training data only (exclude OOS) train_data = self.splitter.get_training_data(df) # Initialize walk-forward validator validator = WalkForwardValidator( n_splits=n_splits, test_size=0.2, gap=0, expanding_window=False, min_train_size=5000 ) # Create splits splits = validator.split(train_data) results_per_split = [] for split in splits: logger.info(f"\n--- {split} ---") # Separate features and targets target_cols = [c for c in split.train_data.columns if c.startswith('target_')] feature_cols = [c for c in split.train_data.columns if not c.startswith('target_') and c not in ['open', 'high', 'low', 'close', 'volume', 'ticker']] X_train = split.train_data[feature_cols] y_train = split.train_data[target_cols] X_val = split.val_data[feature_cols] y_val = split.val_data[target_cols] # Train predictor predictor = RangePredictorV2(timeframes=[timeframe], use_gpu=True) train_metrics = predictor.train(X_train, y_train, verbose=False) # Evaluate val_metrics = predictor.evaluate(X_val, y_val, timeframe=timeframe) # Store metrics split_result = { 'split_id': split.split_id, 'train_size': split.train_size, 'val_size': split.val_size, 'metrics': {k: vars(v) for k, v in val_metrics.items()} } results_per_split.append(split_result) # Log summary for key, m in val_metrics.items(): if hasattr(m, 'mae'): logger.info(f" {key}: MAE={m.mae:.4f}, DirAcc={m.directional_accuracy:.2%}") # Calculate average metrics all_maes = [] all_dir_accs = [] for result in results_per_split: for key, metrics in result['metrics'].items(): if 'mae' in metrics: all_maes.append(metrics['mae']) if 'directional_accuracy' in metrics: all_dir_accs.append(metrics['directional_accuracy']) avg_metrics = { 'avg_mae': np.mean(all_maes) if all_maes else 0, 'std_mae': np.std(all_maes) if all_maes else 0, 'avg_dir_acc': np.mean(all_dir_accs) if all_dir_accs else 0, 'std_dir_acc': np.std(all_dir_accs) if all_dir_accs else 0 } logger.info(f"\n--- Walk-Forward Summary ---") logger.info(f"Avg MAE: {avg_metrics['avg_mae']:.4f} (+/- {avg_metrics['std_mae']:.4f})") logger.info(f"Avg DirAcc: {avg_metrics['avg_dir_acc']:.2%} (+/- {avg_metrics['std_dir_acc']:.2%})") return { 'n_splits': n_splits, 'splits': results_per_split, 'avg_metrics': avg_metrics } def train_full_pipeline( self, symbol: str, timeframes: List[str], run_walk_forward: bool = True ) -> Dict[str, Any]: """ Run complete training pipeline. Args: symbol: Trading symbol timeframes: List of timeframes to train run_walk_forward: Whether to run walk-forward validation Returns: Complete training results """ logger.info(f"\n{'#'*70}") logger.info(f"ML-FIRST TRAINING PIPELINE") logger.info(f"Symbol: {symbol}") logger.info(f"Timeframes: {timeframes}") logger.info(f"{'#'*70}\n") # Load raw data df_raw = self.load_data(symbol) # Prepare features and targets df = self.prepare_features_targets(df_raw, timeframes[0]) # Train Range Predictor range_results = self.train_range_predictor(df, timeframes, symbol) self.results['models']['range_predictor'] = range_results # Train AMD Detector amd_results = self.train_amd_detector(df_raw, symbol) self.results['models']['amd_detector'] = amd_results # Walk-forward validation if run_walk_forward: wf_results = self.run_walk_forward_validation(df, timeframes[0]) self.results['walk_forward'] = wf_results # Generate summary self._generate_summary() # Save results self._save_results(symbol) return self.results def _generate_summary(self): """Generate training summary""" summary = { 'total_models_trained': 0, 'range_predictor': {}, 'amd_detector': {}, 'validation_passed': False } # Count models for tf_results in self.results['models'].get('range_predictor', {}).values(): summary['total_models_trained'] += len(tf_results.get('train_metrics', {})) if self.results['models'].get('amd_detector'): summary['total_models_trained'] += 1 # Check if validation passed (based on config thresholds) thresholds = self.config.get('metrics_thresholds', {}) win_rate_target = thresholds.get('win_rate_target', 0.80) # Get best directional accuracy from OOS best_dir_acc = 0 for tf, data in self.results['models'].get('range_predictor', {}).items(): for key, metrics in data.get('oos_metrics', {}).items(): if 'directional_accuracy' in metrics: best_dir_acc = max(best_dir_acc, metrics['directional_accuracy']) summary['best_oos_directional_accuracy'] = best_dir_acc summary['validation_passed'] = best_dir_acc >= 0.60 # At least 60% for initial target self.results['summary'] = summary def _save_results(self, symbol: str): """Save training results to file""" results_path = self.output_dir / symbol / 'training_results.json' results_path.parent.mkdir(parents=True, exist_ok=True) with open(results_path, 'w') as f: json.dump(self.results, f, indent=2, default=str) logger.info(f"\nResults saved to: {results_path}") # Print final summary logger.info(f"\n{'='*60}") logger.info("TRAINING COMPLETE") logger.info(f"{'='*60}") logger.info(f"Total models trained: {self.results['summary']['total_models_trained']}") logger.info(f"Best OOS Directional Accuracy: {self.results['summary']['best_oos_directional_accuracy']:.2%}") logger.info(f"Validation passed: {self.results['summary']['validation_passed']}") def main(): """Main entry point""" parser = argparse.ArgumentParser( description="ML-First Training Pipeline" ) parser.add_argument( '--symbol', type=str, default='XAUUSD', help='Trading symbol (default: XAUUSD)' ) parser.add_argument( '--timeframes', type=str, default='15m,1H', help='Comma-separated timeframes (default: 15m,1H)' ) parser.add_argument( '--output-dir', type=str, default='models/ml_first', help='Output directory for models' ) parser.add_argument( '--config', type=str, default='config/validation_oos.yaml', help='Validation config path' ) parser.add_argument( '--skip-walk-forward', action='store_true', help='Skip walk-forward validation' ) parser.add_argument( '--full-training', action='store_true', help='Train all timeframes' ) args = parser.parse_args() # Parse timeframes if args.full_training: timeframes = ['5m', '15m', '1H', '4H', 'D'] else: timeframes = args.timeframes.split(',') # Initialize trainer trainer = MLFirstTrainer( output_dir=args.output_dir, config_path=args.config ) # Run training try: results = trainer.train_full_pipeline( symbol=args.symbol, timeframes=timeframes, run_walk_forward=not args.skip_walk_forward ) except Exception as e: logger.error(f"Training failed: {e}") raise if __name__ == "__main__": main()