trading-platform-ml-engine-v2/scripts/train_ml_first.py
rckrdmrd 75c4d07690 feat: Initial commit - ML Engine codebase
Hierarchical ML Pipeline for trading predictions:
- Level 0: Attention Models (volatility/flow classification)
- Level 1: Base Models (XGBoost per symbol/timeframe)
- Level 2: Metamodels (XGBoost Stacking + Neural Gating)

Key components:
- src/pipelines/hierarchical_pipeline.py - Main prediction pipeline
- src/models/ - All ML model classes
- src/training/ - Training utilities
- src/api/ - FastAPI endpoints
- scripts/ - Training and evaluation scripts
- config/ - YAML configurations

Note: Trained models (*.joblib, *.pt) are gitignored.
      Regenerate with training scripts.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-18 04:27:40 -06:00

581 lines
18 KiB
Python

#!/usr/bin/env python3
"""
ML-First Training Pipeline
===========================
Complete training pipeline for ML-First strategy.
Trains all models with proper temporal validation:
- RangePredictorV2 (multi-timeframe)
- AMDDetectorML (phase detection)
- Walk-forward validation
- OOS evaluation (2025 data never seen in training)
Usage:
python scripts/train_ml_first.py --symbol XAUUSD --timeframes 15m,1H
python scripts/train_ml_first.py --full-training
Author: ML-Specialist (NEXUS v4.0)
Created: 2026-01-04
"""
import os
import sys
import argparse
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional, Any
import pandas as pd
import numpy as np
import yaml
import json
from loguru import logger
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from data.database import DatabaseManager
from data.pipeline import DataPipeline
from training.data_splitter import TemporalDataSplitter, create_ml_first_splits
from training.walk_forward import WalkForwardValidator
from models.range_predictor_v2 import RangePredictorV2, RangeMetricsV2
from models.amd_detector_ml import AMDDetectorML, AMDMetrics
# Configure logging
logger.remove()
logger.add(
sys.stdout,
format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{message}</cyan>",
level="INFO"
)
logger.add(
"logs/training_{time}.log",
format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {message}",
level="DEBUG",
rotation="1 day"
)
class MLFirstTrainer:
"""
Complete ML training pipeline for ML-First strategy.
Handles:
- Data loading and preparation
- Temporal splitting (2025 excluded for OOS)
- Multi-timeframe model training
- Walk-forward validation
- OOS evaluation
- Model saving and reporting
"""
def __init__(
self,
output_dir: str = "models/ml_first",
config_path: str = "config/validation_oos.yaml"
):
"""
Initialize the ML trainer.
Args:
output_dir: Directory for saved models
config_path: Path to validation configuration
"""
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
self.config_path = config_path
self.db_manager = DatabaseManager()
self.data_pipeline = DataPipeline()
self.splitter = TemporalDataSplitter(config_path)
# Load config
with open(config_path, 'r') as f:
self.config = yaml.safe_load(f)
# Training results
self.results: Dict[str, Any] = {
'timestamp': datetime.now().isoformat(),
'models': {},
'oos_results': {},
'summary': {}
}
def load_data(
self,
symbol: str,
limit: int = 500000
) -> pd.DataFrame:
"""Load raw data from database"""
logger.info(f"Loading data for {symbol}...")
df = self.db_manager.db.get_ticker_data(symbol, limit=limit)
if df.empty:
raise ValueError(f"No data found for {symbol}")
logger.info(f"Loaded {len(df):,} records ({df.index.min()} to {df.index.max()})")
# Show year distribution
self.splitter.print_data_summary(df)
return df
def prepare_features_targets(
self,
df: pd.DataFrame,
timeframe: str = '15m'
) -> pd.DataFrame:
"""
Prepare features and targets for training.
Args:
df: Raw OHLCV DataFrame
timeframe: Target timeframe
Returns:
DataFrame with features and targets
"""
logger.info(f"Preparing features for timeframe {timeframe}...")
# Process features using FeatureEngineer
from data.features import FeatureEngineer
feature_eng = FeatureEngineer()
df_processed = df.copy()
df_processed = feature_eng.create_price_features(df_processed)
df_processed = feature_eng.create_volume_features(df_processed)
df_processed = feature_eng.create_time_features(df_processed)
df_processed = feature_eng.create_rolling_features(
df_processed,
columns=['close', 'volume', 'high', 'low'],
windows=[5, 10, 20]
)
# Create targets based on timeframe config
horizons = RangePredictorV2.TIMEFRAME_CONFIGS.get(timeframe)
if horizons:
for horizon_name, periods in horizons.horizons.items():
# Future high/low
future_highs = [df_processed['high'].shift(-i) for i in range(1, periods + 1)]
future_lows = [df_processed['low'].shift(-i) for i in range(1, periods + 1)]
df_processed[f'target_delta_high_{horizon_name}'] = (
pd.concat(future_highs, axis=1).max(axis=1) / df_processed['close'] - 1
)
df_processed[f'target_delta_low_{horizon_name}'] = (
pd.concat(future_lows, axis=1).min(axis=1) / df_processed['close'] - 1
)
df_processed[f'target_direction_{horizon_name}'] = (
df_processed['close'].shift(-periods) > df_processed['close']
).astype(int)
# Drop NaN
df_processed = df_processed.dropna()
logger.info(f"Prepared {len(df_processed):,} samples with {len(df_processed.columns)} columns")
return df_processed
def train_range_predictor(
self,
df: pd.DataFrame,
timeframes: List[str],
symbol: str
) -> Dict[str, Any]:
"""
Train RangePredictorV2 for specified timeframes.
Args:
df: Prepared DataFrame with features and targets
timeframes: List of timeframes to train
symbol: Trading symbol
Returns:
Training results
"""
logger.info(f"\n{'='*60}")
logger.info("TRAINING RANGE PREDICTOR V2")
logger.info(f"{'='*60}")
results = {}
for tf in timeframes:
logger.info(f"\n--- Timeframe: {tf} ---")
# Create temporal split
splits = create_ml_first_splits(df, self.config_path)
train_df = splits['train']
val_df = splits['val']
test_df = splits['test_oos']
# Separate features and targets
target_cols = [c for c in train_df.columns if c.startswith('target_')]
feature_cols = [c for c in train_df.columns if not c.startswith('target_') and c not in ['open', 'high', 'low', 'close', 'volume', 'ticker']]
X_train = train_df[feature_cols]
y_train = train_df[target_cols]
X_val = val_df[feature_cols]
y_val = val_df[target_cols]
X_test = test_df[feature_cols]
y_test = test_df[target_cols]
logger.info(f"Train: {len(X_train):,}, Val: {len(X_val):,}, Test OOS: {len(X_test):,}")
logger.info(f"Features: {len(feature_cols)}, Targets: {len(target_cols)}")
# Initialize and train
predictor = RangePredictorV2(timeframes=[tf], use_gpu=True)
train_metrics = predictor.train(X_train, y_train, X_val, y_val, timeframe=tf)
# Evaluate on OOS
logger.info("\n--- OOS Evaluation ---")
oos_metrics = predictor.evaluate(X_test, y_test, timeframe=tf)
# Save model
model_path = self.output_dir / symbol / 'range_predictor' / tf
predictor.save(str(model_path))
# Store results
results[tf] = {
'train_metrics': {k: vars(v) for k, v in train_metrics.items()},
'oos_metrics': {k: vars(v) for k, v in oos_metrics.items()},
'model_path': str(model_path),
'train_size': len(X_train),
'val_size': len(X_val),
'test_size': len(X_test)
}
# Print OOS summary
logger.info("\nOOS Results:")
for key, m in oos_metrics.items():
logger.info(f" {key}: MAE={m.mae:.4f}, R2={m.r2:.4f}, DirAcc={m.directional_accuracy:.2%}")
return results
def train_amd_detector(
self,
df: pd.DataFrame,
symbol: str
) -> Dict[str, Any]:
"""
Train AMDDetectorML.
Args:
df: Raw OHLCV DataFrame
symbol: Trading symbol
Returns:
Training results
"""
logger.info(f"\n{'='*60}")
logger.info("TRAINING AMD DETECTOR ML")
logger.info(f"{'='*60}")
# Create temporal split
splits = create_ml_first_splits(df, self.config_path)
train_df = splits['train']
val_df = splits['val']
test_df = splits['test_oos']
logger.info(f"Train: {len(train_df):,}, Val: {len(val_df):,}, Test OOS: {len(test_df):,}")
# Initialize and train
detector = AMDDetectorML(use_gpu=True)
train_metrics = detector.train(train_df, val_df)
# Evaluate on OOS
logger.info("\n--- OOS Evaluation ---")
X_test, y_test = detector.prepare_training_data(test_df)
y_pred = detector.model.predict(X_test.values)
from sklearn.metrics import accuracy_score, f1_score
oos_accuracy = accuracy_score(y_test.values, y_pred)
oos_f1 = f1_score(y_test.values, y_pred, average='weighted')
logger.info(f"OOS Accuracy: {oos_accuracy:.2%}")
logger.info(f"OOS Weighted F1: {oos_f1:.4f}")
# Save model
model_path = self.output_dir / symbol / 'amd_detector'
detector.save(str(model_path))
return {
'train_metrics': vars(train_metrics),
'oos_metrics': {
'accuracy': oos_accuracy,
'weighted_f1': oos_f1
},
'model_path': str(model_path),
'train_size': len(train_df),
'test_size': len(test_df)
}
def run_walk_forward_validation(
self,
df: pd.DataFrame,
timeframe: str,
n_splits: int = 5
) -> Dict[str, Any]:
"""
Run walk-forward validation for robustness testing.
Args:
df: Prepared DataFrame
timeframe: Timeframe to validate
n_splits: Number of walk-forward splits
Returns:
Walk-forward results
"""
logger.info(f"\n{'='*60}")
logger.info(f"WALK-FORWARD VALIDATION ({n_splits} splits)")
logger.info(f"{'='*60}")
# Get training data only (exclude OOS)
train_data = self.splitter.get_training_data(df)
# Initialize walk-forward validator
validator = WalkForwardValidator(
n_splits=n_splits,
test_size=0.2,
gap=0,
expanding_window=False,
min_train_size=5000
)
# Create splits
splits = validator.split(train_data)
results_per_split = []
for split in splits:
logger.info(f"\n--- {split} ---")
# Separate features and targets
target_cols = [c for c in split.train_data.columns if c.startswith('target_')]
feature_cols = [c for c in split.train_data.columns if not c.startswith('target_') and c not in ['open', 'high', 'low', 'close', 'volume', 'ticker']]
X_train = split.train_data[feature_cols]
y_train = split.train_data[target_cols]
X_val = split.val_data[feature_cols]
y_val = split.val_data[target_cols]
# Train predictor
predictor = RangePredictorV2(timeframes=[timeframe], use_gpu=True)
train_metrics = predictor.train(X_train, y_train, verbose=False)
# Evaluate
val_metrics = predictor.evaluate(X_val, y_val, timeframe=timeframe)
# Store metrics
split_result = {
'split_id': split.split_id,
'train_size': split.train_size,
'val_size': split.val_size,
'metrics': {k: vars(v) for k, v in val_metrics.items()}
}
results_per_split.append(split_result)
# Log summary
for key, m in val_metrics.items():
if hasattr(m, 'mae'):
logger.info(f" {key}: MAE={m.mae:.4f}, DirAcc={m.directional_accuracy:.2%}")
# Calculate average metrics
all_maes = []
all_dir_accs = []
for result in results_per_split:
for key, metrics in result['metrics'].items():
if 'mae' in metrics:
all_maes.append(metrics['mae'])
if 'directional_accuracy' in metrics:
all_dir_accs.append(metrics['directional_accuracy'])
avg_metrics = {
'avg_mae': np.mean(all_maes) if all_maes else 0,
'std_mae': np.std(all_maes) if all_maes else 0,
'avg_dir_acc': np.mean(all_dir_accs) if all_dir_accs else 0,
'std_dir_acc': np.std(all_dir_accs) if all_dir_accs else 0
}
logger.info(f"\n--- Walk-Forward Summary ---")
logger.info(f"Avg MAE: {avg_metrics['avg_mae']:.4f} (+/- {avg_metrics['std_mae']:.4f})")
logger.info(f"Avg DirAcc: {avg_metrics['avg_dir_acc']:.2%} (+/- {avg_metrics['std_dir_acc']:.2%})")
return {
'n_splits': n_splits,
'splits': results_per_split,
'avg_metrics': avg_metrics
}
def train_full_pipeline(
self,
symbol: str,
timeframes: List[str],
run_walk_forward: bool = True
) -> Dict[str, Any]:
"""
Run complete training pipeline.
Args:
symbol: Trading symbol
timeframes: List of timeframes to train
run_walk_forward: Whether to run walk-forward validation
Returns:
Complete training results
"""
logger.info(f"\n{'#'*70}")
logger.info(f"ML-FIRST TRAINING PIPELINE")
logger.info(f"Symbol: {symbol}")
logger.info(f"Timeframes: {timeframes}")
logger.info(f"{'#'*70}\n")
# Load raw data
df_raw = self.load_data(symbol)
# Prepare features and targets
df = self.prepare_features_targets(df_raw, timeframes[0])
# Train Range Predictor
range_results = self.train_range_predictor(df, timeframes, symbol)
self.results['models']['range_predictor'] = range_results
# Train AMD Detector
amd_results = self.train_amd_detector(df_raw, symbol)
self.results['models']['amd_detector'] = amd_results
# Walk-forward validation
if run_walk_forward:
wf_results = self.run_walk_forward_validation(df, timeframes[0])
self.results['walk_forward'] = wf_results
# Generate summary
self._generate_summary()
# Save results
self._save_results(symbol)
return self.results
def _generate_summary(self):
"""Generate training summary"""
summary = {
'total_models_trained': 0,
'range_predictor': {},
'amd_detector': {},
'validation_passed': False
}
# Count models
for tf_results in self.results['models'].get('range_predictor', {}).values():
summary['total_models_trained'] += len(tf_results.get('train_metrics', {}))
if self.results['models'].get('amd_detector'):
summary['total_models_trained'] += 1
# Check if validation passed (based on config thresholds)
thresholds = self.config.get('metrics_thresholds', {})
win_rate_target = thresholds.get('win_rate_target', 0.80)
# Get best directional accuracy from OOS
best_dir_acc = 0
for tf, data in self.results['models'].get('range_predictor', {}).items():
for key, metrics in data.get('oos_metrics', {}).items():
if 'directional_accuracy' in metrics:
best_dir_acc = max(best_dir_acc, metrics['directional_accuracy'])
summary['best_oos_directional_accuracy'] = best_dir_acc
summary['validation_passed'] = best_dir_acc >= 0.60 # At least 60% for initial target
self.results['summary'] = summary
def _save_results(self, symbol: str):
"""Save training results to file"""
results_path = self.output_dir / symbol / 'training_results.json'
results_path.parent.mkdir(parents=True, exist_ok=True)
with open(results_path, 'w') as f:
json.dump(self.results, f, indent=2, default=str)
logger.info(f"\nResults saved to: {results_path}")
# Print final summary
logger.info(f"\n{'='*60}")
logger.info("TRAINING COMPLETE")
logger.info(f"{'='*60}")
logger.info(f"Total models trained: {self.results['summary']['total_models_trained']}")
logger.info(f"Best OOS Directional Accuracy: {self.results['summary']['best_oos_directional_accuracy']:.2%}")
logger.info(f"Validation passed: {self.results['summary']['validation_passed']}")
def main():
"""Main entry point"""
parser = argparse.ArgumentParser(
description="ML-First Training Pipeline"
)
parser.add_argument(
'--symbol',
type=str,
default='XAUUSD',
help='Trading symbol (default: XAUUSD)'
)
parser.add_argument(
'--timeframes',
type=str,
default='15m,1H',
help='Comma-separated timeframes (default: 15m,1H)'
)
parser.add_argument(
'--output-dir',
type=str,
default='models/ml_first',
help='Output directory for models'
)
parser.add_argument(
'--config',
type=str,
default='config/validation_oos.yaml',
help='Validation config path'
)
parser.add_argument(
'--skip-walk-forward',
action='store_true',
help='Skip walk-forward validation'
)
parser.add_argument(
'--full-training',
action='store_true',
help='Train all timeframes'
)
args = parser.parse_args()
# Parse timeframes
if args.full_training:
timeframes = ['5m', '15m', '1H', '4H', 'D']
else:
timeframes = args.timeframes.split(',')
# Initialize trainer
trainer = MLFirstTrainer(
output_dir=args.output_dir,
config_path=args.config
)
# Run training
try:
results = trainer.train_full_pipeline(
symbol=args.symbol,
timeframes=timeframes,
run_walk_forward=not args.skip_walk_forward
)
except Exception as e:
logger.error(f"Training failed: {e}")
raise
if __name__ == "__main__":
main()