Hierarchical ML Pipeline for trading predictions:
- Level 0: Attention Models (volatility/flow classification)
- Level 1: Base Models (XGBoost per symbol/timeframe)
- Level 2: Metamodels (XGBoost Stacking + Neural Gating)
Key components:
- src/pipelines/hierarchical_pipeline.py - Main prediction pipeline
- src/models/ - All ML model classes
- src/training/ - Training utilities
- src/api/ - FastAPI endpoints
- scripts/ - Training and evaluation scripts
- config/ - YAML configurations
Note: Trained models (*.joblib, *.pt) are gitignored.
Regenerate with training scripts.
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
581 lines
18 KiB
Python
581 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
ML-First Training Pipeline
|
|
===========================
|
|
Complete training pipeline for ML-First strategy.
|
|
|
|
Trains all models with proper temporal validation:
|
|
- RangePredictorV2 (multi-timeframe)
|
|
- AMDDetectorML (phase detection)
|
|
- Walk-forward validation
|
|
- OOS evaluation (2025 data never seen in training)
|
|
|
|
Usage:
|
|
python scripts/train_ml_first.py --symbol XAUUSD --timeframes 15m,1H
|
|
python scripts/train_ml_first.py --full-training
|
|
|
|
Author: ML-Specialist (NEXUS v4.0)
|
|
Created: 2026-01-04
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import argparse
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from typing import Dict, List, Optional, Any
|
|
import pandas as pd
|
|
import numpy as np
|
|
import yaml
|
|
import json
|
|
from loguru import logger
|
|
|
|
# Add src to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
from data.database import DatabaseManager
|
|
from data.pipeline import DataPipeline
|
|
from training.data_splitter import TemporalDataSplitter, create_ml_first_splits
|
|
from training.walk_forward import WalkForwardValidator
|
|
from models.range_predictor_v2 import RangePredictorV2, RangeMetricsV2
|
|
from models.amd_detector_ml import AMDDetectorML, AMDMetrics
|
|
|
|
|
|
# Configure logging
|
|
logger.remove()
|
|
logger.add(
|
|
sys.stdout,
|
|
format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{message}</cyan>",
|
|
level="INFO"
|
|
)
|
|
logger.add(
|
|
"logs/training_{time}.log",
|
|
format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {message}",
|
|
level="DEBUG",
|
|
rotation="1 day"
|
|
)
|
|
|
|
|
|
class MLFirstTrainer:
|
|
"""
|
|
Complete ML training pipeline for ML-First strategy.
|
|
|
|
Handles:
|
|
- Data loading and preparation
|
|
- Temporal splitting (2025 excluded for OOS)
|
|
- Multi-timeframe model training
|
|
- Walk-forward validation
|
|
- OOS evaluation
|
|
- Model saving and reporting
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
output_dir: str = "models/ml_first",
|
|
config_path: str = "config/validation_oos.yaml"
|
|
):
|
|
"""
|
|
Initialize the ML trainer.
|
|
|
|
Args:
|
|
output_dir: Directory for saved models
|
|
config_path: Path to validation configuration
|
|
"""
|
|
self.output_dir = Path(output_dir)
|
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
self.config_path = config_path
|
|
self.db_manager = DatabaseManager()
|
|
self.data_pipeline = DataPipeline()
|
|
self.splitter = TemporalDataSplitter(config_path)
|
|
|
|
# Load config
|
|
with open(config_path, 'r') as f:
|
|
self.config = yaml.safe_load(f)
|
|
|
|
# Training results
|
|
self.results: Dict[str, Any] = {
|
|
'timestamp': datetime.now().isoformat(),
|
|
'models': {},
|
|
'oos_results': {},
|
|
'summary': {}
|
|
}
|
|
|
|
def load_data(
|
|
self,
|
|
symbol: str,
|
|
limit: int = 500000
|
|
) -> pd.DataFrame:
|
|
"""Load raw data from database"""
|
|
logger.info(f"Loading data for {symbol}...")
|
|
df = self.db_manager.db.get_ticker_data(symbol, limit=limit)
|
|
|
|
if df.empty:
|
|
raise ValueError(f"No data found for {symbol}")
|
|
|
|
logger.info(f"Loaded {len(df):,} records ({df.index.min()} to {df.index.max()})")
|
|
|
|
# Show year distribution
|
|
self.splitter.print_data_summary(df)
|
|
|
|
return df
|
|
|
|
def prepare_features_targets(
|
|
self,
|
|
df: pd.DataFrame,
|
|
timeframe: str = '15m'
|
|
) -> pd.DataFrame:
|
|
"""
|
|
Prepare features and targets for training.
|
|
|
|
Args:
|
|
df: Raw OHLCV DataFrame
|
|
timeframe: Target timeframe
|
|
|
|
Returns:
|
|
DataFrame with features and targets
|
|
"""
|
|
logger.info(f"Preparing features for timeframe {timeframe}...")
|
|
|
|
# Process features using FeatureEngineer
|
|
from data.features import FeatureEngineer
|
|
feature_eng = FeatureEngineer()
|
|
|
|
df_processed = df.copy()
|
|
df_processed = feature_eng.create_price_features(df_processed)
|
|
df_processed = feature_eng.create_volume_features(df_processed)
|
|
df_processed = feature_eng.create_time_features(df_processed)
|
|
df_processed = feature_eng.create_rolling_features(
|
|
df_processed,
|
|
columns=['close', 'volume', 'high', 'low'],
|
|
windows=[5, 10, 20]
|
|
)
|
|
|
|
# Create targets based on timeframe config
|
|
horizons = RangePredictorV2.TIMEFRAME_CONFIGS.get(timeframe)
|
|
if horizons:
|
|
for horizon_name, periods in horizons.horizons.items():
|
|
# Future high/low
|
|
future_highs = [df_processed['high'].shift(-i) for i in range(1, periods + 1)]
|
|
future_lows = [df_processed['low'].shift(-i) for i in range(1, periods + 1)]
|
|
|
|
df_processed[f'target_delta_high_{horizon_name}'] = (
|
|
pd.concat(future_highs, axis=1).max(axis=1) / df_processed['close'] - 1
|
|
)
|
|
df_processed[f'target_delta_low_{horizon_name}'] = (
|
|
pd.concat(future_lows, axis=1).min(axis=1) / df_processed['close'] - 1
|
|
)
|
|
df_processed[f'target_direction_{horizon_name}'] = (
|
|
df_processed['close'].shift(-periods) > df_processed['close']
|
|
).astype(int)
|
|
|
|
# Drop NaN
|
|
df_processed = df_processed.dropna()
|
|
|
|
logger.info(f"Prepared {len(df_processed):,} samples with {len(df_processed.columns)} columns")
|
|
|
|
return df_processed
|
|
|
|
def train_range_predictor(
|
|
self,
|
|
df: pd.DataFrame,
|
|
timeframes: List[str],
|
|
symbol: str
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Train RangePredictorV2 for specified timeframes.
|
|
|
|
Args:
|
|
df: Prepared DataFrame with features and targets
|
|
timeframes: List of timeframes to train
|
|
symbol: Trading symbol
|
|
|
|
Returns:
|
|
Training results
|
|
"""
|
|
logger.info(f"\n{'='*60}")
|
|
logger.info("TRAINING RANGE PREDICTOR V2")
|
|
logger.info(f"{'='*60}")
|
|
|
|
results = {}
|
|
|
|
for tf in timeframes:
|
|
logger.info(f"\n--- Timeframe: {tf} ---")
|
|
|
|
# Create temporal split
|
|
splits = create_ml_first_splits(df, self.config_path)
|
|
|
|
train_df = splits['train']
|
|
val_df = splits['val']
|
|
test_df = splits['test_oos']
|
|
|
|
# Separate features and targets
|
|
target_cols = [c for c in train_df.columns if c.startswith('target_')]
|
|
feature_cols = [c for c in train_df.columns if not c.startswith('target_') and c not in ['open', 'high', 'low', 'close', 'volume', 'ticker']]
|
|
|
|
X_train = train_df[feature_cols]
|
|
y_train = train_df[target_cols]
|
|
X_val = val_df[feature_cols]
|
|
y_val = val_df[target_cols]
|
|
X_test = test_df[feature_cols]
|
|
y_test = test_df[target_cols]
|
|
|
|
logger.info(f"Train: {len(X_train):,}, Val: {len(X_val):,}, Test OOS: {len(X_test):,}")
|
|
logger.info(f"Features: {len(feature_cols)}, Targets: {len(target_cols)}")
|
|
|
|
# Initialize and train
|
|
predictor = RangePredictorV2(timeframes=[tf], use_gpu=True)
|
|
train_metrics = predictor.train(X_train, y_train, X_val, y_val, timeframe=tf)
|
|
|
|
# Evaluate on OOS
|
|
logger.info("\n--- OOS Evaluation ---")
|
|
oos_metrics = predictor.evaluate(X_test, y_test, timeframe=tf)
|
|
|
|
# Save model
|
|
model_path = self.output_dir / symbol / 'range_predictor' / tf
|
|
predictor.save(str(model_path))
|
|
|
|
# Store results
|
|
results[tf] = {
|
|
'train_metrics': {k: vars(v) for k, v in train_metrics.items()},
|
|
'oos_metrics': {k: vars(v) for k, v in oos_metrics.items()},
|
|
'model_path': str(model_path),
|
|
'train_size': len(X_train),
|
|
'val_size': len(X_val),
|
|
'test_size': len(X_test)
|
|
}
|
|
|
|
# Print OOS summary
|
|
logger.info("\nOOS Results:")
|
|
for key, m in oos_metrics.items():
|
|
logger.info(f" {key}: MAE={m.mae:.4f}, R2={m.r2:.4f}, DirAcc={m.directional_accuracy:.2%}")
|
|
|
|
return results
|
|
|
|
def train_amd_detector(
|
|
self,
|
|
df: pd.DataFrame,
|
|
symbol: str
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Train AMDDetectorML.
|
|
|
|
Args:
|
|
df: Raw OHLCV DataFrame
|
|
symbol: Trading symbol
|
|
|
|
Returns:
|
|
Training results
|
|
"""
|
|
logger.info(f"\n{'='*60}")
|
|
logger.info("TRAINING AMD DETECTOR ML")
|
|
logger.info(f"{'='*60}")
|
|
|
|
# Create temporal split
|
|
splits = create_ml_first_splits(df, self.config_path)
|
|
|
|
train_df = splits['train']
|
|
val_df = splits['val']
|
|
test_df = splits['test_oos']
|
|
|
|
logger.info(f"Train: {len(train_df):,}, Val: {len(val_df):,}, Test OOS: {len(test_df):,}")
|
|
|
|
# Initialize and train
|
|
detector = AMDDetectorML(use_gpu=True)
|
|
train_metrics = detector.train(train_df, val_df)
|
|
|
|
# Evaluate on OOS
|
|
logger.info("\n--- OOS Evaluation ---")
|
|
X_test, y_test = detector.prepare_training_data(test_df)
|
|
y_pred = detector.model.predict(X_test.values)
|
|
|
|
from sklearn.metrics import accuracy_score, f1_score
|
|
oos_accuracy = accuracy_score(y_test.values, y_pred)
|
|
oos_f1 = f1_score(y_test.values, y_pred, average='weighted')
|
|
|
|
logger.info(f"OOS Accuracy: {oos_accuracy:.2%}")
|
|
logger.info(f"OOS Weighted F1: {oos_f1:.4f}")
|
|
|
|
# Save model
|
|
model_path = self.output_dir / symbol / 'amd_detector'
|
|
detector.save(str(model_path))
|
|
|
|
return {
|
|
'train_metrics': vars(train_metrics),
|
|
'oos_metrics': {
|
|
'accuracy': oos_accuracy,
|
|
'weighted_f1': oos_f1
|
|
},
|
|
'model_path': str(model_path),
|
|
'train_size': len(train_df),
|
|
'test_size': len(test_df)
|
|
}
|
|
|
|
def run_walk_forward_validation(
|
|
self,
|
|
df: pd.DataFrame,
|
|
timeframe: str,
|
|
n_splits: int = 5
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Run walk-forward validation for robustness testing.
|
|
|
|
Args:
|
|
df: Prepared DataFrame
|
|
timeframe: Timeframe to validate
|
|
n_splits: Number of walk-forward splits
|
|
|
|
Returns:
|
|
Walk-forward results
|
|
"""
|
|
logger.info(f"\n{'='*60}")
|
|
logger.info(f"WALK-FORWARD VALIDATION ({n_splits} splits)")
|
|
logger.info(f"{'='*60}")
|
|
|
|
# Get training data only (exclude OOS)
|
|
train_data = self.splitter.get_training_data(df)
|
|
|
|
# Initialize walk-forward validator
|
|
validator = WalkForwardValidator(
|
|
n_splits=n_splits,
|
|
test_size=0.2,
|
|
gap=0,
|
|
expanding_window=False,
|
|
min_train_size=5000
|
|
)
|
|
|
|
# Create splits
|
|
splits = validator.split(train_data)
|
|
|
|
results_per_split = []
|
|
|
|
for split in splits:
|
|
logger.info(f"\n--- {split} ---")
|
|
|
|
# Separate features and targets
|
|
target_cols = [c for c in split.train_data.columns if c.startswith('target_')]
|
|
feature_cols = [c for c in split.train_data.columns if not c.startswith('target_') and c not in ['open', 'high', 'low', 'close', 'volume', 'ticker']]
|
|
|
|
X_train = split.train_data[feature_cols]
|
|
y_train = split.train_data[target_cols]
|
|
X_val = split.val_data[feature_cols]
|
|
y_val = split.val_data[target_cols]
|
|
|
|
# Train predictor
|
|
predictor = RangePredictorV2(timeframes=[timeframe], use_gpu=True)
|
|
train_metrics = predictor.train(X_train, y_train, verbose=False)
|
|
|
|
# Evaluate
|
|
val_metrics = predictor.evaluate(X_val, y_val, timeframe=timeframe)
|
|
|
|
# Store metrics
|
|
split_result = {
|
|
'split_id': split.split_id,
|
|
'train_size': split.train_size,
|
|
'val_size': split.val_size,
|
|
'metrics': {k: vars(v) for k, v in val_metrics.items()}
|
|
}
|
|
results_per_split.append(split_result)
|
|
|
|
# Log summary
|
|
for key, m in val_metrics.items():
|
|
if hasattr(m, 'mae'):
|
|
logger.info(f" {key}: MAE={m.mae:.4f}, DirAcc={m.directional_accuracy:.2%}")
|
|
|
|
# Calculate average metrics
|
|
all_maes = []
|
|
all_dir_accs = []
|
|
for result in results_per_split:
|
|
for key, metrics in result['metrics'].items():
|
|
if 'mae' in metrics:
|
|
all_maes.append(metrics['mae'])
|
|
if 'directional_accuracy' in metrics:
|
|
all_dir_accs.append(metrics['directional_accuracy'])
|
|
|
|
avg_metrics = {
|
|
'avg_mae': np.mean(all_maes) if all_maes else 0,
|
|
'std_mae': np.std(all_maes) if all_maes else 0,
|
|
'avg_dir_acc': np.mean(all_dir_accs) if all_dir_accs else 0,
|
|
'std_dir_acc': np.std(all_dir_accs) if all_dir_accs else 0
|
|
}
|
|
|
|
logger.info(f"\n--- Walk-Forward Summary ---")
|
|
logger.info(f"Avg MAE: {avg_metrics['avg_mae']:.4f} (+/- {avg_metrics['std_mae']:.4f})")
|
|
logger.info(f"Avg DirAcc: {avg_metrics['avg_dir_acc']:.2%} (+/- {avg_metrics['std_dir_acc']:.2%})")
|
|
|
|
return {
|
|
'n_splits': n_splits,
|
|
'splits': results_per_split,
|
|
'avg_metrics': avg_metrics
|
|
}
|
|
|
|
def train_full_pipeline(
|
|
self,
|
|
symbol: str,
|
|
timeframes: List[str],
|
|
run_walk_forward: bool = True
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Run complete training pipeline.
|
|
|
|
Args:
|
|
symbol: Trading symbol
|
|
timeframes: List of timeframes to train
|
|
run_walk_forward: Whether to run walk-forward validation
|
|
|
|
Returns:
|
|
Complete training results
|
|
"""
|
|
logger.info(f"\n{'#'*70}")
|
|
logger.info(f"ML-FIRST TRAINING PIPELINE")
|
|
logger.info(f"Symbol: {symbol}")
|
|
logger.info(f"Timeframes: {timeframes}")
|
|
logger.info(f"{'#'*70}\n")
|
|
|
|
# Load raw data
|
|
df_raw = self.load_data(symbol)
|
|
|
|
# Prepare features and targets
|
|
df = self.prepare_features_targets(df_raw, timeframes[0])
|
|
|
|
# Train Range Predictor
|
|
range_results = self.train_range_predictor(df, timeframes, symbol)
|
|
self.results['models']['range_predictor'] = range_results
|
|
|
|
# Train AMD Detector
|
|
amd_results = self.train_amd_detector(df_raw, symbol)
|
|
self.results['models']['amd_detector'] = amd_results
|
|
|
|
# Walk-forward validation
|
|
if run_walk_forward:
|
|
wf_results = self.run_walk_forward_validation(df, timeframes[0])
|
|
self.results['walk_forward'] = wf_results
|
|
|
|
# Generate summary
|
|
self._generate_summary()
|
|
|
|
# Save results
|
|
self._save_results(symbol)
|
|
|
|
return self.results
|
|
|
|
def _generate_summary(self):
|
|
"""Generate training summary"""
|
|
summary = {
|
|
'total_models_trained': 0,
|
|
'range_predictor': {},
|
|
'amd_detector': {},
|
|
'validation_passed': False
|
|
}
|
|
|
|
# Count models
|
|
for tf_results in self.results['models'].get('range_predictor', {}).values():
|
|
summary['total_models_trained'] += len(tf_results.get('train_metrics', {}))
|
|
|
|
if self.results['models'].get('amd_detector'):
|
|
summary['total_models_trained'] += 1
|
|
|
|
# Check if validation passed (based on config thresholds)
|
|
thresholds = self.config.get('metrics_thresholds', {})
|
|
win_rate_target = thresholds.get('win_rate_target', 0.80)
|
|
|
|
# Get best directional accuracy from OOS
|
|
best_dir_acc = 0
|
|
for tf, data in self.results['models'].get('range_predictor', {}).items():
|
|
for key, metrics in data.get('oos_metrics', {}).items():
|
|
if 'directional_accuracy' in metrics:
|
|
best_dir_acc = max(best_dir_acc, metrics['directional_accuracy'])
|
|
|
|
summary['best_oos_directional_accuracy'] = best_dir_acc
|
|
summary['validation_passed'] = best_dir_acc >= 0.60 # At least 60% for initial target
|
|
|
|
self.results['summary'] = summary
|
|
|
|
def _save_results(self, symbol: str):
|
|
"""Save training results to file"""
|
|
results_path = self.output_dir / symbol / 'training_results.json'
|
|
results_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(results_path, 'w') as f:
|
|
json.dump(self.results, f, indent=2, default=str)
|
|
|
|
logger.info(f"\nResults saved to: {results_path}")
|
|
|
|
# Print final summary
|
|
logger.info(f"\n{'='*60}")
|
|
logger.info("TRAINING COMPLETE")
|
|
logger.info(f"{'='*60}")
|
|
logger.info(f"Total models trained: {self.results['summary']['total_models_trained']}")
|
|
logger.info(f"Best OOS Directional Accuracy: {self.results['summary']['best_oos_directional_accuracy']:.2%}")
|
|
logger.info(f"Validation passed: {self.results['summary']['validation_passed']}")
|
|
|
|
|
|
def main():
|
|
"""Main entry point"""
|
|
parser = argparse.ArgumentParser(
|
|
description="ML-First Training Pipeline"
|
|
)
|
|
parser.add_argument(
|
|
'--symbol',
|
|
type=str,
|
|
default='XAUUSD',
|
|
help='Trading symbol (default: XAUUSD)'
|
|
)
|
|
parser.add_argument(
|
|
'--timeframes',
|
|
type=str,
|
|
default='15m,1H',
|
|
help='Comma-separated timeframes (default: 15m,1H)'
|
|
)
|
|
parser.add_argument(
|
|
'--output-dir',
|
|
type=str,
|
|
default='models/ml_first',
|
|
help='Output directory for models'
|
|
)
|
|
parser.add_argument(
|
|
'--config',
|
|
type=str,
|
|
default='config/validation_oos.yaml',
|
|
help='Validation config path'
|
|
)
|
|
parser.add_argument(
|
|
'--skip-walk-forward',
|
|
action='store_true',
|
|
help='Skip walk-forward validation'
|
|
)
|
|
parser.add_argument(
|
|
'--full-training',
|
|
action='store_true',
|
|
help='Train all timeframes'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Parse timeframes
|
|
if args.full_training:
|
|
timeframes = ['5m', '15m', '1H', '4H', 'D']
|
|
else:
|
|
timeframes = args.timeframes.split(',')
|
|
|
|
# Initialize trainer
|
|
trainer = MLFirstTrainer(
|
|
output_dir=args.output_dir,
|
|
config_path=args.config
|
|
)
|
|
|
|
# Run training
|
|
try:
|
|
results = trainer.train_full_pipeline(
|
|
symbol=args.symbol,
|
|
timeframes=timeframes,
|
|
run_walk_forward=not args.skip_walk_forward
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Training failed: {e}")
|
|
raise
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|