Hierarchical ML Pipeline for trading predictions:
- Level 0: Attention Models (volatility/flow classification)
- Level 1: Base Models (XGBoost per symbol/timeframe)
- Level 2: Metamodels (XGBoost Stacking + Neural Gating)
Key components:
- src/pipelines/hierarchical_pipeline.py - Main prediction pipeline
- src/models/ - All ML model classes
- src/training/ - Training utilities
- src/api/ - FastAPI endpoints
- scripts/ - Training and evaluation scripts
- config/ - YAML configurations
Note: Trained models (*.joblib, *.pt) are gitignored.
Regenerate with training scripts.
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
314 lines
11 KiB
Python
314 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Simple Neural Gating Training Script.
|
|
|
|
Uses the existing HierarchicalPipeline to generate training data
|
|
and trains the Neural Gating metamodel as an alternative to XGBoost.
|
|
|
|
Usage:
|
|
python scripts/train_neural_gating_simple.py --symbol XAUUSD
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
from pathlib import Path
|
|
|
|
# Add both root and src directories to path
|
|
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
sys.path.insert(0, root_dir)
|
|
sys.path.insert(0, os.path.join(root_dir, 'src'))
|
|
|
|
import argparse
|
|
import numpy as np
|
|
import pandas as pd
|
|
from datetime import datetime
|
|
from loguru import logger
|
|
import joblib
|
|
|
|
# Configure logging
|
|
logger.remove()
|
|
logger.add(sys.stdout, level="INFO", format="{time:HH:mm:ss} | {level} | {message}")
|
|
|
|
|
|
def load_ohlcv_data(symbol: str, start_date: str, end_date: str, timeframe: str = '15m'):
|
|
"""Load OHLCV data from database."""
|
|
from data.database import MySQLConnection
|
|
|
|
# Map symbol to database ticker format
|
|
ticker_map = {
|
|
'XAUUSD': 'C:XAUUSD',
|
|
'EURUSD': 'C:EURUSD',
|
|
'GBPUSD': 'C:GBPUSD',
|
|
'USDJPY': 'C:USDJPY',
|
|
'BTCUSD': 'X:BTCUSD'
|
|
}
|
|
ticker = ticker_map.get(symbol, f'C:{symbol}')
|
|
|
|
db = MySQLConnection()
|
|
|
|
query = f"""
|
|
SELECT date_agg as timestamp, open, high, low, close, volume
|
|
FROM tickers_agg_data
|
|
WHERE ticker = '{ticker}'
|
|
AND date_agg >= '{start_date}' AND date_agg <= '{end_date}'
|
|
ORDER BY date_agg ASC
|
|
"""
|
|
|
|
df = pd.read_sql(query, db.engine)
|
|
df['timestamp'] = pd.to_datetime(df['timestamp'])
|
|
df.set_index('timestamp', inplace=True)
|
|
|
|
# Resample
|
|
agg_dict = {
|
|
'open': 'first',
|
|
'high': 'max',
|
|
'low': 'min',
|
|
'close': 'last',
|
|
'volume': 'sum'
|
|
}
|
|
|
|
if timeframe == '5m':
|
|
df = df.resample('5min').agg(agg_dict).dropna()
|
|
elif timeframe == '15m':
|
|
df = df.resample('15min').agg(agg_dict).dropna()
|
|
|
|
return df.reset_index()
|
|
|
|
|
|
def generate_training_data(symbol: str):
|
|
"""Generate training data using HierarchicalPipeline."""
|
|
from src.pipelines.hierarchical_pipeline import HierarchicalPipeline, PipelineConfig
|
|
|
|
logger.info(f"Generating training data for {symbol}...")
|
|
|
|
# Initialize pipeline
|
|
config = PipelineConfig(
|
|
attention_model_path='models/attention',
|
|
base_model_path='models/symbol_timeframe_models',
|
|
metamodel_path='models/metamodels'
|
|
)
|
|
|
|
pipeline = HierarchicalPipeline(config)
|
|
|
|
if not pipeline.load_models(symbol):
|
|
raise ValueError(f"Failed to load models for {symbol}")
|
|
|
|
# Load OOS data (Jan 2024 - Aug 2024)
|
|
df_5m = load_ohlcv_data(symbol, '2024-01-01', '2024-08-31', '5m')
|
|
df_15m = load_ohlcv_data(symbol, '2024-01-01', '2024-08-31', '15m')
|
|
|
|
logger.info(f"Loaded data: 5m={len(df_5m)}, 15m={len(df_15m)}")
|
|
|
|
# Generate predictions and extract meta features
|
|
meta_features_list = []
|
|
targets_high = []
|
|
targets_low = []
|
|
|
|
# Process in batches to avoid memory issues
|
|
batch_size = 100
|
|
lookback = 200 # Features require lookback
|
|
|
|
for i in range(lookback, len(df_15m) - 1, batch_size):
|
|
batch_end = min(i + batch_size, len(df_15m) - 1)
|
|
|
|
for j in range(i, batch_end):
|
|
# Get feature windows
|
|
df_15m_window = df_15m.iloc[j-lookback:j+1].copy()
|
|
df_5m_idx = j * 3 # Approximate 5m index
|
|
|
|
if df_5m_idx + 1 >= len(df_5m):
|
|
continue
|
|
|
|
df_5m_window = df_5m.iloc[max(0, df_5m_idx-lookback*3):df_5m_idx+1].copy()
|
|
|
|
if len(df_5m_window) < 50 or len(df_15m_window) < 50:
|
|
continue
|
|
|
|
try:
|
|
# Generate features using pipeline's internal method
|
|
features_5m = pipeline._generate_features(df_5m_window)
|
|
features_15m = pipeline._generate_features(df_15m_window)
|
|
|
|
if features_5m is None or features_15m is None:
|
|
continue
|
|
|
|
# Get attention scores
|
|
att_5m, att_class_5m = pipeline.attention_models[f'{symbol}_5m'].predict_single(features_5m)
|
|
att_15m, att_class_15m = pipeline.attention_models[f'{symbol}_15m'].predict_single(features_15m)
|
|
|
|
# Get base predictions
|
|
base_feat_5m = np.concatenate([features_5m, [att_5m, att_class_5m]])
|
|
base_feat_15m = np.concatenate([features_15m, [att_15m, att_class_15m]])
|
|
|
|
pred_high_5m = pipeline.base_models[f'{symbol}_5m_high'].predict(base_feat_5m.reshape(1, -1))[0]
|
|
pred_low_5m = pipeline.base_models[f'{symbol}_5m_low'].predict(base_feat_5m.reshape(1, -1))[0]
|
|
pred_high_15m = pipeline.base_models[f'{symbol}_15m_high'].predict(base_feat_15m.reshape(1, -1))[0]
|
|
pred_low_15m = pipeline.base_models[f'{symbol}_15m_low'].predict(base_feat_15m.reshape(1, -1))[0]
|
|
|
|
# Context features
|
|
atr = df_15m_window['high'].iloc[-50:].values - df_15m_window['low'].iloc[-50:].values
|
|
atr_ratio = atr[-1] / np.median(atr) if np.median(atr) > 0 else 1.0
|
|
vol = df_15m_window['volume'].iloc[-20:].values
|
|
volume_z = (vol[-1] - np.mean(vol)) / (np.std(vol) + 1e-8)
|
|
|
|
# Meta features
|
|
meta_features_list.append({
|
|
'pred_high_5m': pred_high_5m,
|
|
'pred_low_5m': pred_low_5m,
|
|
'pred_high_15m': pred_high_15m,
|
|
'pred_low_15m': pred_low_15m,
|
|
'attention_5m': att_5m,
|
|
'attention_15m': att_15m,
|
|
'attention_class_5m': att_class_5m,
|
|
'attention_class_15m': att_class_15m,
|
|
'ATR_ratio': atr_ratio,
|
|
'volume_z': volume_z
|
|
})
|
|
|
|
# Targets (actual movement in next bar)
|
|
if j + 1 < len(df_15m):
|
|
next_bar = df_15m.iloc[j + 1]
|
|
current_close = df_15m.iloc[j]['close']
|
|
targets_high.append(next_bar['high'] - current_close)
|
|
targets_low.append(current_close - next_bar['low'])
|
|
else:
|
|
targets_high.append(np.nan)
|
|
targets_low.append(np.nan)
|
|
|
|
except Exception as e:
|
|
continue
|
|
|
|
if len(meta_features_list) % 500 == 0:
|
|
logger.info(f" Processed {len(meta_features_list)} samples...")
|
|
|
|
# Convert to arrays
|
|
meta_features = pd.DataFrame(meta_features_list)
|
|
target_high = np.array(targets_high[:len(meta_features)])
|
|
target_low = np.array(targets_low[:len(meta_features)])
|
|
|
|
# Remove NaN
|
|
valid_mask = ~np.isnan(target_high) & ~np.isnan(target_low)
|
|
meta_features = meta_features[valid_mask]
|
|
target_high = target_high[valid_mask]
|
|
target_low = target_low[valid_mask]
|
|
|
|
# Ensure non-negative targets
|
|
target_high = np.maximum(target_high, 0)
|
|
target_low = np.maximum(target_low, 0)
|
|
|
|
logger.info(f"Generated {len(meta_features)} training samples")
|
|
|
|
return meta_features, target_high, target_low
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Train Neural Gating Metamodel')
|
|
parser.add_argument('--symbol', type=str, default='XAUUSD', help='Symbol to train')
|
|
parser.add_argument('--epochs', type=int, default=50, help='Training epochs')
|
|
parser.add_argument('--compare', action='store_true', help='Compare with XGBoost')
|
|
args = parser.parse_args()
|
|
|
|
symbol = args.symbol
|
|
output_dir = Path('models/metamodels_neural')
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
logger.info("=" * 60)
|
|
logger.info(f"NEURAL GATING TRAINING - {symbol}")
|
|
logger.info("=" * 60)
|
|
|
|
# Check PyTorch
|
|
try:
|
|
import torch
|
|
logger.info(f"PyTorch: {torch.__version__}, CUDA: {torch.cuda.is_available()}")
|
|
except ImportError:
|
|
logger.error("PyTorch required!")
|
|
return
|
|
|
|
from src.models.neural_gating_metamodel import (
|
|
NeuralGatingMetamodelWrapper,
|
|
NeuralGatingConfig
|
|
)
|
|
|
|
# Check for cached training data
|
|
cache_path = output_dir / f'{symbol}_training_cache.joblib'
|
|
|
|
if cache_path.exists():
|
|
logger.info(f"Loading cached training data from {cache_path}")
|
|
cache = joblib.load(cache_path)
|
|
meta_features = cache['meta_features']
|
|
target_high = cache['target_high']
|
|
target_low = cache['target_low']
|
|
else:
|
|
# Generate training data
|
|
meta_features, target_high, target_low = generate_training_data(symbol)
|
|
|
|
# Cache for future use
|
|
joblib.dump({
|
|
'meta_features': meta_features,
|
|
'target_high': target_high,
|
|
'target_low': target_low
|
|
}, cache_path)
|
|
logger.info(f"Cached training data to {cache_path}")
|
|
|
|
logger.info(f"Training samples: {len(meta_features)}")
|
|
|
|
# Configure and train
|
|
config = NeuralGatingConfig(
|
|
epochs=args.epochs,
|
|
early_stopping_patience=10,
|
|
learning_rate=0.001,
|
|
batch_size=256,
|
|
gating_hidden_dims=[32, 16],
|
|
residual_hidden_dims=[64, 32],
|
|
confidence_hidden_dims=[32, 16],
|
|
dropout=0.2
|
|
)
|
|
|
|
model = NeuralGatingMetamodelWrapper(symbol, config)
|
|
model.fit(meta_features, target_high, target_low)
|
|
|
|
# Save
|
|
model_path = output_dir / symbol
|
|
model.save(str(model_path))
|
|
|
|
summary = model.get_training_summary()
|
|
|
|
# Compare with XGBoost
|
|
if args.compare:
|
|
from src.models.asset_metamodel import AssetMetamodel
|
|
|
|
xgb_path = Path(f'models/metamodels/{symbol}')
|
|
if xgb_path.exists():
|
|
xgb_model = AssetMetamodel.load(str(xgb_path))
|
|
xgb_summary = xgb_model.get_training_summary()
|
|
|
|
logger.info(f"\n{'='*60}")
|
|
logger.info("COMPARISON: Neural Gating vs XGBoost")
|
|
logger.info(f"{'='*60}")
|
|
|
|
neural = summary['metrics']
|
|
xgb = xgb_summary['metrics']
|
|
|
|
logger.info(f"{'Metric':<25} {'Neural':<15} {'XGBoost':<15}")
|
|
logger.info("-" * 55)
|
|
|
|
neural_mae = (neural['mae_high'] + neural['mae_low']) / 2
|
|
xgb_mae = (xgb['mae_high'] + xgb['mae_low']) / 2
|
|
logger.info(f"{'MAE (avg)':<25} {neural_mae:<15.4f} {xgb_mae:<15.4f}")
|
|
|
|
neural_r2 = (neural['r2_high'] + neural['r2_low']) / 2
|
|
xgb_r2 = (xgb['r2_high'] + xgb['r2_low']) / 2
|
|
logger.info(f"{'R2 (avg)':<25} {neural_r2:<15.4f} {xgb_r2:<15.4f}")
|
|
|
|
logger.info(f"{'Alpha HIGH mean':<25} {neural['alpha_high_mean']:<15.3f} {'N/A':<15}")
|
|
logger.info(f"{'Alpha LOW mean':<25} {neural['alpha_low_mean']:<15.3f} {'N/A':<15}")
|
|
|
|
logger.info(f"\n{'='*60}")
|
|
logger.info("TRAINING COMPLETE")
|
|
logger.info(f"Model saved to: {model_path}")
|
|
logger.info(f"{'='*60}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|