trading-platform-ml-engine-v2/scripts/train_reduced_features_models.py
rckrdmrd 75c4d07690 feat: Initial commit - ML Engine codebase
Hierarchical ML Pipeline for trading predictions:
- Level 0: Attention Models (volatility/flow classification)
- Level 1: Base Models (XGBoost per symbol/timeframe)
- Level 2: Metamodels (XGBoost Stacking + Neural Gating)

Key components:
- src/pipelines/hierarchical_pipeline.py - Main prediction pipeline
- src/models/ - All ML model classes
- src/training/ - Training utilities
- src/api/ - FastAPI endpoints
- scripts/ - Training and evaluation scripts
- config/ - YAML configurations

Note: Trained models (*.joblib, *.pt) are gitignored.
      Regenerate with training scripts.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-18 04:27:40 -06:00

882 lines
25 KiB
Python

#!/usr/bin/env python3
"""
Reduced Features Model Training Script
=======================================
Trains ML models using the reduced 14-feature set with volatility-biased weighting.
Features Used (14 total):
- OHLCV: open, high, low, close, volume
- Indicators: ATR, SAR, RSI, MFI, OBV, AD, CMF
- Volume derived: volume_z, volume_anomaly
Key Improvements:
1. Reduced feature set (14 vs 50+) for better generalization
2. Volatility-biased sample weighting (step + smooth options)
3. Separate models per symbol and timeframe
4. Training data excludes 2025 (reserved for backtesting)
Usage:
python scripts/train_reduced_features_models.py
python scripts/train_reduced_features_models.py --symbols XAUUSD EURUSD BTCUSD
python scripts/train_reduced_features_models.py --timeframes 5m 15m
Author: ML-Specialist (NEXUS v4.0)
Version: 2.0.0
Created: 2026-01-05
"""
import argparse
import sys
import os
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, List, Tuple, Optional, Any
import json
import numpy as np
import pandas as pd
import joblib
from loguru import logger
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
# Local imports
from config.reduced_features import (
COLUMNS_TO_TRAIN,
ReducedFeatureConfig,
generate_reduced_features,
get_feature_columns_without_ohlcv
)
from models.volatility_attention import (
VolatilityAttentionConfig,
compute_factor_median_range,
compute_move_multiplier,
weight_smooth,
weight_step,
compute_attention_weights
)
# XGBoost
try:
from xgboost import XGBRegressor
HAS_XGBOOST = True
except ImportError:
HAS_XGBOOST = False
logger.error("XGBoost not available - install with: pip install xgboost")
sys.exit(1)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# ==============================================================================
# Configuration
# ==============================================================================
# Symbols to train
SUPPORTED_SYMBOLS = ['XAUUSD', 'EURUSD', 'BTCUSD']
# Symbol-specific configurations
SYMBOL_CONFIGS = {
'XAUUSD': {
'base_factor': 5.0, # ~5 USD typical 5m range
'pip_value': 0.01,
'typical_spread': 0.30,
'db_prefix': 'C:'
},
'BTCUSD': {
'base_factor': 100.0, # ~100 USD typical 5m range
'pip_value': 0.01,
'typical_spread': 10.0,
'db_prefix': 'X:'
},
'EURUSD': {
'base_factor': 0.0005, # ~5 pips typical 5m range
'pip_value': 0.0001,
'typical_spread': 0.0001,
'db_prefix': 'C:'
}
}
# Timeframes to train
TIMEFRAMES = ['5m', '15m']
# Horizon in bars (how far ahead to predict)
HORIZONS = {
'5m': 3, # 15 minutes ahead
'15m': 3 # 45 minutes ahead
}
# Training configuration
TRAINING_CONFIG = {
# Data split
'train_years': 5.0,
'holdout_years': 1.0, # 2025 excluded for backtesting
'val_split': 0.15,
'min_train_samples': 5000,
# XGBoost hyperparameters
'xgb_params': {
'n_estimators': 300,
'max_depth': 6,
'learning_rate': 0.03,
'subsample': 0.8,
'colsample_bytree': 0.8,
'min_child_weight': 10,
'gamma': 0.1,
'reg_alpha': 0.1,
'reg_lambda': 1.0,
'tree_method': 'hist',
'random_state': 42
},
# Volatility weighting
'use_volatility_weighting': True,
'factor_window': 200,
'softplus_beta': 4.0,
'softplus_w_max': 3.0,
'use_smooth_weights': True # True for softplus, False for step
}
# ==============================================================================
# Database Connection
# ==============================================================================
def load_data_from_db(
symbol: str,
start_date: str = None,
end_date: str = None,
limit: int = None,
db_config_path: str = 'config/database.yaml'
) -> pd.DataFrame:
"""
Load OHLCV data from MySQL database.
Args:
symbol: Trading symbol (e.g., 'XAUUSD')
start_date: Start date filter (YYYY-MM-DD)
end_date: End date filter (YYYY-MM-DD)
limit: Maximum records to fetch
db_config_path: Path to database config
Returns:
DataFrame with OHLCV data
"""
try:
from data.database import MySQLConnection
db = MySQLConnection(db_config_path)
except Exception as e:
logger.error(f"Database connection failed: {e}")
logger.info("Attempting to create sample data for testing...")
return create_sample_data(symbol, start_date, end_date)
# Get DB prefix for symbol
config = SYMBOL_CONFIGS.get(symbol, {'db_prefix': 'C:'})
db_symbol = f"{config['db_prefix']}{symbol}"
logger.info(f"Loading data for {db_symbol}...")
query = """
SELECT
date_agg as time,
open,
high,
low,
close,
volume,
vwap
FROM tickers_agg_data
WHERE ticker = :symbol
"""
params = {'symbol': db_symbol}
if start_date:
query += " AND date_agg >= :start_date"
params['start_date'] = start_date
if end_date:
query += " AND date_agg <= :end_date"
params['end_date'] = end_date
query += " ORDER BY date_agg ASC"
if limit:
query += f" LIMIT {limit}"
try:
df = db.execute_query(query, params)
except Exception as e:
logger.error(f"Query failed: {e}")
return create_sample_data(symbol, start_date, end_date)
if df.empty:
logger.warning(f"No data found for {symbol}")
return df
# Set datetime index
df['time'] = pd.to_datetime(df['time'])
df.set_index('time', inplace=True)
df = df.sort_index()
# Normalize column names
df.columns = ['open', 'high', 'low', 'close', 'volume', 'vwap']
logger.info(f"Loaded {len(df)} records for {symbol}")
logger.info(f" Date range: {df.index.min()} to {df.index.max()}")
return df
def create_sample_data(
symbol: str,
start_date: str = None,
end_date: str = None,
n_records: int = 100000
) -> pd.DataFrame:
"""Create sample data for testing when database is unavailable."""
logger.info(f"Creating sample data for {symbol}...")
np.random.seed(42)
start = pd.Timestamp(start_date or '2020-01-01')
end = pd.Timestamp(end_date or '2024-12-31')
dates = pd.date_range(start=start, end=end, freq='5min')
n = min(len(dates), n_records)
dates = dates[:n]
# Base price based on symbol
if symbol == 'XAUUSD':
base_price = 1800
volatility = 3.0
elif symbol == 'BTCUSD':
base_price = 30000
volatility = 200.0
else:
base_price = 1.10
volatility = 0.001
price = base_price + np.cumsum(np.random.randn(n) * volatility * 0.1)
df = pd.DataFrame({
'open': price + np.random.randn(n) * volatility * 0.2,
'high': price + np.abs(np.random.randn(n)) * volatility,
'low': price - np.abs(np.random.randn(n)) * volatility,
'close': price + np.random.randn(n) * volatility * 0.2,
'volume': np.random.randint(100, 10000, n),
'vwap': price
}, index=dates)
# Ensure high >= all, low <= all
df['high'] = df[['open', 'high', 'close']].max(axis=1)
df['low'] = df[['open', 'low', 'close']].min(axis=1)
logger.info(f"Created {len(df)} sample records")
return df
# ==============================================================================
# Data Preparation
# ==============================================================================
def resample_to_timeframe(df: pd.DataFrame, timeframe: str) -> pd.DataFrame:
"""Resample 5-minute data to different timeframe."""
if timeframe == '5m':
return df
tf_map = {
'15m': '15min',
'30m': '30min',
'1H': '1H',
'4H': '4H',
'1D': '1D'
}
offset = tf_map.get(timeframe, timeframe)
resampled = df.resample(offset).agg({
'open': 'first',
'high': 'max',
'low': 'min',
'close': 'last',
'volume': 'sum'
}).dropna()
logger.info(f"Resampled to {timeframe}: {len(resampled)} bars")
return resampled
def split_train_holdout(
df: pd.DataFrame,
holdout_years: float = 1.0,
train_years: float = 5.0
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Split data into training and holdout sets.
Holdout = last holdout_years of data (for backtesting)
Training = everything before holdout
Args:
df: DataFrame with datetime index
holdout_years: Years to reserve for backtesting
train_years: Maximum years for training
Returns:
Tuple of (train_df, holdout_df)
"""
max_date = df.index.max()
min_date = df.index.min()
# Holdout = last N years
holdout_start = max_date - timedelta(days=holdout_years * 365)
# Training = holdout_years before holdout
train_start = holdout_start - timedelta(days=train_years * 365)
train_start = max(train_start, min_date)
train_mask = (df.index >= train_start) & (df.index < holdout_start)
holdout_mask = df.index >= holdout_start
train_df = df[train_mask].copy()
holdout_df = df[holdout_mask].copy()
logger.info(f"Data split:")
logger.info(f" Training: {train_start.strftime('%Y-%m-%d')} to {holdout_start.strftime('%Y-%m-%d')} "
f"({len(train_df)} samples)")
logger.info(f" Holdout: {holdout_start.strftime('%Y-%m-%d')} to {max_date.strftime('%Y-%m-%d')} "
f"({len(holdout_df)} samples)")
return train_df, holdout_df
def compute_targets(
df: pd.DataFrame,
horizon_bars: int
) -> Tuple[np.ndarray, np.ndarray]:
"""
Compute corrected targets for range prediction.
Formula:
- target_high = MAX(high[t+1:t+horizon+1]) - close[t]
- target_low = close[t] - MIN(low[t+1:t+horizon+1])
Args:
df: DataFrame with OHLC columns
horizon_bars: Number of bars to look ahead
Returns:
Tuple of (target_high, target_low) arrays
"""
close = df['close'].values
high = df['high'].values
low = df['low'].values
n = len(df)
target_high = np.full(n, np.nan)
target_low = np.full(n, np.nan)
for i in range(n - horizon_bars):
# Future window [t+1, t+horizon]
future_high = high[i+1:i+1+horizon_bars]
future_low = low[i+1:i+1+horizon_bars]
target_high[i] = np.max(future_high) - close[i]
target_low[i] = close[i] - np.min(future_low)
return target_high, target_low
def compute_sample_weights(
df: pd.DataFrame,
target_high: np.ndarray,
target_low: np.ndarray,
config: dict
) -> np.ndarray:
"""
Compute sample weights using volatility-based approach.
Args:
df: DataFrame with OHLC data
target_high: Target high values
target_low: Target low values
config: Training configuration
Returns:
Array of sample weights
"""
if not config.get('use_volatility_weighting', True):
return np.ones(len(df))
# Compute factor
factor = compute_factor_median_range(
df,
window=config.get('factor_window', 200)
)
# Compute move multiplier from target movement
total_target = np.abs(target_high) + np.abs(target_low)
m = total_target / (factor.values + 1e-12)
# Apply weighting
if config.get('use_smooth_weights', True):
weights = weight_smooth(
m,
w_max=config.get('softplus_w_max', 3.0),
beta=config.get('softplus_beta', 4.0)
)
else:
weights = weight_step(m, w_max=3)
# Handle NaN
nan_mask = np.isnan(weights) | np.isnan(factor.values)
weights[nan_mask] = 1.0
# Normalize
valid_mask = ~nan_mask
if valid_mask.sum() > 0 and weights[valid_mask].mean() > 0:
weights[valid_mask] = weights[valid_mask] / weights[valid_mask].mean()
logger.info(f"Sample weights computed:")
logger.info(f" Mean multiplier: {np.nanmean(m):.2f}")
logger.info(f" High attention (w>1.5): {(weights > 1.5).sum()} samples")
return weights
# ==============================================================================
# Training Functions
# ==============================================================================
def train_model(
X_train: np.ndarray,
y_train: np.ndarray,
X_val: np.ndarray,
y_val: np.ndarray,
sample_weights: np.ndarray,
config: dict
) -> Tuple[Any, dict]:
"""
Train a single XGBoost model.
Args:
X_train: Training features
y_train: Training targets
X_val: Validation features
y_val: Validation targets
sample_weights: Sample weights
config: Training configuration
Returns:
Tuple of (model, metrics_dict)
"""
xgb_params = config.get('xgb_params', {}).copy()
# Force CPU training (GPU requires special build)
if xgb_params.get('tree_method') == 'gpu_hist':
xgb_params['tree_method'] = 'hist'
if 'device' in xgb_params:
del xgb_params['device']
model = XGBRegressor(**xgb_params)
# Fit with sample weights
model.fit(
X_train, y_train,
sample_weight=sample_weights,
eval_set=[(X_val, y_val)],
verbose=False
)
# Evaluate
y_pred_val = model.predict(X_val)
metrics = {
'mae': mean_absolute_error(y_val, y_pred_val),
'rmse': np.sqrt(mean_squared_error(y_val, y_pred_val)),
'r2': r2_score(y_val, y_pred_val),
'directional_accuracy': np.mean(np.sign(y_val) == np.sign(y_pred_val)),
'n_train': len(X_train),
'n_val': len(X_val)
}
return model, metrics
def train_symbol_timeframe(
symbol: str,
timeframe: str,
config: dict,
db_config_path: str
) -> Dict[str, Any]:
"""
Train models for a specific symbol and timeframe.
Args:
symbol: Trading symbol
timeframe: Timeframe
config: Training configuration
db_config_path: Database config path
Returns:
Dictionary with models and results
"""
logger.info(f"\n{'='*60}")
logger.info(f"Training {symbol} {timeframe}")
logger.info(f"{'='*60}")
# Load raw data (5m)
df_raw = load_data_from_db(
symbol,
end_date='2024-12-31', # Exclude 2025
db_config_path=db_config_path
)
if df_raw.empty:
logger.warning(f"No data for {symbol}")
return {}
# Resample if needed
if timeframe == '5m':
df_tf = df_raw[['open', 'high', 'low', 'close', 'volume']].copy()
else:
df_tf = resample_to_timeframe(
df_raw[['open', 'high', 'low', 'close', 'volume']],
timeframe
)
if len(df_tf) < config.get('min_train_samples', 5000):
logger.warning(f"Insufficient {timeframe} data: {len(df_tf)} rows")
return {}
# Generate reduced features
logger.info("Generating reduced features (14 total)...")
df_features = generate_reduced_features(df_tf)
logger.info(f"Features shape: {df_features.shape}")
logger.info(f"Features: {list(df_features.columns)}")
# Split train/holdout
train_df, _ = split_train_holdout(
df_features,
holdout_years=config.get('holdout_years', 1.0),
train_years=config.get('train_years', 5.0)
)
# Get horizon
horizon = HORIZONS.get(timeframe, 3)
# Compute targets
target_high, target_low = compute_targets(train_df, horizon)
# Compute sample weights
sample_weights = compute_sample_weights(train_df, target_high, target_low, config)
# Prepare features (excluding OHLCV for prediction, but keep for context)
feature_cols = get_feature_columns_without_ohlcv()
available_features = [c for c in feature_cols if c in train_df.columns]
logger.info(f"Training features ({len(available_features)}): {available_features}")
X = train_df[available_features].values
# Remove invalid samples (NaN targets)
valid_mask = ~(np.isnan(target_high) | np.isnan(target_low))
X_valid = X[valid_mask]
y_high_valid = target_high[valid_mask]
y_low_valid = target_low[valid_mask]
weights_valid = sample_weights[valid_mask]
# Train/val split (time-based)
val_split = config.get('val_split', 0.15)
split_idx = int(len(X_valid) * (1 - val_split))
X_train, X_val = X_valid[:split_idx], X_valid[split_idx:]
y_high_train, y_high_val = y_high_valid[:split_idx], y_high_valid[split_idx:]
y_low_train, y_low_val = y_low_valid[:split_idx], y_low_valid[split_idx:]
weights_train = weights_valid[:split_idx]
results = {}
# Train HIGH model
logger.info(f"\nTraining HIGH model...")
model_high, metrics_high = train_model(
X_train, y_high_train, X_val, y_high_val,
weights_train, config
)
key_high = f"{symbol}_{timeframe}_high_h{horizon}"
results[key_high] = {
'model': model_high,
'metrics': metrics_high,
'feature_columns': available_features
}
logger.info(f"HIGH: MAE={metrics_high['mae']:.4f}, R2={metrics_high['r2']:.4f}, "
f"DirAcc={metrics_high['directional_accuracy']:.2%}")
# Train LOW model
logger.info(f"\nTraining LOW model...")
model_low, metrics_low = train_model(
X_train, y_low_train, X_val, y_low_val,
weights_train, config
)
key_low = f"{symbol}_{timeframe}_low_h{horizon}"
results[key_low] = {
'model': model_low,
'metrics': metrics_low,
'feature_columns': available_features
}
logger.info(f"LOW: MAE={metrics_low['mae']:.4f}, R2={metrics_low['r2']:.4f}, "
f"DirAcc={metrics_low['directional_accuracy']:.2%}")
return results
# ==============================================================================
# Main Training Pipeline
# ==============================================================================
def train_all_models(
symbols: List[str],
timeframes: List[str],
output_dir: Path,
db_config_path: str
) -> Dict[str, Any]:
"""
Train models for all symbol/timeframe combinations.
Args:
symbols: List of symbols to train
timeframes: List of timeframes
output_dir: Output directory for models
db_config_path: Database config path
Returns:
Dictionary with all results
"""
logger.info("=" * 60)
logger.info("Reduced Features Model Training")
logger.info("=" * 60)
logger.info(f"Features: {COLUMNS_TO_TRAIN}")
logger.info(f"Symbols: {symbols}")
logger.info(f"Timeframes: {timeframes}")
logger.info(f"Cutoff: 2024-12-31 (2025 reserved for backtesting)")
all_results = {}
all_models = {}
for symbol in symbols:
for timeframe in timeframes:
try:
results = train_symbol_timeframe(
symbol, timeframe,
TRAINING_CONFIG,
db_config_path
)
if results:
for key, data in results.items():
all_models[key] = data['model']
all_results[key] = data['metrics']
all_results[key]['feature_columns'] = data['feature_columns']
except Exception as e:
logger.error(f"Training failed for {symbol} {timeframe}: {e}")
import traceback
traceback.print_exc()
# Save models
model_dir = output_dir / 'reduced_features_models'
model_dir.mkdir(parents=True, exist_ok=True)
for key, model in all_models.items():
model_path = model_dir / f"{key}.joblib"
joblib.dump(model, model_path)
logger.info(f"Saved model: {model_path}")
# Save metadata
metadata = {
'features': COLUMNS_TO_TRAIN,
'training_config': TRAINING_CONFIG,
'results': all_results,
'model_keys': list(all_models.keys()),
'trained_at': datetime.now().isoformat()
}
metadata_path = model_dir / 'metadata.joblib'
joblib.dump(metadata, metadata_path)
# Save summary JSON
summary_path = model_dir / 'training_summary.json'
with open(summary_path, 'w') as f:
json.dump({
'features': COLUMNS_TO_TRAIN,
'symbols': symbols,
'timeframes': timeframes,
'results': {k: {kk: vv for kk, vv in v.items() if kk != 'model'}
for k, v in all_results.items()},
'trained_at': datetime.now().isoformat()
}, f, indent=2, default=str)
logger.info(f"\nModels saved to {model_dir}")
return all_results
def generate_training_report(results: Dict, output_dir: Path) -> Path:
"""Generate a Markdown training report."""
report_path = output_dir / 'reduced_features_models' / f"TRAINING_REPORT_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
report = f"""# Reduced Features Model Training Report
**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
## Feature Set (14 Features)
| Category | Features |
|----------|----------|
| OHLCV | open, high, low, close, volume |
| Volatility | ATR |
| Trend | SAR |
| Momentum | RSI, MFI |
| Volume Flow | OBV, AD, CMF |
| Volume Derived | volume_z, volume_anomaly |
## Training Configuration
- **Training Data Cutoff:** 2024-12-31 (2025 reserved for backtesting)
- **Volatility Weighting:** Enabled (softplus, beta=4.0, w_max=3.0)
- **XGBoost:** n_estimators=300, max_depth=6, lr=0.03
## Results Summary
| Model | MAE | RMSE | R2 | Dir Accuracy | Train | Val |
|-------|-----|------|----|--------------| ----- | --- |
"""
for key, metrics in results.items():
report += f"| {key} | {metrics['mae']:.6f} | {metrics['rmse']:.6f} | "
report += f"{metrics['r2']:.4f} | {metrics['directional_accuracy']:.2%} | "
report += f"{metrics['n_train']} | {metrics['n_val']} |\n"
report += f"""
## Usage Example
```python
import joblib
from config.reduced_features import generate_reduced_features
# Load model
model_high = joblib.load('models/reduced_features_models/XAUUSD_15m_high_h3.joblib')
model_low = joblib.load('models/reduced_features_models/XAUUSD_15m_low_h3.joblib')
# Prepare features
features = generate_reduced_features(df_ohlcv)
feature_cols = ['ATR', 'SAR', 'RSI', 'MFI', 'OBV', 'AD', 'CMF', 'volume_z', 'volume_anomaly']
X = features[feature_cols].values
# Predict
pred_high = model_high.predict(X)
pred_low = model_low.predict(X)
```
## Notes
1. Models trained on data up to 2024-12-31
2. 2025 data reserved for out-of-sample backtesting
3. Volatility-biased weighting emphasizes high-movement samples
4. Reduced feature set (14) for better generalization
---
*Report generated by Reduced Features Training Pipeline*
"""
with open(report_path, 'w') as f:
f.write(report)
logger.info(f"Report saved to {report_path}")
return report_path
# ==============================================================================
# CLI Entry Point
# ==============================================================================
def main():
parser = argparse.ArgumentParser(
description='Train ML models with reduced 14-feature set'
)
parser.add_argument(
'--symbols', nargs='+',
default=['XAUUSD', 'EURUSD', 'BTCUSD'],
help='Symbols to train (default: XAUUSD EURUSD BTCUSD)'
)
parser.add_argument(
'--timeframes', nargs='+',
default=['5m', '15m'],
help='Timeframes to train (default: 5m 15m)'
)
parser.add_argument(
'--output-dir', type=str,
default='models/',
help='Output directory for models'
)
parser.add_argument(
'--db-config', type=str,
default='config/database.yaml',
help='Database configuration file'
)
args = parser.parse_args()
# Setup paths
script_dir = Path(__file__).parent.parent
output_dir = script_dir / args.output_dir
output_dir.mkdir(parents=True, exist_ok=True)
# Setup logging
log_dir = output_dir / 'logs'
log_dir.mkdir(parents=True, exist_ok=True)
log_file = log_dir / f"reduced_features_training_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
logger.remove()
logger.add(sys.stderr, level="INFO", format="{time:HH:mm:ss} | {level} | {message}")
logger.add(log_file, level="DEBUG", rotation="10 MB")
logger.info(f"Logging to {log_file}")
# Run training
try:
results = train_all_models(
symbols=args.symbols,
timeframes=args.timeframes,
output_dir=output_dir,
db_config_path=str(script_dir / args.db_config)
)
# Generate report
generate_training_report(results, output_dir)
# Print summary
logger.info("\n" + "=" * 60)
logger.info("TRAINING COMPLETE!")
logger.info("=" * 60)
for key, metrics in results.items():
logger.info(f"{key}:")
logger.info(f" MAE={metrics['mae']:.6f}, R2={metrics['r2']:.4f}, "
f"DirAcc={metrics['directional_accuracy']:.2%}")
except Exception as e:
logger.exception(f"Training failed: {e}")
sys.exit(1)
if __name__ == "__main__":
main()