Hierarchical ML Pipeline for trading predictions:
- Level 0: Attention Models (volatility/flow classification)
- Level 1: Base Models (XGBoost per symbol/timeframe)
- Level 2: Metamodels (XGBoost Stacking + Neural Gating)
Key components:
- src/pipelines/hierarchical_pipeline.py - Main prediction pipeline
- src/models/ - All ML model classes
- src/training/ - Training utilities
- src/api/ - FastAPI endpoints
- scripts/ - Training and evaluation scripts
- config/ - YAML configurations
Note: Trained models (*.joblib, *.pt) are gitignored.
Regenerate with training scripts.
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
882 lines
25 KiB
Python
882 lines
25 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Reduced Features Model Training Script
|
|
=======================================
|
|
Trains ML models using the reduced 14-feature set with volatility-biased weighting.
|
|
|
|
Features Used (14 total):
|
|
- OHLCV: open, high, low, close, volume
|
|
- Indicators: ATR, SAR, RSI, MFI, OBV, AD, CMF
|
|
- Volume derived: volume_z, volume_anomaly
|
|
|
|
Key Improvements:
|
|
1. Reduced feature set (14 vs 50+) for better generalization
|
|
2. Volatility-biased sample weighting (step + smooth options)
|
|
3. Separate models per symbol and timeframe
|
|
4. Training data excludes 2025 (reserved for backtesting)
|
|
|
|
Usage:
|
|
python scripts/train_reduced_features_models.py
|
|
python scripts/train_reduced_features_models.py --symbols XAUUSD EURUSD BTCUSD
|
|
python scripts/train_reduced_features_models.py --timeframes 5m 15m
|
|
|
|
Author: ML-Specialist (NEXUS v4.0)
|
|
Version: 2.0.0
|
|
Created: 2026-01-05
|
|
"""
|
|
|
|
import argparse
|
|
import sys
|
|
import os
|
|
from pathlib import Path
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict, List, Tuple, Optional, Any
|
|
import json
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import joblib
|
|
from loguru import logger
|
|
|
|
# Add parent directory to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
|
|
|
|
# Local imports
|
|
from config.reduced_features import (
|
|
COLUMNS_TO_TRAIN,
|
|
ReducedFeatureConfig,
|
|
generate_reduced_features,
|
|
get_feature_columns_without_ohlcv
|
|
)
|
|
from models.volatility_attention import (
|
|
VolatilityAttentionConfig,
|
|
compute_factor_median_range,
|
|
compute_move_multiplier,
|
|
weight_smooth,
|
|
weight_step,
|
|
compute_attention_weights
|
|
)
|
|
|
|
# XGBoost
|
|
try:
|
|
from xgboost import XGBRegressor
|
|
HAS_XGBOOST = True
|
|
except ImportError:
|
|
HAS_XGBOOST = False
|
|
logger.error("XGBoost not available - install with: pip install xgboost")
|
|
sys.exit(1)
|
|
|
|
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
|
|
|
|
|
# ==============================================================================
|
|
# Configuration
|
|
# ==============================================================================
|
|
|
|
# Symbols to train
|
|
SUPPORTED_SYMBOLS = ['XAUUSD', 'EURUSD', 'BTCUSD']
|
|
|
|
# Symbol-specific configurations
|
|
SYMBOL_CONFIGS = {
|
|
'XAUUSD': {
|
|
'base_factor': 5.0, # ~5 USD typical 5m range
|
|
'pip_value': 0.01,
|
|
'typical_spread': 0.30,
|
|
'db_prefix': 'C:'
|
|
},
|
|
'BTCUSD': {
|
|
'base_factor': 100.0, # ~100 USD typical 5m range
|
|
'pip_value': 0.01,
|
|
'typical_spread': 10.0,
|
|
'db_prefix': 'X:'
|
|
},
|
|
'EURUSD': {
|
|
'base_factor': 0.0005, # ~5 pips typical 5m range
|
|
'pip_value': 0.0001,
|
|
'typical_spread': 0.0001,
|
|
'db_prefix': 'C:'
|
|
}
|
|
}
|
|
|
|
# Timeframes to train
|
|
TIMEFRAMES = ['5m', '15m']
|
|
|
|
# Horizon in bars (how far ahead to predict)
|
|
HORIZONS = {
|
|
'5m': 3, # 15 minutes ahead
|
|
'15m': 3 # 45 minutes ahead
|
|
}
|
|
|
|
# Training configuration
|
|
TRAINING_CONFIG = {
|
|
# Data split
|
|
'train_years': 5.0,
|
|
'holdout_years': 1.0, # 2025 excluded for backtesting
|
|
'val_split': 0.15,
|
|
'min_train_samples': 5000,
|
|
|
|
# XGBoost hyperparameters
|
|
'xgb_params': {
|
|
'n_estimators': 300,
|
|
'max_depth': 6,
|
|
'learning_rate': 0.03,
|
|
'subsample': 0.8,
|
|
'colsample_bytree': 0.8,
|
|
'min_child_weight': 10,
|
|
'gamma': 0.1,
|
|
'reg_alpha': 0.1,
|
|
'reg_lambda': 1.0,
|
|
'tree_method': 'hist',
|
|
'random_state': 42
|
|
},
|
|
|
|
# Volatility weighting
|
|
'use_volatility_weighting': True,
|
|
'factor_window': 200,
|
|
'softplus_beta': 4.0,
|
|
'softplus_w_max': 3.0,
|
|
'use_smooth_weights': True # True for softplus, False for step
|
|
}
|
|
|
|
|
|
# ==============================================================================
|
|
# Database Connection
|
|
# ==============================================================================
|
|
|
|
def load_data_from_db(
|
|
symbol: str,
|
|
start_date: str = None,
|
|
end_date: str = None,
|
|
limit: int = None,
|
|
db_config_path: str = 'config/database.yaml'
|
|
) -> pd.DataFrame:
|
|
"""
|
|
Load OHLCV data from MySQL database.
|
|
|
|
Args:
|
|
symbol: Trading symbol (e.g., 'XAUUSD')
|
|
start_date: Start date filter (YYYY-MM-DD)
|
|
end_date: End date filter (YYYY-MM-DD)
|
|
limit: Maximum records to fetch
|
|
db_config_path: Path to database config
|
|
|
|
Returns:
|
|
DataFrame with OHLCV data
|
|
"""
|
|
try:
|
|
from data.database import MySQLConnection
|
|
db = MySQLConnection(db_config_path)
|
|
except Exception as e:
|
|
logger.error(f"Database connection failed: {e}")
|
|
logger.info("Attempting to create sample data for testing...")
|
|
return create_sample_data(symbol, start_date, end_date)
|
|
|
|
# Get DB prefix for symbol
|
|
config = SYMBOL_CONFIGS.get(symbol, {'db_prefix': 'C:'})
|
|
db_symbol = f"{config['db_prefix']}{symbol}"
|
|
|
|
logger.info(f"Loading data for {db_symbol}...")
|
|
|
|
query = """
|
|
SELECT
|
|
date_agg as time,
|
|
open,
|
|
high,
|
|
low,
|
|
close,
|
|
volume,
|
|
vwap
|
|
FROM tickers_agg_data
|
|
WHERE ticker = :symbol
|
|
"""
|
|
|
|
params = {'symbol': db_symbol}
|
|
|
|
if start_date:
|
|
query += " AND date_agg >= :start_date"
|
|
params['start_date'] = start_date
|
|
if end_date:
|
|
query += " AND date_agg <= :end_date"
|
|
params['end_date'] = end_date
|
|
|
|
query += " ORDER BY date_agg ASC"
|
|
|
|
if limit:
|
|
query += f" LIMIT {limit}"
|
|
|
|
try:
|
|
df = db.execute_query(query, params)
|
|
except Exception as e:
|
|
logger.error(f"Query failed: {e}")
|
|
return create_sample_data(symbol, start_date, end_date)
|
|
|
|
if df.empty:
|
|
logger.warning(f"No data found for {symbol}")
|
|
return df
|
|
|
|
# Set datetime index
|
|
df['time'] = pd.to_datetime(df['time'])
|
|
df.set_index('time', inplace=True)
|
|
df = df.sort_index()
|
|
|
|
# Normalize column names
|
|
df.columns = ['open', 'high', 'low', 'close', 'volume', 'vwap']
|
|
|
|
logger.info(f"Loaded {len(df)} records for {symbol}")
|
|
logger.info(f" Date range: {df.index.min()} to {df.index.max()}")
|
|
|
|
return df
|
|
|
|
|
|
def create_sample_data(
|
|
symbol: str,
|
|
start_date: str = None,
|
|
end_date: str = None,
|
|
n_records: int = 100000
|
|
) -> pd.DataFrame:
|
|
"""Create sample data for testing when database is unavailable."""
|
|
logger.info(f"Creating sample data for {symbol}...")
|
|
|
|
np.random.seed(42)
|
|
|
|
start = pd.Timestamp(start_date or '2020-01-01')
|
|
end = pd.Timestamp(end_date or '2024-12-31')
|
|
|
|
dates = pd.date_range(start=start, end=end, freq='5min')
|
|
n = min(len(dates), n_records)
|
|
dates = dates[:n]
|
|
|
|
# Base price based on symbol
|
|
if symbol == 'XAUUSD':
|
|
base_price = 1800
|
|
volatility = 3.0
|
|
elif symbol == 'BTCUSD':
|
|
base_price = 30000
|
|
volatility = 200.0
|
|
else:
|
|
base_price = 1.10
|
|
volatility = 0.001
|
|
|
|
price = base_price + np.cumsum(np.random.randn(n) * volatility * 0.1)
|
|
|
|
df = pd.DataFrame({
|
|
'open': price + np.random.randn(n) * volatility * 0.2,
|
|
'high': price + np.abs(np.random.randn(n)) * volatility,
|
|
'low': price - np.abs(np.random.randn(n)) * volatility,
|
|
'close': price + np.random.randn(n) * volatility * 0.2,
|
|
'volume': np.random.randint(100, 10000, n),
|
|
'vwap': price
|
|
}, index=dates)
|
|
|
|
# Ensure high >= all, low <= all
|
|
df['high'] = df[['open', 'high', 'close']].max(axis=1)
|
|
df['low'] = df[['open', 'low', 'close']].min(axis=1)
|
|
|
|
logger.info(f"Created {len(df)} sample records")
|
|
return df
|
|
|
|
|
|
# ==============================================================================
|
|
# Data Preparation
|
|
# ==============================================================================
|
|
|
|
def resample_to_timeframe(df: pd.DataFrame, timeframe: str) -> pd.DataFrame:
|
|
"""Resample 5-minute data to different timeframe."""
|
|
if timeframe == '5m':
|
|
return df
|
|
|
|
tf_map = {
|
|
'15m': '15min',
|
|
'30m': '30min',
|
|
'1H': '1H',
|
|
'4H': '4H',
|
|
'1D': '1D'
|
|
}
|
|
|
|
offset = tf_map.get(timeframe, timeframe)
|
|
|
|
resampled = df.resample(offset).agg({
|
|
'open': 'first',
|
|
'high': 'max',
|
|
'low': 'min',
|
|
'close': 'last',
|
|
'volume': 'sum'
|
|
}).dropna()
|
|
|
|
logger.info(f"Resampled to {timeframe}: {len(resampled)} bars")
|
|
return resampled
|
|
|
|
|
|
def split_train_holdout(
|
|
df: pd.DataFrame,
|
|
holdout_years: float = 1.0,
|
|
train_years: float = 5.0
|
|
) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
"""
|
|
Split data into training and holdout sets.
|
|
|
|
Holdout = last holdout_years of data (for backtesting)
|
|
Training = everything before holdout
|
|
|
|
Args:
|
|
df: DataFrame with datetime index
|
|
holdout_years: Years to reserve for backtesting
|
|
train_years: Maximum years for training
|
|
|
|
Returns:
|
|
Tuple of (train_df, holdout_df)
|
|
"""
|
|
max_date = df.index.max()
|
|
min_date = df.index.min()
|
|
|
|
# Holdout = last N years
|
|
holdout_start = max_date - timedelta(days=holdout_years * 365)
|
|
|
|
# Training = holdout_years before holdout
|
|
train_start = holdout_start - timedelta(days=train_years * 365)
|
|
train_start = max(train_start, min_date)
|
|
|
|
train_mask = (df.index >= train_start) & (df.index < holdout_start)
|
|
holdout_mask = df.index >= holdout_start
|
|
|
|
train_df = df[train_mask].copy()
|
|
holdout_df = df[holdout_mask].copy()
|
|
|
|
logger.info(f"Data split:")
|
|
logger.info(f" Training: {train_start.strftime('%Y-%m-%d')} to {holdout_start.strftime('%Y-%m-%d')} "
|
|
f"({len(train_df)} samples)")
|
|
logger.info(f" Holdout: {holdout_start.strftime('%Y-%m-%d')} to {max_date.strftime('%Y-%m-%d')} "
|
|
f"({len(holdout_df)} samples)")
|
|
|
|
return train_df, holdout_df
|
|
|
|
|
|
def compute_targets(
|
|
df: pd.DataFrame,
|
|
horizon_bars: int
|
|
) -> Tuple[np.ndarray, np.ndarray]:
|
|
"""
|
|
Compute corrected targets for range prediction.
|
|
|
|
Formula:
|
|
- target_high = MAX(high[t+1:t+horizon+1]) - close[t]
|
|
- target_low = close[t] - MIN(low[t+1:t+horizon+1])
|
|
|
|
Args:
|
|
df: DataFrame with OHLC columns
|
|
horizon_bars: Number of bars to look ahead
|
|
|
|
Returns:
|
|
Tuple of (target_high, target_low) arrays
|
|
"""
|
|
close = df['close'].values
|
|
high = df['high'].values
|
|
low = df['low'].values
|
|
n = len(df)
|
|
|
|
target_high = np.full(n, np.nan)
|
|
target_low = np.full(n, np.nan)
|
|
|
|
for i in range(n - horizon_bars):
|
|
# Future window [t+1, t+horizon]
|
|
future_high = high[i+1:i+1+horizon_bars]
|
|
future_low = low[i+1:i+1+horizon_bars]
|
|
|
|
target_high[i] = np.max(future_high) - close[i]
|
|
target_low[i] = close[i] - np.min(future_low)
|
|
|
|
return target_high, target_low
|
|
|
|
|
|
def compute_sample_weights(
|
|
df: pd.DataFrame,
|
|
target_high: np.ndarray,
|
|
target_low: np.ndarray,
|
|
config: dict
|
|
) -> np.ndarray:
|
|
"""
|
|
Compute sample weights using volatility-based approach.
|
|
|
|
Args:
|
|
df: DataFrame with OHLC data
|
|
target_high: Target high values
|
|
target_low: Target low values
|
|
config: Training configuration
|
|
|
|
Returns:
|
|
Array of sample weights
|
|
"""
|
|
if not config.get('use_volatility_weighting', True):
|
|
return np.ones(len(df))
|
|
|
|
# Compute factor
|
|
factor = compute_factor_median_range(
|
|
df,
|
|
window=config.get('factor_window', 200)
|
|
)
|
|
|
|
# Compute move multiplier from target movement
|
|
total_target = np.abs(target_high) + np.abs(target_low)
|
|
m = total_target / (factor.values + 1e-12)
|
|
|
|
# Apply weighting
|
|
if config.get('use_smooth_weights', True):
|
|
weights = weight_smooth(
|
|
m,
|
|
w_max=config.get('softplus_w_max', 3.0),
|
|
beta=config.get('softplus_beta', 4.0)
|
|
)
|
|
else:
|
|
weights = weight_step(m, w_max=3)
|
|
|
|
# Handle NaN
|
|
nan_mask = np.isnan(weights) | np.isnan(factor.values)
|
|
weights[nan_mask] = 1.0
|
|
|
|
# Normalize
|
|
valid_mask = ~nan_mask
|
|
if valid_mask.sum() > 0 and weights[valid_mask].mean() > 0:
|
|
weights[valid_mask] = weights[valid_mask] / weights[valid_mask].mean()
|
|
|
|
logger.info(f"Sample weights computed:")
|
|
logger.info(f" Mean multiplier: {np.nanmean(m):.2f}")
|
|
logger.info(f" High attention (w>1.5): {(weights > 1.5).sum()} samples")
|
|
|
|
return weights
|
|
|
|
|
|
# ==============================================================================
|
|
# Training Functions
|
|
# ==============================================================================
|
|
|
|
def train_model(
|
|
X_train: np.ndarray,
|
|
y_train: np.ndarray,
|
|
X_val: np.ndarray,
|
|
y_val: np.ndarray,
|
|
sample_weights: np.ndarray,
|
|
config: dict
|
|
) -> Tuple[Any, dict]:
|
|
"""
|
|
Train a single XGBoost model.
|
|
|
|
Args:
|
|
X_train: Training features
|
|
y_train: Training targets
|
|
X_val: Validation features
|
|
y_val: Validation targets
|
|
sample_weights: Sample weights
|
|
config: Training configuration
|
|
|
|
Returns:
|
|
Tuple of (model, metrics_dict)
|
|
"""
|
|
xgb_params = config.get('xgb_params', {}).copy()
|
|
|
|
# Force CPU training (GPU requires special build)
|
|
if xgb_params.get('tree_method') == 'gpu_hist':
|
|
xgb_params['tree_method'] = 'hist'
|
|
if 'device' in xgb_params:
|
|
del xgb_params['device']
|
|
|
|
model = XGBRegressor(**xgb_params)
|
|
|
|
# Fit with sample weights
|
|
model.fit(
|
|
X_train, y_train,
|
|
sample_weight=sample_weights,
|
|
eval_set=[(X_val, y_val)],
|
|
verbose=False
|
|
)
|
|
|
|
# Evaluate
|
|
y_pred_val = model.predict(X_val)
|
|
|
|
metrics = {
|
|
'mae': mean_absolute_error(y_val, y_pred_val),
|
|
'rmse': np.sqrt(mean_squared_error(y_val, y_pred_val)),
|
|
'r2': r2_score(y_val, y_pred_val),
|
|
'directional_accuracy': np.mean(np.sign(y_val) == np.sign(y_pred_val)),
|
|
'n_train': len(X_train),
|
|
'n_val': len(X_val)
|
|
}
|
|
|
|
return model, metrics
|
|
|
|
|
|
def train_symbol_timeframe(
|
|
symbol: str,
|
|
timeframe: str,
|
|
config: dict,
|
|
db_config_path: str
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Train models for a specific symbol and timeframe.
|
|
|
|
Args:
|
|
symbol: Trading symbol
|
|
timeframe: Timeframe
|
|
config: Training configuration
|
|
db_config_path: Database config path
|
|
|
|
Returns:
|
|
Dictionary with models and results
|
|
"""
|
|
logger.info(f"\n{'='*60}")
|
|
logger.info(f"Training {symbol} {timeframe}")
|
|
logger.info(f"{'='*60}")
|
|
|
|
# Load raw data (5m)
|
|
df_raw = load_data_from_db(
|
|
symbol,
|
|
end_date='2024-12-31', # Exclude 2025
|
|
db_config_path=db_config_path
|
|
)
|
|
|
|
if df_raw.empty:
|
|
logger.warning(f"No data for {symbol}")
|
|
return {}
|
|
|
|
# Resample if needed
|
|
if timeframe == '5m':
|
|
df_tf = df_raw[['open', 'high', 'low', 'close', 'volume']].copy()
|
|
else:
|
|
df_tf = resample_to_timeframe(
|
|
df_raw[['open', 'high', 'low', 'close', 'volume']],
|
|
timeframe
|
|
)
|
|
|
|
if len(df_tf) < config.get('min_train_samples', 5000):
|
|
logger.warning(f"Insufficient {timeframe} data: {len(df_tf)} rows")
|
|
return {}
|
|
|
|
# Generate reduced features
|
|
logger.info("Generating reduced features (14 total)...")
|
|
df_features = generate_reduced_features(df_tf)
|
|
|
|
logger.info(f"Features shape: {df_features.shape}")
|
|
logger.info(f"Features: {list(df_features.columns)}")
|
|
|
|
# Split train/holdout
|
|
train_df, _ = split_train_holdout(
|
|
df_features,
|
|
holdout_years=config.get('holdout_years', 1.0),
|
|
train_years=config.get('train_years', 5.0)
|
|
)
|
|
|
|
# Get horizon
|
|
horizon = HORIZONS.get(timeframe, 3)
|
|
|
|
# Compute targets
|
|
target_high, target_low = compute_targets(train_df, horizon)
|
|
|
|
# Compute sample weights
|
|
sample_weights = compute_sample_weights(train_df, target_high, target_low, config)
|
|
|
|
# Prepare features (excluding OHLCV for prediction, but keep for context)
|
|
feature_cols = get_feature_columns_without_ohlcv()
|
|
available_features = [c for c in feature_cols if c in train_df.columns]
|
|
|
|
logger.info(f"Training features ({len(available_features)}): {available_features}")
|
|
|
|
X = train_df[available_features].values
|
|
|
|
# Remove invalid samples (NaN targets)
|
|
valid_mask = ~(np.isnan(target_high) | np.isnan(target_low))
|
|
X_valid = X[valid_mask]
|
|
y_high_valid = target_high[valid_mask]
|
|
y_low_valid = target_low[valid_mask]
|
|
weights_valid = sample_weights[valid_mask]
|
|
|
|
# Train/val split (time-based)
|
|
val_split = config.get('val_split', 0.15)
|
|
split_idx = int(len(X_valid) * (1 - val_split))
|
|
|
|
X_train, X_val = X_valid[:split_idx], X_valid[split_idx:]
|
|
y_high_train, y_high_val = y_high_valid[:split_idx], y_high_valid[split_idx:]
|
|
y_low_train, y_low_val = y_low_valid[:split_idx], y_low_valid[split_idx:]
|
|
weights_train = weights_valid[:split_idx]
|
|
|
|
results = {}
|
|
|
|
# Train HIGH model
|
|
logger.info(f"\nTraining HIGH model...")
|
|
model_high, metrics_high = train_model(
|
|
X_train, y_high_train, X_val, y_high_val,
|
|
weights_train, config
|
|
)
|
|
|
|
key_high = f"{symbol}_{timeframe}_high_h{horizon}"
|
|
results[key_high] = {
|
|
'model': model_high,
|
|
'metrics': metrics_high,
|
|
'feature_columns': available_features
|
|
}
|
|
|
|
logger.info(f"HIGH: MAE={metrics_high['mae']:.4f}, R2={metrics_high['r2']:.4f}, "
|
|
f"DirAcc={metrics_high['directional_accuracy']:.2%}")
|
|
|
|
# Train LOW model
|
|
logger.info(f"\nTraining LOW model...")
|
|
model_low, metrics_low = train_model(
|
|
X_train, y_low_train, X_val, y_low_val,
|
|
weights_train, config
|
|
)
|
|
|
|
key_low = f"{symbol}_{timeframe}_low_h{horizon}"
|
|
results[key_low] = {
|
|
'model': model_low,
|
|
'metrics': metrics_low,
|
|
'feature_columns': available_features
|
|
}
|
|
|
|
logger.info(f"LOW: MAE={metrics_low['mae']:.4f}, R2={metrics_low['r2']:.4f}, "
|
|
f"DirAcc={metrics_low['directional_accuracy']:.2%}")
|
|
|
|
return results
|
|
|
|
|
|
# ==============================================================================
|
|
# Main Training Pipeline
|
|
# ==============================================================================
|
|
|
|
def train_all_models(
|
|
symbols: List[str],
|
|
timeframes: List[str],
|
|
output_dir: Path,
|
|
db_config_path: str
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Train models for all symbol/timeframe combinations.
|
|
|
|
Args:
|
|
symbols: List of symbols to train
|
|
timeframes: List of timeframes
|
|
output_dir: Output directory for models
|
|
db_config_path: Database config path
|
|
|
|
Returns:
|
|
Dictionary with all results
|
|
"""
|
|
logger.info("=" * 60)
|
|
logger.info("Reduced Features Model Training")
|
|
logger.info("=" * 60)
|
|
logger.info(f"Features: {COLUMNS_TO_TRAIN}")
|
|
logger.info(f"Symbols: {symbols}")
|
|
logger.info(f"Timeframes: {timeframes}")
|
|
logger.info(f"Cutoff: 2024-12-31 (2025 reserved for backtesting)")
|
|
|
|
all_results = {}
|
|
all_models = {}
|
|
|
|
for symbol in symbols:
|
|
for timeframe in timeframes:
|
|
try:
|
|
results = train_symbol_timeframe(
|
|
symbol, timeframe,
|
|
TRAINING_CONFIG,
|
|
db_config_path
|
|
)
|
|
|
|
if results:
|
|
for key, data in results.items():
|
|
all_models[key] = data['model']
|
|
all_results[key] = data['metrics']
|
|
all_results[key]['feature_columns'] = data['feature_columns']
|
|
|
|
except Exception as e:
|
|
logger.error(f"Training failed for {symbol} {timeframe}: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
# Save models
|
|
model_dir = output_dir / 'reduced_features_models'
|
|
model_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
for key, model in all_models.items():
|
|
model_path = model_dir / f"{key}.joblib"
|
|
joblib.dump(model, model_path)
|
|
logger.info(f"Saved model: {model_path}")
|
|
|
|
# Save metadata
|
|
metadata = {
|
|
'features': COLUMNS_TO_TRAIN,
|
|
'training_config': TRAINING_CONFIG,
|
|
'results': all_results,
|
|
'model_keys': list(all_models.keys()),
|
|
'trained_at': datetime.now().isoformat()
|
|
}
|
|
|
|
metadata_path = model_dir / 'metadata.joblib'
|
|
joblib.dump(metadata, metadata_path)
|
|
|
|
# Save summary JSON
|
|
summary_path = model_dir / 'training_summary.json'
|
|
with open(summary_path, 'w') as f:
|
|
json.dump({
|
|
'features': COLUMNS_TO_TRAIN,
|
|
'symbols': symbols,
|
|
'timeframes': timeframes,
|
|
'results': {k: {kk: vv for kk, vv in v.items() if kk != 'model'}
|
|
for k, v in all_results.items()},
|
|
'trained_at': datetime.now().isoformat()
|
|
}, f, indent=2, default=str)
|
|
|
|
logger.info(f"\nModels saved to {model_dir}")
|
|
|
|
return all_results
|
|
|
|
|
|
def generate_training_report(results: Dict, output_dir: Path) -> Path:
|
|
"""Generate a Markdown training report."""
|
|
report_path = output_dir / 'reduced_features_models' / f"TRAINING_REPORT_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
|
|
|
|
report = f"""# Reduced Features Model Training Report
|
|
|
|
**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
|
|
|
## Feature Set (14 Features)
|
|
|
|
| Category | Features |
|
|
|----------|----------|
|
|
| OHLCV | open, high, low, close, volume |
|
|
| Volatility | ATR |
|
|
| Trend | SAR |
|
|
| Momentum | RSI, MFI |
|
|
| Volume Flow | OBV, AD, CMF |
|
|
| Volume Derived | volume_z, volume_anomaly |
|
|
|
|
## Training Configuration
|
|
|
|
- **Training Data Cutoff:** 2024-12-31 (2025 reserved for backtesting)
|
|
- **Volatility Weighting:** Enabled (softplus, beta=4.0, w_max=3.0)
|
|
- **XGBoost:** n_estimators=300, max_depth=6, lr=0.03
|
|
|
|
## Results Summary
|
|
|
|
| Model | MAE | RMSE | R2 | Dir Accuracy | Train | Val |
|
|
|-------|-----|------|----|--------------| ----- | --- |
|
|
"""
|
|
|
|
for key, metrics in results.items():
|
|
report += f"| {key} | {metrics['mae']:.6f} | {metrics['rmse']:.6f} | "
|
|
report += f"{metrics['r2']:.4f} | {metrics['directional_accuracy']:.2%} | "
|
|
report += f"{metrics['n_train']} | {metrics['n_val']} |\n"
|
|
|
|
report += f"""
|
|
|
|
## Usage Example
|
|
|
|
```python
|
|
import joblib
|
|
from config.reduced_features import generate_reduced_features
|
|
|
|
# Load model
|
|
model_high = joblib.load('models/reduced_features_models/XAUUSD_15m_high_h3.joblib')
|
|
model_low = joblib.load('models/reduced_features_models/XAUUSD_15m_low_h3.joblib')
|
|
|
|
# Prepare features
|
|
features = generate_reduced_features(df_ohlcv)
|
|
feature_cols = ['ATR', 'SAR', 'RSI', 'MFI', 'OBV', 'AD', 'CMF', 'volume_z', 'volume_anomaly']
|
|
X = features[feature_cols].values
|
|
|
|
# Predict
|
|
pred_high = model_high.predict(X)
|
|
pred_low = model_low.predict(X)
|
|
```
|
|
|
|
## Notes
|
|
|
|
1. Models trained on data up to 2024-12-31
|
|
2. 2025 data reserved for out-of-sample backtesting
|
|
3. Volatility-biased weighting emphasizes high-movement samples
|
|
4. Reduced feature set (14) for better generalization
|
|
|
|
---
|
|
*Report generated by Reduced Features Training Pipeline*
|
|
"""
|
|
|
|
with open(report_path, 'w') as f:
|
|
f.write(report)
|
|
|
|
logger.info(f"Report saved to {report_path}")
|
|
return report_path
|
|
|
|
|
|
# ==============================================================================
|
|
# CLI Entry Point
|
|
# ==============================================================================
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Train ML models with reduced 14-feature set'
|
|
)
|
|
parser.add_argument(
|
|
'--symbols', nargs='+',
|
|
default=['XAUUSD', 'EURUSD', 'BTCUSD'],
|
|
help='Symbols to train (default: XAUUSD EURUSD BTCUSD)'
|
|
)
|
|
parser.add_argument(
|
|
'--timeframes', nargs='+',
|
|
default=['5m', '15m'],
|
|
help='Timeframes to train (default: 5m 15m)'
|
|
)
|
|
parser.add_argument(
|
|
'--output-dir', type=str,
|
|
default='models/',
|
|
help='Output directory for models'
|
|
)
|
|
parser.add_argument(
|
|
'--db-config', type=str,
|
|
default='config/database.yaml',
|
|
help='Database configuration file'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Setup paths
|
|
script_dir = Path(__file__).parent.parent
|
|
output_dir = script_dir / args.output_dir
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Setup logging
|
|
log_dir = output_dir / 'logs'
|
|
log_dir.mkdir(parents=True, exist_ok=True)
|
|
log_file = log_dir / f"reduced_features_training_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
|
|
|
|
logger.remove()
|
|
logger.add(sys.stderr, level="INFO", format="{time:HH:mm:ss} | {level} | {message}")
|
|
logger.add(log_file, level="DEBUG", rotation="10 MB")
|
|
|
|
logger.info(f"Logging to {log_file}")
|
|
|
|
# Run training
|
|
try:
|
|
results = train_all_models(
|
|
symbols=args.symbols,
|
|
timeframes=args.timeframes,
|
|
output_dir=output_dir,
|
|
db_config_path=str(script_dir / args.db_config)
|
|
)
|
|
|
|
# Generate report
|
|
generate_training_report(results, output_dir)
|
|
|
|
# Print summary
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info("TRAINING COMPLETE!")
|
|
logger.info("=" * 60)
|
|
|
|
for key, metrics in results.items():
|
|
logger.info(f"{key}:")
|
|
logger.info(f" MAE={metrics['mae']:.6f}, R2={metrics['r2']:.4f}, "
|
|
f"DirAcc={metrics['directional_accuracy']:.2%}")
|
|
|
|
except Exception as e:
|
|
logger.exception(f"Training failed: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|