trading-platform-ml-engine-v2/tests/test_symbol_timeframe_trainer.py
rckrdmrd 75c4d07690 feat: Initial commit - ML Engine codebase
Hierarchical ML Pipeline for trading predictions:
- Level 0: Attention Models (volatility/flow classification)
- Level 1: Base Models (XGBoost per symbol/timeframe)
- Level 2: Metamodels (XGBoost Stacking + Neural Gating)

Key components:
- src/pipelines/hierarchical_pipeline.py - Main prediction pipeline
- src/models/ - All ML model classes
- src/training/ - Training utilities
- src/api/ - FastAPI endpoints
- scripts/ - Training and evaluation scripts
- config/ - YAML configurations

Note: Trained models (*.joblib, *.pt) are gitignored.
      Regenerate with training scripts.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-18 04:27:40 -06:00

395 lines
14 KiB
Python

"""
Tests for SymbolTimeframeTrainer
================================
Tests for the improved symbol-timeframe trainer with:
- ATR-normalized targets
- Reduced sample weighting aggressiveness
- Optimized XGBoost hyperparameters
Author: Trading Platform Team
Version: 1.0.0
Created: 2026-01-07
"""
import pytest
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from pathlib import Path
import tempfile
import shutil
# Import the module under test
import sys
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
from training.symbol_timeframe_trainer import (
SymbolTimeframeTrainer,
TrainerConfig,
SymbolConfig,
ModelKey,
TrainingResult,
SYMBOL_CONFIGS
)
class TestATRComputation:
"""Tests for ATR computation with shift(1)"""
def create_sample_df(self, n: int = 100) -> pd.DataFrame:
"""Create sample OHLCV data for testing"""
np.random.seed(42)
dates = pd.date_range('2025-01-01', periods=n, freq='15min')
price = 2000 + np.cumsum(np.random.randn(n) * 2)
df = pd.DataFrame({
'open': price,
'high': price + np.abs(np.random.randn(n)) * 5,
'low': price - np.abs(np.random.randn(n)) * 5,
'close': price + np.random.randn(n) * 0.5,
'volume': np.random.randint(100, 1000, n)
}, index=dates)
return df
def test_atr_computation_returns_correct_shape(self):
"""ATR should return same length as input"""
config = TrainerConfig(symbols=['XAUUSD'], timeframes=['15m'])
trainer = SymbolTimeframeTrainer(config)
df = self.create_sample_df(100)
atr = trainer._compute_atr(df, period=14)
assert len(atr) == len(df)
def test_atr_has_shift_one(self):
"""ATR should have NaN at first position due to shift(1)"""
config = TrainerConfig(symbols=['XAUUSD'], timeframes=['15m'])
trainer = SymbolTimeframeTrainer(config)
df = self.create_sample_df(100)
atr = trainer._compute_atr(df, period=14)
# First few values should be NaN due to rolling + shift
assert np.isnan(atr[0])
# After warmup period, values should be valid
assert not np.isnan(atr[20])
def test_atr_no_future_leakage(self):
"""ATR at time t should not use data from time t+1"""
config = TrainerConfig(symbols=['XAUUSD'], timeframes=['15m'])
trainer = SymbolTimeframeTrainer(config)
# Create data where last row has extreme values
df = self.create_sample_df(50)
atr_before = trainer._compute_atr(df, period=14)
# Add extreme row at the end
df_extended = df.copy()
new_row = pd.DataFrame({
'open': [df['close'].iloc[-1]],
'high': [df['close'].iloc[-1] + 1000], # Extreme high
'low': [df['close'].iloc[-1] - 1000], # Extreme low
'close': [df['close'].iloc[-1]],
'volume': [500]
}, index=[df.index[-1] + timedelta(minutes=15)])
df_extended = pd.concat([df_extended, new_row])
atr_after = trainer._compute_atr(df_extended, period=14)
# ATR at position 49 should be the same in both cases
# because shift(1) means we don't use row 50's data
assert atr_before[49] == atr_after[49]
def test_atr_values_are_positive(self):
"""ATR should always be non-negative"""
config = TrainerConfig(symbols=['XAUUSD'], timeframes=['15m'])
trainer = SymbolTimeframeTrainer(config)
df = self.create_sample_df(100)
atr = trainer._compute_atr(df, period=14)
valid_atr = atr[~np.isnan(atr)]
assert np.all(valid_atr >= 0)
class TestTargetNormalization:
"""Tests for target normalization by ATR"""
def create_sample_df(self, n: int = 100) -> pd.DataFrame:
"""Create sample OHLCV data"""
np.random.seed(42)
dates = pd.date_range('2025-01-01', periods=n, freq='15min')
price = 2000 + np.cumsum(np.random.randn(n) * 2)
df = pd.DataFrame({
'open': price,
'high': price + np.abs(np.random.randn(n)) * 5,
'low': price - np.abs(np.random.randn(n)) * 5,
'close': price + np.random.randn(n) * 0.5,
'volume': np.random.randint(100, 1000, n)
}, index=dates)
return df
def test_normalized_targets_scale(self):
"""Normalized targets should be in reasonable scale (ATR multiples)"""
config = TrainerConfig(symbols=['XAUUSD'], timeframes=['15m'])
trainer = SymbolTimeframeTrainer(config)
df = self.create_sample_df(100)
target_high, target_low, atr = trainer._compute_targets(df, horizon_bars=3, normalize=True)
# Remove NaN values
valid_high = target_high[~np.isnan(target_high)]
valid_low = target_low[~np.isnan(target_low)]
# Normalized targets should be clipped to [-5, 5]
assert np.all(valid_high >= -5)
assert np.all(valid_high <= 5)
assert np.all(valid_low >= -5)
assert np.all(valid_low <= 5)
def test_raw_targets_different_from_normalized(self):
"""Raw and normalized targets should be different"""
config = TrainerConfig(symbols=['XAUUSD'], timeframes=['15m'])
trainer = SymbolTimeframeTrainer(config)
df = self.create_sample_df(100)
target_high_norm, target_low_norm, _ = trainer._compute_targets(df, horizon_bars=3, normalize=True)
target_high_raw, target_low_raw, _ = trainer._compute_targets(df, horizon_bars=3, normalize=False)
# They should not be equal (except for edge cases)
valid_mask = ~(np.isnan(target_high_norm) | np.isnan(target_high_raw))
if valid_mask.sum() > 0:
assert not np.allclose(target_high_norm[valid_mask], target_high_raw[valid_mask])
def test_targets_have_correct_sign(self):
"""target_high should be >= 0, target_low should be >= 0"""
config = TrainerConfig(symbols=['XAUUSD'], timeframes=['15m'])
trainer = SymbolTimeframeTrainer(config)
df = self.create_sample_df(100)
# Use raw targets to check sign (before normalization)
target_high, target_low, _ = trainer._compute_targets(df, horizon_bars=3, normalize=False)
valid_high = target_high[~np.isnan(target_high)]
valid_low = target_low[~np.isnan(target_low)]
# High should be positive (future_high > close)
assert np.mean(valid_high >= 0) > 0.9 # Most should be positive
# Low should be positive (close > future_low)
assert np.mean(valid_low >= 0) > 0.9
class TestSampleWeighting:
"""Tests for sample weighting configuration"""
def test_default_weighting_parameters(self):
"""Default parameters should be the optimized values"""
config = TrainerConfig()
# Check new default values
assert config.softplus_beta == 2.0, "softplus_beta should be 2.0 (reduced from 4.0)"
assert config.softplus_w_max == 2.0, "softplus_w_max should be 2.0 (reduced from 3.0)"
def test_weighting_can_be_disabled(self):
"""Sample weighting should be disableable"""
config = TrainerConfig(use_dynamic_factor_weighting=False)
trainer = SymbolTimeframeTrainer(config)
# Create sample data
np.random.seed(42)
n = 100
dates = pd.date_range('2025-01-01', periods=n, freq='15min')
price = 2000 + np.cumsum(np.random.randn(n) * 2)
df = pd.DataFrame({
'open': price,
'high': price + np.abs(np.random.randn(n)) * 5,
'low': price - np.abs(np.random.randn(n)) * 5,
'close': price + np.random.randn(n) * 0.5,
'volume': np.random.randint(100, 1000, n)
}, index=dates)
target_high = np.random.randn(n)
target_low = np.random.randn(n)
weights = trainer._compute_sample_weights(df, target_high, target_low)
# When disabled, all weights should be 1.0
assert np.allclose(weights, 1.0)
class TestXGBoostHyperparameters:
"""Tests for XGBoost hyperparameter configuration"""
def test_default_hyperparameters_are_optimized(self):
"""Default hyperparameters should be the optimized values"""
config = TrainerConfig()
params = config.xgb_params
# Check optimized values
assert params['n_estimators'] == 150, "n_estimators should be 150"
assert params['max_depth'] == 4, "max_depth should be 4"
assert params['learning_rate'] == 0.02, "learning_rate should be 0.02"
assert params['min_child_weight'] == 20, "min_child_weight should be 20"
assert params['gamma'] == 0.3, "gamma should be 0.3"
assert params['reg_alpha'] == 0.5, "reg_alpha should be 0.5"
assert params['reg_lambda'] == 5.0, "reg_lambda should be 5.0"
def test_regularization_is_stronger(self):
"""New config should have stronger regularization"""
config = TrainerConfig()
params = config.xgb_params
# These should be higher than before for more regularization
assert params['reg_alpha'] >= 0.5, "L1 regularization should be >= 0.5"
assert params['reg_lambda'] >= 5.0, "L2 regularization should be >= 5.0"
assert params['gamma'] >= 0.3, "gamma should be >= 0.3"
class TestModelKey:
"""Tests for ModelKey dataclass"""
def test_model_key_string_representation(self):
"""ModelKey should have correct string format"""
key = ModelKey(symbol='XAUUSD', timeframe='15m', target_type='high', horizon_bars=3)
assert str(key) == 'XAUUSD_15m_high_h3'
def test_model_key_path_representation(self):
"""ModelKey should have correct path format"""
key = ModelKey(symbol='XAUUSD', timeframe='15m', target_type='high', horizon_bars=3)
assert key.to_path() == 'XAUUSD/15m/high_h3'
class TestSymbolConfigs:
"""Tests for symbol configurations"""
def test_common_symbols_configured(self):
"""Common trading symbols should be configured"""
expected_symbols = ['XAUUSD', 'BTCUSD', 'EURUSD', 'GBPUSD', 'USDJPY']
for symbol in expected_symbols:
assert symbol in SYMBOL_CONFIGS, f"{symbol} should be in SYMBOL_CONFIGS"
def test_symbol_config_has_required_fields(self):
"""Each symbol config should have required fields"""
for symbol, config in SYMBOL_CONFIGS.items():
assert hasattr(config, 'symbol'), f"{symbol} config should have 'symbol'"
assert hasattr(config, 'base_factor'), f"{symbol} config should have 'base_factor'"
assert hasattr(config, 'pip_value'), f"{symbol} config should have 'pip_value'"
class TestTrainerIntegration:
"""Integration tests for the trainer"""
def create_training_data(self, n: int = 1000) -> pd.DataFrame:
"""Create sample training data"""
np.random.seed(42)
# Generate 2 years of 15m data
dates = pd.date_range('2023-01-01', periods=n, freq='15min')
price = 2000 + np.cumsum(np.random.randn(n) * 0.5)
# Add varying volatility
volatility = np.where(
(dates.hour >= 13) & (dates.hour < 16),
5.0, 2.0
)
df = pd.DataFrame({
'open': price,
'high': price + np.abs(np.random.randn(n)) * volatility,
'low': price - np.abs(np.random.randn(n)) * volatility,
'close': price + np.random.randn(n) * 0.5,
'volume': np.random.randint(100, 1000, n),
# Features
'rsi': 50 + np.random.randn(n) * 10,
'macd': np.random.randn(n),
'bb_width': 10 + np.random.randn(n)
}, index=dates)
return df
def test_trainer_initialization(self):
"""Trainer should initialize correctly"""
config = TrainerConfig(
symbols=['XAUUSD'],
timeframes=['15m'],
min_train_samples=100
)
trainer = SymbolTimeframeTrainer(config)
assert trainer.config == config
assert 'XAUUSD' in trainer.symbol_configs
def test_trainer_can_train_single(self):
"""Trainer should be able to train on single symbol/timeframe"""
config = TrainerConfig(
symbols=['XAUUSD'],
timeframes=['15m'],
train_years=1.5, # Increased to cover more data
holdout_years=0.3, # Smaller holdout
min_train_samples=100,
xgb_params={
'n_estimators': 10, # Small for fast test
'max_depth': 3,
'learning_rate': 0.1,
'tree_method': 'hist',
'random_state': 42
}
)
trainer = SymbolTimeframeTrainer(config)
# Create more data to ensure enough for training and holdout
df = self.create_training_data(50000) # ~1 year of 15m data
results = trainer.train_single(df, 'XAUUSD', '15m')
# Should have results for high and low
assert len(results) == 2
for key, result in results.items():
assert isinstance(result, TrainingResult)
assert result.n_train > 0
assert result.n_val > 0
def test_trainer_save_and_load(self):
"""Trainer should be able to save and load models"""
config = TrainerConfig(
symbols=['XAUUSD'],
timeframes=['15m'],
train_years=0.5,
holdout_years=0.1,
min_train_samples=100,
xgb_params={
'n_estimators': 10,
'max_depth': 3,
'learning_rate': 0.1,
'tree_method': 'hist',
'random_state': 42
}
)
trainer = SymbolTimeframeTrainer(config)
df = self.create_training_data(1000)
# Train
trainer.train_single(df, 'XAUUSD', '15m')
# Save to temp directory
with tempfile.TemporaryDirectory() as tmpdir:
trainer.save(tmpdir)
# Load into new trainer
new_trainer = SymbolTimeframeTrainer(config)
new_trainer.load(tmpdir)
# Check models are loaded
assert len(new_trainer.models) == len(trainer.models)
if __name__ == '__main__':
pytest.main([__file__, '-v'])