trading-platform-ml-engine/src/training/walk_forward.py

453 lines
16 KiB
Python

"""
Walk-forward validation implementation
Based on best practices from analyzed projects
"""
import pandas as pd
import numpy as np
from typing import List, Tuple, Dict, Any, Optional, Union
from dataclasses import dataclass
from loguru import logger
import joblib
from pathlib import Path
import json
@dataclass
class WalkForwardSplit:
"""Data class for a single walk-forward split"""
split_id: int
train_start: int
train_end: int
val_start: int
val_end: int
train_data: pd.DataFrame
val_data: pd.DataFrame
@property
def train_size(self) -> int:
return len(self.train_data)
@property
def val_size(self) -> int:
return len(self.val_data)
def __repr__(self) -> str:
return (f"Split {self.split_id}: "
f"Train[{self.train_start}:{self.train_end}] n={self.train_size}, "
f"Val[{self.val_start}:{self.val_end}] n={self.val_size}")
class WalkForwardValidator:
"""Walk-forward validation for time series data"""
def __init__(
self,
n_splits: int = 5,
test_size: float = 0.2,
gap: int = 0,
expanding_window: bool = False,
min_train_size: int = 10000
):
"""
Initialize walk-forward validator
Args:
n_splits: Number of splits
test_size: Test size as fraction of step size
gap: Gap between train and test sets (to avoid look-ahead)
expanding_window: If True, training window expands; if False, sliding window
min_train_size: Minimum training samples required
"""
self.n_splits = n_splits
self.test_size = test_size
self.gap = gap
self.expanding_window = expanding_window
self.min_train_size = min_train_size
self.splits = []
self.results = {}
def split(
self,
data: pd.DataFrame
) -> List[WalkForwardSplit]:
"""
Create walk-forward validation splits
Args:
data: Complete DataFrame with time index
Returns:
List of WalkForwardSplit objects
"""
n_samples = len(data)
# Calculate step size
step_size = n_samples // (self.n_splits + 1)
test_size = int(step_size * self.test_size)
if step_size < self.min_train_size:
logger.warning(
f"Step size ({step_size}) is less than minimum train size ({self.min_train_size}). "
f"Reducing number of splits."
)
self.n_splits = max(1, n_samples // self.min_train_size - 1)
step_size = n_samples // (self.n_splits + 1)
test_size = int(step_size * self.test_size)
self.splits = []
for i in range(self.n_splits):
if self.expanding_window:
# Expanding window: always start from beginning
train_start = 0
else:
# Sliding window: move start forward
train_start = i * step_size if i > 0 else 0
train_end = (i + 1) * step_size
val_start = train_end + self.gap
val_end = min(val_start + test_size, n_samples)
# Ensure we have enough data
if val_end > n_samples or (train_end - train_start) < self.min_train_size:
logger.warning(f"Skipping split {i+1}: insufficient data")
continue
# Create split
split = WalkForwardSplit(
split_id=i + 1,
train_start=train_start,
train_end=train_end,
val_start=val_start,
val_end=val_end,
train_data=data.iloc[train_start:train_end].copy(),
val_data=data.iloc[val_start:val_end].copy()
)
self.splits.append(split)
logger.info(f"Created {split}")
logger.info(f"✅ Created {len(self.splits)} walk-forward splits")
return self.splits
def train_model(
self,
model_class: Any,
model_config: Dict[str, Any],
data: pd.DataFrame,
feature_cols: List[str],
target_cols: List[str],
save_models: bool = True,
model_dir: str = "models/walk_forward"
) -> Dict[str, Any]:
"""
Train a model using walk-forward validation
Args:
model_class: Model class to instantiate
model_config: Configuration for model
data: Complete DataFrame
feature_cols: List of feature column names
target_cols: List of target column names
save_models: Whether to save trained models
model_dir: Directory to save models
Returns:
Dictionary with results for all splits
"""
# Create splits if not already done
if not self.splits:
self.splits = self.split(data)
results = {
'splits': [],
'metrics': {
'train_mse': [],
'val_mse': [],
'train_mae': [],
'val_mae': [],
'train_r2': [],
'val_r2': []
},
'models': [],
'config': model_config
}
for split in self.splits:
logger.info(f"🏃 Training on {split}")
# Prepare data
X_train = split.train_data[feature_cols]
y_train = split.train_data[target_cols]
X_val = split.val_data[feature_cols]
y_val = split.val_data[target_cols]
# Initialize model
model = model_class(model_config)
# Train model
if hasattr(model, 'train'):
# XGBoost style
metrics = model.train(X_train, y_train, X_val, y_val)
else:
# PyTorch style
metrics = model.train_model(X_train, y_train, X_val, y_val)
# Make predictions for validation
if hasattr(model, 'predict'):
val_predictions = model.predict(X_val)
else:
val_predictions = model(X_val)
# Calculate additional metrics if needed
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
if isinstance(val_predictions, np.ndarray):
val_mse = mean_squared_error(y_val.values, val_predictions)
val_mae = mean_absolute_error(y_val.values, val_predictions)
val_r2 = r2_score(y_val.values, val_predictions)
else:
# Handle torch tensors
val_predictions_np = val_predictions.detach().cpu().numpy()
val_mse = mean_squared_error(y_val.values, val_predictions_np)
val_mae = mean_absolute_error(y_val.values, val_predictions_np)
val_r2 = r2_score(y_val.values, val_predictions_np)
# Store results
split_results = {
'split_id': split.split_id,
'train_size': split.train_size,
'val_size': split.val_size,
'metrics': {
'val_mse': val_mse,
'val_mae': val_mae,
'val_r2': val_r2,
**metrics
}
}
results['splits'].append(split_results)
results['metrics']['val_mse'].append(val_mse)
results['metrics']['val_mae'].append(val_mae)
results['metrics']['val_r2'].append(val_r2)
# Save model if requested
if save_models:
model_path = Path(model_dir) / f"model_split_{split.split_id}.pkl"
model_path.parent.mkdir(parents=True, exist_ok=True)
if hasattr(model, 'save'):
model.save(str(model_path))
else:
joblib.dump(model, model_path)
results['models'].append(str(model_path))
logger.info(f"💾 Saved model to {model_path}")
# Log split results
logger.info(
f"Split {split.split_id} - "
f"Val MSE: {val_mse:.6f}, "
f"Val MAE: {val_mae:.6f}, "
f"Val R2: {val_r2:.4f}"
)
# Calculate average metrics
results['avg_metrics'] = {
'val_mse': np.mean(results['metrics']['val_mse']),
'val_mse_std': np.std(results['metrics']['val_mse']),
'val_mae': np.mean(results['metrics']['val_mae']),
'val_mae_std': np.std(results['metrics']['val_mae']),
'val_r2': np.mean(results['metrics']['val_r2']),
'val_r2_std': np.std(results['metrics']['val_r2'])
}
logger.info(
f"📊 Walk-Forward Average - "
f"MSE: {results['avg_metrics']['val_mse']:.6f}{results['avg_metrics']['val_mse_std']:.6f}), "
f"R2: {results['avg_metrics']['val_r2']:.4f}{results['avg_metrics']['val_r2_std']:.4f})"
)
self.results = results
return results
def combine_predictions(
self,
models: List[Any],
X: pd.DataFrame,
method: str = 'average'
) -> np.ndarray:
"""
Combine predictions from multiple walk-forward models
Args:
models: List of trained models
X: Features to predict on
method: Combination method ('average', 'weighted', 'best')
Returns:
Combined predictions
"""
predictions = []
for model in models:
if hasattr(model, 'predict'):
pred = model.predict(X)
else:
pred = model(X)
if hasattr(pred, 'detach'):
pred = pred.detach().cpu().numpy()
predictions.append(pred)
predictions = np.array(predictions)
if method == 'average':
# Simple average
combined = np.mean(predictions, axis=0)
elif method == 'weighted':
# Weight by validation performance
weights = 1 / np.array(self.results['metrics']['val_mse'])
weights = weights / weights.sum()
combined = np.average(predictions, axis=0, weights=weights)
elif method == 'best':
# Use best performing model
best_idx = np.argmin(self.results['metrics']['val_mse'])
combined = predictions[best_idx]
else:
raise ValueError(f"Unknown combination method: {method}")
return combined
def save_results(self, path: str):
"""Save validation results to file"""
save_path = Path(path)
save_path.parent.mkdir(parents=True, exist_ok=True)
with open(save_path, 'w') as f:
json.dump(self.results, f, indent=2, default=str)
logger.info(f"💾 Saved results to {save_path}")
def load_results(self, path: str):
"""Load validation results from file"""
with open(path, 'r') as f:
self.results = json.load(f)
logger.info(f"📂 Loaded results from {path}")
return self.results
def plot_results(self, save_path: Optional[str] = None):
"""
Plot walk-forward validation results
Args:
save_path: Path to save plot
"""
import matplotlib.pyplot as plt
if not self.results:
logger.warning("No results to plot")
return
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# MSE across splits
splits = [s['split_id'] for s in self.results['splits']]
mse_values = self.results['metrics']['val_mse']
axes[0, 0].bar(splits, mse_values, color='steelblue')
axes[0, 0].axhline(
y=self.results['avg_metrics']['val_mse'],
color='red', linestyle='--', label='Average'
)
axes[0, 0].set_xlabel('Split')
axes[0, 0].set_ylabel('MSE')
axes[0, 0].set_title('Validation MSE by Split')
axes[0, 0].legend()
# MAE across splits
mae_values = self.results['metrics']['val_mae']
axes[0, 1].bar(splits, mae_values, color='forestgreen')
axes[0, 1].axhline(
y=self.results['avg_metrics']['val_mae'],
color='red', linestyle='--', label='Average'
)
axes[0, 1].set_xlabel('Split')
axes[0, 1].set_ylabel('MAE')
axes[0, 1].set_title('Validation MAE by Split')
axes[0, 1].legend()
# R2 across splits
r2_values = self.results['metrics']['val_r2']
axes[1, 0].bar(splits, r2_values, color='coral')
axes[1, 0].axhline(
y=self.results['avg_metrics']['val_r2'],
color='red', linestyle='--', label='Average'
)
axes[1, 0].set_xlabel('Split')
axes[1, 0].set_ylabel('')
axes[1, 0].set_title('Validation R² by Split')
axes[1, 0].legend()
# Sample sizes
train_sizes = [s['train_size'] for s in self.results['splits']]
val_sizes = [s['val_size'] for s in self.results['splits']]
x = np.arange(len(splits))
width = 0.35
axes[1, 1].bar(x - width/2, train_sizes, width, label='Train', color='navy')
axes[1, 1].bar(x + width/2, val_sizes, width, label='Validation', color='orange')
axes[1, 1].set_xlabel('Split')
axes[1, 1].set_ylabel('Sample Size')
axes[1, 1].set_title('Data Split Sizes')
axes[1, 1].set_xticks(x)
axes[1, 1].set_xticklabels(splits)
axes[1, 1].legend()
plt.suptitle('Walk-Forward Validation Results', fontsize=14, fontweight='bold')
plt.tight_layout()
if save_path:
plt.savefig(save_path, dpi=300, bbox_inches='tight')
logger.info(f"📊 Plot saved to {save_path}")
plt.show()
if __name__ == "__main__":
# Test walk-forward validation
from datetime import datetime, timedelta
# Create sample data
dates = pd.date_range(start='2020-01-01', periods=50000, freq='5min')
np.random.seed(42)
df = pd.DataFrame({
'feature1': np.random.randn(50000),
'feature2': np.random.randn(50000),
'feature3': np.random.randn(50000),
'target': np.random.randn(50000)
}, index=dates)
# Initialize validator
validator = WalkForwardValidator(
n_splits=5,
test_size=0.2,
gap=0,
expanding_window=False,
min_train_size=5000
)
# Create splits
splits = validator.split(df)
print(f"Created {len(splits)} splits:")
for split in splits:
print(f" {split}")
# Test plot (without actual training)
# validator.plot_results()