""" Walk-forward validation implementation Based on best practices from analyzed projects """ import pandas as pd import numpy as np from typing import List, Tuple, Dict, Any, Optional, Union from dataclasses import dataclass from loguru import logger import joblib from pathlib import Path import json @dataclass class WalkForwardSplit: """Data class for a single walk-forward split""" split_id: int train_start: int train_end: int val_start: int val_end: int train_data: pd.DataFrame val_data: pd.DataFrame @property def train_size(self) -> int: return len(self.train_data) @property def val_size(self) -> int: return len(self.val_data) def __repr__(self) -> str: return (f"Split {self.split_id}: " f"Train[{self.train_start}:{self.train_end}] n={self.train_size}, " f"Val[{self.val_start}:{self.val_end}] n={self.val_size}") class WalkForwardValidator: """Walk-forward validation for time series data""" def __init__( self, n_splits: int = 5, test_size: float = 0.2, gap: int = 0, expanding_window: bool = False, min_train_size: int = 10000 ): """ Initialize walk-forward validator Args: n_splits: Number of splits test_size: Test size as fraction of step size gap: Gap between train and test sets (to avoid look-ahead) expanding_window: If True, training window expands; if False, sliding window min_train_size: Minimum training samples required """ self.n_splits = n_splits self.test_size = test_size self.gap = gap self.expanding_window = expanding_window self.min_train_size = min_train_size self.splits = [] self.results = {} def split( self, data: pd.DataFrame ) -> List[WalkForwardSplit]: """ Create walk-forward validation splits Args: data: Complete DataFrame with time index Returns: List of WalkForwardSplit objects """ n_samples = len(data) # Calculate step size step_size = n_samples // (self.n_splits + 1) test_size = int(step_size * self.test_size) if step_size < self.min_train_size: logger.warning( f"Step size ({step_size}) is less than minimum train size ({self.min_train_size}). " f"Reducing number of splits." ) self.n_splits = max(1, n_samples // self.min_train_size - 1) step_size = n_samples // (self.n_splits + 1) test_size = int(step_size * self.test_size) self.splits = [] for i in range(self.n_splits): if self.expanding_window: # Expanding window: always start from beginning train_start = 0 else: # Sliding window: move start forward train_start = i * step_size if i > 0 else 0 train_end = (i + 1) * step_size val_start = train_end + self.gap val_end = min(val_start + test_size, n_samples) # Ensure we have enough data if val_end > n_samples or (train_end - train_start) < self.min_train_size: logger.warning(f"Skipping split {i+1}: insufficient data") continue # Create split split = WalkForwardSplit( split_id=i + 1, train_start=train_start, train_end=train_end, val_start=val_start, val_end=val_end, train_data=data.iloc[train_start:train_end].copy(), val_data=data.iloc[val_start:val_end].copy() ) self.splits.append(split) logger.info(f"Created {split}") logger.info(f"✅ Created {len(self.splits)} walk-forward splits") return self.splits def train_model( self, model_class: Any, model_config: Dict[str, Any], data: pd.DataFrame, feature_cols: List[str], target_cols: List[str], save_models: bool = True, model_dir: str = "models/walk_forward" ) -> Dict[str, Any]: """ Train a model using walk-forward validation Args: model_class: Model class to instantiate model_config: Configuration for model data: Complete DataFrame feature_cols: List of feature column names target_cols: List of target column names save_models: Whether to save trained models model_dir: Directory to save models Returns: Dictionary with results for all splits """ # Create splits if not already done if not self.splits: self.splits = self.split(data) results = { 'splits': [], 'metrics': { 'train_mse': [], 'val_mse': [], 'train_mae': [], 'val_mae': [], 'train_r2': [], 'val_r2': [] }, 'models': [], 'config': model_config } for split in self.splits: logger.info(f"🏃 Training on {split}") # Prepare data X_train = split.train_data[feature_cols] y_train = split.train_data[target_cols] X_val = split.val_data[feature_cols] y_val = split.val_data[target_cols] # Initialize model model = model_class(model_config) # Train model if hasattr(model, 'train'): # XGBoost style metrics = model.train(X_train, y_train, X_val, y_val) else: # PyTorch style metrics = model.train_model(X_train, y_train, X_val, y_val) # Make predictions for validation if hasattr(model, 'predict'): val_predictions = model.predict(X_val) else: val_predictions = model(X_val) # Calculate additional metrics if needed from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score if isinstance(val_predictions, np.ndarray): val_mse = mean_squared_error(y_val.values, val_predictions) val_mae = mean_absolute_error(y_val.values, val_predictions) val_r2 = r2_score(y_val.values, val_predictions) else: # Handle torch tensors val_predictions_np = val_predictions.detach().cpu().numpy() val_mse = mean_squared_error(y_val.values, val_predictions_np) val_mae = mean_absolute_error(y_val.values, val_predictions_np) val_r2 = r2_score(y_val.values, val_predictions_np) # Store results split_results = { 'split_id': split.split_id, 'train_size': split.train_size, 'val_size': split.val_size, 'metrics': { 'val_mse': val_mse, 'val_mae': val_mae, 'val_r2': val_r2, **metrics } } results['splits'].append(split_results) results['metrics']['val_mse'].append(val_mse) results['metrics']['val_mae'].append(val_mae) results['metrics']['val_r2'].append(val_r2) # Save model if requested if save_models: model_path = Path(model_dir) / f"model_split_{split.split_id}.pkl" model_path.parent.mkdir(parents=True, exist_ok=True) if hasattr(model, 'save'): model.save(str(model_path)) else: joblib.dump(model, model_path) results['models'].append(str(model_path)) logger.info(f"💾 Saved model to {model_path}") # Log split results logger.info( f"Split {split.split_id} - " f"Val MSE: {val_mse:.6f}, " f"Val MAE: {val_mae:.6f}, " f"Val R2: {val_r2:.4f}" ) # Calculate average metrics results['avg_metrics'] = { 'val_mse': np.mean(results['metrics']['val_mse']), 'val_mse_std': np.std(results['metrics']['val_mse']), 'val_mae': np.mean(results['metrics']['val_mae']), 'val_mae_std': np.std(results['metrics']['val_mae']), 'val_r2': np.mean(results['metrics']['val_r2']), 'val_r2_std': np.std(results['metrics']['val_r2']) } logger.info( f"📊 Walk-Forward Average - " f"MSE: {results['avg_metrics']['val_mse']:.6f} (±{results['avg_metrics']['val_mse_std']:.6f}), " f"R2: {results['avg_metrics']['val_r2']:.4f} (±{results['avg_metrics']['val_r2_std']:.4f})" ) self.results = results return results def combine_predictions( self, models: List[Any], X: pd.DataFrame, method: str = 'average' ) -> np.ndarray: """ Combine predictions from multiple walk-forward models Args: models: List of trained models X: Features to predict on method: Combination method ('average', 'weighted', 'best') Returns: Combined predictions """ predictions = [] for model in models: if hasattr(model, 'predict'): pred = model.predict(X) else: pred = model(X) if hasattr(pred, 'detach'): pred = pred.detach().cpu().numpy() predictions.append(pred) predictions = np.array(predictions) if method == 'average': # Simple average combined = np.mean(predictions, axis=0) elif method == 'weighted': # Weight by validation performance weights = 1 / np.array(self.results['metrics']['val_mse']) weights = weights / weights.sum() combined = np.average(predictions, axis=0, weights=weights) elif method == 'best': # Use best performing model best_idx = np.argmin(self.results['metrics']['val_mse']) combined = predictions[best_idx] else: raise ValueError(f"Unknown combination method: {method}") return combined def save_results(self, path: str): """Save validation results to file""" save_path = Path(path) save_path.parent.mkdir(parents=True, exist_ok=True) with open(save_path, 'w') as f: json.dump(self.results, f, indent=2, default=str) logger.info(f"💾 Saved results to {save_path}") def load_results(self, path: str): """Load validation results from file""" with open(path, 'r') as f: self.results = json.load(f) logger.info(f"📂 Loaded results from {path}") return self.results def plot_results(self, save_path: Optional[str] = None): """ Plot walk-forward validation results Args: save_path: Path to save plot """ import matplotlib.pyplot as plt if not self.results: logger.warning("No results to plot") return fig, axes = plt.subplots(2, 2, figsize=(12, 10)) # MSE across splits splits = [s['split_id'] for s in self.results['splits']] mse_values = self.results['metrics']['val_mse'] axes[0, 0].bar(splits, mse_values, color='steelblue') axes[0, 0].axhline( y=self.results['avg_metrics']['val_mse'], color='red', linestyle='--', label='Average' ) axes[0, 0].set_xlabel('Split') axes[0, 0].set_ylabel('MSE') axes[0, 0].set_title('Validation MSE by Split') axes[0, 0].legend() # MAE across splits mae_values = self.results['metrics']['val_mae'] axes[0, 1].bar(splits, mae_values, color='forestgreen') axes[0, 1].axhline( y=self.results['avg_metrics']['val_mae'], color='red', linestyle='--', label='Average' ) axes[0, 1].set_xlabel('Split') axes[0, 1].set_ylabel('MAE') axes[0, 1].set_title('Validation MAE by Split') axes[0, 1].legend() # R2 across splits r2_values = self.results['metrics']['val_r2'] axes[1, 0].bar(splits, r2_values, color='coral') axes[1, 0].axhline( y=self.results['avg_metrics']['val_r2'], color='red', linestyle='--', label='Average' ) axes[1, 0].set_xlabel('Split') axes[1, 0].set_ylabel('R²') axes[1, 0].set_title('Validation R² by Split') axes[1, 0].legend() # Sample sizes train_sizes = [s['train_size'] for s in self.results['splits']] val_sizes = [s['val_size'] for s in self.results['splits']] x = np.arange(len(splits)) width = 0.35 axes[1, 1].bar(x - width/2, train_sizes, width, label='Train', color='navy') axes[1, 1].bar(x + width/2, val_sizes, width, label='Validation', color='orange') axes[1, 1].set_xlabel('Split') axes[1, 1].set_ylabel('Sample Size') axes[1, 1].set_title('Data Split Sizes') axes[1, 1].set_xticks(x) axes[1, 1].set_xticklabels(splits) axes[1, 1].legend() plt.suptitle('Walk-Forward Validation Results', fontsize=14, fontweight='bold') plt.tight_layout() if save_path: plt.savefig(save_path, dpi=300, bbox_inches='tight') logger.info(f"📊 Plot saved to {save_path}") plt.show() if __name__ == "__main__": # Test walk-forward validation from datetime import datetime, timedelta # Create sample data dates = pd.date_range(start='2020-01-01', periods=50000, freq='5min') np.random.seed(42) df = pd.DataFrame({ 'feature1': np.random.randn(50000), 'feature2': np.random.randn(50000), 'feature3': np.random.randn(50000), 'target': np.random.randn(50000) }, index=dates) # Initialize validator validator = WalkForwardValidator( n_splits=5, test_size=0.2, gap=0, expanding_window=False, min_train_size=5000 ) # Create splits splits = validator.split(df) print(f"Created {len(splits)} splits:") for split in splits: print(f" {split}") # Test plot (without actual training) # validator.plot_results()