""" Walk-Forward Validation for Backtesting ========================================= Implements walk-forward validation methodology for robust strategy evaluation. Walk-forward validation simulates real trading conditions by: 1. Training on historical data 2. Testing on unseen future data 3. Rolling forward and repeating This approach provides more realistic performance estimates than standard train/test splits. Author: ML-Specialist (NEXUS v4.0) Version: 1.0.0 Created: 2026-01-25 """ import numpy as np import pandas as pd from typing import Dict, List, Optional, Tuple, Generator, Any, Union, Callable from dataclasses import dataclass, field from datetime import datetime, timedelta from pathlib import Path import json from loguru import logger from .rr_backtester import BacktestConfig, BacktestResult from .metrics import TradingMetrics, TradeRecord, MetricsCalculator @dataclass class WalkForwardConfig: """Configuration for walk-forward validation.""" n_splits: int = 5 train_ratio: float = 0.8 gap_bars: int = 0 expanding_window: bool = False min_train_samples: int = 1000 min_test_samples: int = 200 overlap_allowed: bool = False @dataclass class WalkForwardSplit: """Single walk-forward split with train/test indices.""" split_id: int train_start_idx: int train_end_idx: int test_start_idx: int test_end_idx: int train_start_date: Optional[datetime] = None train_end_date: Optional[datetime] = None test_start_date: Optional[datetime] = None test_end_date: Optional[datetime] = None @property def train_size(self) -> int: """Number of training samples.""" return self.train_end_idx - self.train_start_idx @property def test_size(self) -> int: """Number of test samples.""" return self.test_end_idx - self.test_start_idx def to_dict(self) -> Dict[str, Any]: """Convert to dictionary.""" return { 'split_id': self.split_id, 'train_start_idx': self.train_start_idx, 'train_end_idx': self.train_end_idx, 'test_start_idx': self.test_start_idx, 'test_end_idx': self.test_end_idx, 'train_start_date': str(self.train_start_date) if self.train_start_date else None, 'train_end_date': str(self.train_end_date) if self.train_end_date else None, 'test_start_date': str(self.test_start_date) if self.test_start_date else None, 'test_end_date': str(self.test_end_date) if self.test_end_date else None, 'train_size': self.train_size, 'test_size': self.test_size } @dataclass class AggregatedResult: """ Aggregated results from walk-forward validation. Contains mean and standard deviation of all metrics across splits. """ total_trades: float total_trades_std: float winrate: float winrate_std: float profit_factor: float profit_factor_std: float sharpe_ratio: float sharpe_ratio_std: float sortino_ratio: float sortino_ratio_std: float max_drawdown_pct: float max_drawdown_pct_std: float net_profit: float net_profit_std: float avg_trade_pnl: float avg_trade_pnl_std: float n_splits: int = 0 split_results: List[BacktestResult] = field(default_factory=list) per_split_metrics: List[Dict[str, float]] = field(default_factory=list) consistency_score: float = 0.0 robustness_score: float = 0.0 def to_dict(self) -> Dict[str, Any]: """Convert to dictionary.""" return { 'total_trades': {'mean': self.total_trades, 'std': self.total_trades_std}, 'winrate': {'mean': self.winrate, 'std': self.winrate_std}, 'profit_factor': {'mean': self.profit_factor, 'std': self.profit_factor_std}, 'sharpe_ratio': {'mean': self.sharpe_ratio, 'std': self.sharpe_ratio_std}, 'sortino_ratio': {'mean': self.sortino_ratio, 'std': self.sortino_ratio_std}, 'max_drawdown_pct': {'mean': self.max_drawdown_pct, 'std': self.max_drawdown_pct_std}, 'net_profit': {'mean': self.net_profit, 'std': self.net_profit_std}, 'avg_trade_pnl': {'mean': self.avg_trade_pnl, 'std': self.avg_trade_pnl_std}, 'n_splits': self.n_splits, 'consistency_score': self.consistency_score, 'robustness_score': self.robustness_score, 'per_split_metrics': self.per_split_metrics } def print_summary(self): """Print formatted summary.""" print("\n" + "=" * 60) print("WALK-FORWARD VALIDATION RESULTS") print("=" * 60) print(f"Number of Splits: {self.n_splits}") print(f"\n--- Performance Metrics (Mean +/- Std) ---") print(f"Win Rate: {self.winrate:.2%} +/- {self.winrate_std:.2%}") print(f"Profit Factor: {self.profit_factor:.2f} +/- {self.profit_factor_std:.2f}") print(f"Sharpe Ratio: {self.sharpe_ratio:.2f} +/- {self.sharpe_ratio_std:.2f}") print(f"Sortino Ratio: {self.sortino_ratio:.2f} +/- {self.sortino_ratio_std:.2f}") print(f"Max Drawdown: {self.max_drawdown_pct:.2%} +/- {self.max_drawdown_pct_std:.2%}") print(f"Net Profit: ${self.net_profit:,.2f} +/- ${self.net_profit_std:,.2f}") print(f"Total Trades: {self.total_trades:.0f} +/- {self.total_trades_std:.0f}") print(f"Avg Trade P&L: ${self.avg_trade_pnl:.2f} +/- ${self.avg_trade_pnl_std:.2f}") print(f"\n--- Quality Scores ---") print(f"Consistency: {self.consistency_score:.2%}") print(f"Robustness: {self.robustness_score:.2%}") print("=" * 60) class WalkForwardValidator: """ Walk-forward validation for trading strategies. Implements time-series cross-validation that respects temporal order and simulates realistic trading conditions. Usage: validator = WalkForwardValidator(n_splits=5, train_ratio=0.8) # Generate splits for train, test in validator.split(data): # Train on train, evaluate on test model.fit(train) result = backtester.run(test, model) # Or run complete walk-forward results = validator.run_walk_forward(strategy, data) aggregated = validator.aggregate_results(results) """ def __init__( self, n_splits: int = 5, train_ratio: float = 0.8, config: Optional[WalkForwardConfig] = None ): """ Initialize walk-forward validator. Args: n_splits: Number of train/test splits train_ratio: Ratio of training data in each split config: Full configuration (overrides n_splits and train_ratio) """ if config is not None: self.config = config else: self.config = WalkForwardConfig( n_splits=n_splits, train_ratio=train_ratio ) self._splits: List[WalkForwardSplit] = [] self._results: List[BacktestResult] = [] self.metrics_calculator = MetricsCalculator() def split( self, data: pd.DataFrame ) -> Generator[Tuple[pd.DataFrame, pd.DataFrame], None, None]: """ Generate train/test splits for walk-forward validation. Args: data: Complete DataFrame with temporal index Yields: Tuple of (train_df, test_df) for each split """ n_samples = len(data) if n_samples < self.config.min_train_samples + self.config.min_test_samples: raise ValueError( f"Insufficient data: {n_samples} samples, need at least " f"{self.config.min_train_samples + self.config.min_test_samples}" ) total_test_size = int(n_samples * (1 - self.config.train_ratio)) test_size_per_split = max( total_test_size // self.config.n_splits, self.config.min_test_samples ) if self.config.expanding_window: initial_train_size = self.config.min_train_samples else: step_size = (n_samples - self.config.min_train_samples) // (self.config.n_splits + 1) initial_train_size = self.config.min_train_samples self._splits = [] for split_idx in range(self.config.n_splits): if self.config.expanding_window: train_start = 0 test_end = int(n_samples * (1 - (self.config.n_splits - split_idx - 1) * (1 - self.config.train_ratio) / self.config.n_splits)) test_start = test_end - test_size_per_split train_end = test_start - self.config.gap_bars else: step = (n_samples - test_size_per_split - self.config.min_train_samples) // max(self.config.n_splits - 1, 1) train_start = split_idx * step train_end = train_start + int((n_samples - train_start) * self.config.train_ratio) train_end = min(train_end, n_samples - test_size_per_split - self.config.gap_bars) test_start = train_end + self.config.gap_bars test_end = min(test_start + test_size_per_split, n_samples) if train_end - train_start < self.config.min_train_samples: continue if test_end - test_start < self.config.min_test_samples: continue train_start_date = data.index[train_start] if hasattr(data.index, '__getitem__') else None train_end_date = data.index[train_end - 1] if hasattr(data.index, '__getitem__') else None test_start_date = data.index[test_start] if hasattr(data.index, '__getitem__') else None test_end_date = data.index[test_end - 1] if hasattr(data.index, '__getitem__') else None split = WalkForwardSplit( split_id=split_idx + 1, train_start_idx=train_start, train_end_idx=train_end, test_start_idx=test_start, test_end_idx=test_end, train_start_date=train_start_date, train_end_date=train_end_date, test_start_date=test_start_date, test_end_date=test_end_date ) self._splits.append(split) train_data = data.iloc[train_start:train_end].copy() test_data = data.iloc[test_start:test_end].copy() logger.info(f"Split {split.split_id}: Train[{train_start}:{train_end}] " f"({split.train_size} samples), " f"Test[{test_start}:{test_end}] ({split.test_size} samples)") yield train_data, test_data logger.info(f"Generated {len(self._splits)} walk-forward splits") def run_walk_forward( self, strategy: Any, data: pd.DataFrame, backtest_config: Optional[BacktestConfig] = None, train_callback: Optional[Callable] = None ) -> List[BacktestResult]: """ Run complete walk-forward validation. Args: strategy: Strategy adapter with predict() and get_signal() methods data: Complete price data with features backtest_config: Configuration for backtesting train_callback: Optional callback for strategy training on each split Returns: List of BacktestResult for each split """ from .rr_backtester import RRBacktester if backtest_config is None: backtest_config = BacktestConfig() self._results = [] for split_idx, (train_data, test_data) in enumerate(self.split(data)): split = self._splits[split_idx] logger.info(f"\n{'='*50}") logger.info(f"Walk-Forward Split {split.split_id}/{self.config.n_splits}") logger.info(f"{'='*50}") if train_callback is not None: logger.info("Training strategy on current split...") train_callback(strategy, train_data) logger.info("Running backtest on test period...") backtester = RRBacktester(backtest_config) signals_df = self._generate_signals_df(strategy, test_data) result = backtester.run_backtest(test_data, signals_df) result.metrics.start_date = split.test_start_date result.metrics.end_date = split.test_end_date self._results.append(result) logger.info(f"Split {split.split_id} Results: " f"Trades={result.metrics.total_trades}, " f"WR={result.metrics.winrate:.2%}, " f"PF={result.metrics.profit_factor:.2f}, " f"Net=${result.metrics.net_profit:,.2f}") return self._results def _generate_signals_df( self, strategy: Any, data: pd.DataFrame ) -> pd.DataFrame: """ Generate signals DataFrame from strategy predictions. Args: strategy: Strategy with predict() method data: Price data Returns: DataFrame with signal columns for backtesting """ signals = pd.DataFrame(index=data.index) signals['prob_tp_first'] = np.nan signals['direction'] = 'long' signals['horizon'] = '15m' signals['rr_config'] = 'rr_2_1' signals['confidence'] = 0.0 lookback = min(100, len(data) // 2) for i in range(lookback, len(data)): features = data.iloc[i-lookback:i] try: prediction = strategy.predict(features) if abs(prediction.direction) > 0.1 and prediction.confidence > 0.5: signals.loc[data.index[i], 'prob_tp_first'] = prediction.confidence signals.loc[data.index[i], 'direction'] = 'long' if prediction.direction > 0 else 'short' signals.loc[data.index[i], 'confidence'] = prediction.confidence except Exception as e: logger.debug(f"Signal generation error at index {i}: {e}") continue valid_signals = signals['prob_tp_first'].notna().sum() logger.info(f"Generated {valid_signals} valid signals from {len(data)} bars") return signals def aggregate_results( self, results: Optional[List[BacktestResult]] = None ) -> AggregatedResult: """ Aggregate results from all walk-forward splits. Args: results: List of BacktestResult (uses stored results if None) Returns: AggregatedResult with mean and std of all metrics """ if results is None: results = self._results if not results: raise ValueError("No results to aggregate") total_trades_list = [] winrate_list = [] profit_factor_list = [] sharpe_list = [] sortino_list = [] max_dd_list = [] net_profit_list = [] avg_trade_list = [] per_split_metrics = [] for i, result in enumerate(results): metrics = result.metrics total_trades_list.append(metrics.total_trades) winrate_list.append(metrics.winrate) profit_factor_list.append(min(metrics.profit_factor, 10.0)) sharpe_list.append(np.clip(metrics.sharpe_ratio, -10, 10) if not np.isinf(metrics.sharpe_ratio) else 0) sortino_list.append(np.clip(metrics.sortino_ratio, -10, 10) if not np.isinf(metrics.sortino_ratio) else 0) max_dd_list.append(abs(metrics.max_drawdown_pct)) net_profit_list.append(metrics.net_profit) avg_trade_list.append(metrics.avg_trade if metrics.total_trades > 0 else 0) per_split_metrics.append({ 'split': i + 1, 'total_trades': metrics.total_trades, 'winrate': metrics.winrate, 'profit_factor': metrics.profit_factor, 'sharpe_ratio': metrics.sharpe_ratio, 'net_profit': metrics.net_profit, 'max_drawdown_pct': metrics.max_drawdown_pct }) profitable_splits = sum(1 for pnl in net_profit_list if pnl > 0) consistency_score = profitable_splits / len(results) if results else 0 winrate_std = np.std(winrate_list) if len(winrate_list) > 1 else 0 pf_std = np.std(profit_factor_list) if len(profit_factor_list) > 1 else 0 winrate_coef_var = winrate_std / np.mean(winrate_list) if np.mean(winrate_list) > 0 else 1 pf_coef_var = pf_std / np.mean(profit_factor_list) if np.mean(profit_factor_list) > 0 else 1 robustness_score = max(0, 1 - (winrate_coef_var + pf_coef_var) / 2) return AggregatedResult( total_trades=float(np.mean(total_trades_list)), total_trades_std=float(np.std(total_trades_list)) if len(total_trades_list) > 1 else 0.0, winrate=float(np.mean(winrate_list)), winrate_std=float(np.std(winrate_list)) if len(winrate_list) > 1 else 0.0, profit_factor=float(np.mean(profit_factor_list)), profit_factor_std=float(np.std(profit_factor_list)) if len(profit_factor_list) > 1 else 0.0, sharpe_ratio=float(np.mean(sharpe_list)), sharpe_ratio_std=float(np.std(sharpe_list)) if len(sharpe_list) > 1 else 0.0, sortino_ratio=float(np.mean(sortino_list)), sortino_ratio_std=float(np.std(sortino_list)) if len(sortino_list) > 1 else 0.0, max_drawdown_pct=float(np.mean(max_dd_list)), max_drawdown_pct_std=float(np.std(max_dd_list)) if len(max_dd_list) > 1 else 0.0, net_profit=float(np.mean(net_profit_list)), net_profit_std=float(np.std(net_profit_list)) if len(net_profit_list) > 1 else 0.0, avg_trade_pnl=float(np.mean(avg_trade_list)), avg_trade_pnl_std=float(np.std(avg_trade_list)) if len(avg_trade_list) > 1 else 0.0, n_splits=len(results), split_results=results, per_split_metrics=per_split_metrics, consistency_score=consistency_score, robustness_score=robustness_score ) def get_splits(self) -> List[WalkForwardSplit]: """Get the generated splits.""" return self._splits def save_results(self, path: str): """ Save validation results to JSON file. Args: path: File path for saving """ if not self._results: logger.warning("No results to save") return aggregated = self.aggregate_results() save_data = { 'config': { 'n_splits': self.config.n_splits, 'train_ratio': self.config.train_ratio, 'expanding_window': self.config.expanding_window }, 'splits': [s.to_dict() for s in self._splits], 'aggregated': aggregated.to_dict(), 'saved_at': datetime.now().isoformat() } save_path = Path(path) save_path.parent.mkdir(parents=True, exist_ok=True) with open(save_path, 'w') as f: json.dump(save_data, f, indent=2, default=str) logger.info(f"Saved walk-forward results to {save_path}") def plot_results( self, save_path: Optional[str] = None ): """ Plot walk-forward validation results. Args: save_path: Optional path to save the plot """ try: import matplotlib.pyplot as plt except ImportError: logger.warning("Matplotlib not available for plotting") return if not self._results: logger.warning("No results to plot") return aggregated = self.aggregate_results() fig, axes = plt.subplots(2, 3, figsize=(15, 10)) fig.suptitle('Walk-Forward Validation Results', fontsize=14, fontweight='bold') splits = list(range(1, len(self._results) + 1)) winrates = [m['winrate'] for m in aggregated.per_split_metrics] axes[0, 0].bar(splits, winrates, color='steelblue', alpha=0.7) axes[0, 0].axhline(y=aggregated.winrate, color='red', linestyle='--', label=f'Mean: {aggregated.winrate:.2%}') axes[0, 0].set_xlabel('Split') axes[0, 0].set_ylabel('Win Rate') axes[0, 0].set_title('Win Rate by Split') axes[0, 0].legend() axes[0, 0].set_ylim(0, 1) pfs = [min(m['profit_factor'], 5) for m in aggregated.per_split_metrics] axes[0, 1].bar(splits, pfs, color='forestgreen', alpha=0.7) axes[0, 1].axhline(y=min(aggregated.profit_factor, 5), color='red', linestyle='--', label=f'Mean: {aggregated.profit_factor:.2f}') axes[0, 1].axhline(y=1.0, color='black', linestyle='-', alpha=0.5) axes[0, 1].set_xlabel('Split') axes[0, 1].set_ylabel('Profit Factor') axes[0, 1].set_title('Profit Factor by Split') axes[0, 1].legend() net_profits = [m['net_profit'] for m in aggregated.per_split_metrics] colors = ['green' if p > 0 else 'red' for p in net_profits] axes[0, 2].bar(splits, net_profits, color=colors, alpha=0.7) axes[0, 2].axhline(y=0, color='black', linestyle='-', alpha=0.5) axes[0, 2].axhline(y=aggregated.net_profit, color='blue', linestyle='--', label=f'Mean: ${aggregated.net_profit:,.0f}') axes[0, 2].set_xlabel('Split') axes[0, 2].set_ylabel('Net Profit ($)') axes[0, 2].set_title('Net Profit by Split') axes[0, 2].legend() sharpes = [m['sharpe_ratio'] for m in aggregated.per_split_metrics] sharpes = [np.clip(s, -5, 5) for s in sharpes] axes[1, 0].bar(splits, sharpes, color='coral', alpha=0.7) axes[1, 0].axhline(y=aggregated.sharpe_ratio, color='red', linestyle='--', label=f'Mean: {aggregated.sharpe_ratio:.2f}') axes[1, 0].axhline(y=0, color='black', linestyle='-', alpha=0.5) axes[1, 0].set_xlabel('Split') axes[1, 0].set_ylabel('Sharpe Ratio') axes[1, 0].set_title('Sharpe Ratio by Split') axes[1, 0].legend() max_dds = [abs(m['max_drawdown_pct']) for m in aggregated.per_split_metrics] axes[1, 1].bar(splits, max_dds, color='crimson', alpha=0.7) axes[1, 1].axhline(y=aggregated.max_drawdown_pct, color='blue', linestyle='--', label=f'Mean: {aggregated.max_drawdown_pct:.2%}') axes[1, 1].set_xlabel('Split') axes[1, 1].set_ylabel('Max Drawdown') axes[1, 1].set_title('Max Drawdown by Split') axes[1, 1].legend() trades = [m['total_trades'] for m in aggregated.per_split_metrics] axes[1, 2].bar(splits, trades, color='purple', alpha=0.7) axes[1, 2].axhline(y=aggregated.total_trades, color='red', linestyle='--', label=f'Mean: {aggregated.total_trades:.0f}') axes[1, 2].set_xlabel('Split') axes[1, 2].set_ylabel('Number of Trades') axes[1, 2].set_title('Trade Count by Split') axes[1, 2].legend() plt.tight_layout() if save_path: plt.savefig(save_path, dpi=150, bbox_inches='tight') logger.info(f"Plot saved to {save_path}") plt.show() if __name__ == "__main__": print("Testing Walk-Forward Validator...") print("=" * 60) np.random.seed(42) n_samples = 5000 dates = pd.date_range(start='2023-01-01', periods=n_samples, freq='5min') base_price = 2000 returns = np.random.randn(n_samples) * 0.001 prices = base_price * np.cumprod(1 + returns) df = pd.DataFrame({ 'open': prices * (1 + np.random.randn(n_samples) * 0.0005), 'high': prices * (1 + np.abs(np.random.randn(n_samples)) * 0.001), 'low': prices * (1 - np.abs(np.random.randn(n_samples)) * 0.001), 'close': prices, 'volume': np.random.randint(1000, 10000, n_samples) }, index=dates) df['high'] = df[['open', 'high', 'close']].max(axis=1) df['low'] = df[['open', 'low', 'close']].min(axis=1) validator = WalkForwardValidator(n_splits=5, train_ratio=0.8) print("\n--- Generated Splits ---") splits_gen = list(validator.split(df)) print(f"Total splits: {len(splits_gen)}") for split in validator.get_splits(): print(f"Split {split.split_id}: " f"Train={split.train_size} samples, " f"Test={split.test_size} samples") print("\n--- Expanding Window Mode ---") expanding_config = WalkForwardConfig( n_splits=3, train_ratio=0.7, expanding_window=True, min_train_samples=500, min_test_samples=100 ) expanding_validator = WalkForwardValidator(config=expanding_config) expanding_splits = list(expanding_validator.split(df)) for split in expanding_validator.get_splits(): print(f"Split {split.split_id}: " f"Train[{split.train_start_idx}:{split.train_end_idx}] = {split.train_size}, " f"Test[{split.test_start_idx}:{split.test_end_idx}] = {split.test_size}") print("\n" + "=" * 60) print("Walk-Forward Validator tests complete!")