trading-platform-ml-engine-v2/src/backtesting/walk_forward.py
Adrian Flores Cortes d015e2b0f3 feat(ml-engine): Phase 4 - PostgreSQL migration, dynamic OOS, data pipeline
- Fix database.py: Add DatabaseConnection alias for backward compat
- Fix train_symbol_timeframe_models.py: Use PostgreSQLConnection + native queries
- Fix run_oos_backtest.py: Fix broken import + add dynamic OOS support
- Update data_splitter.py: split_dynamic_oos() method (from previous session)
- Update validation_oos.yaml: Dynamic OOS config + all 6 symbols enabled
- Create ingest_ohlcv_polygon.py: Standalone Polygon→PostgreSQL ingestion script
- Fix .gitignore: /data/ instead of data/ to not ignore src/data/
- Add untracked src/ modules: backtesting, data, llm, models (attention/metamodel/strategies)
- Add aiohttp, sqlalchemy, psycopg2-binary to requirements.txt

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-27 04:39:05 -06:00

653 lines
25 KiB
Python

"""
Walk-Forward Validation for Backtesting
=========================================
Implements walk-forward validation methodology for robust strategy evaluation.
Walk-forward validation simulates real trading conditions by:
1. Training on historical data
2. Testing on unseen future data
3. Rolling forward and repeating
This approach provides more realistic performance estimates than
standard train/test splits.
Author: ML-Specialist (NEXUS v4.0)
Version: 1.0.0
Created: 2026-01-25
"""
import numpy as np
import pandas as pd
from typing import Dict, List, Optional, Tuple, Generator, Any, Union, Callable
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from pathlib import Path
import json
from loguru import logger
from .rr_backtester import BacktestConfig, BacktestResult
from .metrics import TradingMetrics, TradeRecord, MetricsCalculator
@dataclass
class WalkForwardConfig:
"""Configuration for walk-forward validation."""
n_splits: int = 5
train_ratio: float = 0.8
gap_bars: int = 0
expanding_window: bool = False
min_train_samples: int = 1000
min_test_samples: int = 200
overlap_allowed: bool = False
@dataclass
class WalkForwardSplit:
"""Single walk-forward split with train/test indices."""
split_id: int
train_start_idx: int
train_end_idx: int
test_start_idx: int
test_end_idx: int
train_start_date: Optional[datetime] = None
train_end_date: Optional[datetime] = None
test_start_date: Optional[datetime] = None
test_end_date: Optional[datetime] = None
@property
def train_size(self) -> int:
"""Number of training samples."""
return self.train_end_idx - self.train_start_idx
@property
def test_size(self) -> int:
"""Number of test samples."""
return self.test_end_idx - self.test_start_idx
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary."""
return {
'split_id': self.split_id,
'train_start_idx': self.train_start_idx,
'train_end_idx': self.train_end_idx,
'test_start_idx': self.test_start_idx,
'test_end_idx': self.test_end_idx,
'train_start_date': str(self.train_start_date) if self.train_start_date else None,
'train_end_date': str(self.train_end_date) if self.train_end_date else None,
'test_start_date': str(self.test_start_date) if self.test_start_date else None,
'test_end_date': str(self.test_end_date) if self.test_end_date else None,
'train_size': self.train_size,
'test_size': self.test_size
}
@dataclass
class AggregatedResult:
"""
Aggregated results from walk-forward validation.
Contains mean and standard deviation of all metrics across splits.
"""
total_trades: float
total_trades_std: float
winrate: float
winrate_std: float
profit_factor: float
profit_factor_std: float
sharpe_ratio: float
sharpe_ratio_std: float
sortino_ratio: float
sortino_ratio_std: float
max_drawdown_pct: float
max_drawdown_pct_std: float
net_profit: float
net_profit_std: float
avg_trade_pnl: float
avg_trade_pnl_std: float
n_splits: int = 0
split_results: List[BacktestResult] = field(default_factory=list)
per_split_metrics: List[Dict[str, float]] = field(default_factory=list)
consistency_score: float = 0.0
robustness_score: float = 0.0
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary."""
return {
'total_trades': {'mean': self.total_trades, 'std': self.total_trades_std},
'winrate': {'mean': self.winrate, 'std': self.winrate_std},
'profit_factor': {'mean': self.profit_factor, 'std': self.profit_factor_std},
'sharpe_ratio': {'mean': self.sharpe_ratio, 'std': self.sharpe_ratio_std},
'sortino_ratio': {'mean': self.sortino_ratio, 'std': self.sortino_ratio_std},
'max_drawdown_pct': {'mean': self.max_drawdown_pct, 'std': self.max_drawdown_pct_std},
'net_profit': {'mean': self.net_profit, 'std': self.net_profit_std},
'avg_trade_pnl': {'mean': self.avg_trade_pnl, 'std': self.avg_trade_pnl_std},
'n_splits': self.n_splits,
'consistency_score': self.consistency_score,
'robustness_score': self.robustness_score,
'per_split_metrics': self.per_split_metrics
}
def print_summary(self):
"""Print formatted summary."""
print("\n" + "=" * 60)
print("WALK-FORWARD VALIDATION RESULTS")
print("=" * 60)
print(f"Number of Splits: {self.n_splits}")
print(f"\n--- Performance Metrics (Mean +/- Std) ---")
print(f"Win Rate: {self.winrate:.2%} +/- {self.winrate_std:.2%}")
print(f"Profit Factor: {self.profit_factor:.2f} +/- {self.profit_factor_std:.2f}")
print(f"Sharpe Ratio: {self.sharpe_ratio:.2f} +/- {self.sharpe_ratio_std:.2f}")
print(f"Sortino Ratio: {self.sortino_ratio:.2f} +/- {self.sortino_ratio_std:.2f}")
print(f"Max Drawdown: {self.max_drawdown_pct:.2%} +/- {self.max_drawdown_pct_std:.2%}")
print(f"Net Profit: ${self.net_profit:,.2f} +/- ${self.net_profit_std:,.2f}")
print(f"Total Trades: {self.total_trades:.0f} +/- {self.total_trades_std:.0f}")
print(f"Avg Trade P&L: ${self.avg_trade_pnl:.2f} +/- ${self.avg_trade_pnl_std:.2f}")
print(f"\n--- Quality Scores ---")
print(f"Consistency: {self.consistency_score:.2%}")
print(f"Robustness: {self.robustness_score:.2%}")
print("=" * 60)
class WalkForwardValidator:
"""
Walk-forward validation for trading strategies.
Implements time-series cross-validation that respects temporal order
and simulates realistic trading conditions.
Usage:
validator = WalkForwardValidator(n_splits=5, train_ratio=0.8)
# Generate splits
for train, test in validator.split(data):
# Train on train, evaluate on test
model.fit(train)
result = backtester.run(test, model)
# Or run complete walk-forward
results = validator.run_walk_forward(strategy, data)
aggregated = validator.aggregate_results(results)
"""
def __init__(
self,
n_splits: int = 5,
train_ratio: float = 0.8,
config: Optional[WalkForwardConfig] = None
):
"""
Initialize walk-forward validator.
Args:
n_splits: Number of train/test splits
train_ratio: Ratio of training data in each split
config: Full configuration (overrides n_splits and train_ratio)
"""
if config is not None:
self.config = config
else:
self.config = WalkForwardConfig(
n_splits=n_splits,
train_ratio=train_ratio
)
self._splits: List[WalkForwardSplit] = []
self._results: List[BacktestResult] = []
self.metrics_calculator = MetricsCalculator()
def split(
self,
data: pd.DataFrame
) -> Generator[Tuple[pd.DataFrame, pd.DataFrame], None, None]:
"""
Generate train/test splits for walk-forward validation.
Args:
data: Complete DataFrame with temporal index
Yields:
Tuple of (train_df, test_df) for each split
"""
n_samples = len(data)
if n_samples < self.config.min_train_samples + self.config.min_test_samples:
raise ValueError(
f"Insufficient data: {n_samples} samples, need at least "
f"{self.config.min_train_samples + self.config.min_test_samples}"
)
total_test_size = int(n_samples * (1 - self.config.train_ratio))
test_size_per_split = max(
total_test_size // self.config.n_splits,
self.config.min_test_samples
)
if self.config.expanding_window:
initial_train_size = self.config.min_train_samples
else:
step_size = (n_samples - self.config.min_train_samples) // (self.config.n_splits + 1)
initial_train_size = self.config.min_train_samples
self._splits = []
for split_idx in range(self.config.n_splits):
if self.config.expanding_window:
train_start = 0
test_end = int(n_samples * (1 - (self.config.n_splits - split_idx - 1) *
(1 - self.config.train_ratio) / self.config.n_splits))
test_start = test_end - test_size_per_split
train_end = test_start - self.config.gap_bars
else:
step = (n_samples - test_size_per_split - self.config.min_train_samples) // max(self.config.n_splits - 1, 1)
train_start = split_idx * step
train_end = train_start + int((n_samples - train_start) * self.config.train_ratio)
train_end = min(train_end, n_samples - test_size_per_split - self.config.gap_bars)
test_start = train_end + self.config.gap_bars
test_end = min(test_start + test_size_per_split, n_samples)
if train_end - train_start < self.config.min_train_samples:
continue
if test_end - test_start < self.config.min_test_samples:
continue
train_start_date = data.index[train_start] if hasattr(data.index, '__getitem__') else None
train_end_date = data.index[train_end - 1] if hasattr(data.index, '__getitem__') else None
test_start_date = data.index[test_start] if hasattr(data.index, '__getitem__') else None
test_end_date = data.index[test_end - 1] if hasattr(data.index, '__getitem__') else None
split = WalkForwardSplit(
split_id=split_idx + 1,
train_start_idx=train_start,
train_end_idx=train_end,
test_start_idx=test_start,
test_end_idx=test_end,
train_start_date=train_start_date,
train_end_date=train_end_date,
test_start_date=test_start_date,
test_end_date=test_end_date
)
self._splits.append(split)
train_data = data.iloc[train_start:train_end].copy()
test_data = data.iloc[test_start:test_end].copy()
logger.info(f"Split {split.split_id}: Train[{train_start}:{train_end}] "
f"({split.train_size} samples), "
f"Test[{test_start}:{test_end}] ({split.test_size} samples)")
yield train_data, test_data
logger.info(f"Generated {len(self._splits)} walk-forward splits")
def run_walk_forward(
self,
strategy: Any,
data: pd.DataFrame,
backtest_config: Optional[BacktestConfig] = None,
train_callback: Optional[Callable] = None
) -> List[BacktestResult]:
"""
Run complete walk-forward validation.
Args:
strategy: Strategy adapter with predict() and get_signal() methods
data: Complete price data with features
backtest_config: Configuration for backtesting
train_callback: Optional callback for strategy training on each split
Returns:
List of BacktestResult for each split
"""
from .rr_backtester import RRBacktester
if backtest_config is None:
backtest_config = BacktestConfig()
self._results = []
for split_idx, (train_data, test_data) in enumerate(self.split(data)):
split = self._splits[split_idx]
logger.info(f"\n{'='*50}")
logger.info(f"Walk-Forward Split {split.split_id}/{self.config.n_splits}")
logger.info(f"{'='*50}")
if train_callback is not None:
logger.info("Training strategy on current split...")
train_callback(strategy, train_data)
logger.info("Running backtest on test period...")
backtester = RRBacktester(backtest_config)
signals_df = self._generate_signals_df(strategy, test_data)
result = backtester.run_backtest(test_data, signals_df)
result.metrics.start_date = split.test_start_date
result.metrics.end_date = split.test_end_date
self._results.append(result)
logger.info(f"Split {split.split_id} Results: "
f"Trades={result.metrics.total_trades}, "
f"WR={result.metrics.winrate:.2%}, "
f"PF={result.metrics.profit_factor:.2f}, "
f"Net=${result.metrics.net_profit:,.2f}")
return self._results
def _generate_signals_df(
self,
strategy: Any,
data: pd.DataFrame
) -> pd.DataFrame:
"""
Generate signals DataFrame from strategy predictions.
Args:
strategy: Strategy with predict() method
data: Price data
Returns:
DataFrame with signal columns for backtesting
"""
signals = pd.DataFrame(index=data.index)
signals['prob_tp_first'] = np.nan
signals['direction'] = 'long'
signals['horizon'] = '15m'
signals['rr_config'] = 'rr_2_1'
signals['confidence'] = 0.0
lookback = min(100, len(data) // 2)
for i in range(lookback, len(data)):
features = data.iloc[i-lookback:i]
try:
prediction = strategy.predict(features)
if abs(prediction.direction) > 0.1 and prediction.confidence > 0.5:
signals.loc[data.index[i], 'prob_tp_first'] = prediction.confidence
signals.loc[data.index[i], 'direction'] = 'long' if prediction.direction > 0 else 'short'
signals.loc[data.index[i], 'confidence'] = prediction.confidence
except Exception as e:
logger.debug(f"Signal generation error at index {i}: {e}")
continue
valid_signals = signals['prob_tp_first'].notna().sum()
logger.info(f"Generated {valid_signals} valid signals from {len(data)} bars")
return signals
def aggregate_results(
self,
results: Optional[List[BacktestResult]] = None
) -> AggregatedResult:
"""
Aggregate results from all walk-forward splits.
Args:
results: List of BacktestResult (uses stored results if None)
Returns:
AggregatedResult with mean and std of all metrics
"""
if results is None:
results = self._results
if not results:
raise ValueError("No results to aggregate")
total_trades_list = []
winrate_list = []
profit_factor_list = []
sharpe_list = []
sortino_list = []
max_dd_list = []
net_profit_list = []
avg_trade_list = []
per_split_metrics = []
for i, result in enumerate(results):
metrics = result.metrics
total_trades_list.append(metrics.total_trades)
winrate_list.append(metrics.winrate)
profit_factor_list.append(min(metrics.profit_factor, 10.0))
sharpe_list.append(np.clip(metrics.sharpe_ratio, -10, 10) if not np.isinf(metrics.sharpe_ratio) else 0)
sortino_list.append(np.clip(metrics.sortino_ratio, -10, 10) if not np.isinf(metrics.sortino_ratio) else 0)
max_dd_list.append(abs(metrics.max_drawdown_pct))
net_profit_list.append(metrics.net_profit)
avg_trade_list.append(metrics.avg_trade if metrics.total_trades > 0 else 0)
per_split_metrics.append({
'split': i + 1,
'total_trades': metrics.total_trades,
'winrate': metrics.winrate,
'profit_factor': metrics.profit_factor,
'sharpe_ratio': metrics.sharpe_ratio,
'net_profit': metrics.net_profit,
'max_drawdown_pct': metrics.max_drawdown_pct
})
profitable_splits = sum(1 for pnl in net_profit_list if pnl > 0)
consistency_score = profitable_splits / len(results) if results else 0
winrate_std = np.std(winrate_list) if len(winrate_list) > 1 else 0
pf_std = np.std(profit_factor_list) if len(profit_factor_list) > 1 else 0
winrate_coef_var = winrate_std / np.mean(winrate_list) if np.mean(winrate_list) > 0 else 1
pf_coef_var = pf_std / np.mean(profit_factor_list) if np.mean(profit_factor_list) > 0 else 1
robustness_score = max(0, 1 - (winrate_coef_var + pf_coef_var) / 2)
return AggregatedResult(
total_trades=float(np.mean(total_trades_list)),
total_trades_std=float(np.std(total_trades_list)) if len(total_trades_list) > 1 else 0.0,
winrate=float(np.mean(winrate_list)),
winrate_std=float(np.std(winrate_list)) if len(winrate_list) > 1 else 0.0,
profit_factor=float(np.mean(profit_factor_list)),
profit_factor_std=float(np.std(profit_factor_list)) if len(profit_factor_list) > 1 else 0.0,
sharpe_ratio=float(np.mean(sharpe_list)),
sharpe_ratio_std=float(np.std(sharpe_list)) if len(sharpe_list) > 1 else 0.0,
sortino_ratio=float(np.mean(sortino_list)),
sortino_ratio_std=float(np.std(sortino_list)) if len(sortino_list) > 1 else 0.0,
max_drawdown_pct=float(np.mean(max_dd_list)),
max_drawdown_pct_std=float(np.std(max_dd_list)) if len(max_dd_list) > 1 else 0.0,
net_profit=float(np.mean(net_profit_list)),
net_profit_std=float(np.std(net_profit_list)) if len(net_profit_list) > 1 else 0.0,
avg_trade_pnl=float(np.mean(avg_trade_list)),
avg_trade_pnl_std=float(np.std(avg_trade_list)) if len(avg_trade_list) > 1 else 0.0,
n_splits=len(results),
split_results=results,
per_split_metrics=per_split_metrics,
consistency_score=consistency_score,
robustness_score=robustness_score
)
def get_splits(self) -> List[WalkForwardSplit]:
"""Get the generated splits."""
return self._splits
def save_results(self, path: str):
"""
Save validation results to JSON file.
Args:
path: File path for saving
"""
if not self._results:
logger.warning("No results to save")
return
aggregated = self.aggregate_results()
save_data = {
'config': {
'n_splits': self.config.n_splits,
'train_ratio': self.config.train_ratio,
'expanding_window': self.config.expanding_window
},
'splits': [s.to_dict() for s in self._splits],
'aggregated': aggregated.to_dict(),
'saved_at': datetime.now().isoformat()
}
save_path = Path(path)
save_path.parent.mkdir(parents=True, exist_ok=True)
with open(save_path, 'w') as f:
json.dump(save_data, f, indent=2, default=str)
logger.info(f"Saved walk-forward results to {save_path}")
def plot_results(
self,
save_path: Optional[str] = None
):
"""
Plot walk-forward validation results.
Args:
save_path: Optional path to save the plot
"""
try:
import matplotlib.pyplot as plt
except ImportError:
logger.warning("Matplotlib not available for plotting")
return
if not self._results:
logger.warning("No results to plot")
return
aggregated = self.aggregate_results()
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Walk-Forward Validation Results', fontsize=14, fontweight='bold')
splits = list(range(1, len(self._results) + 1))
winrates = [m['winrate'] for m in aggregated.per_split_metrics]
axes[0, 0].bar(splits, winrates, color='steelblue', alpha=0.7)
axes[0, 0].axhline(y=aggregated.winrate, color='red', linestyle='--', label=f'Mean: {aggregated.winrate:.2%}')
axes[0, 0].set_xlabel('Split')
axes[0, 0].set_ylabel('Win Rate')
axes[0, 0].set_title('Win Rate by Split')
axes[0, 0].legend()
axes[0, 0].set_ylim(0, 1)
pfs = [min(m['profit_factor'], 5) for m in aggregated.per_split_metrics]
axes[0, 1].bar(splits, pfs, color='forestgreen', alpha=0.7)
axes[0, 1].axhline(y=min(aggregated.profit_factor, 5), color='red', linestyle='--',
label=f'Mean: {aggregated.profit_factor:.2f}')
axes[0, 1].axhline(y=1.0, color='black', linestyle='-', alpha=0.5)
axes[0, 1].set_xlabel('Split')
axes[0, 1].set_ylabel('Profit Factor')
axes[0, 1].set_title('Profit Factor by Split')
axes[0, 1].legend()
net_profits = [m['net_profit'] for m in aggregated.per_split_metrics]
colors = ['green' if p > 0 else 'red' for p in net_profits]
axes[0, 2].bar(splits, net_profits, color=colors, alpha=0.7)
axes[0, 2].axhline(y=0, color='black', linestyle='-', alpha=0.5)
axes[0, 2].axhline(y=aggregated.net_profit, color='blue', linestyle='--',
label=f'Mean: ${aggregated.net_profit:,.0f}')
axes[0, 2].set_xlabel('Split')
axes[0, 2].set_ylabel('Net Profit ($)')
axes[0, 2].set_title('Net Profit by Split')
axes[0, 2].legend()
sharpes = [m['sharpe_ratio'] for m in aggregated.per_split_metrics]
sharpes = [np.clip(s, -5, 5) for s in sharpes]
axes[1, 0].bar(splits, sharpes, color='coral', alpha=0.7)
axes[1, 0].axhline(y=aggregated.sharpe_ratio, color='red', linestyle='--',
label=f'Mean: {aggregated.sharpe_ratio:.2f}')
axes[1, 0].axhline(y=0, color='black', linestyle='-', alpha=0.5)
axes[1, 0].set_xlabel('Split')
axes[1, 0].set_ylabel('Sharpe Ratio')
axes[1, 0].set_title('Sharpe Ratio by Split')
axes[1, 0].legend()
max_dds = [abs(m['max_drawdown_pct']) for m in aggregated.per_split_metrics]
axes[1, 1].bar(splits, max_dds, color='crimson', alpha=0.7)
axes[1, 1].axhline(y=aggregated.max_drawdown_pct, color='blue', linestyle='--',
label=f'Mean: {aggregated.max_drawdown_pct:.2%}')
axes[1, 1].set_xlabel('Split')
axes[1, 1].set_ylabel('Max Drawdown')
axes[1, 1].set_title('Max Drawdown by Split')
axes[1, 1].legend()
trades = [m['total_trades'] for m in aggregated.per_split_metrics]
axes[1, 2].bar(splits, trades, color='purple', alpha=0.7)
axes[1, 2].axhline(y=aggregated.total_trades, color='red', linestyle='--',
label=f'Mean: {aggregated.total_trades:.0f}')
axes[1, 2].set_xlabel('Split')
axes[1, 2].set_ylabel('Number of Trades')
axes[1, 2].set_title('Trade Count by Split')
axes[1, 2].legend()
plt.tight_layout()
if save_path:
plt.savefig(save_path, dpi=150, bbox_inches='tight')
logger.info(f"Plot saved to {save_path}")
plt.show()
if __name__ == "__main__":
print("Testing Walk-Forward Validator...")
print("=" * 60)
np.random.seed(42)
n_samples = 5000
dates = pd.date_range(start='2023-01-01', periods=n_samples, freq='5min')
base_price = 2000
returns = np.random.randn(n_samples) * 0.001
prices = base_price * np.cumprod(1 + returns)
df = pd.DataFrame({
'open': prices * (1 + np.random.randn(n_samples) * 0.0005),
'high': prices * (1 + np.abs(np.random.randn(n_samples)) * 0.001),
'low': prices * (1 - np.abs(np.random.randn(n_samples)) * 0.001),
'close': prices,
'volume': np.random.randint(1000, 10000, n_samples)
}, index=dates)
df['high'] = df[['open', 'high', 'close']].max(axis=1)
df['low'] = df[['open', 'low', 'close']].min(axis=1)
validator = WalkForwardValidator(n_splits=5, train_ratio=0.8)
print("\n--- Generated Splits ---")
splits_gen = list(validator.split(df))
print(f"Total splits: {len(splits_gen)}")
for split in validator.get_splits():
print(f"Split {split.split_id}: "
f"Train={split.train_size} samples, "
f"Test={split.test_size} samples")
print("\n--- Expanding Window Mode ---")
expanding_config = WalkForwardConfig(
n_splits=3,
train_ratio=0.7,
expanding_window=True,
min_train_samples=500,
min_test_samples=100
)
expanding_validator = WalkForwardValidator(config=expanding_config)
expanding_splits = list(expanding_validator.split(df))
for split in expanding_validator.get_splits():
print(f"Split {split.split_id}: "
f"Train[{split.train_start_idx}:{split.train_end_idx}] = {split.train_size}, "
f"Test[{split.test_start_idx}:{split.test_end_idx}] = {split.test_size}")
print("\n" + "=" * 60)
print("Walk-Forward Validator tests complete!")