453 lines
16 KiB
Python
453 lines
16 KiB
Python
"""
|
|
Walk-forward validation implementation
|
|
Based on best practices from analyzed projects
|
|
"""
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
from typing import List, Tuple, Dict, Any, Optional, Union
|
|
from dataclasses import dataclass
|
|
from loguru import logger
|
|
import joblib
|
|
from pathlib import Path
|
|
import json
|
|
|
|
|
|
@dataclass
|
|
class WalkForwardSplit:
|
|
"""Data class for a single walk-forward split"""
|
|
split_id: int
|
|
train_start: int
|
|
train_end: int
|
|
val_start: int
|
|
val_end: int
|
|
train_data: pd.DataFrame
|
|
val_data: pd.DataFrame
|
|
|
|
@property
|
|
def train_size(self) -> int:
|
|
return len(self.train_data)
|
|
|
|
@property
|
|
def val_size(self) -> int:
|
|
return len(self.val_data)
|
|
|
|
def __repr__(self) -> str:
|
|
return (f"Split {self.split_id}: "
|
|
f"Train[{self.train_start}:{self.train_end}] n={self.train_size}, "
|
|
f"Val[{self.val_start}:{self.val_end}] n={self.val_size}")
|
|
|
|
|
|
class WalkForwardValidator:
|
|
"""Walk-forward validation for time series data"""
|
|
|
|
def __init__(
|
|
self,
|
|
n_splits: int = 5,
|
|
test_size: float = 0.2,
|
|
gap: int = 0,
|
|
expanding_window: bool = False,
|
|
min_train_size: int = 10000
|
|
):
|
|
"""
|
|
Initialize walk-forward validator
|
|
|
|
Args:
|
|
n_splits: Number of splits
|
|
test_size: Test size as fraction of step size
|
|
gap: Gap between train and test sets (to avoid look-ahead)
|
|
expanding_window: If True, training window expands; if False, sliding window
|
|
min_train_size: Minimum training samples required
|
|
"""
|
|
self.n_splits = n_splits
|
|
self.test_size = test_size
|
|
self.gap = gap
|
|
self.expanding_window = expanding_window
|
|
self.min_train_size = min_train_size
|
|
self.splits = []
|
|
self.results = {}
|
|
|
|
def split(
|
|
self,
|
|
data: pd.DataFrame
|
|
) -> List[WalkForwardSplit]:
|
|
"""
|
|
Create walk-forward validation splits
|
|
|
|
Args:
|
|
data: Complete DataFrame with time index
|
|
|
|
Returns:
|
|
List of WalkForwardSplit objects
|
|
"""
|
|
n_samples = len(data)
|
|
|
|
# Calculate step size
|
|
step_size = n_samples // (self.n_splits + 1)
|
|
test_size = int(step_size * self.test_size)
|
|
|
|
if step_size < self.min_train_size:
|
|
logger.warning(
|
|
f"Step size ({step_size}) is less than minimum train size ({self.min_train_size}). "
|
|
f"Reducing number of splits."
|
|
)
|
|
self.n_splits = max(1, n_samples // self.min_train_size - 1)
|
|
step_size = n_samples // (self.n_splits + 1)
|
|
test_size = int(step_size * self.test_size)
|
|
|
|
self.splits = []
|
|
|
|
for i in range(self.n_splits):
|
|
if self.expanding_window:
|
|
# Expanding window: always start from beginning
|
|
train_start = 0
|
|
else:
|
|
# Sliding window: move start forward
|
|
train_start = i * step_size if i > 0 else 0
|
|
|
|
train_end = (i + 1) * step_size
|
|
val_start = train_end + self.gap
|
|
val_end = min(val_start + test_size, n_samples)
|
|
|
|
# Ensure we have enough data
|
|
if val_end > n_samples or (train_end - train_start) < self.min_train_size:
|
|
logger.warning(f"Skipping split {i+1}: insufficient data")
|
|
continue
|
|
|
|
# Create split
|
|
split = WalkForwardSplit(
|
|
split_id=i + 1,
|
|
train_start=train_start,
|
|
train_end=train_end,
|
|
val_start=val_start,
|
|
val_end=val_end,
|
|
train_data=data.iloc[train_start:train_end].copy(),
|
|
val_data=data.iloc[val_start:val_end].copy()
|
|
)
|
|
|
|
self.splits.append(split)
|
|
logger.info(f"Created {split}")
|
|
|
|
logger.info(f"✅ Created {len(self.splits)} walk-forward splits")
|
|
return self.splits
|
|
|
|
def train_model(
|
|
self,
|
|
model_class: Any,
|
|
model_config: Dict[str, Any],
|
|
data: pd.DataFrame,
|
|
feature_cols: List[str],
|
|
target_cols: List[str],
|
|
save_models: bool = True,
|
|
model_dir: str = "models/walk_forward"
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Train a model using walk-forward validation
|
|
|
|
Args:
|
|
model_class: Model class to instantiate
|
|
model_config: Configuration for model
|
|
data: Complete DataFrame
|
|
feature_cols: List of feature column names
|
|
target_cols: List of target column names
|
|
save_models: Whether to save trained models
|
|
model_dir: Directory to save models
|
|
|
|
Returns:
|
|
Dictionary with results for all splits
|
|
"""
|
|
# Create splits if not already done
|
|
if not self.splits:
|
|
self.splits = self.split(data)
|
|
|
|
results = {
|
|
'splits': [],
|
|
'metrics': {
|
|
'train_mse': [],
|
|
'val_mse': [],
|
|
'train_mae': [],
|
|
'val_mae': [],
|
|
'train_r2': [],
|
|
'val_r2': []
|
|
},
|
|
'models': [],
|
|
'config': model_config
|
|
}
|
|
|
|
for split in self.splits:
|
|
logger.info(f"🏃 Training on {split}")
|
|
|
|
# Prepare data
|
|
X_train = split.train_data[feature_cols]
|
|
y_train = split.train_data[target_cols]
|
|
X_val = split.val_data[feature_cols]
|
|
y_val = split.val_data[target_cols]
|
|
|
|
# Initialize model
|
|
model = model_class(model_config)
|
|
|
|
# Train model
|
|
if hasattr(model, 'train'):
|
|
# XGBoost style
|
|
metrics = model.train(X_train, y_train, X_val, y_val)
|
|
else:
|
|
# PyTorch style
|
|
metrics = model.train_model(X_train, y_train, X_val, y_val)
|
|
|
|
# Make predictions for validation
|
|
if hasattr(model, 'predict'):
|
|
val_predictions = model.predict(X_val)
|
|
else:
|
|
val_predictions = model(X_val)
|
|
|
|
# Calculate additional metrics if needed
|
|
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
|
|
|
|
if isinstance(val_predictions, np.ndarray):
|
|
val_mse = mean_squared_error(y_val.values, val_predictions)
|
|
val_mae = mean_absolute_error(y_val.values, val_predictions)
|
|
val_r2 = r2_score(y_val.values, val_predictions)
|
|
else:
|
|
# Handle torch tensors
|
|
val_predictions_np = val_predictions.detach().cpu().numpy()
|
|
val_mse = mean_squared_error(y_val.values, val_predictions_np)
|
|
val_mae = mean_absolute_error(y_val.values, val_predictions_np)
|
|
val_r2 = r2_score(y_val.values, val_predictions_np)
|
|
|
|
# Store results
|
|
split_results = {
|
|
'split_id': split.split_id,
|
|
'train_size': split.train_size,
|
|
'val_size': split.val_size,
|
|
'metrics': {
|
|
'val_mse': val_mse,
|
|
'val_mae': val_mae,
|
|
'val_r2': val_r2,
|
|
**metrics
|
|
}
|
|
}
|
|
|
|
results['splits'].append(split_results)
|
|
results['metrics']['val_mse'].append(val_mse)
|
|
results['metrics']['val_mae'].append(val_mae)
|
|
results['metrics']['val_r2'].append(val_r2)
|
|
|
|
# Save model if requested
|
|
if save_models:
|
|
model_path = Path(model_dir) / f"model_split_{split.split_id}.pkl"
|
|
model_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
if hasattr(model, 'save'):
|
|
model.save(str(model_path))
|
|
else:
|
|
joblib.dump(model, model_path)
|
|
|
|
results['models'].append(str(model_path))
|
|
logger.info(f"💾 Saved model to {model_path}")
|
|
|
|
# Log split results
|
|
logger.info(
|
|
f"Split {split.split_id} - "
|
|
f"Val MSE: {val_mse:.6f}, "
|
|
f"Val MAE: {val_mae:.6f}, "
|
|
f"Val R2: {val_r2:.4f}"
|
|
)
|
|
|
|
# Calculate average metrics
|
|
results['avg_metrics'] = {
|
|
'val_mse': np.mean(results['metrics']['val_mse']),
|
|
'val_mse_std': np.std(results['metrics']['val_mse']),
|
|
'val_mae': np.mean(results['metrics']['val_mae']),
|
|
'val_mae_std': np.std(results['metrics']['val_mae']),
|
|
'val_r2': np.mean(results['metrics']['val_r2']),
|
|
'val_r2_std': np.std(results['metrics']['val_r2'])
|
|
}
|
|
|
|
logger.info(
|
|
f"📊 Walk-Forward Average - "
|
|
f"MSE: {results['avg_metrics']['val_mse']:.6f} (±{results['avg_metrics']['val_mse_std']:.6f}), "
|
|
f"R2: {results['avg_metrics']['val_r2']:.4f} (±{results['avg_metrics']['val_r2_std']:.4f})"
|
|
)
|
|
|
|
self.results = results
|
|
return results
|
|
|
|
def combine_predictions(
|
|
self,
|
|
models: List[Any],
|
|
X: pd.DataFrame,
|
|
method: str = 'average'
|
|
) -> np.ndarray:
|
|
"""
|
|
Combine predictions from multiple walk-forward models
|
|
|
|
Args:
|
|
models: List of trained models
|
|
X: Features to predict on
|
|
method: Combination method ('average', 'weighted', 'best')
|
|
|
|
Returns:
|
|
Combined predictions
|
|
"""
|
|
predictions = []
|
|
|
|
for model in models:
|
|
if hasattr(model, 'predict'):
|
|
pred = model.predict(X)
|
|
else:
|
|
pred = model(X)
|
|
if hasattr(pred, 'detach'):
|
|
pred = pred.detach().cpu().numpy()
|
|
predictions.append(pred)
|
|
|
|
predictions = np.array(predictions)
|
|
|
|
if method == 'average':
|
|
# Simple average
|
|
combined = np.mean(predictions, axis=0)
|
|
elif method == 'weighted':
|
|
# Weight by validation performance
|
|
weights = 1 / np.array(self.results['metrics']['val_mse'])
|
|
weights = weights / weights.sum()
|
|
combined = np.average(predictions, axis=0, weights=weights)
|
|
elif method == 'best':
|
|
# Use best performing model
|
|
best_idx = np.argmin(self.results['metrics']['val_mse'])
|
|
combined = predictions[best_idx]
|
|
else:
|
|
raise ValueError(f"Unknown combination method: {method}")
|
|
|
|
return combined
|
|
|
|
def save_results(self, path: str):
|
|
"""Save validation results to file"""
|
|
save_path = Path(path)
|
|
save_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(save_path, 'w') as f:
|
|
json.dump(self.results, f, indent=2, default=str)
|
|
|
|
logger.info(f"💾 Saved results to {save_path}")
|
|
|
|
def load_results(self, path: str):
|
|
"""Load validation results from file"""
|
|
with open(path, 'r') as f:
|
|
self.results = json.load(f)
|
|
|
|
logger.info(f"📂 Loaded results from {path}")
|
|
return self.results
|
|
|
|
def plot_results(self, save_path: Optional[str] = None):
|
|
"""
|
|
Plot walk-forward validation results
|
|
|
|
Args:
|
|
save_path: Path to save plot
|
|
"""
|
|
import matplotlib.pyplot as plt
|
|
|
|
if not self.results:
|
|
logger.warning("No results to plot")
|
|
return
|
|
|
|
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
|
|
|
|
# MSE across splits
|
|
splits = [s['split_id'] for s in self.results['splits']]
|
|
mse_values = self.results['metrics']['val_mse']
|
|
|
|
axes[0, 0].bar(splits, mse_values, color='steelblue')
|
|
axes[0, 0].axhline(
|
|
y=self.results['avg_metrics']['val_mse'],
|
|
color='red', linestyle='--', label='Average'
|
|
)
|
|
axes[0, 0].set_xlabel('Split')
|
|
axes[0, 0].set_ylabel('MSE')
|
|
axes[0, 0].set_title('Validation MSE by Split')
|
|
axes[0, 0].legend()
|
|
|
|
# MAE across splits
|
|
mae_values = self.results['metrics']['val_mae']
|
|
|
|
axes[0, 1].bar(splits, mae_values, color='forestgreen')
|
|
axes[0, 1].axhline(
|
|
y=self.results['avg_metrics']['val_mae'],
|
|
color='red', linestyle='--', label='Average'
|
|
)
|
|
axes[0, 1].set_xlabel('Split')
|
|
axes[0, 1].set_ylabel('MAE')
|
|
axes[0, 1].set_title('Validation MAE by Split')
|
|
axes[0, 1].legend()
|
|
|
|
# R2 across splits
|
|
r2_values = self.results['metrics']['val_r2']
|
|
|
|
axes[1, 0].bar(splits, r2_values, color='coral')
|
|
axes[1, 0].axhline(
|
|
y=self.results['avg_metrics']['val_r2'],
|
|
color='red', linestyle='--', label='Average'
|
|
)
|
|
axes[1, 0].set_xlabel('Split')
|
|
axes[1, 0].set_ylabel('R²')
|
|
axes[1, 0].set_title('Validation R² by Split')
|
|
axes[1, 0].legend()
|
|
|
|
# Sample sizes
|
|
train_sizes = [s['train_size'] for s in self.results['splits']]
|
|
val_sizes = [s['val_size'] for s in self.results['splits']]
|
|
|
|
x = np.arange(len(splits))
|
|
width = 0.35
|
|
|
|
axes[1, 1].bar(x - width/2, train_sizes, width, label='Train', color='navy')
|
|
axes[1, 1].bar(x + width/2, val_sizes, width, label='Validation', color='orange')
|
|
axes[1, 1].set_xlabel('Split')
|
|
axes[1, 1].set_ylabel('Sample Size')
|
|
axes[1, 1].set_title('Data Split Sizes')
|
|
axes[1, 1].set_xticks(x)
|
|
axes[1, 1].set_xticklabels(splits)
|
|
axes[1, 1].legend()
|
|
|
|
plt.suptitle('Walk-Forward Validation Results', fontsize=14, fontweight='bold')
|
|
plt.tight_layout()
|
|
|
|
if save_path:
|
|
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
|
logger.info(f"📊 Plot saved to {save_path}")
|
|
|
|
plt.show()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Test walk-forward validation
|
|
from datetime import datetime, timedelta
|
|
|
|
# Create sample data
|
|
dates = pd.date_range(start='2020-01-01', periods=50000, freq='5min')
|
|
np.random.seed(42)
|
|
|
|
df = pd.DataFrame({
|
|
'feature1': np.random.randn(50000),
|
|
'feature2': np.random.randn(50000),
|
|
'feature3': np.random.randn(50000),
|
|
'target': np.random.randn(50000)
|
|
}, index=dates)
|
|
|
|
# Initialize validator
|
|
validator = WalkForwardValidator(
|
|
n_splits=5,
|
|
test_size=0.2,
|
|
gap=0,
|
|
expanding_window=False,
|
|
min_train_size=5000
|
|
)
|
|
|
|
# Create splits
|
|
splits = validator.split(df)
|
|
|
|
print(f"Created {len(splits)} splits:")
|
|
for split in splits:
|
|
print(f" {split}")
|
|
|
|
# Test plot (without actual training)
|
|
# validator.plot_results() |