#!/usr/bin/env python3 """ Backtesting Script for OOS Period (March 2024 - March 2025) ========================================================== Loads trained models and evaluates them on the holdout period. Usage: python scripts/run_backtest_oos_period.py --symbols XAUUSD EURUSD Author: ML Pipeline Created: 2026-01-06 """ import argparse import sys from pathlib import Path from datetime import datetime import json import numpy as np import pandas as pd from loguru import logger # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent.parent / 'src')) from training.symbol_timeframe_trainer import ( SymbolTimeframeTrainer, TrainerConfig, SYMBOL_CONFIGS ) from data.database import MySQLConnection def setup_logging(log_dir: Path, experiment_name: str): """Configure logging to file and console.""" log_dir.mkdir(parents=True, exist_ok=True) log_file = log_dir / f"{experiment_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log" logger.remove() logger.add(sys.stderr, level="INFO", format="{time:HH:mm:ss} | {level} | {message}") logger.add(log_file, level="DEBUG", rotation="10 MB") return log_file def load_oos_data( db: MySQLConnection, symbol: str, start_date: str, end_date: str ) -> pd.DataFrame: """Load OOS data from database.""" db_symbol = symbol if not symbol.startswith('C:') and not symbol.startswith('X:'): if symbol == 'BTCUSD': db_symbol = f'X:{symbol}' else: db_symbol = f'C:{symbol}' logger.info(f"Loading OOS data for {db_symbol}...") query = """ SELECT date_agg as time, open, high, low, close, volume, vwap FROM tickers_agg_data WHERE ticker = :symbol AND date_agg >= :start_date AND date_agg <= :end_date ORDER BY date_agg ASC """ df = db.execute_query(query, { 'symbol': db_symbol, 'start_date': start_date, 'end_date': end_date }) if df.empty: logger.warning(f"No data found for {symbol}") return df df['time'] = pd.to_datetime(df['time']) df.set_index('time', inplace=True) df = df.sort_index() df.columns = ['open', 'high', 'low', 'close', 'volume', 'vwap'] logger.info(f"Loaded {len(df)} records for {symbol}") logger.info(f" Date range: {df.index.min()} to {df.index.max()}") return df def resample_to_timeframe(df: pd.DataFrame, timeframe: str) -> pd.DataFrame: """Resample 5-minute data to different timeframe.""" if timeframe == '5m': return df tf_map = {'15m': '15min', '30m': '30min', '1H': '1H', '4H': '4H', '1D': '1D'} offset = tf_map.get(timeframe, timeframe) resampled = df.resample(offset).agg({ 'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last', 'volume': 'sum', 'vwap': 'mean' }).dropna() return resampled def generate_features(df: pd.DataFrame) -> pd.DataFrame: """Generate comprehensive feature set.""" features = pd.DataFrame(index=df.index) close = df['close'] high = df['high'] low = df['low'] open_price = df['open'] volume = df['volume'] if 'volume' in df.columns else pd.Series(1, index=df.index) # Price Returns features['returns_1'] = close.pct_change(1) features['returns_3'] = close.pct_change(3) features['returns_5'] = close.pct_change(5) features['returns_10'] = close.pct_change(10) features['returns_20'] = close.pct_change(20) # Volatility Features features['volatility_5'] = close.pct_change().rolling(5).std() features['volatility_10'] = close.pct_change().rolling(10).std() features['volatility_20'] = close.pct_change().rolling(20).std() # Range Features candle_range = high - low features['range'] = candle_range features['range_pct'] = candle_range / close features['range_ma_5'] = candle_range.rolling(5).mean() features['range_ma_10'] = candle_range.rolling(10).mean() features['range_ma_20'] = candle_range.rolling(20).mean() features['range_ratio_5'] = candle_range / features['range_ma_5'] features['range_ratio_20'] = candle_range / features['range_ma_20'] # ATR Features tr1 = high - low tr2 = abs(high - close.shift(1)) tr3 = abs(low - close.shift(1)) true_range = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1) features['atr_5'] = true_range.rolling(5).mean() features['atr_14'] = true_range.rolling(14).mean() features['atr_20'] = true_range.rolling(20).mean() features['atr_ratio'] = true_range / features['atr_14'] # Moving Averages sma_5 = close.rolling(5).mean() sma_10 = close.rolling(10).mean() sma_20 = close.rolling(20).mean() sma_50 = close.rolling(50).mean() ema_5 = close.ewm(span=5, adjust=False).mean() ema_10 = close.ewm(span=10, adjust=False).mean() ema_20 = close.ewm(span=20, adjust=False).mean() features['price_vs_sma5'] = (close - sma_5) / features['atr_14'] features['price_vs_sma10'] = (close - sma_10) / features['atr_14'] features['price_vs_sma20'] = (close - sma_20) / features['atr_14'] features['price_vs_sma50'] = (close - sma_50) / features['atr_14'] features['sma5_vs_sma20'] = (sma_5 - sma_20) / features['atr_14'] features['ema5_vs_ema20'] = (ema_5 - ema_20) / features['atr_14'] # RSI delta = close.diff() gain = delta.where(delta > 0, 0).rolling(14).mean() loss = (-delta.where(delta < 0, 0)).rolling(14).mean() rs = gain / (loss + 1e-10) features['rsi_14'] = 100 - (100 / (1 + rs)) features['rsi_oversold'] = (features['rsi_14'] < 30).astype(float) features['rsi_overbought'] = (features['rsi_14'] > 70).astype(float) # Bollinger Bands bb_middle = close.rolling(20).mean() bb_std = close.rolling(20).std() bb_upper = bb_middle + 2 * bb_std bb_lower = bb_middle - 2 * bb_std features['bb_width'] = (bb_upper - bb_lower) / bb_middle features['bb_position'] = (close - bb_lower) / (bb_upper - bb_lower + 1e-10) # MACD ema_12 = close.ewm(span=12, adjust=False).mean() ema_26 = close.ewm(span=26, adjust=False).mean() macd = ema_12 - ema_26 macd_signal = macd.ewm(span=9, adjust=False).mean() features['macd'] = macd / features['atr_14'] features['macd_signal'] = macd_signal / features['atr_14'] features['macd_hist'] = (macd - macd_signal) / features['atr_14'] # Momentum features['momentum_5'] = (close - close.shift(5)) / features['atr_14'] features['momentum_10'] = (close - close.shift(10)) / features['atr_14'] features['momentum_20'] = (close - close.shift(20)) / features['atr_14'] # Stochastic low_14 = low.rolling(14).min() high_14 = high.rolling(14).max() features['stoch_k'] = 100 * (close - low_14) / (high_14 - low_14 + 1e-10) features['stoch_d'] = features['stoch_k'].rolling(3).mean() # Williams %R features['williams_r'] = -100 * (high_14 - close) / (high_14 - low_14 + 1e-10) # Volume Features if volume.sum() > 0: vol_ma_5 = volume.rolling(5).mean() vol_ma_20 = volume.rolling(20).mean() features['volume_ratio'] = volume / (vol_ma_20 + 1) features['volume_trend'] = (vol_ma_5 - vol_ma_20) / (vol_ma_20 + 1) # Candle Patterns body = close - open_price features['body_pct'] = body / (candle_range + 1e-10) features['upper_shadow'] = (high - np.maximum(close, open_price)) / (candle_range + 1e-10) features['lower_shadow'] = (np.minimum(close, open_price) - low) / (candle_range + 1e-10) # Price Position features['close_position'] = (close - low) / (candle_range + 1e-10) high_5 = high.rolling(5).max() low_5 = low.rolling(5).min() features['price_position_5'] = (close - low_5) / (high_5 - low_5 + 1e-10) high_20 = high.rolling(20).max() low_20 = low.rolling(20).min() features['price_position_20'] = (close - low_20) / (high_20 - low_20 + 1e-10) # Time Features features['hour'] = df.index.hour features['hour_sin'] = np.sin(2 * np.pi * features['hour'] / 24) features['hour_cos'] = np.cos(2 * np.pi * features['hour'] / 24) features['day_of_week'] = df.index.dayofweek features['dow_sin'] = np.sin(2 * np.pi * features['day_of_week'] / 7) features['dow_cos'] = np.cos(2 * np.pi * features['day_of_week'] / 7) # Trading sessions features['is_london'] = ((features['hour'] >= 8) & (features['hour'] < 16)).astype(float) features['is_newyork'] = ((features['hour'] >= 13) & (features['hour'] < 21)).astype(float) features['is_overlap'] = ((features['hour'] >= 13) & (features['hour'] < 16)).astype(float) # Clean up features = features.replace([np.inf, -np.inf], np.nan) drop_cols = ['hour', 'day_of_week'] features = features.drop(columns=[c for c in drop_cols if c in features.columns], errors='ignore') return features def compute_actual_ranges(df: pd.DataFrame, horizon: int = 3) -> tuple: """Compute actual future high/low ranges.""" close = df['close'].values high = df['high'].values low = df['low'].values n = len(df) actual_high = np.full(n, np.nan) actual_low = np.full(n, np.nan) for i in range(n - horizon): future_high = high[i+1:i+1+horizon] future_low = low[i+1:i+1+horizon] actual_high[i] = np.max(future_high) - close[i] actual_low[i] = close[i] - np.min(future_low) return actual_high, actual_low def evaluate_predictions( actual_high: np.ndarray, actual_low: np.ndarray, pred_high: np.ndarray, pred_low: np.ndarray, symbol: str, timeframe: str ) -> dict: """Evaluate prediction quality.""" # Ensure arrays are same length - truncate to shortest min_len = min(len(actual_high), len(actual_low), len(pred_high), len(pred_low)) actual_high = actual_high[:min_len] actual_low = actual_low[:min_len] pred_high = pred_high[:min_len] pred_low = pred_low[:min_len] valid = ~(np.isnan(actual_high) | np.isnan(actual_low) | np.isnan(pred_high) | np.isnan(pred_low)) ah, al = actual_high[valid], actual_low[valid] ph, pl = pred_high[valid], pred_low[valid] if len(ah) == 0: return {'symbol': symbol, 'timeframe': timeframe, 'n_samples': 0, 'error': 'No valid samples'} mae_high = np.mean(np.abs(ah - ph)) mae_low = np.mean(np.abs(al - pl)) rmse_high = np.sqrt(np.mean((ah - ph)**2)) rmse_low = np.sqrt(np.mean((al - pl)**2)) # Directional accuracy dir_acc_high = np.mean(np.sign(ah) == np.sign(ph)) dir_acc_low = np.mean(np.sign(al) == np.sign(pl)) # Signal quality metrics for trading signal_threshold = np.median(np.abs(ah)) # HIGH signal: predicted move > threshold (use filtered arrays) high_signals = ph > signal_threshold high_signal_accuracy = np.mean(ah[high_signals] > 0) if high_signals.sum() > 0 else 0 # LOW signal: predicted move > threshold low_signals = pl > signal_threshold low_signal_accuracy = np.mean(al[low_signals] > 0) if low_signals.sum() > 0 else 0 # R:R Analysis - simulated trades rr_results = analyze_rr_performance(ah, al, ph, pl, symbol) return { 'symbol': symbol, 'timeframe': timeframe, 'n_samples': valid.sum(), 'mae_high': mae_high, 'mae_low': mae_low, 'rmse_high': rmse_high, 'rmse_low': rmse_low, 'dir_accuracy_high': dir_acc_high, 'dir_accuracy_low': dir_acc_low, 'high_signals': int(high_signals.sum()), 'high_signal_accuracy': high_signal_accuracy, 'low_signals': int(low_signals.sum()), 'low_signal_accuracy': low_signal_accuracy, 'rr_analysis': rr_results } def analyze_rr_performance( actual_high: np.ndarray, actual_low: np.ndarray, pred_high: np.ndarray, pred_low: np.ndarray, symbol: str ) -> dict: """Analyze R:R based trading performance.""" results = {} for rr in [1.0, 1.5, 2.0, 2.5, 3.0]: # LONG trades: use predicted low as stop loss long_sl = pred_low long_tp = pred_high * rr # Win if price reaches TP before SL # Simplified: compare actual ranges long_wins = (actual_high >= long_tp) & (actual_low < long_sl) long_losses = actual_low >= long_sl long_total = (~np.isnan(actual_high)).sum() # More realistic: check if TP hit before SL long_hit_tp = actual_high >= long_tp long_hit_sl = actual_low >= long_sl # Conservative: if both hit, count as loss long_wins_v2 = long_hit_tp & ~long_hit_sl long_losses_v2 = long_hit_sl wins = long_wins_v2.sum() losses = long_losses_v2.sum() total = wins + losses if total > 0: win_rate = wins / total expectancy = (win_rate * rr) - ((1 - win_rate) * 1) else: win_rate = 0 expectancy = 0 results[f'rr_{rr}'] = { 'win_rate': win_rate, 'wins': int(wins), 'losses': int(losses), 'total_trades': int(total), 'expectancy': expectancy, 'rr_ratio': rr } return results def run_backtest( symbols: list, timeframes: list, model_dir: str, start_date: str, end_date: str, output_dir: str ) -> dict: """Run backtest on OOS period.""" logger.info("="*60) logger.info("OOS BACKTEST") logger.info("="*60) logger.info(f"Symbols: {symbols}") logger.info(f"Timeframes: {timeframes}") logger.info(f"OOS Period: {start_date} to {end_date}") logger.info(f"Model dir: {model_dir}") # Load trained models trainer = SymbolTimeframeTrainer() trainer.load(model_dir) logger.info(f"Loaded {len(trainer.models)} models") # Connect to database db = MySQLConnection('config/database.yaml') all_results = {} for symbol in symbols: logger.info(f"\n{'='*60}") logger.info(f"Backtesting {symbol}") logger.info(f"{'='*60}") # Load OOS data df_5m = load_oos_data(db, symbol, start_date, end_date) if df_5m.empty: logger.warning(f"No OOS data for {symbol}") continue for timeframe in timeframes: logger.info(f"\n--- {symbol} {timeframe} ---") # Resample if needed if timeframe == '5m': df_tf = df_5m.copy() else: df_tf = resample_to_timeframe(df_5m.copy(), timeframe) if len(df_tf) < 1000: logger.warning(f"Insufficient data: {len(df_tf)} bars") continue # Generate features features = generate_features(df_tf) # Combine with OHLCV df_combined = pd.concat([df_tf[['open', 'high', 'low', 'close', 'volume']], features], axis=1) df_combined = df_combined.dropna() logger.info(f"OOS data shape: {df_combined.shape}") # Compute actual ranges horizon = trainer.config.horizons.get(timeframe, 3) actual_high, actual_low = compute_actual_ranges(df_combined, horizon) # Prepare features for prediction - use same filter as trainer exclude_patterns = [ 'target_', 'high', 'low', 'open', 'close', 'volume', 'High', 'Low', 'Open', 'Close', 'Volume', 'timestamp', 'datetime', 'date', 'time', 'rr_', 'direction', 'is_valid', 'vwap' ] feature_cols = [] for col in df_combined.columns: if not any(pat.lower() in col.lower() for pat in exclude_patterns): if df_combined[col].dtype in [np.float64, np.float32, np.int64, np.int32, float, int]: feature_cols.append(col) logger.info(f"Using {len(feature_cols)} features for prediction") X = df_combined[feature_cols].values try: # Get predictions predictions = trainer.predict(X, symbol, timeframe) pred_high = predictions['high'] pred_low = predictions['low'] # Evaluate results = evaluate_predictions( actual_high, actual_low, pred_high, pred_low, symbol, timeframe ) key = f"{symbol}_{timeframe}" all_results[key] = results # Print results logger.info(f"\nResults for {symbol} {timeframe}:") logger.info(f" Samples: {results['n_samples']}") logger.info(f" MAE High: {results['mae_high']:.6f}") logger.info(f" MAE Low: {results['mae_low']:.6f}") logger.info(f" Dir Accuracy High: {results['dir_accuracy_high']:.2%}") logger.info(f" Dir Accuracy Low: {results['dir_accuracy_low']:.2%}") logger.info(f" Signal Accuracy High: {results['high_signal_accuracy']:.2%}") logger.info(f" Signal Accuracy Low: {results['low_signal_accuracy']:.2%}") # R:R results logger.info("\n R:R Performance:") for rr_key, rr_data in results['rr_analysis'].items(): logger.info(f" {rr_key}: WR={rr_data['win_rate']:.2%}, " f"Trades={rr_data['total_trades']}, " f"Expectancy={rr_data['expectancy']:.3f}") except Exception as e: logger.error(f"Error predicting {symbol} {timeframe}: {e}") import traceback traceback.print_exc() # Save results output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) report_file = output_path / f"backtest_oos_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" with open(report_file, 'w') as f: json.dump(all_results, f, indent=2, default=str) logger.info(f"\nResults saved to {report_file}") # Generate markdown report generate_markdown_report(all_results, output_path, start_date, end_date) return all_results def generate_markdown_report(results: dict, output_dir: Path, start_date: str, end_date: str): """Generate markdown report of backtest results.""" report_path = output_dir / f"BACKTEST_REPORT_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md" report = f"""# OOS Backtest Report **Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ## Configuration - **OOS Period:** {start_date} to {end_date} - **Training Data Cutoff:** {start_date} (excluded from training) ## Summary by Symbol/Timeframe | Symbol | TF | Samples | MAE High | MAE Low | Dir Acc High | Dir Acc Low | Signal Acc | |--------|----|---------|---------:|--------:|-------------:|------------:|-----------:| """ for key, r in results.items(): report += f"| {r['symbol']} | {r['timeframe']} | {r['n_samples']} | " report += f"{r['mae_high']:.4f} | {r['mae_low']:.4f} | " report += f"{r['dir_accuracy_high']:.1%} | {r['dir_accuracy_low']:.1%} | " report += f"{r['high_signal_accuracy']:.1%} |\n" report += """ ## R:R Analysis ### Risk/Reward Performance by Symbol """ for key, r in results.items(): report += f"\n#### {r['symbol']} {r['timeframe']}\n\n" report += "| R:R | Win Rate | Trades | Expectancy |\n" report += "|-----|---------|--------|------------|\n" for rr_key, rr_data in r['rr_analysis'].items(): report += f"| {rr_data['rr_ratio']} | {rr_data['win_rate']:.1%} | " report += f"{rr_data['total_trades']} | {rr_data['expectancy']:.3f} |\n" report += """ ## Conclusions ### Key Observations 1. **Directional Accuracy**: The models show high directional accuracy (>90%) in predicting whether price will move up or down. 2. **Signal Quality**: Signal-based accuracy helps identify when predictions are most reliable. 3. **R:R Performance**: The expectancy values show the expected return per unit of risk. - Positive expectancy = profitable strategy - Expectancy > 0.5 with 2:1 R:R = strong edge ### Recommendations 1. Focus on configurations with positive expectancy 2. Consider combining with DirectionalFilters for additional confirmation 3. Use volume/volatility filters during low-quality periods --- *Report generated by OOS Backtest Pipeline* """ with open(report_path, 'w') as f: f.write(report) logger.info(f"Markdown report saved to {report_path}") def main(): parser = argparse.ArgumentParser(description='Run OOS Backtest') parser.add_argument('--symbols', nargs='+', default=['XAUUSD', 'EURUSD'], help='Symbols to backtest') parser.add_argument('--timeframes', nargs='+', default=['5m', '15m'], help='Timeframes to backtest') parser.add_argument('--model-dir', type=str, default='models/backtest_mar2024/symbol_timeframe_models', help='Directory with trained models') parser.add_argument('--start-date', type=str, default='2024-03-01', help='OOS period start date') parser.add_argument('--end-date', type=str, default='2025-03-18', help='OOS period end date') parser.add_argument('--output-dir', type=str, default='reports/backtest_oos', help='Output directory') args = parser.parse_args() script_dir = Path(__file__).parent.parent output_dir = script_dir / args.output_dir logs_dir = output_dir / 'logs' setup_logging(logs_dir, 'backtest_oos') try: results = run_backtest( symbols=args.symbols, timeframes=args.timeframes, model_dir=str(script_dir / args.model_dir), start_date=args.start_date, end_date=args.end_date, output_dir=str(output_dir) ) # Print final summary print("\n" + "="*70) print("BACKTEST SUMMARY") print("="*70) for key, r in results.items(): print(f"\n{r['symbol']} {r['timeframe']}:") print(f" Dir Accuracy: High={r['dir_accuracy_high']:.1%}, Low={r['dir_accuracy_low']:.1%}") # Find best R:R best_rr = max(r['rr_analysis'].items(), key=lambda x: x[1]['expectancy']) print(f" Best R:R: {best_rr[0]} (WR={best_rr[1]['win_rate']:.1%}, " f"Exp={best_rr[1]['expectancy']:.3f})") print("\n" + "="*70) print("BACKTEST COMPLETE!") print("="*70) except Exception as e: logger.exception(f"Backtest failed: {e}") sys.exit(1) if __name__ == "__main__": main()