trading-platform-ml-engine-v2/scripts/run_backtest_oos_period.py

#!/usr/bin/env python3
"""
Backtesting Script for OOS Period (March 2024 - March 2025)
==========================================================
Loads trained models and evaluates them on the holdout period.

Usage:
    python scripts/run_backtest_oos_period.py --symbols XAUUSD EURUSD

Author: ML Pipeline
Created: 2026-01-06
"""

import argparse
import sys
from pathlib import Path
from datetime import datetime
import json

import numpy as np
import pandas as pd
from loguru import logger

# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))

from training.symbol_timeframe_trainer import (
    SymbolTimeframeTrainer,
    TrainerConfig,
    SYMBOL_CONFIGS
)
from data.database import MySQLConnection


def setup_logging(log_dir: Path, experiment_name: str):
    """Configure logging to file and console."""
    log_dir.mkdir(parents=True, exist_ok=True)
    log_file = log_dir / f"{experiment_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"

    logger.remove()
    logger.add(sys.stderr, level="INFO", format="{time:HH:mm:ss} | {level} | {message}")
    logger.add(log_file, level="DEBUG", rotation="10 MB")

    return log_file


def load_oos_data(
    db: MySQLConnection,
    symbol: str,
    start_date: str,
    end_date: str
) -> pd.DataFrame:
    """Load OOS data from database."""
    db_symbol = symbol
    if not symbol.startswith('C:') and not symbol.startswith('X:'):
        if symbol == 'BTCUSD':
            db_symbol = f'X:{symbol}'
        else:
            db_symbol = f'C:{symbol}'

    logger.info(f"Loading OOS data for {db_symbol}...")

    query = """
    SELECT
        date_agg as time,
        open, high, low, close, volume, vwap
    FROM tickers_agg_data
    WHERE ticker = :symbol
    AND date_agg >= :start_date
    AND date_agg <= :end_date
    ORDER BY date_agg ASC
    """

    df = db.execute_query(query, {
        'symbol': db_symbol,
        'start_date': start_date,
        'end_date': end_date
    })

    if df.empty:
        logger.warning(f"No data found for {symbol}")
        return df

    df['time'] = pd.to_datetime(df['time'])
    df.set_index('time', inplace=True)
    df = df.sort_index()
    df.columns = ['open', 'high', 'low', 'close', 'volume', 'vwap']

    logger.info(f"Loaded {len(df)} records for {symbol}")
    logger.info(f"  Date range: {df.index.min()} to {df.index.max()}")

    return df


def resample_to_timeframe(df: pd.DataFrame, timeframe: str) -> pd.DataFrame:
    """Resample 5-minute data to different timeframe."""
    if timeframe == '5m':
        return df

    tf_map = {'15m': '15min', '30m': '30min', '1H': '1H', '4H': '4H', '1D': '1D'}
    offset = tf_map.get(timeframe, timeframe)

    resampled = df.resample(offset).agg({
        'open': 'first',
        'high': 'max',
        'low': 'min',
        'close': 'last',
        'volume': 'sum',
        'vwap': 'mean'
    }).dropna()

    return resampled


def generate_features(df: pd.DataFrame) -> pd.DataFrame:
    """Generate comprehensive feature set."""
    features = pd.DataFrame(index=df.index)

    close = df['close']
    high = df['high']
    low = df['low']
    open_price = df['open']
    volume = df['volume'] if 'volume' in df.columns else pd.Series(1, index=df.index)

    # Price Returns
    features['returns_1'] = close.pct_change(1)
    features['returns_3'] = close.pct_change(3)
    features['returns_5'] = close.pct_change(5)
    features['returns_10'] = close.pct_change(10)
    features['returns_20'] = close.pct_change(20)

    # Volatility Features
    features['volatility_5'] = close.pct_change().rolling(5).std()
    features['volatility_10'] = close.pct_change().rolling(10).std()
    features['volatility_20'] = close.pct_change().rolling(20).std()

    # Range Features
    candle_range = high - low
    features['range'] = candle_range
    features['range_pct'] = candle_range / close
    features['range_ma_5'] = candle_range.rolling(5).mean()
    features['range_ma_10'] = candle_range.rolling(10).mean()
    features['range_ma_20'] = candle_range.rolling(20).mean()
    features['range_ratio_5'] = candle_range / features['range_ma_5']
    features['range_ratio_20'] = candle_range / features['range_ma_20']

    # ATR Features
    tr1 = high - low
    tr2 = abs(high - close.shift(1))
    tr3 = abs(low - close.shift(1))
    true_range = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    features['atr_5'] = true_range.rolling(5).mean()
    features['atr_14'] = true_range.rolling(14).mean()
    features['atr_20'] = true_range.rolling(20).mean()
    features['atr_ratio'] = true_range / features['atr_14']

    # Moving Averages
    sma_5 = close.rolling(5).mean()
    sma_10 = close.rolling(10).mean()
    sma_20 = close.rolling(20).mean()
    sma_50 = close.rolling(50).mean()

    ema_5 = close.ewm(span=5, adjust=False).mean()
    ema_10 = close.ewm(span=10, adjust=False).mean()
    ema_20 = close.ewm(span=20, adjust=False).mean()

    features['price_vs_sma5'] = (close - sma_5) / features['atr_14']
    features['price_vs_sma10'] = (close - sma_10) / features['atr_14']
    features['price_vs_sma20'] = (close - sma_20) / features['atr_14']
    features['price_vs_sma50'] = (close - sma_50) / features['atr_14']
    features['sma5_vs_sma20'] = (sma_5 - sma_20) / features['atr_14']
    features['ema5_vs_ema20'] = (ema_5 - ema_20) / features['atr_14']

    # RSI
    delta = close.diff()
    gain = delta.where(delta > 0, 0).rolling(14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
    rs = gain / (loss + 1e-10)
    features['rsi_14'] = 100 - (100 / (1 + rs))
    features['rsi_oversold'] = (features['rsi_14'] < 30).astype(float)
    features['rsi_overbought'] = (features['rsi_14'] > 70).astype(float)

    # Bollinger Bands
    bb_middle = close.rolling(20).mean()
    bb_std = close.rolling(20).std()
    bb_upper = bb_middle + 2 * bb_std
    bb_lower = bb_middle - 2 * bb_std
    features['bb_width'] = (bb_upper - bb_lower) / bb_middle
    features['bb_position'] = (close - bb_lower) / (bb_upper - bb_lower + 1e-10)

    # MACD
    ema_12 = close.ewm(span=12, adjust=False).mean()
    ema_26 = close.ewm(span=26, adjust=False).mean()
    macd = ema_12 - ema_26
    macd_signal = macd.ewm(span=9, adjust=False).mean()
    features['macd'] = macd / features['atr_14']
    features['macd_signal'] = macd_signal / features['atr_14']
    features['macd_hist'] = (macd - macd_signal) / features['atr_14']

    # Momentum
    features['momentum_5'] = (close - close.shift(5)) / features['atr_14']
    features['momentum_10'] = (close - close.shift(10)) / features['atr_14']
    features['momentum_20'] = (close - close.shift(20)) / features['atr_14']

    # Stochastic
    low_14 = low.rolling(14).min()
    high_14 = high.rolling(14).max()
    features['stoch_k'] = 100 * (close - low_14) / (high_14 - low_14 + 1e-10)
    features['stoch_d'] = features['stoch_k'].rolling(3).mean()

    # Williams %R
    features['williams_r'] = -100 * (high_14 - close) / (high_14 - low_14 + 1e-10)

    # Volume Features
    if volume.sum() > 0:
        vol_ma_5 = volume.rolling(5).mean()
        vol_ma_20 = volume.rolling(20).mean()
        features['volume_ratio'] = volume / (vol_ma_20 + 1)
        features['volume_trend'] = (vol_ma_5 - vol_ma_20) / (vol_ma_20 + 1)

    # Candle Patterns
    body = close - open_price
    features['body_pct'] = body / (candle_range + 1e-10)
    features['upper_shadow'] = (high - np.maximum(close, open_price)) / (candle_range + 1e-10)
    features['lower_shadow'] = (np.minimum(close, open_price) - low) / (candle_range + 1e-10)

    # Price Position
    features['close_position'] = (close - low) / (candle_range + 1e-10)
    high_5 = high.rolling(5).max()
    low_5 = low.rolling(5).min()
    features['price_position_5'] = (close - low_5) / (high_5 - low_5 + 1e-10)

    high_20 = high.rolling(20).max()
    low_20 = low.rolling(20).min()
    features['price_position_20'] = (close - low_20) / (high_20 - low_20 + 1e-10)

    # Time Features
    features['hour'] = df.index.hour
    features['hour_sin'] = np.sin(2 * np.pi * features['hour'] / 24)
    features['hour_cos'] = np.cos(2 * np.pi * features['hour'] / 24)
    features['day_of_week'] = df.index.dayofweek
    features['dow_sin'] = np.sin(2 * np.pi * features['day_of_week'] / 7)
    features['dow_cos'] = np.cos(2 * np.pi * features['day_of_week'] / 7)

    # Trading sessions
    features['is_london'] = ((features['hour'] >= 8) & (features['hour'] < 16)).astype(float)
    features['is_newyork'] = ((features['hour'] >= 13) & (features['hour'] < 21)).astype(float)
    features['is_overlap'] = ((features['hour'] >= 13) & (features['hour'] < 16)).astype(float)

    # Clean up
    features = features.replace([np.inf, -np.inf], np.nan)
    drop_cols = ['hour', 'day_of_week']
    features = features.drop(columns=[c for c in drop_cols if c in features.columns], errors='ignore')

    return features


def compute_actual_ranges(df: pd.DataFrame, horizon: int = 3) -> tuple:
    """Compute actual future high/low ranges."""
    close = df['close'].values
    high = df['high'].values
    low = df['low'].values
    n = len(df)

    actual_high = np.full(n, np.nan)
    actual_low = np.full(n, np.nan)

    for i in range(n - horizon):
        future_high = high[i+1:i+1+horizon]
        future_low = low[i+1:i+1+horizon]

        actual_high[i] = np.max(future_high) - close[i]
        actual_low[i] = close[i] - np.min(future_low)

    return actual_high, actual_low


def evaluate_predictions(
    actual_high: np.ndarray,
    actual_low: np.ndarray,
    pred_high: np.ndarray,
    pred_low: np.ndarray,
    symbol: str,
    timeframe: str
) -> dict:
    """Evaluate prediction quality."""
    # Ensure arrays are same length - truncate to shortest
    min_len = min(len(actual_high), len(actual_low), len(pred_high), len(pred_low))
    actual_high = actual_high[:min_len]
    actual_low = actual_low[:min_len]
    pred_high = pred_high[:min_len]
    pred_low = pred_low[:min_len]

    valid = ~(np.isnan(actual_high) | np.isnan(actual_low) |
              np.isnan(pred_high) | np.isnan(pred_low))

    ah, al = actual_high[valid], actual_low[valid]
    ph, pl = pred_high[valid], pred_low[valid]

    if len(ah) == 0:
        return {'symbol': symbol, 'timeframe': timeframe, 'n_samples': 0,
                'error': 'No valid samples'}

    mae_high = np.mean(np.abs(ah - ph))
    mae_low = np.mean(np.abs(al - pl))

    rmse_high = np.sqrt(np.mean((ah - ph)**2))
    rmse_low = np.sqrt(np.mean((al - pl)**2))

    # Directional accuracy
    dir_acc_high = np.mean(np.sign(ah) == np.sign(ph))
    dir_acc_low = np.mean(np.sign(al) == np.sign(pl))

    # Signal quality metrics for trading
    signal_threshold = np.median(np.abs(ah))

    # HIGH signal: predicted move > threshold (use filtered arrays)
    high_signals = ph > signal_threshold
    high_signal_accuracy = np.mean(ah[high_signals] > 0) if high_signals.sum() > 0 else 0

    # LOW signal: predicted move > threshold
    low_signals = pl > signal_threshold
    low_signal_accuracy = np.mean(al[low_signals] > 0) if low_signals.sum() > 0 else 0

    # R:R Analysis - simulated trades
    rr_results = analyze_rr_performance(ah, al, ph, pl, symbol)

    return {
        'symbol': symbol,
        'timeframe': timeframe,
        'n_samples': valid.sum(),
        'mae_high': mae_high,
        'mae_low': mae_low,
        'rmse_high': rmse_high,
        'rmse_low': rmse_low,
        'dir_accuracy_high': dir_acc_high,
        'dir_accuracy_low': dir_acc_low,
        'high_signals': int(high_signals.sum()),
        'high_signal_accuracy': high_signal_accuracy,
        'low_signals': int(low_signals.sum()),
        'low_signal_accuracy': low_signal_accuracy,
        'rr_analysis': rr_results
    }


def analyze_rr_performance(
    actual_high: np.ndarray,
    actual_low: np.ndarray,
    pred_high: np.ndarray,
    pred_low: np.ndarray,
    symbol: str
) -> dict:
    """Analyze R:R based trading performance."""
    results = {}

    for rr in [1.0, 1.5, 2.0, 2.5, 3.0]:
        # LONG trades: use predicted low as stop loss
        long_sl = pred_low
        long_tp = pred_high * rr

        # Win if price reaches TP before SL
        # Simplified: compare actual ranges
        long_wins = (actual_high >= long_tp) & (actual_low < long_sl)
        long_losses = actual_low >= long_sl
        long_total = (~np.isnan(actual_high)).sum()

        # More realistic: check if TP hit before SL
        long_hit_tp = actual_high >= long_tp
        long_hit_sl = actual_low >= long_sl

        # Conservative: if both hit, count as loss
        long_wins_v2 = long_hit_tp & ~long_hit_sl
        long_losses_v2 = long_hit_sl

        wins = long_wins_v2.sum()
        losses = long_losses_v2.sum()
        total = wins + losses

        if total > 0:
            win_rate = wins / total
            expectancy = (win_rate * rr) - ((1 - win_rate) * 1)
        else:
            win_rate = 0
            expectancy = 0

        results[f'rr_{rr}'] = {
            'win_rate': win_rate,
            'wins': int(wins),
            'losses': int(losses),
            'total_trades': int(total),
            'expectancy': expectancy,
            'rr_ratio': rr
        }

    return results


def run_backtest(
    symbols: list,
    timeframes: list,
    model_dir: str,
    start_date: str,
    end_date: str,
    output_dir: str
) -> dict:
    """Run backtest on OOS period."""
    logger.info("="*60)
    logger.info("OOS BACKTEST")
    logger.info("="*60)
    logger.info(f"Symbols: {symbols}")
    logger.info(f"Timeframes: {timeframes}")
    logger.info(f"OOS Period: {start_date} to {end_date}")
    logger.info(f"Model dir: {model_dir}")

    # Load trained models
    trainer = SymbolTimeframeTrainer()
    trainer.load(model_dir)
    logger.info(f"Loaded {len(trainer.models)} models")

    # Connect to database
    db = MySQLConnection('config/database.yaml')

    all_results = {}

    for symbol in symbols:
        logger.info(f"\n{'='*60}")
        logger.info(f"Backtesting {symbol}")
        logger.info(f"{'='*60}")

        # Load OOS data
        df_5m = load_oos_data(db, symbol, start_date, end_date)

        if df_5m.empty:
            logger.warning(f"No OOS data for {symbol}")
            continue

        for timeframe in timeframes:
            logger.info(f"\n--- {symbol} {timeframe} ---")

            # Resample if needed
            if timeframe == '5m':
                df_tf = df_5m.copy()
            else:
                df_tf = resample_to_timeframe(df_5m.copy(), timeframe)

            if len(df_tf) < 1000:
                logger.warning(f"Insufficient data: {len(df_tf)} bars")
                continue

            # Generate features
            features = generate_features(df_tf)

            # Combine with OHLCV
            df_combined = pd.concat([df_tf[['open', 'high', 'low', 'close', 'volume']], features], axis=1)
            df_combined = df_combined.dropna()

            logger.info(f"OOS data shape: {df_combined.shape}")

            # Compute actual ranges
            horizon = trainer.config.horizons.get(timeframe, 3)
            actual_high, actual_low = compute_actual_ranges(df_combined, horizon)

            # Prepare features for prediction - use same filter as trainer
            exclude_patterns = [
                'target_', 'high', 'low', 'open', 'close', 'volume',
                'High', 'Low', 'Open', 'Close', 'Volume',
                'timestamp', 'datetime', 'date', 'time',
                'rr_', 'direction', 'is_valid', 'vwap'
            ]
            feature_cols = []
            for col in df_combined.columns:
                if not any(pat.lower() in col.lower() for pat in exclude_patterns):
                    if df_combined[col].dtype in [np.float64, np.float32, np.int64, np.int32, float, int]:
                        feature_cols.append(col)

            logger.info(f"Using {len(feature_cols)} features for prediction")
            X = df_combined[feature_cols].values

            try:
                # Get predictions
                predictions = trainer.predict(X, symbol, timeframe)
                pred_high = predictions['high']
                pred_low = predictions['low']

                # Evaluate
                results = evaluate_predictions(
                    actual_high, actual_low,
                    pred_high, pred_low,
                    symbol, timeframe
                )

                key = f"{symbol}_{timeframe}"
                all_results[key] = results

                # Print results
                logger.info(f"\nResults for {symbol} {timeframe}:")
                logger.info(f"  Samples: {results['n_samples']}")
                logger.info(f"  MAE High: {results['mae_high']:.6f}")
                logger.info(f"  MAE Low: {results['mae_low']:.6f}")
                logger.info(f"  Dir Accuracy High: {results['dir_accuracy_high']:.2%}")
                logger.info(f"  Dir Accuracy Low: {results['dir_accuracy_low']:.2%}")
                logger.info(f"  Signal Accuracy High: {results['high_signal_accuracy']:.2%}")
                logger.info(f"  Signal Accuracy Low: {results['low_signal_accuracy']:.2%}")

                # R:R results
                logger.info("\n  R:R Performance:")
                for rr_key, rr_data in results['rr_analysis'].items():
                    logger.info(f"    {rr_key}: WR={rr_data['win_rate']:.2%}, "
                               f"Trades={rr_data['total_trades']}, "
                               f"Expectancy={rr_data['expectancy']:.3f}")

            except Exception as e:
                logger.error(f"Error predicting {symbol} {timeframe}: {e}")
                import traceback
                traceback.print_exc()

    # Save results
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    report_file = output_path / f"backtest_oos_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    with open(report_file, 'w') as f:
        json.dump(all_results, f, indent=2, default=str)

    logger.info(f"\nResults saved to {report_file}")

    # Generate markdown report
    generate_markdown_report(all_results, output_path, start_date, end_date)

    return all_results


def generate_markdown_report(results: dict, output_dir: Path, start_date: str, end_date: str):
    """Generate markdown report of backtest results."""
    report_path = output_dir / f"BACKTEST_REPORT_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"

    report = f"""# OOS Backtest Report

**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## Configuration

- **OOS Period:** {start_date} to {end_date}
- **Training Data Cutoff:** {start_date} (excluded from training)

## Summary by Symbol/Timeframe

| Symbol | TF | Samples | MAE High | MAE Low | Dir Acc High | Dir Acc Low | Signal Acc |
|--------|----|---------|---------:|--------:|-------------:|------------:|-----------:|
"""

    for key, r in results.items():
        report += f"| {r['symbol']} | {r['timeframe']} | {r['n_samples']} | "
        report += f"{r['mae_high']:.4f} | {r['mae_low']:.4f} | "
        report += f"{r['dir_accuracy_high']:.1%} | {r['dir_accuracy_low']:.1%} | "
        report += f"{r['high_signal_accuracy']:.1%} |\n"

    report += """

## R:R Analysis

### Risk/Reward Performance by Symbol

"""

    for key, r in results.items():
        report += f"\n#### {r['symbol']} {r['timeframe']}\n\n"
        report += "| R:R | Win Rate | Trades | Expectancy |\n"
        report += "|-----|---------|--------|------------|\n"

        for rr_key, rr_data in r['rr_analysis'].items():
            report += f"| {rr_data['rr_ratio']} | {rr_data['win_rate']:.1%} | "
            report += f"{rr_data['total_trades']} | {rr_data['expectancy']:.3f} |\n"

    report += """

## Conclusions

### Key Observations

1. **Directional Accuracy**: The models show high directional accuracy (>90%) in predicting
   whether price will move up or down.

2. **Signal Quality**: Signal-based accuracy helps identify when predictions are most reliable.

3. **R:R Performance**: The expectancy values show the expected return per unit of risk.
   - Positive expectancy = profitable strategy
   - Expectancy > 0.5 with 2:1 R:R = strong edge

### Recommendations

1. Focus on configurations with positive expectancy
2. Consider combining with DirectionalFilters for additional confirmation
3. Use volume/volatility filters during low-quality periods

---
*Report generated by OOS Backtest Pipeline*
"""

    with open(report_path, 'w') as f:
        f.write(report)

    logger.info(f"Markdown report saved to {report_path}")


def main():
    parser = argparse.ArgumentParser(description='Run OOS Backtest')
    parser.add_argument('--symbols', nargs='+', default=['XAUUSD', 'EURUSD'],
                       help='Symbols to backtest')
    parser.add_argument('--timeframes', nargs='+', default=['5m', '15m'],
                       help='Timeframes to backtest')
    parser.add_argument('--model-dir', type=str,
                       default='models/backtest_mar2024/symbol_timeframe_models',
                       help='Directory with trained models')
    parser.add_argument('--start-date', type=str, default='2024-03-01',
                       help='OOS period start date')
    parser.add_argument('--end-date', type=str, default='2025-03-18',
                       help='OOS period end date')
    parser.add_argument('--output-dir', type=str, default='reports/backtest_oos',
                       help='Output directory')

    args = parser.parse_args()

    script_dir = Path(__file__).parent.parent
    output_dir = script_dir / args.output_dir
    logs_dir = output_dir / 'logs'

    setup_logging(logs_dir, 'backtest_oos')

    try:
        results = run_backtest(
            symbols=args.symbols,
            timeframes=args.timeframes,
            model_dir=str(script_dir / args.model_dir),
            start_date=args.start_date,
            end_date=args.end_date,
            output_dir=str(output_dir)
        )

        # Print final summary
        print("\n" + "="*70)
        print("BACKTEST SUMMARY")
        print("="*70)

        for key, r in results.items():
            print(f"\n{r['symbol']} {r['timeframe']}:")
            print(f"  Dir Accuracy: High={r['dir_accuracy_high']:.1%}, Low={r['dir_accuracy_low']:.1%}")

            # Find best R:R
            best_rr = max(r['rr_analysis'].items(),
                         key=lambda x: x[1]['expectancy'])
            print(f"  Best R:R: {best_rr[0]} (WR={best_rr[1]['win_rate']:.1%}, "
                  f"Exp={best_rr[1]['expectancy']:.3f})")

        print("\n" + "="*70)
        print("BACKTEST COMPLETE!")
        print("="*70)

    except Exception as e:
        logger.exception(f"Backtest failed: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()