trading-platform-ml-engine-v2/scripts/evaluate_hierarchical.py

#!/usr/bin/env python3
"""
Hierarchical Pipeline Backtesting
=================================
Evaluates the 3-level hierarchical ML architecture with R:R 2:1 backtesting.

Key metrics:
- Win Rate with R:R 2:1 (target: >40%)
- Expectancy (target: >0.10)
- Trade filtering effectiveness
- Comparison: filtered vs unfiltered

Usage:
    python scripts/evaluate_hierarchical.py --symbols XAUUSD EURUSD
    python scripts/evaluate_hierarchical.py --symbols XAUUSD --rr 2.0 --attention-threshold 0.8

Author: ML Pipeline
Version: 1.0.0
Created: 2026-01-07
"""

import argparse
import sys
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Tuple, Optional, Any
from dataclasses import dataclass, asdict
import json

import numpy as np
import pandas as pd
from loguru import logger
import joblib

# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))

# Import hierarchical pipeline directly to avoid __init__.py issues
import importlib.util
pipeline_path = Path(__file__).parent.parent / 'src' / 'pipelines' / 'hierarchical_pipeline.py'
spec = importlib.util.spec_from_file_location("hierarchical_pipeline", pipeline_path)
hierarchical_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(hierarchical_module)

HierarchicalPipeline = hierarchical_module.HierarchicalPipeline
PipelineConfig = hierarchical_module.PipelineConfig
PredictionResult = hierarchical_module.PredictionResult


@dataclass
class TradeResult:
    """Result of a single trade"""
    timestamp: datetime
    symbol: str
    direction: str  # 'long' or 'short'
    entry_price: float
    stop_loss: float
    take_profit: float
    risk: float
    reward: float
    actual_high: float
    actual_low: float
    hit_tp: bool
    hit_sl: bool
    profit_r: float  # Profit in R multiples
    attention_score: float
    confidence_proba: float
    trade_quality: str
    was_filtered: bool  # Would this trade be filtered by attention?


@dataclass
class BacktestMetrics:
    """Comprehensive backtest metrics"""
    symbol: str
    timeframe: str
    period: str
    risk_reward: float

    # Trade counts
    total_bars: int
    total_trades: int
    filtered_trades: int
    executed_trades: int

    # Win/Loss
    wins: int
    losses: int
    win_rate: float

    # Profitability
    total_profit_r: float
    avg_profit_r: float
    expectancy: float
    profit_factor: float

    # Risk metrics
    max_consecutive_losses: int
    max_drawdown_r: float

    # Attention analysis
    avg_attention_winners: float
    avg_attention_losers: float
    high_attention_win_rate: float
    medium_attention_win_rate: float
    low_attention_win_rate: float

    # Comparison: unfiltered
    unfiltered_total_trades: int
    unfiltered_win_rate: float
    unfiltered_expectancy: float
    improvement_pct: float


def setup_logging(log_dir: Path, experiment_name: str) -> Path:
    """Configure logging to file and console."""
    log_dir.mkdir(parents=True, exist_ok=True)
    log_file = log_dir / f"{experiment_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"

    logger.remove()
    logger.add(sys.stderr, level="INFO", format="{time:HH:mm:ss} | {level} | {message}")
    logger.add(log_file, level="DEBUG", rotation="10 MB")

    return log_file


def load_ohlcv_from_mysql(
    symbol: str,
    timeframe: str,
    start_date: str,
    end_date: str
) -> pd.DataFrame:
    """Load OHLCV data from MySQL database using project's database module."""
    from data.database import MySQLConnection
    import pandas as pd

    # Map symbol to ticker
    ticker_map = {
        'XAUUSD': 'C:XAUUSD',
        'EURUSD': 'C:EURUSD',
        'GBPUSD': 'C:GBPUSD',
        'USDJPY': 'C:USDJPY',
        'BTCUSD': 'X:BTCUSD'
    }
    ticker = ticker_map.get(symbol, f'C:{symbol}')

    logger.info(f"Loading {symbol} {timeframe} data from {start_date} to {end_date}...")

    try:
        db = MySQLConnection()

        # Load raw OHLCV data (base frequency)
        query = f"""
            SELECT date_agg as timestamp, open, high, low, close, volume
            FROM tickers_agg_data
            WHERE ticker = '{ticker}'
            AND date_agg >= '{start_date}'
            AND date_agg <= '{end_date}'
            ORDER BY date_agg ASC
        """

        df = pd.read_sql(query, db.engine)

        if df.empty:
            logger.warning(f"No data found for {symbol}")
            return df

        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df.set_index('timestamp', inplace=True)
        df.sort_index(inplace=True)

        logger.info(f"  Loaded {len(df)} raw bars")

        # Resample to requested timeframe
        agg_dict = {
            'open': 'first',
            'high': 'max',
            'low': 'min',
            'close': 'last',
            'volume': 'sum'
        }

        if timeframe == '5m':
            df = df.resample('5min').agg(agg_dict).dropna()
        elif timeframe == '15m':
            df = df.resample('15min').agg(agg_dict).dropna()
        elif timeframe == '1h':
            df = df.resample('1h').agg(agg_dict).dropna()
        elif timeframe == '4h':
            df = df.resample('4h').agg(agg_dict).dropna()

        logger.info(f"  Resampled to {timeframe}: {len(df)} bars")

        return df

    except Exception as e:
        logger.error(f"Failed to load data from MySQL: {e}")
        raise


def generate_features(df: pd.DataFrame) -> pd.DataFrame:
    """Generate comprehensive feature set matching training."""
    if len(df) == 0:
        return df

    df = df.copy()
    features = pd.DataFrame(index=df.index)

    close = df['close']
    high = df['high']
    low = df['low']
    open_price = df['open']
    volume = df.get('volume', pd.Series(1, index=df.index))

    # Returns
    features['returns_1'] = close.pct_change(1)
    features['returns_3'] = close.pct_change(3)
    features['returns_5'] = close.pct_change(5)
    features['returns_10'] = close.pct_change(10)
    features['returns_20'] = close.pct_change(20)

    # Volatility
    features['volatility_5'] = close.pct_change().rolling(5).std()
    features['volatility_10'] = close.pct_change().rolling(10).std()
    features['volatility_20'] = close.pct_change().rolling(20).std()

    # Range
    candle_range = high - low
    features['range'] = candle_range
    features['range_pct'] = candle_range / close
    features['range_ma_5'] = candle_range.rolling(5).mean()
    features['range_ma_10'] = candle_range.rolling(10).mean()
    features['range_ma_20'] = candle_range.rolling(20).mean()
    features['range_ratio_5'] = candle_range / features['range_ma_5']
    features['range_ratio_20'] = candle_range / features['range_ma_20']

    # ATR
    tr1 = high - low
    tr2 = abs(high - close.shift(1))
    tr3 = abs(low - close.shift(1))
    true_range = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    features['atr_5'] = true_range.rolling(5).mean()
    features['atr_14'] = true_range.rolling(14).mean()
    features['atr_20'] = true_range.rolling(20).mean()
    features['atr_ratio'] = true_range / features['atr_14']

    # Moving Averages
    sma_5 = close.rolling(5).mean()
    sma_10 = close.rolling(10).mean()
    sma_20 = close.rolling(20).mean()
    sma_50 = close.rolling(50).mean()
    ema_5 = close.ewm(span=5, adjust=False).mean()
    ema_20 = close.ewm(span=20, adjust=False).mean()

    features['price_vs_sma5'] = (close - sma_5) / features['atr_14']
    features['price_vs_sma10'] = (close - sma_10) / features['atr_14']
    features['price_vs_sma20'] = (close - sma_20) / features['atr_14']
    features['price_vs_sma50'] = (close - sma_50) / features['atr_14']
    features['sma5_vs_sma20'] = (sma_5 - sma_20) / features['atr_14']
    features['ema5_vs_ema20'] = (ema_5 - ema_20) / features['atr_14']

    # RSI
    delta = close.diff()
    gain = delta.where(delta > 0, 0).rolling(14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
    rs = gain / (loss + 1e-10)
    features['rsi_14'] = 100 - (100 / (1 + rs))
    features['rsi_oversold'] = (features['rsi_14'] < 30).astype(float)
    features['rsi_overbought'] = (features['rsi_14'] > 70).astype(float)

    # Bollinger Bands
    bb_middle = close.rolling(20).mean()
    bb_std = close.rolling(20).std()
    bb_upper = bb_middle + 2 * bb_std
    bb_lower = bb_middle - 2 * bb_std
    features['bb_width'] = (bb_upper - bb_lower) / bb_middle
    features['bb_position'] = (close - bb_lower) / (bb_upper - bb_lower + 1e-10)

    # MACD
    ema_12 = close.ewm(span=12, adjust=False).mean()
    ema_26 = close.ewm(span=26, adjust=False).mean()
    macd = ema_12 - ema_26
    macd_signal = macd.ewm(span=9, adjust=False).mean()
    features['macd'] = macd / features['atr_14']
    features['macd_signal'] = macd_signal / features['atr_14']
    features['macd_hist'] = (macd - macd_signal) / features['atr_14']

    # Momentum
    features['momentum_5'] = (close - close.shift(5)) / features['atr_14']
    features['momentum_10'] = (close - close.shift(10)) / features['atr_14']
    features['momentum_20'] = (close - close.shift(20)) / features['atr_14']

    # Stochastic
    low_14 = low.rolling(14).min()
    high_14 = high.rolling(14).max()
    features['stoch_k'] = 100 * (close - low_14) / (high_14 - low_14 + 1e-10)
    features['stoch_d'] = features['stoch_k'].rolling(3).mean()

    # Williams %R
    features['williams_r'] = -100 * (high_14 - close) / (high_14 - low_14 + 1e-10)

    # Volume
    if volume.sum() > 0:
        vol_ma_5 = volume.rolling(5).mean()
        vol_ma_20 = volume.rolling(20).mean()
        features['volume_ratio'] = volume / (vol_ma_20 + 1)
        features['volume_trend'] = (vol_ma_5 - vol_ma_20) / (vol_ma_20 + 1)
    else:
        features['volume_ratio'] = 1.0
        features['volume_trend'] = 0.0

    # Candle patterns
    body = close - open_price
    features['body_pct'] = body / (candle_range + 1e-10)
    features['upper_shadow'] = (high - np.maximum(close, open_price)) / (candle_range + 1e-10)
    features['lower_shadow'] = (np.minimum(close, open_price) - low) / (candle_range + 1e-10)

    # Price position
    features['close_position'] = (close - low) / (candle_range + 1e-10)
    high_5 = high.rolling(5).max()
    low_5 = low.rolling(5).min()
    features['price_position_5'] = (close - low_5) / (high_5 - low_5 + 1e-10)
    high_20 = high.rolling(20).max()
    low_20 = low.rolling(20).min()
    features['price_position_20'] = (close - low_20) / (high_20 - low_20 + 1e-10)

    # Time features
    if hasattr(df.index, 'hour'):
        hour = df.index.hour
        day_of_week = df.index.dayofweek
        features['hour_sin'] = np.sin(2 * np.pi * hour / 24)
        features['hour_cos'] = np.cos(2 * np.pi * hour / 24)
        features['dow_sin'] = np.sin(2 * np.pi * day_of_week / 7)
        features['dow_cos'] = np.cos(2 * np.pi * day_of_week / 7)
        features['is_london'] = ((hour >= 8) & (hour < 16)).astype(float)
        features['is_newyork'] = ((hour >= 13) & (hour < 21)).astype(float)
        features['is_overlap'] = ((hour >= 13) & (hour < 16)).astype(float)

    # Clean
    features = features.replace([np.inf, -np.inf], np.nan)

    # Combine
    result = pd.concat([df[['open', 'high', 'low', 'close', 'volume']], features], axis=1)
    return result


def run_backtest(
    pipeline: HierarchicalPipeline,
    df_5m: pd.DataFrame,
    df_15m: pd.DataFrame,
    symbol: str,
    risk_reward: float = 2.0,
    attention_threshold: float = 0.8,
    horizon_bars: int = 3,
    step_bars: int = 1
) -> List[TradeResult]:
    """
    Run backtest simulation.

    Args:
        pipeline: Hierarchical pipeline instance
        df_5m: 5-minute OHLCV data
        df_15m: 15-minute OHLCV data
        symbol: Trading symbol
        risk_reward: Risk/reward ratio for TP
        attention_threshold: Minimum attention to take trade
        horizon_bars: Bars to look forward for TP/SL
        step_bars: Step size between predictions

    Returns:
        List of TradeResult
    """
    trades = []
    min_lookback = 100  # Minimum bars for features

    # Ensure data is sorted
    df_5m = df_5m.sort_index()
    df_15m = df_15m.sort_index()

    # Add features
    df_5m_feat = generate_features(df_5m)
    df_15m_feat = generate_features(df_15m)

    # Get common valid range
    valid_start_5m = df_5m_feat.index[min_lookback * 3]
    valid_start_15m = df_15m_feat.index[min_lookback]
    common_start = max(valid_start_5m, valid_start_15m)

    # Filter to common range leaving room for horizon
    df_15m_test = df_15m_feat[df_15m_feat.index >= common_start].iloc[:-horizon_bars]

    logger.info(f"Backtesting {len(df_15m_test)} bars...")

    for i in range(0, len(df_15m_test), step_bars):
        current_time = df_15m_test.index[i]

        # Get historical data up to current time
        df_5m_slice = df_5m_feat[df_5m_feat.index <= current_time].tail(min_lookback * 3)
        df_15m_slice = df_15m_feat[df_15m_feat.index <= current_time].tail(min_lookback)

        if len(df_5m_slice) < min_lookback or len(df_15m_slice) < 50:
            continue

        try:
            # Get prediction
            result = pipeline.predict(df_5m_slice, df_15m_slice, symbol)

            # Get entry price
            entry_price = float(df_15m_slice['close'].iloc[-1])

            # Determine direction from predictions
            delta_high = result.delta_high_final
            delta_low = result.delta_low_final

            if delta_high > delta_low * 1.1:
                direction = 'long'
            elif delta_low > delta_high * 1.1:
                direction = 'short'
            else:
                # Use momentum
                momentum = (df_15m_slice['close'].iloc[-1] / df_15m_slice['close'].iloc[-5]) - 1
                direction = 'long' if momentum > 0 else 'short'

            # Calculate SL and TP
            if direction == 'long':
                stop_loss = entry_price - delta_low
                risk = entry_price - stop_loss
                take_profit = entry_price + (risk * risk_reward)
            else:
                stop_loss = entry_price + delta_high
                risk = stop_loss - entry_price
                take_profit = entry_price - (risk * risk_reward)

            # Get future data for outcome
            future_start_idx = df_15m_feat.index.get_loc(current_time)
            future_end_idx = min(future_start_idx + horizon_bars, len(df_15m_feat))
            future_data = df_15m_feat.iloc[future_start_idx:future_end_idx]

            if len(future_data) < 2:
                continue

            actual_high = future_data['high'].max()
            actual_low = future_data['low'].min()

            # Determine outcome
            if direction == 'long':
                hit_tp = actual_high >= take_profit
                hit_sl = actual_low <= stop_loss

                if hit_tp and hit_sl:
                    # Both hit - determine which first (simplified: assume TP first if diff is larger)
                    high_dist = actual_high - entry_price
                    low_dist = entry_price - actual_low
                    hit_tp = high_dist >= low_dist
                    hit_sl = not hit_tp

                if hit_tp:
                    profit_r = risk_reward
                elif hit_sl:
                    profit_r = -1.0
                else:
                    # Neither hit - use actual PnL
                    actual_pnl = future_data['close'].iloc[-1] - entry_price
                    profit_r = actual_pnl / risk if risk > 0 else 0
            else:
                hit_tp = actual_low <= take_profit
                hit_sl = actual_high >= stop_loss

                if hit_tp and hit_sl:
                    high_dist = actual_high - entry_price
                    low_dist = entry_price - actual_low
                    hit_tp = low_dist >= high_dist
                    hit_sl = not hit_tp

                if hit_tp:
                    profit_r = risk_reward
                elif hit_sl:
                    profit_r = -1.0
                else:
                    actual_pnl = entry_price - future_data['close'].iloc[-1]
                    profit_r = actual_pnl / risk if risk > 0 else 0

            # Calculate average attention
            avg_attention = (result.attention_score_5m + result.attention_score_15m) / 2
            was_filtered = avg_attention < attention_threshold or not result.confidence

            trade = TradeResult(
                timestamp=current_time,
                symbol=symbol,
                direction=direction,
                entry_price=entry_price,
                stop_loss=stop_loss,
                take_profit=take_profit,
                risk=risk,
                reward=risk * risk_reward,
                actual_high=actual_high,
                actual_low=actual_low,
                hit_tp=hit_tp,
                hit_sl=hit_sl,
                profit_r=profit_r,
                attention_score=avg_attention,
                confidence_proba=result.confidence_proba,
                trade_quality=result.trade_quality,
                was_filtered=was_filtered
            )
            trades.append(trade)

        except Exception as e:
            logger.debug(f"Prediction failed at {current_time}: {e}")
            continue

        if (i + 1) % 500 == 0:
            logger.info(f"  Processed {i + 1}/{len(df_15m_test)} bars...")

    return trades


def calculate_metrics(
    trades: List[TradeResult],
    symbol: str,
    risk_reward: float,
    attention_threshold: float
) -> BacktestMetrics:
    """Calculate comprehensive backtest metrics."""
    if not trades:
        return None

    # All trades
    all_trades = trades
    total_trades = len(all_trades)

    # Filtered trades (executed)
    executed_trades = [t for t in trades if not t.was_filtered]
    filtered_count = total_trades - len(executed_trades)

    # Win/Loss for executed trades
    wins = [t for t in executed_trades if t.profit_r > 0]
    losses = [t for t in executed_trades if t.profit_r <= 0]

    win_rate = len(wins) / len(executed_trades) if executed_trades else 0

    # Profitability
    total_profit_r = sum(t.profit_r for t in executed_trades)
    avg_profit_r = total_profit_r / len(executed_trades) if executed_trades else 0

    # Expectancy = (WinRate * AvgWin) - (LossRate * AvgLoss)
    avg_win = sum(t.profit_r for t in wins) / len(wins) if wins else 0
    avg_loss = abs(sum(t.profit_r for t in losses) / len(losses)) if losses else 0
    expectancy = (win_rate * avg_win) - ((1 - win_rate) * avg_loss)

    # Profit factor
    gross_profit = sum(t.profit_r for t in wins)
    gross_loss = abs(sum(t.profit_r for t in losses))
    profit_factor = gross_profit / gross_loss if gross_loss > 0 else float('inf')

    # Risk metrics
    consecutive_losses = 0
    max_consecutive_losses = 0
    equity_curve = []
    cumulative = 0

    for t in executed_trades:
        cumulative += t.profit_r
        equity_curve.append(cumulative)
        if t.profit_r <= 0:
            consecutive_losses += 1
            max_consecutive_losses = max(max_consecutive_losses, consecutive_losses)
        else:
            consecutive_losses = 0

    # Max drawdown
    peak = 0
    max_dd = 0
    for eq in equity_curve:
        if eq > peak:
            peak = eq
        dd = peak - eq
        if dd > max_dd:
            max_dd = dd

    # Attention analysis
    winners_attention = [t.attention_score for t in wins]
    losers_attention = [t.attention_score for t in losses]

    avg_attention_winners = np.mean(winners_attention) if winners_attention else 0
    avg_attention_losers = np.mean(losers_attention) if losers_attention else 0

    # Win rate by attention level
    high_attention = [t for t in executed_trades if t.attention_score >= 2.0]
    medium_attention = [t for t in executed_trades if 0.8 <= t.attention_score < 2.0]
    low_attention = [t for t in executed_trades if t.attention_score < 0.8]

    high_attention_wr = sum(1 for t in high_attention if t.profit_r > 0) / len(high_attention) if high_attention else 0
    medium_attention_wr = sum(1 for t in medium_attention if t.profit_r > 0) / len(medium_attention) if medium_attention else 0
    low_attention_wr = sum(1 for t in low_attention if t.profit_r > 0) / len(low_attention) if low_attention else 0

    # Unfiltered comparison (all trades)
    unfiltered_wins = [t for t in all_trades if t.profit_r > 0]
    unfiltered_win_rate = len(unfiltered_wins) / len(all_trades) if all_trades else 0
    unfiltered_profit = sum(t.profit_r for t in all_trades)
    unfiltered_expectancy = unfiltered_profit / len(all_trades) if all_trades else 0

    # Improvement
    improvement_pct = ((expectancy - unfiltered_expectancy) / abs(unfiltered_expectancy) * 100) if unfiltered_expectancy != 0 else 0

    # Get period
    start_date = min(t.timestamp for t in trades)
    end_date = max(t.timestamp for t in trades)
    period = f"{start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}"

    return BacktestMetrics(
        symbol=symbol,
        timeframe='15m',
        period=period,
        risk_reward=risk_reward,
        total_bars=len(trades),
        total_trades=total_trades,
        filtered_trades=filtered_count,
        executed_trades=len(executed_trades),
        wins=len(wins),
        losses=len(losses),
        win_rate=round(win_rate, 4),
        total_profit_r=round(total_profit_r, 2),
        avg_profit_r=round(avg_profit_r, 4),
        expectancy=round(expectancy, 4),
        profit_factor=round(profit_factor, 2),
        max_consecutive_losses=max_consecutive_losses,
        max_drawdown_r=round(max_dd, 2),
        avg_attention_winners=round(avg_attention_winners, 3),
        avg_attention_losers=round(avg_attention_losers, 3),
        high_attention_win_rate=round(high_attention_wr, 4),
        medium_attention_win_rate=round(medium_attention_wr, 4),
        low_attention_win_rate=round(low_attention_wr, 4),
        unfiltered_total_trades=total_trades,
        unfiltered_win_rate=round(unfiltered_win_rate, 4),
        unfiltered_expectancy=round(unfiltered_expectancy, 4),
        improvement_pct=round(improvement_pct, 1)
    )


def print_metrics(metrics: BacktestMetrics, target_wr: float = 0.40, target_exp: float = 0.10):
    """Print metrics with pass/fail indicators."""
    print(f"\n{'=' * 60}")
    print(f"BACKTEST RESULTS: {metrics.symbol}")
    print(f"{'=' * 60}")
    print(f"Period: {metrics.period}")
    print(f"Timeframe: {metrics.timeframe}")
    print(f"Risk:Reward: 1:{metrics.risk_reward}")

    print(f"\n--- Trade Statistics ---")
    print(f"Total Signals: {metrics.total_trades}")
    print(f"Filtered Out: {metrics.filtered_trades} ({metrics.filtered_trades / metrics.total_trades * 100:.1f}%)")
    print(f"Executed Trades: {metrics.executed_trades}")
    print(f"Wins: {metrics.wins}")
    print(f"Losses: {metrics.losses}")

    # Win Rate with target comparison
    wr_status = "PASS" if metrics.win_rate >= target_wr else "FAIL"
    print(f"\n--- Key Metrics ---")
    print(f"Win Rate: {metrics.win_rate * 100:.1f}% (target: {target_wr * 100}%) [{wr_status}]")

    # Expectancy with target comparison
    exp_status = "PASS" if metrics.expectancy >= target_exp else "FAIL"
    print(f"Expectancy: {metrics.expectancy:.4f} (target: {target_exp}) [{exp_status}]")

    print(f"Profit Factor: {metrics.profit_factor:.2f}")
    print(f"Total Profit (R): {metrics.total_profit_r:.2f}")
    print(f"Avg Profit/Trade (R): {metrics.avg_profit_r:.4f}")

    print(f"\n--- Risk Metrics ---")
    print(f"Max Consecutive Losses: {metrics.max_consecutive_losses}")
    print(f"Max Drawdown (R): {metrics.max_drawdown_r:.2f}")

    print(f"\n--- Attention Analysis ---")
    print(f"Avg Attention (Winners): {metrics.avg_attention_winners:.3f}")
    print(f"Avg Attention (Losers): {metrics.avg_attention_losers:.3f}")
    print(f"High Attention (>=2.0) Win Rate: {metrics.high_attention_win_rate * 100:.1f}%")
    print(f"Medium Attention (0.8-2.0) Win Rate: {metrics.medium_attention_win_rate * 100:.1f}%")
    print(f"Low Attention (<0.8) Win Rate: {metrics.low_attention_win_rate * 100:.1f}%")

    print(f"\n--- Comparison: Filtered vs Unfiltered ---")
    print(f"Unfiltered Win Rate: {metrics.unfiltered_win_rate * 100:.1f}%")
    print(f"Unfiltered Expectancy: {metrics.unfiltered_expectancy:.4f}")
    print(f"Improvement: {metrics.improvement_pct:+.1f}%")

    print(f"\n{'=' * 60}")


def generate_report(all_metrics: List[BacktestMetrics], output_path: Path):
    """Generate markdown report."""
    report = []
    report.append("# Hierarchical Pipeline Backtest Report")
    report.append(f"\n**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

    # Summary table
    report.append("\n## Summary\n")
    report.append("| Symbol | Period | Win Rate | Expectancy | Profit (R) | Improvement |")
    report.append("|--------|--------|----------|------------|------------|-------------|")

    for m in all_metrics:
        wr_status = "PASS" if m.win_rate >= 0.40 else "FAIL"
        exp_status = "PASS" if m.expectancy >= 0.10 else "FAIL"
        report.append(
            f"| {m.symbol} | {m.period} | {m.win_rate * 100:.1f}% ({wr_status}) | "
            f"{m.expectancy:.4f} ({exp_status}) | {m.total_profit_r:.1f} | {m.improvement_pct:+.1f}% |"
        )

    # Detailed sections
    for m in all_metrics:
        report.append(f"\n## {m.symbol} Details\n")
        report.append(f"- **Total Signals:** {m.total_trades}")
        report.append(f"- **Filtered Out:** {m.filtered_trades} ({m.filtered_trades / m.total_trades * 100:.1f}%)")
        report.append(f"- **Executed Trades:** {m.executed_trades}")
        report.append(f"- **Win Rate:** {m.win_rate * 100:.1f}%")
        report.append(f"- **Expectancy:** {m.expectancy:.4f}")
        report.append(f"- **Profit Factor:** {m.profit_factor:.2f}")

        report.append("\n### Attention Analysis\n")
        report.append("| Attention Level | Win Rate |")
        report.append("|-----------------|----------|")
        report.append(f"| High (>=2.0) | {m.high_attention_win_rate * 100:.1f}% |")
        report.append(f"| Medium (0.8-2.0) | {m.medium_attention_win_rate * 100:.1f}% |")
        report.append(f"| Low (<0.8) | {m.low_attention_win_rate * 100:.1f}% |")

    # Write report
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, 'w') as f:
        f.write('\n'.join(report))

    logger.info(f"Report saved to: {output_path}")


def main():
    parser = argparse.ArgumentParser(description='Hierarchical Pipeline Backtest')
    parser.add_argument('--symbols', nargs='+', default=['XAUUSD', 'EURUSD'],
                        help='Symbols to backtest')
    parser.add_argument('--start-date', type=str, default='2024-06-01',
                        help='Start date (YYYY-MM-DD)')
    parser.add_argument('--end-date', type=str, default='2025-12-31',
                        help='End date (YYYY-MM-DD)')
    parser.add_argument('--rr', type=float, default=2.0,
                        help='Risk:Reward ratio')
    parser.add_argument('--attention-threshold', type=float, default=0.8,
                        help='Minimum attention score to trade')
    parser.add_argument('--horizon', type=int, default=3,
                        help='Bars to look forward for TP/SL')
    parser.add_argument('--step', type=int, default=1,
                        help='Step size between predictions')
    parser.add_argument('--models-dir', type=str, default='models',
                        help='Directory containing trained models')
    parser.add_argument('--output-dir', type=str, default='models/backtest_results',
                        help='Output directory for reports')

    args = parser.parse_args()

    # Setup
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    log_file = setup_logging(output_dir / 'logs', 'hierarchical_backtest')

    logger.info("=" * 60)
    logger.info("HIERARCHICAL PIPELINE BACKTEST")
    logger.info("=" * 60)
    logger.info(f"Symbols: {args.symbols}")
    logger.info(f"Period: {args.start_date} to {args.end_date}")
    logger.info(f"R:R: 1:{args.rr}")
    logger.info(f"Attention Threshold: {args.attention_threshold}")

    # Initialize pipeline
    config = PipelineConfig(
        attention_model_path=f'{args.models_dir}/attention',
        base_model_path=f'{args.models_dir}/symbol_timeframe_models',
        metamodel_path=f'{args.models_dir}/metamodels',
        attention_threshold_low=args.attention_threshold,
        attention_threshold_high=2.0,
        confidence_threshold=0.5
    )
    pipeline = HierarchicalPipeline(config)

    all_metrics = []

    for symbol in args.symbols:
        logger.info(f"\n{'=' * 40}")
        logger.info(f"Processing {symbol}...")
        logger.info(f"{'=' * 40}")

        # Load models
        if not pipeline.load_models(symbol):
            logger.warning(f"Could not load all models for {symbol}, skipping...")
            continue

        # Load data
        try:
            df_5m = load_ohlcv_from_mysql(symbol, '5m', args.start_date, args.end_date)
            df_15m = load_ohlcv_from_mysql(symbol, '15m', args.start_date, args.end_date)

            if df_5m.empty or df_15m.empty:
                logger.warning(f"No data for {symbol}, skipping...")
                continue

        except Exception as e:
            logger.error(f"Data loading failed for {symbol}: {e}")
            continue

        # Run backtest
        trades = run_backtest(
            pipeline=pipeline,
            df_5m=df_5m,
            df_15m=df_15m,
            symbol=symbol,
            risk_reward=args.rr,
            attention_threshold=args.attention_threshold,
            horizon_bars=args.horizon,
            step_bars=args.step
        )

        if not trades:
            logger.warning(f"No trades generated for {symbol}")
            continue

        # Calculate metrics
        metrics = calculate_metrics(
            trades=trades,
            symbol=symbol,
            risk_reward=args.rr,
            attention_threshold=args.attention_threshold
        )

        if metrics:
            all_metrics.append(metrics)
            print_metrics(metrics)

            # Save trades
            trades_df = pd.DataFrame([asdict(t) for t in trades])
            trades_file = output_dir / f'{symbol}_trades_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
            trades_df.to_csv(trades_file, index=False)
            logger.info(f"Trades saved to: {trades_file}")

    # Generate final report
    if all_metrics:
        report_file = output_dir / f'backtest_report_{datetime.now().strftime("%Y%m%d_%H%M%S")}.md'
        generate_report(all_metrics, report_file)

        # Save metrics as JSON
        metrics_json = output_dir / f'backtest_metrics_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
        with open(metrics_json, 'w') as f:
            json.dump([asdict(m) for m in all_metrics], f, indent=2, default=str)
        logger.info(f"Metrics saved to: {metrics_json}")

    logger.info("\n" + "=" * 60)
    logger.info("BACKTEST COMPLETE")
    logger.info("=" * 60)


if __name__ == "__main__":
    main()