trading-platform-ml-engine-v2/scripts/evaluate_hierarchical.py
rckrdmrd 75c4d07690 feat: Initial commit - ML Engine codebase
Hierarchical ML Pipeline for trading predictions:
- Level 0: Attention Models (volatility/flow classification)
- Level 1: Base Models (XGBoost per symbol/timeframe)
- Level 2: Metamodels (XGBoost Stacking + Neural Gating)

Key components:
- src/pipelines/hierarchical_pipeline.py - Main prediction pipeline
- src/models/ - All ML model classes
- src/training/ - Training utilities
- src/api/ - FastAPI endpoints
- scripts/ - Training and evaluation scripts
- config/ - YAML configurations

Note: Trained models (*.joblib, *.pt) are gitignored.
      Regenerate with training scripts.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-18 04:27:40 -06:00

857 lines
31 KiB
Python

#!/usr/bin/env python3
"""
Hierarchical Pipeline Backtesting
=================================
Evaluates the 3-level hierarchical ML architecture with R:R 2:1 backtesting.
Key metrics:
- Win Rate with R:R 2:1 (target: >40%)
- Expectancy (target: >0.10)
- Trade filtering effectiveness
- Comparison: filtered vs unfiltered
Usage:
python scripts/evaluate_hierarchical.py --symbols XAUUSD EURUSD
python scripts/evaluate_hierarchical.py --symbols XAUUSD --rr 2.0 --attention-threshold 0.8
Author: ML Pipeline
Version: 1.0.0
Created: 2026-01-07
"""
import argparse
import sys
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Tuple, Optional, Any
from dataclasses import dataclass, asdict
import json
import numpy as np
import pandas as pd
from loguru import logger
import joblib
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
# Import hierarchical pipeline directly to avoid __init__.py issues
import importlib.util
pipeline_path = Path(__file__).parent.parent / 'src' / 'pipelines' / 'hierarchical_pipeline.py'
spec = importlib.util.spec_from_file_location("hierarchical_pipeline", pipeline_path)
hierarchical_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(hierarchical_module)
HierarchicalPipeline = hierarchical_module.HierarchicalPipeline
PipelineConfig = hierarchical_module.PipelineConfig
PredictionResult = hierarchical_module.PredictionResult
@dataclass
class TradeResult:
"""Result of a single trade"""
timestamp: datetime
symbol: str
direction: str # 'long' or 'short'
entry_price: float
stop_loss: float
take_profit: float
risk: float
reward: float
actual_high: float
actual_low: float
hit_tp: bool
hit_sl: bool
profit_r: float # Profit in R multiples
attention_score: float
confidence_proba: float
trade_quality: str
was_filtered: bool # Would this trade be filtered by attention?
@dataclass
class BacktestMetrics:
"""Comprehensive backtest metrics"""
symbol: str
timeframe: str
period: str
risk_reward: float
# Trade counts
total_bars: int
total_trades: int
filtered_trades: int
executed_trades: int
# Win/Loss
wins: int
losses: int
win_rate: float
# Profitability
total_profit_r: float
avg_profit_r: float
expectancy: float
profit_factor: float
# Risk metrics
max_consecutive_losses: int
max_drawdown_r: float
# Attention analysis
avg_attention_winners: float
avg_attention_losers: float
high_attention_win_rate: float
medium_attention_win_rate: float
low_attention_win_rate: float
# Comparison: unfiltered
unfiltered_total_trades: int
unfiltered_win_rate: float
unfiltered_expectancy: float
improvement_pct: float
def setup_logging(log_dir: Path, experiment_name: str) -> Path:
"""Configure logging to file and console."""
log_dir.mkdir(parents=True, exist_ok=True)
log_file = log_dir / f"{experiment_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
logger.remove()
logger.add(sys.stderr, level="INFO", format="{time:HH:mm:ss} | {level} | {message}")
logger.add(log_file, level="DEBUG", rotation="10 MB")
return log_file
def load_ohlcv_from_mysql(
symbol: str,
timeframe: str,
start_date: str,
end_date: str
) -> pd.DataFrame:
"""Load OHLCV data from MySQL database using project's database module."""
from data.database import MySQLConnection
import pandas as pd
# Map symbol to ticker
ticker_map = {
'XAUUSD': 'C:XAUUSD',
'EURUSD': 'C:EURUSD',
'GBPUSD': 'C:GBPUSD',
'USDJPY': 'C:USDJPY',
'BTCUSD': 'X:BTCUSD'
}
ticker = ticker_map.get(symbol, f'C:{symbol}')
logger.info(f"Loading {symbol} {timeframe} data from {start_date} to {end_date}...")
try:
db = MySQLConnection()
# Load raw OHLCV data (base frequency)
query = f"""
SELECT date_agg as timestamp, open, high, low, close, volume
FROM tickers_agg_data
WHERE ticker = '{ticker}'
AND date_agg >= '{start_date}'
AND date_agg <= '{end_date}'
ORDER BY date_agg ASC
"""
df = pd.read_sql(query, db.engine)
if df.empty:
logger.warning(f"No data found for {symbol}")
return df
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.set_index('timestamp', inplace=True)
df.sort_index(inplace=True)
logger.info(f" Loaded {len(df)} raw bars")
# Resample to requested timeframe
agg_dict = {
'open': 'first',
'high': 'max',
'low': 'min',
'close': 'last',
'volume': 'sum'
}
if timeframe == '5m':
df = df.resample('5min').agg(agg_dict).dropna()
elif timeframe == '15m':
df = df.resample('15min').agg(agg_dict).dropna()
elif timeframe == '1h':
df = df.resample('1h').agg(agg_dict).dropna()
elif timeframe == '4h':
df = df.resample('4h').agg(agg_dict).dropna()
logger.info(f" Resampled to {timeframe}: {len(df)} bars")
return df
except Exception as e:
logger.error(f"Failed to load data from MySQL: {e}")
raise
def generate_features(df: pd.DataFrame) -> pd.DataFrame:
"""Generate comprehensive feature set matching training."""
if len(df) == 0:
return df
df = df.copy()
features = pd.DataFrame(index=df.index)
close = df['close']
high = df['high']
low = df['low']
open_price = df['open']
volume = df.get('volume', pd.Series(1, index=df.index))
# Returns
features['returns_1'] = close.pct_change(1)
features['returns_3'] = close.pct_change(3)
features['returns_5'] = close.pct_change(5)
features['returns_10'] = close.pct_change(10)
features['returns_20'] = close.pct_change(20)
# Volatility
features['volatility_5'] = close.pct_change().rolling(5).std()
features['volatility_10'] = close.pct_change().rolling(10).std()
features['volatility_20'] = close.pct_change().rolling(20).std()
# Range
candle_range = high - low
features['range'] = candle_range
features['range_pct'] = candle_range / close
features['range_ma_5'] = candle_range.rolling(5).mean()
features['range_ma_10'] = candle_range.rolling(10).mean()
features['range_ma_20'] = candle_range.rolling(20).mean()
features['range_ratio_5'] = candle_range / features['range_ma_5']
features['range_ratio_20'] = candle_range / features['range_ma_20']
# ATR
tr1 = high - low
tr2 = abs(high - close.shift(1))
tr3 = abs(low - close.shift(1))
true_range = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
features['atr_5'] = true_range.rolling(5).mean()
features['atr_14'] = true_range.rolling(14).mean()
features['atr_20'] = true_range.rolling(20).mean()
features['atr_ratio'] = true_range / features['atr_14']
# Moving Averages
sma_5 = close.rolling(5).mean()
sma_10 = close.rolling(10).mean()
sma_20 = close.rolling(20).mean()
sma_50 = close.rolling(50).mean()
ema_5 = close.ewm(span=5, adjust=False).mean()
ema_20 = close.ewm(span=20, adjust=False).mean()
features['price_vs_sma5'] = (close - sma_5) / features['atr_14']
features['price_vs_sma10'] = (close - sma_10) / features['atr_14']
features['price_vs_sma20'] = (close - sma_20) / features['atr_14']
features['price_vs_sma50'] = (close - sma_50) / features['atr_14']
features['sma5_vs_sma20'] = (sma_5 - sma_20) / features['atr_14']
features['ema5_vs_ema20'] = (ema_5 - ema_20) / features['atr_14']
# RSI
delta = close.diff()
gain = delta.where(delta > 0, 0).rolling(14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
rs = gain / (loss + 1e-10)
features['rsi_14'] = 100 - (100 / (1 + rs))
features['rsi_oversold'] = (features['rsi_14'] < 30).astype(float)
features['rsi_overbought'] = (features['rsi_14'] > 70).astype(float)
# Bollinger Bands
bb_middle = close.rolling(20).mean()
bb_std = close.rolling(20).std()
bb_upper = bb_middle + 2 * bb_std
bb_lower = bb_middle - 2 * bb_std
features['bb_width'] = (bb_upper - bb_lower) / bb_middle
features['bb_position'] = (close - bb_lower) / (bb_upper - bb_lower + 1e-10)
# MACD
ema_12 = close.ewm(span=12, adjust=False).mean()
ema_26 = close.ewm(span=26, adjust=False).mean()
macd = ema_12 - ema_26
macd_signal = macd.ewm(span=9, adjust=False).mean()
features['macd'] = macd / features['atr_14']
features['macd_signal'] = macd_signal / features['atr_14']
features['macd_hist'] = (macd - macd_signal) / features['atr_14']
# Momentum
features['momentum_5'] = (close - close.shift(5)) / features['atr_14']
features['momentum_10'] = (close - close.shift(10)) / features['atr_14']
features['momentum_20'] = (close - close.shift(20)) / features['atr_14']
# Stochastic
low_14 = low.rolling(14).min()
high_14 = high.rolling(14).max()
features['stoch_k'] = 100 * (close - low_14) / (high_14 - low_14 + 1e-10)
features['stoch_d'] = features['stoch_k'].rolling(3).mean()
# Williams %R
features['williams_r'] = -100 * (high_14 - close) / (high_14 - low_14 + 1e-10)
# Volume
if volume.sum() > 0:
vol_ma_5 = volume.rolling(5).mean()
vol_ma_20 = volume.rolling(20).mean()
features['volume_ratio'] = volume / (vol_ma_20 + 1)
features['volume_trend'] = (vol_ma_5 - vol_ma_20) / (vol_ma_20 + 1)
else:
features['volume_ratio'] = 1.0
features['volume_trend'] = 0.0
# Candle patterns
body = close - open_price
features['body_pct'] = body / (candle_range + 1e-10)
features['upper_shadow'] = (high - np.maximum(close, open_price)) / (candle_range + 1e-10)
features['lower_shadow'] = (np.minimum(close, open_price) - low) / (candle_range + 1e-10)
# Price position
features['close_position'] = (close - low) / (candle_range + 1e-10)
high_5 = high.rolling(5).max()
low_5 = low.rolling(5).min()
features['price_position_5'] = (close - low_5) / (high_5 - low_5 + 1e-10)
high_20 = high.rolling(20).max()
low_20 = low.rolling(20).min()
features['price_position_20'] = (close - low_20) / (high_20 - low_20 + 1e-10)
# Time features
if hasattr(df.index, 'hour'):
hour = df.index.hour
day_of_week = df.index.dayofweek
features['hour_sin'] = np.sin(2 * np.pi * hour / 24)
features['hour_cos'] = np.cos(2 * np.pi * hour / 24)
features['dow_sin'] = np.sin(2 * np.pi * day_of_week / 7)
features['dow_cos'] = np.cos(2 * np.pi * day_of_week / 7)
features['is_london'] = ((hour >= 8) & (hour < 16)).astype(float)
features['is_newyork'] = ((hour >= 13) & (hour < 21)).astype(float)
features['is_overlap'] = ((hour >= 13) & (hour < 16)).astype(float)
# Clean
features = features.replace([np.inf, -np.inf], np.nan)
# Combine
result = pd.concat([df[['open', 'high', 'low', 'close', 'volume']], features], axis=1)
return result
def run_backtest(
pipeline: HierarchicalPipeline,
df_5m: pd.DataFrame,
df_15m: pd.DataFrame,
symbol: str,
risk_reward: float = 2.0,
attention_threshold: float = 0.8,
horizon_bars: int = 3,
step_bars: int = 1
) -> List[TradeResult]:
"""
Run backtest simulation.
Args:
pipeline: Hierarchical pipeline instance
df_5m: 5-minute OHLCV data
df_15m: 15-minute OHLCV data
symbol: Trading symbol
risk_reward: Risk/reward ratio for TP
attention_threshold: Minimum attention to take trade
horizon_bars: Bars to look forward for TP/SL
step_bars: Step size between predictions
Returns:
List of TradeResult
"""
trades = []
min_lookback = 100 # Minimum bars for features
# Ensure data is sorted
df_5m = df_5m.sort_index()
df_15m = df_15m.sort_index()
# Add features
df_5m_feat = generate_features(df_5m)
df_15m_feat = generate_features(df_15m)
# Get common valid range
valid_start_5m = df_5m_feat.index[min_lookback * 3]
valid_start_15m = df_15m_feat.index[min_lookback]
common_start = max(valid_start_5m, valid_start_15m)
# Filter to common range leaving room for horizon
df_15m_test = df_15m_feat[df_15m_feat.index >= common_start].iloc[:-horizon_bars]
logger.info(f"Backtesting {len(df_15m_test)} bars...")
for i in range(0, len(df_15m_test), step_bars):
current_time = df_15m_test.index[i]
# Get historical data up to current time
df_5m_slice = df_5m_feat[df_5m_feat.index <= current_time].tail(min_lookback * 3)
df_15m_slice = df_15m_feat[df_15m_feat.index <= current_time].tail(min_lookback)
if len(df_5m_slice) < min_lookback or len(df_15m_slice) < 50:
continue
try:
# Get prediction
result = pipeline.predict(df_5m_slice, df_15m_slice, symbol)
# Get entry price
entry_price = float(df_15m_slice['close'].iloc[-1])
# Determine direction from predictions
delta_high = result.delta_high_final
delta_low = result.delta_low_final
if delta_high > delta_low * 1.1:
direction = 'long'
elif delta_low > delta_high * 1.1:
direction = 'short'
else:
# Use momentum
momentum = (df_15m_slice['close'].iloc[-1] / df_15m_slice['close'].iloc[-5]) - 1
direction = 'long' if momentum > 0 else 'short'
# Calculate SL and TP
if direction == 'long':
stop_loss = entry_price - delta_low
risk = entry_price - stop_loss
take_profit = entry_price + (risk * risk_reward)
else:
stop_loss = entry_price + delta_high
risk = stop_loss - entry_price
take_profit = entry_price - (risk * risk_reward)
# Get future data for outcome
future_start_idx = df_15m_feat.index.get_loc(current_time)
future_end_idx = min(future_start_idx + horizon_bars, len(df_15m_feat))
future_data = df_15m_feat.iloc[future_start_idx:future_end_idx]
if len(future_data) < 2:
continue
actual_high = future_data['high'].max()
actual_low = future_data['low'].min()
# Determine outcome
if direction == 'long':
hit_tp = actual_high >= take_profit
hit_sl = actual_low <= stop_loss
if hit_tp and hit_sl:
# Both hit - determine which first (simplified: assume TP first if diff is larger)
high_dist = actual_high - entry_price
low_dist = entry_price - actual_low
hit_tp = high_dist >= low_dist
hit_sl = not hit_tp
if hit_tp:
profit_r = risk_reward
elif hit_sl:
profit_r = -1.0
else:
# Neither hit - use actual PnL
actual_pnl = future_data['close'].iloc[-1] - entry_price
profit_r = actual_pnl / risk if risk > 0 else 0
else:
hit_tp = actual_low <= take_profit
hit_sl = actual_high >= stop_loss
if hit_tp and hit_sl:
high_dist = actual_high - entry_price
low_dist = entry_price - actual_low
hit_tp = low_dist >= high_dist
hit_sl = not hit_tp
if hit_tp:
profit_r = risk_reward
elif hit_sl:
profit_r = -1.0
else:
actual_pnl = entry_price - future_data['close'].iloc[-1]
profit_r = actual_pnl / risk if risk > 0 else 0
# Calculate average attention
avg_attention = (result.attention_score_5m + result.attention_score_15m) / 2
was_filtered = avg_attention < attention_threshold or not result.confidence
trade = TradeResult(
timestamp=current_time,
symbol=symbol,
direction=direction,
entry_price=entry_price,
stop_loss=stop_loss,
take_profit=take_profit,
risk=risk,
reward=risk * risk_reward,
actual_high=actual_high,
actual_low=actual_low,
hit_tp=hit_tp,
hit_sl=hit_sl,
profit_r=profit_r,
attention_score=avg_attention,
confidence_proba=result.confidence_proba,
trade_quality=result.trade_quality,
was_filtered=was_filtered
)
trades.append(trade)
except Exception as e:
logger.debug(f"Prediction failed at {current_time}: {e}")
continue
if (i + 1) % 500 == 0:
logger.info(f" Processed {i + 1}/{len(df_15m_test)} bars...")
return trades
def calculate_metrics(
trades: List[TradeResult],
symbol: str,
risk_reward: float,
attention_threshold: float
) -> BacktestMetrics:
"""Calculate comprehensive backtest metrics."""
if not trades:
return None
# All trades
all_trades = trades
total_trades = len(all_trades)
# Filtered trades (executed)
executed_trades = [t for t in trades if not t.was_filtered]
filtered_count = total_trades - len(executed_trades)
# Win/Loss for executed trades
wins = [t for t in executed_trades if t.profit_r > 0]
losses = [t for t in executed_trades if t.profit_r <= 0]
win_rate = len(wins) / len(executed_trades) if executed_trades else 0
# Profitability
total_profit_r = sum(t.profit_r for t in executed_trades)
avg_profit_r = total_profit_r / len(executed_trades) if executed_trades else 0
# Expectancy = (WinRate * AvgWin) - (LossRate * AvgLoss)
avg_win = sum(t.profit_r for t in wins) / len(wins) if wins else 0
avg_loss = abs(sum(t.profit_r for t in losses) / len(losses)) if losses else 0
expectancy = (win_rate * avg_win) - ((1 - win_rate) * avg_loss)
# Profit factor
gross_profit = sum(t.profit_r for t in wins)
gross_loss = abs(sum(t.profit_r for t in losses))
profit_factor = gross_profit / gross_loss if gross_loss > 0 else float('inf')
# Risk metrics
consecutive_losses = 0
max_consecutive_losses = 0
equity_curve = []
cumulative = 0
for t in executed_trades:
cumulative += t.profit_r
equity_curve.append(cumulative)
if t.profit_r <= 0:
consecutive_losses += 1
max_consecutive_losses = max(max_consecutive_losses, consecutive_losses)
else:
consecutive_losses = 0
# Max drawdown
peak = 0
max_dd = 0
for eq in equity_curve:
if eq > peak:
peak = eq
dd = peak - eq
if dd > max_dd:
max_dd = dd
# Attention analysis
winners_attention = [t.attention_score for t in wins]
losers_attention = [t.attention_score for t in losses]
avg_attention_winners = np.mean(winners_attention) if winners_attention else 0
avg_attention_losers = np.mean(losers_attention) if losers_attention else 0
# Win rate by attention level
high_attention = [t for t in executed_trades if t.attention_score >= 2.0]
medium_attention = [t for t in executed_trades if 0.8 <= t.attention_score < 2.0]
low_attention = [t for t in executed_trades if t.attention_score < 0.8]
high_attention_wr = sum(1 for t in high_attention if t.profit_r > 0) / len(high_attention) if high_attention else 0
medium_attention_wr = sum(1 for t in medium_attention if t.profit_r > 0) / len(medium_attention) if medium_attention else 0
low_attention_wr = sum(1 for t in low_attention if t.profit_r > 0) / len(low_attention) if low_attention else 0
# Unfiltered comparison (all trades)
unfiltered_wins = [t for t in all_trades if t.profit_r > 0]
unfiltered_win_rate = len(unfiltered_wins) / len(all_trades) if all_trades else 0
unfiltered_profit = sum(t.profit_r for t in all_trades)
unfiltered_expectancy = unfiltered_profit / len(all_trades) if all_trades else 0
# Improvement
improvement_pct = ((expectancy - unfiltered_expectancy) / abs(unfiltered_expectancy) * 100) if unfiltered_expectancy != 0 else 0
# Get period
start_date = min(t.timestamp for t in trades)
end_date = max(t.timestamp for t in trades)
period = f"{start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}"
return BacktestMetrics(
symbol=symbol,
timeframe='15m',
period=period,
risk_reward=risk_reward,
total_bars=len(trades),
total_trades=total_trades,
filtered_trades=filtered_count,
executed_trades=len(executed_trades),
wins=len(wins),
losses=len(losses),
win_rate=round(win_rate, 4),
total_profit_r=round(total_profit_r, 2),
avg_profit_r=round(avg_profit_r, 4),
expectancy=round(expectancy, 4),
profit_factor=round(profit_factor, 2),
max_consecutive_losses=max_consecutive_losses,
max_drawdown_r=round(max_dd, 2),
avg_attention_winners=round(avg_attention_winners, 3),
avg_attention_losers=round(avg_attention_losers, 3),
high_attention_win_rate=round(high_attention_wr, 4),
medium_attention_win_rate=round(medium_attention_wr, 4),
low_attention_win_rate=round(low_attention_wr, 4),
unfiltered_total_trades=total_trades,
unfiltered_win_rate=round(unfiltered_win_rate, 4),
unfiltered_expectancy=round(unfiltered_expectancy, 4),
improvement_pct=round(improvement_pct, 1)
)
def print_metrics(metrics: BacktestMetrics, target_wr: float = 0.40, target_exp: float = 0.10):
"""Print metrics with pass/fail indicators."""
print(f"\n{'=' * 60}")
print(f"BACKTEST RESULTS: {metrics.symbol}")
print(f"{'=' * 60}")
print(f"Period: {metrics.period}")
print(f"Timeframe: {metrics.timeframe}")
print(f"Risk:Reward: 1:{metrics.risk_reward}")
print(f"\n--- Trade Statistics ---")
print(f"Total Signals: {metrics.total_trades}")
print(f"Filtered Out: {metrics.filtered_trades} ({metrics.filtered_trades / metrics.total_trades * 100:.1f}%)")
print(f"Executed Trades: {metrics.executed_trades}")
print(f"Wins: {metrics.wins}")
print(f"Losses: {metrics.losses}")
# Win Rate with target comparison
wr_status = "PASS" if metrics.win_rate >= target_wr else "FAIL"
print(f"\n--- Key Metrics ---")
print(f"Win Rate: {metrics.win_rate * 100:.1f}% (target: {target_wr * 100}%) [{wr_status}]")
# Expectancy with target comparison
exp_status = "PASS" if metrics.expectancy >= target_exp else "FAIL"
print(f"Expectancy: {metrics.expectancy:.4f} (target: {target_exp}) [{exp_status}]")
print(f"Profit Factor: {metrics.profit_factor:.2f}")
print(f"Total Profit (R): {metrics.total_profit_r:.2f}")
print(f"Avg Profit/Trade (R): {metrics.avg_profit_r:.4f}")
print(f"\n--- Risk Metrics ---")
print(f"Max Consecutive Losses: {metrics.max_consecutive_losses}")
print(f"Max Drawdown (R): {metrics.max_drawdown_r:.2f}")
print(f"\n--- Attention Analysis ---")
print(f"Avg Attention (Winners): {metrics.avg_attention_winners:.3f}")
print(f"Avg Attention (Losers): {metrics.avg_attention_losers:.3f}")
print(f"High Attention (>=2.0) Win Rate: {metrics.high_attention_win_rate * 100:.1f}%")
print(f"Medium Attention (0.8-2.0) Win Rate: {metrics.medium_attention_win_rate * 100:.1f}%")
print(f"Low Attention (<0.8) Win Rate: {metrics.low_attention_win_rate * 100:.1f}%")
print(f"\n--- Comparison: Filtered vs Unfiltered ---")
print(f"Unfiltered Win Rate: {metrics.unfiltered_win_rate * 100:.1f}%")
print(f"Unfiltered Expectancy: {metrics.unfiltered_expectancy:.4f}")
print(f"Improvement: {metrics.improvement_pct:+.1f}%")
print(f"\n{'=' * 60}")
def generate_report(all_metrics: List[BacktestMetrics], output_path: Path):
"""Generate markdown report."""
report = []
report.append("# Hierarchical Pipeline Backtest Report")
report.append(f"\n**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
# Summary table
report.append("\n## Summary\n")
report.append("| Symbol | Period | Win Rate | Expectancy | Profit (R) | Improvement |")
report.append("|--------|--------|----------|------------|------------|-------------|")
for m in all_metrics:
wr_status = "PASS" if m.win_rate >= 0.40 else "FAIL"
exp_status = "PASS" if m.expectancy >= 0.10 else "FAIL"
report.append(
f"| {m.symbol} | {m.period} | {m.win_rate * 100:.1f}% ({wr_status}) | "
f"{m.expectancy:.4f} ({exp_status}) | {m.total_profit_r:.1f} | {m.improvement_pct:+.1f}% |"
)
# Detailed sections
for m in all_metrics:
report.append(f"\n## {m.symbol} Details\n")
report.append(f"- **Total Signals:** {m.total_trades}")
report.append(f"- **Filtered Out:** {m.filtered_trades} ({m.filtered_trades / m.total_trades * 100:.1f}%)")
report.append(f"- **Executed Trades:** {m.executed_trades}")
report.append(f"- **Win Rate:** {m.win_rate * 100:.1f}%")
report.append(f"- **Expectancy:** {m.expectancy:.4f}")
report.append(f"- **Profit Factor:** {m.profit_factor:.2f}")
report.append("\n### Attention Analysis\n")
report.append("| Attention Level | Win Rate |")
report.append("|-----------------|----------|")
report.append(f"| High (>=2.0) | {m.high_attention_win_rate * 100:.1f}% |")
report.append(f"| Medium (0.8-2.0) | {m.medium_attention_win_rate * 100:.1f}% |")
report.append(f"| Low (<0.8) | {m.low_attention_win_rate * 100:.1f}% |")
# Write report
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w') as f:
f.write('\n'.join(report))
logger.info(f"Report saved to: {output_path}")
def main():
parser = argparse.ArgumentParser(description='Hierarchical Pipeline Backtest')
parser.add_argument('--symbols', nargs='+', default=['XAUUSD', 'EURUSD'],
help='Symbols to backtest')
parser.add_argument('--start-date', type=str, default='2024-06-01',
help='Start date (YYYY-MM-DD)')
parser.add_argument('--end-date', type=str, default='2025-12-31',
help='End date (YYYY-MM-DD)')
parser.add_argument('--rr', type=float, default=2.0,
help='Risk:Reward ratio')
parser.add_argument('--attention-threshold', type=float, default=0.8,
help='Minimum attention score to trade')
parser.add_argument('--horizon', type=int, default=3,
help='Bars to look forward for TP/SL')
parser.add_argument('--step', type=int, default=1,
help='Step size between predictions')
parser.add_argument('--models-dir', type=str, default='models',
help='Directory containing trained models')
parser.add_argument('--output-dir', type=str, default='models/backtest_results',
help='Output directory for reports')
args = parser.parse_args()
# Setup
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
log_file = setup_logging(output_dir / 'logs', 'hierarchical_backtest')
logger.info("=" * 60)
logger.info("HIERARCHICAL PIPELINE BACKTEST")
logger.info("=" * 60)
logger.info(f"Symbols: {args.symbols}")
logger.info(f"Period: {args.start_date} to {args.end_date}")
logger.info(f"R:R: 1:{args.rr}")
logger.info(f"Attention Threshold: {args.attention_threshold}")
# Initialize pipeline
config = PipelineConfig(
attention_model_path=f'{args.models_dir}/attention',
base_model_path=f'{args.models_dir}/symbol_timeframe_models',
metamodel_path=f'{args.models_dir}/metamodels',
attention_threshold_low=args.attention_threshold,
attention_threshold_high=2.0,
confidence_threshold=0.5
)
pipeline = HierarchicalPipeline(config)
all_metrics = []
for symbol in args.symbols:
logger.info(f"\n{'=' * 40}")
logger.info(f"Processing {symbol}...")
logger.info(f"{'=' * 40}")
# Load models
if not pipeline.load_models(symbol):
logger.warning(f"Could not load all models for {symbol}, skipping...")
continue
# Load data
try:
df_5m = load_ohlcv_from_mysql(symbol, '5m', args.start_date, args.end_date)
df_15m = load_ohlcv_from_mysql(symbol, '15m', args.start_date, args.end_date)
if df_5m.empty or df_15m.empty:
logger.warning(f"No data for {symbol}, skipping...")
continue
except Exception as e:
logger.error(f"Data loading failed for {symbol}: {e}")
continue
# Run backtest
trades = run_backtest(
pipeline=pipeline,
df_5m=df_5m,
df_15m=df_15m,
symbol=symbol,
risk_reward=args.rr,
attention_threshold=args.attention_threshold,
horizon_bars=args.horizon,
step_bars=args.step
)
if not trades:
logger.warning(f"No trades generated for {symbol}")
continue
# Calculate metrics
metrics = calculate_metrics(
trades=trades,
symbol=symbol,
risk_reward=args.rr,
attention_threshold=args.attention_threshold
)
if metrics:
all_metrics.append(metrics)
print_metrics(metrics)
# Save trades
trades_df = pd.DataFrame([asdict(t) for t in trades])
trades_file = output_dir / f'{symbol}_trades_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
trades_df.to_csv(trades_file, index=False)
logger.info(f"Trades saved to: {trades_file}")
# Generate final report
if all_metrics:
report_file = output_dir / f'backtest_report_{datetime.now().strftime("%Y%m%d_%H%M%S")}.md'
generate_report(all_metrics, report_file)
# Save metrics as JSON
metrics_json = output_dir / f'backtest_metrics_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
with open(metrics_json, 'w') as f:
json.dump([asdict(m) for m in all_metrics], f, indent=2, default=str)
logger.info(f"Metrics saved to: {metrics_json}")
logger.info("\n" + "=" * 60)
logger.info("BACKTEST COMPLETE")
logger.info("=" * 60)
if __name__ == "__main__":
main()