#!/usr/bin/env python3 """ Symbol-Timeframe Training Script ================================ Trains separate ML models for each symbol and timeframe combination. This script uses the SymbolTimeframeTrainer to train models for: - XAUUSD (Gold) - EURUSD (Euro/USD) - BTCUSD (Bitcoin) - if data available Each symbol is trained for both 5m and 15m timeframes. Features: - Loads data from PostgreSQL database - Excludes last year (2025) for backtesting - Uses dynamic factor-based sample weighting - Generates comprehensive feature set - Saves models and training reports Usage: python scripts/train_symbol_timeframe_models.py python scripts/train_symbol_timeframe_models.py --symbols XAUUSD EURUSD --timeframes 5m 15m Author: ML Training Pipeline Version: 1.0.0 Created: 2026-01-05 """ import argparse import sys from pathlib import Path from datetime import datetime, timedelta import json import numpy as np import pandas as pd from loguru import logger # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent.parent / 'src')) from training.symbol_timeframe_trainer import ( SymbolTimeframeTrainer, TrainerConfig, SYMBOL_CONFIGS ) from data.database import PostgreSQLConnection def setup_logging(log_dir: Path, experiment_name: str): """Configure logging to file and console.""" log_dir.mkdir(parents=True, exist_ok=True) log_file = log_dir / f"{experiment_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log" logger.remove() logger.add(sys.stderr, level="INFO", format="{time:HH:mm:ss} | {level} | {message}") logger.add(log_file, level="DEBUG", rotation="10 MB") logger.info(f"Logging to {log_file}") return log_file def load_data_from_db( db: PostgreSQLConnection, symbol: str, start_date: str = None, end_date: str = None, limit: int = None ) -> pd.DataFrame: """ Load OHLCV data from PostgreSQL database. Args: db: PostgreSQL connection symbol: Trading symbol (e.g., 'XAUUSD') start_date: Start date filter (YYYY-MM-DD) end_date: End date filter (YYYY-MM-DD) limit: Maximum records to fetch Returns: DataFrame with OHLCV data """ logger.info(f"Loading data for {symbol}...") # Use PostgreSQLConnection.get_ticker_data which handles the # market_data.ohlcv_5m JOIN with tickers table df = db.get_ticker_data( symbol=symbol, timeframe='5m', limit=limit or 500000, start_date=start_date, end_date=end_date ) if df.empty: logger.warning(f"No data found for {symbol}") return df logger.info(f"Loaded {len(df)} records for {symbol}") logger.info(f" Date range: {df.index.min()} to {df.index.max()}") return df def resample_to_timeframe(df: pd.DataFrame, timeframe: str) -> pd.DataFrame: """ Resample 5-minute data to different timeframe. Args: df: DataFrame with 5m data timeframe: Target timeframe ('5m', '15m', '1H', etc.) Returns: Resampled DataFrame """ if timeframe == '5m': return df # Already in 5m # Map timeframe to pandas offset tf_map = { '15m': '15min', '30m': '30min', '1H': '1H', '4H': '4H', '1D': '1D' } offset = tf_map.get(timeframe, timeframe) resampled = df.resample(offset).agg({ 'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last', 'volume': 'sum', 'vwap': 'mean' }).dropna() logger.info(f"Resampled to {timeframe}: {len(resampled)} bars") return resampled def generate_features(df: pd.DataFrame, symbol: str) -> pd.DataFrame: """ Generate comprehensive feature set for training. Args: df: OHLCV DataFrame symbol: Symbol for context-specific features Returns: DataFrame with features """ logger.info(f"Generating features for {symbol}...") features = pd.DataFrame(index=df.index) close = df['close'] high = df['high'] low = df['low'] open_price = df['open'] volume = df['volume'] if 'volume' in df.columns else pd.Series(1, index=df.index) # ===== Price Returns ===== features['returns_1'] = close.pct_change(1) features['returns_3'] = close.pct_change(3) features['returns_5'] = close.pct_change(5) features['returns_10'] = close.pct_change(10) features['returns_20'] = close.pct_change(20) # ===== Volatility Features ===== features['volatility_5'] = close.pct_change().rolling(5).std() features['volatility_10'] = close.pct_change().rolling(10).std() features['volatility_20'] = close.pct_change().rolling(20).std() # ===== Range Features ===== candle_range = high - low features['range'] = candle_range features['range_pct'] = candle_range / close features['range_ma_5'] = candle_range.rolling(5).mean() features['range_ma_10'] = candle_range.rolling(10).mean() features['range_ma_20'] = candle_range.rolling(20).mean() features['range_ratio_5'] = candle_range / features['range_ma_5'] features['range_ratio_20'] = candle_range / features['range_ma_20'] # ===== ATR Features ===== tr1 = high - low tr2 = abs(high - close.shift(1)) tr3 = abs(low - close.shift(1)) true_range = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1) features['atr_5'] = true_range.rolling(5).mean() features['atr_14'] = true_range.rolling(14).mean() features['atr_20'] = true_range.rolling(20).mean() features['atr_ratio'] = true_range / features['atr_14'] # ===== Moving Averages ===== sma_5 = close.rolling(5).mean() sma_10 = close.rolling(10).mean() sma_20 = close.rolling(20).mean() sma_50 = close.rolling(50).mean() ema_5 = close.ewm(span=5, adjust=False).mean() ema_10 = close.ewm(span=10, adjust=False).mean() ema_20 = close.ewm(span=20, adjust=False).mean() features['price_vs_sma5'] = (close - sma_5) / features['atr_14'] features['price_vs_sma10'] = (close - sma_10) / features['atr_14'] features['price_vs_sma20'] = (close - sma_20) / features['atr_14'] features['price_vs_sma50'] = (close - sma_50) / features['atr_14'] features['sma5_vs_sma20'] = (sma_5 - sma_20) / features['atr_14'] features['ema5_vs_ema20'] = (ema_5 - ema_20) / features['atr_14'] # ===== RSI ===== delta = close.diff() gain = delta.where(delta > 0, 0).rolling(14).mean() loss = (-delta.where(delta < 0, 0)).rolling(14).mean() rs = gain / (loss + 1e-10) features['rsi_14'] = 100 - (100 / (1 + rs)) # RSI extremes features['rsi_oversold'] = (features['rsi_14'] < 30).astype(float) features['rsi_overbought'] = (features['rsi_14'] > 70).astype(float) # ===== Bollinger Bands ===== bb_middle = close.rolling(20).mean() bb_std = close.rolling(20).std() bb_upper = bb_middle + 2 * bb_std bb_lower = bb_middle - 2 * bb_std features['bb_width'] = (bb_upper - bb_lower) / bb_middle features['bb_position'] = (close - bb_lower) / (bb_upper - bb_lower + 1e-10) # ===== MACD ===== ema_12 = close.ewm(span=12, adjust=False).mean() ema_26 = close.ewm(span=26, adjust=False).mean() macd = ema_12 - ema_26 macd_signal = macd.ewm(span=9, adjust=False).mean() features['macd'] = macd / features['atr_14'] features['macd_signal'] = macd_signal / features['atr_14'] features['macd_hist'] = (macd - macd_signal) / features['atr_14'] # ===== Momentum ===== features['momentum_5'] = (close - close.shift(5)) / features['atr_14'] features['momentum_10'] = (close - close.shift(10)) / features['atr_14'] features['momentum_20'] = (close - close.shift(20)) / features['atr_14'] # ===== Stochastic ===== low_14 = low.rolling(14).min() high_14 = high.rolling(14).max() features['stoch_k'] = 100 * (close - low_14) / (high_14 - low_14 + 1e-10) features['stoch_d'] = features['stoch_k'].rolling(3).mean() # ===== Williams %R ===== features['williams_r'] = -100 * (high_14 - close) / (high_14 - low_14 + 1e-10) # ===== Volume Features ===== if volume.sum() > 0: vol_ma_5 = volume.rolling(5).mean() vol_ma_20 = volume.rolling(20).mean() features['volume_ratio'] = volume / (vol_ma_20 + 1) features['volume_trend'] = (vol_ma_5 - vol_ma_20) / (vol_ma_20 + 1) # ===== Candle Patterns ===== body = close - open_price features['body_pct'] = body / (candle_range + 1e-10) features['upper_shadow'] = (high - np.maximum(close, open_price)) / (candle_range + 1e-10) features['lower_shadow'] = (np.minimum(close, open_price) - low) / (candle_range + 1e-10) # ===== Price Position ===== features['close_position'] = (close - low) / (candle_range + 1e-10) high_5 = high.rolling(5).max() low_5 = low.rolling(5).min() features['price_position_5'] = (close - low_5) / (high_5 - low_5 + 1e-10) high_20 = high.rolling(20).max() low_20 = low.rolling(20).min() features['price_position_20'] = (close - low_20) / (high_20 - low_20 + 1e-10) # ===== Time Features ===== features['hour'] = df.index.hour features['hour_sin'] = np.sin(2 * np.pi * features['hour'] / 24) features['hour_cos'] = np.cos(2 * np.pi * features['hour'] / 24) features['day_of_week'] = df.index.dayofweek features['dow_sin'] = np.sin(2 * np.pi * features['day_of_week'] / 7) features['dow_cos'] = np.cos(2 * np.pi * features['day_of_week'] / 7) # Trading sessions features['is_london'] = ((features['hour'] >= 8) & (features['hour'] < 16)).astype(float) features['is_newyork'] = ((features['hour'] >= 13) & (features['hour'] < 21)).astype(float) features['is_overlap'] = ((features['hour'] >= 13) & (features['hour'] < 16)).astype(float) # Clean up features = features.replace([np.inf, -np.inf], np.nan) # Drop non-feature columns used for intermediate calculations drop_cols = ['hour', 'day_of_week'] features = features.drop(columns=[c for c in drop_cols if c in features.columns], errors='ignore') logger.info(f"Generated {len(features.columns)} features") return features def train_models( symbols: list, timeframes: list, output_dir: Path, cutoff_date: str = '2024-12-31', db_config_path: str = 'config/database.yaml', use_attention: bool = False, attention_model_path: str = 'models/attention' ) -> dict: """ Train models for all symbol/timeframe combinations. Args: symbols: List of symbols to train timeframes: List of timeframes output_dir: Directory to save models cutoff_date: Training data cutoff date db_config_path: Path to database config use_attention: Whether to include attention features from pre-trained model attention_model_path: Path to trained attention models Returns: Dictionary with training results """ logger.info("="*60) logger.info("Symbol-Timeframe Model Training") logger.info("="*60) logger.info(f"Symbols: {symbols}") logger.info(f"Timeframes: {timeframes}") logger.info(f"Cutoff date: {cutoff_date}") logger.info(f"Use attention features: {use_attention}") # Connect to database db = PostgreSQLConnection(db_config_path) # Configure trainer with improved parameters for better R^2 # Key improvements: # 1. Targets are now normalized by ATR (handled in SymbolTimeframeTrainer) # 2. Reduced sample weighting aggressiveness # 3. More regularization in XGBoost config = TrainerConfig( symbols=symbols, timeframes=timeframes, horizons={ '5m': 3, # 15 minutes ahead '15m': 3, # 45 minutes ahead }, train_years=5.0, holdout_years=1.0, # Exclude 2025 for backtesting use_dynamic_factor_weighting=True, factor_window=200, softplus_beta=2.0, # Reduced from 4.0 - less aggressive weighting softplus_w_max=2.0, # Reduced from 3.0 - cap weights lower xgb_params={ 'n_estimators': 150, # Reduced from 300 'max_depth': 4, # Reduced from 6 - shallower trees 'learning_rate': 0.02, # Reduced from 0.03 'subsample': 0.7, # Reduced from 0.8 'colsample_bytree': 0.7, # Reduced from 0.8 'min_child_weight': 20, # Increased from 10 - more regularization 'gamma': 0.3, # Increased from 0.1 'reg_alpha': 0.5, # Increased from 0.1 - L1 regularization 'reg_lambda': 5.0, # Increased from 1.0 - L2 regularization 'tree_method': 'hist', 'random_state': 42 }, min_train_samples=5000, use_attention_features=use_attention, attention_model_path=attention_model_path ) trainer = SymbolTimeframeTrainer(config) # Prepare data dictionary data_dict = {} all_results = {} for symbol in symbols: logger.info(f"\n{'='*60}") logger.info(f"Processing {symbol}") logger.info(f"{'='*60}") # Load raw data (5m) df_5m = load_data_from_db(db, symbol, end_date=cutoff_date) if df_5m.empty: logger.warning(f"No data for {symbol}, skipping...") continue # Verify we have enough data if len(df_5m) < 50000: logger.warning(f"Insufficient data for {symbol}: {len(df_5m)} rows") continue data_dict[symbol] = {} for timeframe in timeframes: logger.info(f"\n--- {symbol} {timeframe} ---") # Resample if needed if timeframe == '5m': df_tf = df_5m.copy() else: df_tf = resample_to_timeframe(df_5m.copy(), timeframe) if len(df_tf) < 10000: logger.warning(f"Insufficient {timeframe} data: {len(df_tf)} rows") continue # Generate features features = generate_features(df_tf, symbol) # Combine OHLCV with features df_combined = pd.concat([df_tf[['open', 'high', 'low', 'close', 'volume']], features], axis=1) # Drop NaN rows df_combined = df_combined.dropna() logger.info(f"Final data shape: {df_combined.shape}") data_dict[symbol][timeframe] = df_combined # Train for this symbol/timeframe try: results = trainer.train_single(df_combined, symbol, timeframe) all_results.update(results) for key, result in results.items(): logger.info(f"\n{key}:") logger.info(f" MAE: {result.mae:.6f}") logger.info(f" RMSE: {result.rmse:.6f}") logger.info(f" R2: {result.r2:.4f}") logger.info(f" Dir Accuracy: {result.directional_accuracy:.2%}") logger.info(f" Train samples: {result.n_train}") logger.info(f" Val samples: {result.n_val}") except Exception as e: logger.error(f"Training failed for {symbol} {timeframe}: {e}") import traceback traceback.print_exc() # Save models model_dir = output_dir / 'symbol_timeframe_models' trainer.save(str(model_dir)) logger.info(f"\nModels saved to {model_dir}") # Generate summary report summary_df = trainer.get_training_summary() if not summary_df.empty: report_path = output_dir / 'training_summary.csv' summary_df.to_csv(report_path, index=False) logger.info(f"Summary saved to {report_path}") logger.info("\n" + "="*60) logger.info("TRAINING SUMMARY") logger.info("="*60) print(summary_df.to_string(index=False)) return { 'results': all_results, 'summary': summary_df.to_dict() if not summary_df.empty else {}, 'model_dir': str(model_dir) } def generate_markdown_report(results: dict, output_dir: Path) -> Path: """Generate a Markdown training report.""" report_path = output_dir / f"TRAINING_REPORT_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md" report = f"""# Symbol-Timeframe Model Training Report **Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ## Configuration - **Training Data Cutoff:** 2024-12-31 (excluding 2025 for backtesting) - **Dynamic Factor Weighting:** Enabled - **Sample Weight Method:** Softplus with beta=4.0, w_max=3.0 ## Training Results Summary | Model | Symbol | Timeframe | Target | MAE | RMSE | R2 | Dir Accuracy | Train | Val | |-------|--------|-----------|--------|-----|------|----|--------------| ----- | --- | """ for key, result in results.get('results', {}).items(): report += f"| {key} | {result.model_key.symbol} | {result.model_key.timeframe} | " report += f"{result.model_key.target_type} | {result.mae:.6f} | {result.rmse:.6f} | " report += f"{result.r2:.4f} | {result.directional_accuracy:.2%} | " report += f"{result.n_train} | {result.n_val} |\n" model_dir_str = results.get('model_dir', 'N/A') report += f""" ## Model Files Models saved to: `{model_dir_str}` ### Model Naming Convention - `{{symbol}}_{{timeframe}}_high_h{{horizon}}.joblib` - High range predictor - `{{symbol}}_{{timeframe}}_low_h{{horizon}}.joblib` - Low range predictor ## Usage Example ```python from training.symbol_timeframe_trainer import SymbolTimeframeTrainer # Load trained models trainer = SymbolTimeframeTrainer() trainer.load('models/symbol_timeframe_models/') # Predict for XAUUSD 15m predictions = trainer.predict(features, 'XAUUSD', '15m') print(f"Predicted High: {{predictions['high']}}") print(f"Predicted Low: {{predictions['low']}}") ``` ## Notes 1. Models exclude 2025 data for out-of-sample backtesting 2. Dynamic factor weighting emphasizes high-movement samples 3. Separate models for HIGH and LOW predictions per symbol/timeframe --- *Report generated by Symbol-Timeframe Training Pipeline* """ with open(report_path, 'w') as f: f.write(report) logger.info(f"Report saved to {report_path}") return report_path def main(): parser = argparse.ArgumentParser(description='Train Symbol-Timeframe ML Models') parser.add_argument('--symbols', nargs='+', default=['XAUUSD', 'EURUSD'], help='Symbols to train (default: XAUUSD EURUSD)') parser.add_argument('--timeframes', nargs='+', default=['5m', '15m'], help='Timeframes to train (default: 5m 15m)') parser.add_argument('--output-dir', type=str, default='models/', help='Output directory for models') parser.add_argument('--cutoff-date', type=str, default='2024-12-31', help='Training data cutoff date') parser.add_argument('--db-config', type=str, default='config/database.yaml', help='Database configuration file') parser.add_argument('--use-attention', action='store_true', help='Use attention features from pre-trained attention model') parser.add_argument('--attention-model-path', type=str, default='models/attention', help='Path to trained attention models') args = parser.parse_args() # Setup paths script_dir = Path(__file__).parent.parent output_dir = script_dir / args.output_dir output_dir.mkdir(parents=True, exist_ok=True) logs_dir = output_dir / 'logs' setup_logging(logs_dir, 'symbol_timeframe_training') # Run training try: results = train_models( symbols=args.symbols, timeframes=args.timeframes, output_dir=output_dir, cutoff_date=args.cutoff_date, db_config_path=str(script_dir / args.db_config), use_attention=args.use_attention, attention_model_path=str(script_dir / args.attention_model_path) ) # Generate report generate_markdown_report(results, output_dir) logger.info("\n" + "="*60) logger.info("TRAINING COMPLETE!") logger.info("="*60) except Exception as e: logger.exception(f"Training failed: {e}") sys.exit(1) if __name__ == "__main__": main()