""" Training Data Loader for ML Engine =================================== Provides efficient data loading from PostgreSQL for ML training. This module implements: - Batch loading for large datasets - Streaming support for memory-efficient processing - Filtering by symbol, timeframe, and date range - Feature and target extraction for model training Author: ML Pipeline (NEXUS v4.0) Created: 2026-01-25 """ from typing import Optional, Dict, Any, List, Iterator, Tuple from datetime import datetime, timedelta from dataclasses import dataclass, field import pandas as pd import numpy as np from sqlalchemy import text from loguru import logger from .database import PostgreSQLConnection, DatabaseManager @dataclass class TrainingDataConfig: """Configuration for training data loading.""" # Database connection host: str = 'localhost' port: int = 5432 database: str = 'trading_platform' user: str = 'trading_user' password: str = 'trading_dev_2026' # Default query parameters default_batch_size: int = 50000 default_timeframe: str = '5m' # Feature generation return_horizons: List[int] = field(default_factory=lambda: [1, 5, 12, 24]) volatility_window: int = 20 # Streaming configuration stream_chunk_size: int = 10000 prefetch_chunks: int = 2 class TrainingDataLoader: """ Efficient data loader for ML model training. Provides batch loading, streaming, and feature extraction from the PostgreSQL trading_platform database. Usage: loader = TrainingDataLoader() # Simple batch loading df = loader.get_training_data('XAUUSD', '2023-01-01', '2024-12-31') # Streaming for large datasets for batch in loader.stream_training_data('XAUUSD', batch_size=50000): process_batch(batch) # Get features and targets X, y = loader.get_features_and_targets('XAUUSD', '5m') """ def __init__( self, config: Optional[TrainingDataConfig] = None, db_connection: Optional[PostgreSQLConnection] = None ): """ Initialize the training data loader. Args: config: Configuration object for data loading db_connection: Existing database connection (creates new if None) """ self.config = config or TrainingDataConfig() self.db = db_connection or PostgreSQLConnection() self._cache: Dict[str, pd.DataFrame] = {} logger.info("TrainingDataLoader initialized") logger.info(f" Database: {self.config.database}") logger.info(f" Default batch size: {self.config.default_batch_size}") def get_training_data( self, symbol: str, start_date: str, end_date: str, timeframe: str = '5m', batch_size: Optional[int] = None, include_features: bool = True ) -> pd.DataFrame: """ Load training data for a symbol within a date range. Args: symbol: Trading symbol (e.g., 'XAUUSD', 'EURUSD') start_date: Start date (YYYY-MM-DD format) end_date: End date (YYYY-MM-DD format) timeframe: Data timeframe ('5m', '15m', '1h', '4h', 'd') batch_size: Number of records per batch (None for all at once) include_features: Whether to compute derived features Returns: DataFrame with OHLCV data and optional features, indexed by timestamp """ logger.info(f"Loading training data for {symbol} ({timeframe})") logger.info(f" Date range: {start_date} to {end_date}") if batch_size is None: # Load all data at once df = self.db.get_ticker_data( symbol=symbol, timeframe=timeframe, start_date=start_date, end_date=end_date, limit=5000000 # High limit for all data ) else: # Load in batches and concatenate batches = [] for batch_df in self._load_batches( symbol, start_date, end_date, timeframe, batch_size ): batches.append(batch_df) if not batches: logger.warning(f"No data found for {symbol}") return pd.DataFrame() df = pd.concat(batches, axis=0) df = df.sort_index() df = df[~df.index.duplicated(keep='first')] if df.empty: logger.warning(f"No data loaded for {symbol}") return df logger.info(f" Loaded {len(df):,} records") if include_features: df = self._compute_basic_features(df) logger.info(f" Added {len(df.columns) - 6} derived features") return df def _load_batches( self, symbol: str, start_date: str, end_date: str, timeframe: str, batch_size: int ) -> Iterator[pd.DataFrame]: """ Load data in batches using offset pagination. Args: symbol: Trading symbol start_date: Start date end_date: End date timeframe: Timeframe batch_size: Records per batch Yields: DataFrames for each batch """ table_map = { '5m': 'ohlcv_5m', '15m': 'ohlcv_15m', '1h': 'ohlcv_1h', '4h': 'ohlcv_4h', 'd': 'ohlcv_daily', '1d': 'ohlcv_daily', } table = table_map.get(timeframe.lower(), 'ohlcv_5m') clean_symbol = symbol if symbol.startswith('C:') or symbol.startswith('X:') or symbol.startswith('I:'): clean_symbol = symbol[2:] offset = 0 total_loaded = 0 while True: query = f""" SELECT o.timestamp, o.open, o.high, o.low, o.close, o.volume, o.vwap FROM market_data.{table} o JOIN market_data.tickers t ON t.id = o.ticker_id WHERE UPPER(t.symbol) = UPPER(:symbol) AND o.timestamp >= :start_date AND o.timestamp <= :end_date ORDER BY o.timestamp ASC LIMIT :limit OFFSET :offset """ params = { 'symbol': clean_symbol, 'start_date': start_date, 'end_date': end_date, 'limit': batch_size, 'offset': offset } batch_df = pd.read_sql(text(query), self.db.engine, params=params) if batch_df.empty: break batch_df['timestamp'] = pd.to_datetime(batch_df['timestamp']) batch_df.set_index('timestamp', inplace=True) total_loaded += len(batch_df) logger.debug(f" Loaded batch: {len(batch_df)} records (total: {total_loaded:,})") yield batch_df if len(batch_df) < batch_size: break offset += batch_size def stream_training_data( self, symbol: str, timeframe: str = '5m', start_date: Optional[str] = None, end_date: Optional[str] = None, batch_size: Optional[int] = None ) -> Iterator[pd.DataFrame]: """ Stream training data in chunks for memory-efficient processing. This method is ideal for processing very large datasets that don't fit in memory. Each chunk is loaded, processed, and can be discarded before loading the next. Args: symbol: Trading symbol timeframe: Data timeframe start_date: Start date (defaults to 5 years ago) end_date: End date (defaults to today) batch_size: Records per chunk (defaults to config) Yields: DataFrames for each chunk with computed features """ if start_date is None: start_date = (datetime.now() - timedelta(days=365 * 5)).strftime('%Y-%m-%d') if end_date is None: end_date = datetime.now().strftime('%Y-%m-%d') if batch_size is None: batch_size = self.config.stream_chunk_size logger.info(f"Streaming data for {symbol} ({timeframe})") logger.info(f" Date range: {start_date} to {end_date}") logger.info(f" Chunk size: {batch_size:,}") chunk_count = 0 total_records = 0 for batch_df in self._load_batches( symbol, start_date, end_date, timeframe, batch_size ): chunk_count += 1 total_records += len(batch_df) # Compute features for this chunk batch_df = self._compute_basic_features(batch_df) logger.debug(f" Yielding chunk {chunk_count}: {len(batch_df)} records") yield batch_df logger.info(f" Streamed {chunk_count} chunks, {total_records:,} total records") def get_features_and_targets( self, symbol: str, timeframe: str = '5m', start_date: Optional[str] = None, end_date: Optional[str] = None, target_horizon: int = 12, target_type: str = 'return' ) -> Tuple[pd.DataFrame, pd.Series]: """ Get feature matrix and target vector for model training. Args: symbol: Trading symbol timeframe: Data timeframe start_date: Start date (defaults to 3 years ago) end_date: End date (defaults to today) target_horizon: Number of bars ahead for target target_type: 'return' for % returns, 'direction' for up/down Returns: Tuple of (X features DataFrame, y target Series) """ if start_date is None: start_date = (datetime.now() - timedelta(days=365 * 3)).strftime('%Y-%m-%d') if end_date is None: end_date = datetime.now().strftime('%Y-%m-%d') logger.info(f"Preparing features and targets for {symbol} ({timeframe})") logger.info(f" Target: {target_type} at horizon {target_horizon}") # Load data with features df = self.get_training_data( symbol=symbol, start_date=start_date, end_date=end_date, timeframe=timeframe, include_features=True ) if df.empty: return pd.DataFrame(), pd.Series() # Compute target if target_type == 'return': target = df['close'].pct_change(target_horizon).shift(-target_horizon) elif target_type == 'direction': future_return = df['close'].pct_change(target_horizon).shift(-target_horizon) target = (future_return > 0).astype(int) else: raise ValueError(f"Unknown target_type: {target_type}") # Select feature columns (exclude OHLCV and target-related) feature_cols = [ col for col in df.columns if col not in ['open', 'high', 'low', 'close', 'volume', 'vwap'] and not col.startswith('target_') ] X = df[feature_cols] y = target # Remove rows with NaN targets (end of series) valid_mask = ~y.isna() & ~X.isna().any(axis=1) X = X[valid_mask] y = y[valid_mask] logger.info(f" Features shape: {X.shape}") logger.info(f" Feature columns: {list(X.columns)}") return X, y def _compute_basic_features(self, df: pd.DataFrame) -> pd.DataFrame: """ Compute basic derived features for the DataFrame. Features computed: - Returns at multiple horizons - Volatility (rolling std) - Range and body percentages - Volume ratios Args: df: DataFrame with OHLCV columns Returns: DataFrame with additional feature columns """ df = df.copy() # Price returns at different horizons for horizon in self.config.return_horizons: df[f'return_{horizon}'] = df['close'].pct_change(horizon) # Log returns (more suitable for ML) df['log_return'] = np.log(df['close'] / df['close'].shift(1)) # Volatility (rolling standard deviation of returns) window = self.config.volatility_window df['volatility'] = df['log_return'].rolling(window=window).std() df['volatility_pct'] = df['volatility'] * np.sqrt(252 * 12) # Annualized for 5m # High-Low range as percentage df['range_pct'] = (df['high'] - df['low']) / df['close'] # Body as percentage of range df['body_pct'] = np.abs(df['close'] - df['open']) / (df['high'] - df['low'] + 1e-10) # Upper and lower wicks body_high = df[['open', 'close']].max(axis=1) body_low = df[['open', 'close']].min(axis=1) df['upper_wick_pct'] = (df['high'] - body_high) / (df['high'] - df['low'] + 1e-10) df['lower_wick_pct'] = (body_low - df['low']) / (df['high'] - df['low'] + 1e-10) # Bullish/Bearish candle df['is_bullish'] = (df['close'] > df['open']).astype(int) # Volume features if 'volume' in df.columns and df['volume'].sum() > 0: df['volume_sma'] = df['volume'].rolling(window=window).mean() df['volume_ratio'] = df['volume'] / (df['volume_sma'] + 1e-10) df['volume_std'] = df['volume'].rolling(window=window).std() else: df['volume_sma'] = 0 df['volume_ratio'] = 1 df['volume_std'] = 0 # Price momentum indicators df['momentum_5'] = df['close'] - df['close'].shift(5) df['momentum_12'] = df['close'] - df['close'].shift(12) # Moving average distances df['sma_10'] = df['close'].rolling(window=10).mean() df['sma_20'] = df['close'].rolling(window=20).mean() df['dist_sma_10'] = (df['close'] - df['sma_10']) / df['close'] df['dist_sma_20'] = (df['close'] - df['sma_20']) / df['close'] return df def get_multi_symbol_data( self, symbols: List[str], start_date: str, end_date: str, timeframe: str = '5m' ) -> Dict[str, pd.DataFrame]: """ Load training data for multiple symbols. Args: symbols: List of trading symbols start_date: Start date end_date: End date timeframe: Data timeframe Returns: Dictionary mapping symbol to DataFrame """ logger.info(f"Loading data for {len(symbols)} symbols") data_dict = {} for symbol in symbols: df = self.get_training_data( symbol=symbol, start_date=start_date, end_date=end_date, timeframe=timeframe, include_features=True ) if not df.empty: data_dict[symbol] = df logger.info(f" {symbol}: {len(df):,} records") else: logger.warning(f" {symbol}: No data found") return data_dict def get_data_summary( self, symbol: str, timeframe: str = '5m' ) -> Dict[str, Any]: """ Get summary statistics for available data. Args: symbol: Trading symbol timeframe: Data timeframe Returns: Dictionary with summary statistics """ table_map = { '5m': 'ohlcv_5m', '15m': 'ohlcv_15m', '1h': 'ohlcv_1h', '4h': 'ohlcv_4h', 'd': 'ohlcv_daily', } table = table_map.get(timeframe.lower(), 'ohlcv_5m') clean_symbol = symbol if symbol.startswith('C:') or symbol.startswith('X:'): clean_symbol = symbol[2:] query = f""" SELECT COUNT(*) as total_records, MIN(o.timestamp) as first_date, MAX(o.timestamp) as last_date, AVG(o.close) as avg_price, STDDEV(o.close) as std_price, AVG(o.volume) as avg_volume FROM market_data.{table} o JOIN market_data.tickers t ON t.id = o.ticker_id WHERE UPPER(t.symbol) = UPPER(:symbol) """ result = pd.read_sql(text(query), self.db.engine, params={'symbol': clean_symbol}) if result.empty or result['total_records'].iloc[0] == 0: return {'symbol': symbol, 'timeframe': timeframe, 'error': 'No data found'} row = result.iloc[0] return { 'symbol': symbol, 'timeframe': timeframe, 'total_records': int(row['total_records']), 'first_date': str(row['first_date']), 'last_date': str(row['last_date']), 'avg_price': float(row['avg_price']) if row['avg_price'] else 0, 'std_price': float(row['std_price']) if row['std_price'] else 0, 'avg_volume': float(row['avg_volume']) if row['avg_volume'] else 0, } def clear_cache(self): """Clear the internal data cache.""" self._cache.clear() logger.info("Data cache cleared") def load_training_data( symbol: str, start_date: str, end_date: str, timeframe: str = '5m' ) -> pd.DataFrame: """ Convenience function to load training data. Args: symbol: Trading symbol start_date: Start date (YYYY-MM-DD) end_date: End date (YYYY-MM-DD) timeframe: Data timeframe Returns: DataFrame with OHLCV data and features """ loader = TrainingDataLoader() return loader.get_training_data(symbol, start_date, end_date, timeframe) if __name__ == "__main__": # Test the training data loader print("Testing TrainingDataLoader...") loader = TrainingDataLoader() # Test data summary print("\nData summary for XAUUSD:") summary = loader.get_data_summary('XAUUSD', '5m') for key, value in summary.items(): print(f" {key}: {value}") # Test batch loading print("\nTesting batch loading:") df = loader.get_training_data( symbol='XAUUSD', start_date='2024-01-01', end_date='2024-12-31', timeframe='5m', batch_size=50000 ) print(f" Loaded {len(df):,} records") print(f" Columns: {list(df.columns)}") # Test features and targets print("\nTesting features and targets:") X, y = loader.get_features_and_targets( symbol='XAUUSD', timeframe='5m', start_date='2024-01-01', end_date='2024-06-30', target_horizon=12 ) print(f" X shape: {X.shape}") print(f" y shape: {y.shape}") print(f" Features: {list(X.columns)}") # Test streaming print("\nTesting streaming:") chunk_count = 0 total_records = 0 for chunk in loader.stream_training_data( symbol='XAUUSD', timeframe='5m', start_date='2024-01-01', end_date='2024-03-31', batch_size=10000 ): chunk_count += 1 total_records += len(chunk) if chunk_count >= 3: print(f" (stopped after {chunk_count} chunks for test)") break print(f" Chunks: {chunk_count}, Records: {total_records:,}") print("\nTest complete!")