trading-platform-ml-engine-v2/src/data/training_loader.py
Adrian Flores Cortes d015e2b0f3 feat(ml-engine): Phase 4 - PostgreSQL migration, dynamic OOS, data pipeline
- Fix database.py: Add DatabaseConnection alias for backward compat
- Fix train_symbol_timeframe_models.py: Use PostgreSQLConnection + native queries
- Fix run_oos_backtest.py: Fix broken import + add dynamic OOS support
- Update data_splitter.py: split_dynamic_oos() method (from previous session)
- Update validation_oos.yaml: Dynamic OOS config + all 6 symbols enabled
- Create ingest_ohlcv_polygon.py: Standalone Polygon→PostgreSQL ingestion script
- Fix .gitignore: /data/ instead of data/ to not ignore src/data/
- Add untracked src/ modules: backtesting, data, llm, models (attention/metamodel/strategies)
- Add aiohttp, sqlalchemy, psycopg2-binary to requirements.txt

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-27 04:39:05 -06:00

612 lines
19 KiB
Python

"""
Training Data Loader for ML Engine
===================================
Provides efficient data loading from PostgreSQL for ML training.
This module implements:
- Batch loading for large datasets
- Streaming support for memory-efficient processing
- Filtering by symbol, timeframe, and date range
- Feature and target extraction for model training
Author: ML Pipeline (NEXUS v4.0)
Created: 2026-01-25
"""
from typing import Optional, Dict, Any, List, Iterator, Tuple
from datetime import datetime, timedelta
from dataclasses import dataclass, field
import pandas as pd
import numpy as np
from sqlalchemy import text
from loguru import logger
from .database import PostgreSQLConnection, DatabaseManager
@dataclass
class TrainingDataConfig:
"""Configuration for training data loading."""
# Database connection
host: str = 'localhost'
port: int = 5432
database: str = 'trading_platform'
user: str = 'trading_user'
password: str = 'trading_dev_2026'
# Default query parameters
default_batch_size: int = 50000
default_timeframe: str = '5m'
# Feature generation
return_horizons: List[int] = field(default_factory=lambda: [1, 5, 12, 24])
volatility_window: int = 20
# Streaming configuration
stream_chunk_size: int = 10000
prefetch_chunks: int = 2
class TrainingDataLoader:
"""
Efficient data loader for ML model training.
Provides batch loading, streaming, and feature extraction
from the PostgreSQL trading_platform database.
Usage:
loader = TrainingDataLoader()
# Simple batch loading
df = loader.get_training_data('XAUUSD', '2023-01-01', '2024-12-31')
# Streaming for large datasets
for batch in loader.stream_training_data('XAUUSD', batch_size=50000):
process_batch(batch)
# Get features and targets
X, y = loader.get_features_and_targets('XAUUSD', '5m')
"""
def __init__(
self,
config: Optional[TrainingDataConfig] = None,
db_connection: Optional[PostgreSQLConnection] = None
):
"""
Initialize the training data loader.
Args:
config: Configuration object for data loading
db_connection: Existing database connection (creates new if None)
"""
self.config = config or TrainingDataConfig()
self.db = db_connection or PostgreSQLConnection()
self._cache: Dict[str, pd.DataFrame] = {}
logger.info("TrainingDataLoader initialized")
logger.info(f" Database: {self.config.database}")
logger.info(f" Default batch size: {self.config.default_batch_size}")
def get_training_data(
self,
symbol: str,
start_date: str,
end_date: str,
timeframe: str = '5m',
batch_size: Optional[int] = None,
include_features: bool = True
) -> pd.DataFrame:
"""
Load training data for a symbol within a date range.
Args:
symbol: Trading symbol (e.g., 'XAUUSD', 'EURUSD')
start_date: Start date (YYYY-MM-DD format)
end_date: End date (YYYY-MM-DD format)
timeframe: Data timeframe ('5m', '15m', '1h', '4h', 'd')
batch_size: Number of records per batch (None for all at once)
include_features: Whether to compute derived features
Returns:
DataFrame with OHLCV data and optional features, indexed by timestamp
"""
logger.info(f"Loading training data for {symbol} ({timeframe})")
logger.info(f" Date range: {start_date} to {end_date}")
if batch_size is None:
# Load all data at once
df = self.db.get_ticker_data(
symbol=symbol,
timeframe=timeframe,
start_date=start_date,
end_date=end_date,
limit=5000000 # High limit for all data
)
else:
# Load in batches and concatenate
batches = []
for batch_df in self._load_batches(
symbol, start_date, end_date, timeframe, batch_size
):
batches.append(batch_df)
if not batches:
logger.warning(f"No data found for {symbol}")
return pd.DataFrame()
df = pd.concat(batches, axis=0)
df = df.sort_index()
df = df[~df.index.duplicated(keep='first')]
if df.empty:
logger.warning(f"No data loaded for {symbol}")
return df
logger.info(f" Loaded {len(df):,} records")
if include_features:
df = self._compute_basic_features(df)
logger.info(f" Added {len(df.columns) - 6} derived features")
return df
def _load_batches(
self,
symbol: str,
start_date: str,
end_date: str,
timeframe: str,
batch_size: int
) -> Iterator[pd.DataFrame]:
"""
Load data in batches using offset pagination.
Args:
symbol: Trading symbol
start_date: Start date
end_date: End date
timeframe: Timeframe
batch_size: Records per batch
Yields:
DataFrames for each batch
"""
table_map = {
'5m': 'ohlcv_5m',
'15m': 'ohlcv_15m',
'1h': 'ohlcv_1h',
'4h': 'ohlcv_4h',
'd': 'ohlcv_daily',
'1d': 'ohlcv_daily',
}
table = table_map.get(timeframe.lower(), 'ohlcv_5m')
clean_symbol = symbol
if symbol.startswith('C:') or symbol.startswith('X:') or symbol.startswith('I:'):
clean_symbol = symbol[2:]
offset = 0
total_loaded = 0
while True:
query = f"""
SELECT
o.timestamp,
o.open,
o.high,
o.low,
o.close,
o.volume,
o.vwap
FROM market_data.{table} o
JOIN market_data.tickers t ON t.id = o.ticker_id
WHERE UPPER(t.symbol) = UPPER(:symbol)
AND o.timestamp >= :start_date
AND o.timestamp <= :end_date
ORDER BY o.timestamp ASC
LIMIT :limit OFFSET :offset
"""
params = {
'symbol': clean_symbol,
'start_date': start_date,
'end_date': end_date,
'limit': batch_size,
'offset': offset
}
batch_df = pd.read_sql(text(query), self.db.engine, params=params)
if batch_df.empty:
break
batch_df['timestamp'] = pd.to_datetime(batch_df['timestamp'])
batch_df.set_index('timestamp', inplace=True)
total_loaded += len(batch_df)
logger.debug(f" Loaded batch: {len(batch_df)} records (total: {total_loaded:,})")
yield batch_df
if len(batch_df) < batch_size:
break
offset += batch_size
def stream_training_data(
self,
symbol: str,
timeframe: str = '5m',
start_date: Optional[str] = None,
end_date: Optional[str] = None,
batch_size: Optional[int] = None
) -> Iterator[pd.DataFrame]:
"""
Stream training data in chunks for memory-efficient processing.
This method is ideal for processing very large datasets that
don't fit in memory. Each chunk is loaded, processed, and
can be discarded before loading the next.
Args:
symbol: Trading symbol
timeframe: Data timeframe
start_date: Start date (defaults to 5 years ago)
end_date: End date (defaults to today)
batch_size: Records per chunk (defaults to config)
Yields:
DataFrames for each chunk with computed features
"""
if start_date is None:
start_date = (datetime.now() - timedelta(days=365 * 5)).strftime('%Y-%m-%d')
if end_date is None:
end_date = datetime.now().strftime('%Y-%m-%d')
if batch_size is None:
batch_size = self.config.stream_chunk_size
logger.info(f"Streaming data for {symbol} ({timeframe})")
logger.info(f" Date range: {start_date} to {end_date}")
logger.info(f" Chunk size: {batch_size:,}")
chunk_count = 0
total_records = 0
for batch_df in self._load_batches(
symbol, start_date, end_date, timeframe, batch_size
):
chunk_count += 1
total_records += len(batch_df)
# Compute features for this chunk
batch_df = self._compute_basic_features(batch_df)
logger.debug(f" Yielding chunk {chunk_count}: {len(batch_df)} records")
yield batch_df
logger.info(f" Streamed {chunk_count} chunks, {total_records:,} total records")
def get_features_and_targets(
self,
symbol: str,
timeframe: str = '5m',
start_date: Optional[str] = None,
end_date: Optional[str] = None,
target_horizon: int = 12,
target_type: str = 'return'
) -> Tuple[pd.DataFrame, pd.Series]:
"""
Get feature matrix and target vector for model training.
Args:
symbol: Trading symbol
timeframe: Data timeframe
start_date: Start date (defaults to 3 years ago)
end_date: End date (defaults to today)
target_horizon: Number of bars ahead for target
target_type: 'return' for % returns, 'direction' for up/down
Returns:
Tuple of (X features DataFrame, y target Series)
"""
if start_date is None:
start_date = (datetime.now() - timedelta(days=365 * 3)).strftime('%Y-%m-%d')
if end_date is None:
end_date = datetime.now().strftime('%Y-%m-%d')
logger.info(f"Preparing features and targets for {symbol} ({timeframe})")
logger.info(f" Target: {target_type} at horizon {target_horizon}")
# Load data with features
df = self.get_training_data(
symbol=symbol,
start_date=start_date,
end_date=end_date,
timeframe=timeframe,
include_features=True
)
if df.empty:
return pd.DataFrame(), pd.Series()
# Compute target
if target_type == 'return':
target = df['close'].pct_change(target_horizon).shift(-target_horizon)
elif target_type == 'direction':
future_return = df['close'].pct_change(target_horizon).shift(-target_horizon)
target = (future_return > 0).astype(int)
else:
raise ValueError(f"Unknown target_type: {target_type}")
# Select feature columns (exclude OHLCV and target-related)
feature_cols = [
col for col in df.columns
if col not in ['open', 'high', 'low', 'close', 'volume', 'vwap']
and not col.startswith('target_')
]
X = df[feature_cols]
y = target
# Remove rows with NaN targets (end of series)
valid_mask = ~y.isna() & ~X.isna().any(axis=1)
X = X[valid_mask]
y = y[valid_mask]
logger.info(f" Features shape: {X.shape}")
logger.info(f" Feature columns: {list(X.columns)}")
return X, y
def _compute_basic_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Compute basic derived features for the DataFrame.
Features computed:
- Returns at multiple horizons
- Volatility (rolling std)
- Range and body percentages
- Volume ratios
Args:
df: DataFrame with OHLCV columns
Returns:
DataFrame with additional feature columns
"""
df = df.copy()
# Price returns at different horizons
for horizon in self.config.return_horizons:
df[f'return_{horizon}'] = df['close'].pct_change(horizon)
# Log returns (more suitable for ML)
df['log_return'] = np.log(df['close'] / df['close'].shift(1))
# Volatility (rolling standard deviation of returns)
window = self.config.volatility_window
df['volatility'] = df['log_return'].rolling(window=window).std()
df['volatility_pct'] = df['volatility'] * np.sqrt(252 * 12) # Annualized for 5m
# High-Low range as percentage
df['range_pct'] = (df['high'] - df['low']) / df['close']
# Body as percentage of range
df['body_pct'] = np.abs(df['close'] - df['open']) / (df['high'] - df['low'] + 1e-10)
# Upper and lower wicks
body_high = df[['open', 'close']].max(axis=1)
body_low = df[['open', 'close']].min(axis=1)
df['upper_wick_pct'] = (df['high'] - body_high) / (df['high'] - df['low'] + 1e-10)
df['lower_wick_pct'] = (body_low - df['low']) / (df['high'] - df['low'] + 1e-10)
# Bullish/Bearish candle
df['is_bullish'] = (df['close'] > df['open']).astype(int)
# Volume features
if 'volume' in df.columns and df['volume'].sum() > 0:
df['volume_sma'] = df['volume'].rolling(window=window).mean()
df['volume_ratio'] = df['volume'] / (df['volume_sma'] + 1e-10)
df['volume_std'] = df['volume'].rolling(window=window).std()
else:
df['volume_sma'] = 0
df['volume_ratio'] = 1
df['volume_std'] = 0
# Price momentum indicators
df['momentum_5'] = df['close'] - df['close'].shift(5)
df['momentum_12'] = df['close'] - df['close'].shift(12)
# Moving average distances
df['sma_10'] = df['close'].rolling(window=10).mean()
df['sma_20'] = df['close'].rolling(window=20).mean()
df['dist_sma_10'] = (df['close'] - df['sma_10']) / df['close']
df['dist_sma_20'] = (df['close'] - df['sma_20']) / df['close']
return df
def get_multi_symbol_data(
self,
symbols: List[str],
start_date: str,
end_date: str,
timeframe: str = '5m'
) -> Dict[str, pd.DataFrame]:
"""
Load training data for multiple symbols.
Args:
symbols: List of trading symbols
start_date: Start date
end_date: End date
timeframe: Data timeframe
Returns:
Dictionary mapping symbol to DataFrame
"""
logger.info(f"Loading data for {len(symbols)} symbols")
data_dict = {}
for symbol in symbols:
df = self.get_training_data(
symbol=symbol,
start_date=start_date,
end_date=end_date,
timeframe=timeframe,
include_features=True
)
if not df.empty:
data_dict[symbol] = df
logger.info(f" {symbol}: {len(df):,} records")
else:
logger.warning(f" {symbol}: No data found")
return data_dict
def get_data_summary(
self,
symbol: str,
timeframe: str = '5m'
) -> Dict[str, Any]:
"""
Get summary statistics for available data.
Args:
symbol: Trading symbol
timeframe: Data timeframe
Returns:
Dictionary with summary statistics
"""
table_map = {
'5m': 'ohlcv_5m',
'15m': 'ohlcv_15m',
'1h': 'ohlcv_1h',
'4h': 'ohlcv_4h',
'd': 'ohlcv_daily',
}
table = table_map.get(timeframe.lower(), 'ohlcv_5m')
clean_symbol = symbol
if symbol.startswith('C:') or symbol.startswith('X:'):
clean_symbol = symbol[2:]
query = f"""
SELECT
COUNT(*) as total_records,
MIN(o.timestamp) as first_date,
MAX(o.timestamp) as last_date,
AVG(o.close) as avg_price,
STDDEV(o.close) as std_price,
AVG(o.volume) as avg_volume
FROM market_data.{table} o
JOIN market_data.tickers t ON t.id = o.ticker_id
WHERE UPPER(t.symbol) = UPPER(:symbol)
"""
result = pd.read_sql(text(query), self.db.engine, params={'symbol': clean_symbol})
if result.empty or result['total_records'].iloc[0] == 0:
return {'symbol': symbol, 'timeframe': timeframe, 'error': 'No data found'}
row = result.iloc[0]
return {
'symbol': symbol,
'timeframe': timeframe,
'total_records': int(row['total_records']),
'first_date': str(row['first_date']),
'last_date': str(row['last_date']),
'avg_price': float(row['avg_price']) if row['avg_price'] else 0,
'std_price': float(row['std_price']) if row['std_price'] else 0,
'avg_volume': float(row['avg_volume']) if row['avg_volume'] else 0,
}
def clear_cache(self):
"""Clear the internal data cache."""
self._cache.clear()
logger.info("Data cache cleared")
def load_training_data(
symbol: str,
start_date: str,
end_date: str,
timeframe: str = '5m'
) -> pd.DataFrame:
"""
Convenience function to load training data.
Args:
symbol: Trading symbol
start_date: Start date (YYYY-MM-DD)
end_date: End date (YYYY-MM-DD)
timeframe: Data timeframe
Returns:
DataFrame with OHLCV data and features
"""
loader = TrainingDataLoader()
return loader.get_training_data(symbol, start_date, end_date, timeframe)
if __name__ == "__main__":
# Test the training data loader
print("Testing TrainingDataLoader...")
loader = TrainingDataLoader()
# Test data summary
print("\nData summary for XAUUSD:")
summary = loader.get_data_summary('XAUUSD', '5m')
for key, value in summary.items():
print(f" {key}: {value}")
# Test batch loading
print("\nTesting batch loading:")
df = loader.get_training_data(
symbol='XAUUSD',
start_date='2024-01-01',
end_date='2024-12-31',
timeframe='5m',
batch_size=50000
)
print(f" Loaded {len(df):,} records")
print(f" Columns: {list(df.columns)}")
# Test features and targets
print("\nTesting features and targets:")
X, y = loader.get_features_and_targets(
symbol='XAUUSD',
timeframe='5m',
start_date='2024-01-01',
end_date='2024-06-30',
target_horizon=12
)
print(f" X shape: {X.shape}")
print(f" y shape: {y.shape}")
print(f" Features: {list(X.columns)}")
# Test streaming
print("\nTesting streaming:")
chunk_count = 0
total_records = 0
for chunk in loader.stream_training_data(
symbol='XAUUSD',
timeframe='5m',
start_date='2024-01-01',
end_date='2024-03-31',
batch_size=10000
):
chunk_count += 1
total_records += len(chunk)
if chunk_count >= 3:
print(f" (stopped after {chunk_count} chunks for test)")
break
print(f" Chunks: {chunk_count}, Records: {total_records:,}")
print("\nTest complete!")