- Fix database.py: Add DatabaseConnection alias for backward compat - Fix train_symbol_timeframe_models.py: Use PostgreSQLConnection + native queries - Fix run_oos_backtest.py: Fix broken import + add dynamic OOS support - Update data_splitter.py: split_dynamic_oos() method (from previous session) - Update validation_oos.yaml: Dynamic OOS config + all 6 symbols enabled - Create ingest_ohlcv_polygon.py: Standalone Polygon→PostgreSQL ingestion script - Fix .gitignore: /data/ instead of data/ to not ignore src/data/ - Add untracked src/ modules: backtesting, data, llm, models (attention/metamodel/strategies) - Add aiohttp, sqlalchemy, psycopg2-binary to requirements.txt Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
612 lines
19 KiB
Python
612 lines
19 KiB
Python
"""
|
|
Training Data Loader for ML Engine
|
|
===================================
|
|
Provides efficient data loading from PostgreSQL for ML training.
|
|
|
|
This module implements:
|
|
- Batch loading for large datasets
|
|
- Streaming support for memory-efficient processing
|
|
- Filtering by symbol, timeframe, and date range
|
|
- Feature and target extraction for model training
|
|
|
|
Author: ML Pipeline (NEXUS v4.0)
|
|
Created: 2026-01-25
|
|
"""
|
|
|
|
from typing import Optional, Dict, Any, List, Iterator, Tuple
|
|
from datetime import datetime, timedelta
|
|
from dataclasses import dataclass, field
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
from sqlalchemy import text
|
|
from loguru import logger
|
|
|
|
from .database import PostgreSQLConnection, DatabaseManager
|
|
|
|
|
|
@dataclass
|
|
class TrainingDataConfig:
|
|
"""Configuration for training data loading."""
|
|
|
|
# Database connection
|
|
host: str = 'localhost'
|
|
port: int = 5432
|
|
database: str = 'trading_platform'
|
|
user: str = 'trading_user'
|
|
password: str = 'trading_dev_2026'
|
|
|
|
# Default query parameters
|
|
default_batch_size: int = 50000
|
|
default_timeframe: str = '5m'
|
|
|
|
# Feature generation
|
|
return_horizons: List[int] = field(default_factory=lambda: [1, 5, 12, 24])
|
|
volatility_window: int = 20
|
|
|
|
# Streaming configuration
|
|
stream_chunk_size: int = 10000
|
|
prefetch_chunks: int = 2
|
|
|
|
|
|
class TrainingDataLoader:
|
|
"""
|
|
Efficient data loader for ML model training.
|
|
|
|
Provides batch loading, streaming, and feature extraction
|
|
from the PostgreSQL trading_platform database.
|
|
|
|
Usage:
|
|
loader = TrainingDataLoader()
|
|
|
|
# Simple batch loading
|
|
df = loader.get_training_data('XAUUSD', '2023-01-01', '2024-12-31')
|
|
|
|
# Streaming for large datasets
|
|
for batch in loader.stream_training_data('XAUUSD', batch_size=50000):
|
|
process_batch(batch)
|
|
|
|
# Get features and targets
|
|
X, y = loader.get_features_and_targets('XAUUSD', '5m')
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
config: Optional[TrainingDataConfig] = None,
|
|
db_connection: Optional[PostgreSQLConnection] = None
|
|
):
|
|
"""
|
|
Initialize the training data loader.
|
|
|
|
Args:
|
|
config: Configuration object for data loading
|
|
db_connection: Existing database connection (creates new if None)
|
|
"""
|
|
self.config = config or TrainingDataConfig()
|
|
self.db = db_connection or PostgreSQLConnection()
|
|
self._cache: Dict[str, pd.DataFrame] = {}
|
|
|
|
logger.info("TrainingDataLoader initialized")
|
|
logger.info(f" Database: {self.config.database}")
|
|
logger.info(f" Default batch size: {self.config.default_batch_size}")
|
|
|
|
def get_training_data(
|
|
self,
|
|
symbol: str,
|
|
start_date: str,
|
|
end_date: str,
|
|
timeframe: str = '5m',
|
|
batch_size: Optional[int] = None,
|
|
include_features: bool = True
|
|
) -> pd.DataFrame:
|
|
"""
|
|
Load training data for a symbol within a date range.
|
|
|
|
Args:
|
|
symbol: Trading symbol (e.g., 'XAUUSD', 'EURUSD')
|
|
start_date: Start date (YYYY-MM-DD format)
|
|
end_date: End date (YYYY-MM-DD format)
|
|
timeframe: Data timeframe ('5m', '15m', '1h', '4h', 'd')
|
|
batch_size: Number of records per batch (None for all at once)
|
|
include_features: Whether to compute derived features
|
|
|
|
Returns:
|
|
DataFrame with OHLCV data and optional features, indexed by timestamp
|
|
"""
|
|
logger.info(f"Loading training data for {symbol} ({timeframe})")
|
|
logger.info(f" Date range: {start_date} to {end_date}")
|
|
|
|
if batch_size is None:
|
|
# Load all data at once
|
|
df = self.db.get_ticker_data(
|
|
symbol=symbol,
|
|
timeframe=timeframe,
|
|
start_date=start_date,
|
|
end_date=end_date,
|
|
limit=5000000 # High limit for all data
|
|
)
|
|
else:
|
|
# Load in batches and concatenate
|
|
batches = []
|
|
for batch_df in self._load_batches(
|
|
symbol, start_date, end_date, timeframe, batch_size
|
|
):
|
|
batches.append(batch_df)
|
|
|
|
if not batches:
|
|
logger.warning(f"No data found for {symbol}")
|
|
return pd.DataFrame()
|
|
|
|
df = pd.concat(batches, axis=0)
|
|
df = df.sort_index()
|
|
df = df[~df.index.duplicated(keep='first')]
|
|
|
|
if df.empty:
|
|
logger.warning(f"No data loaded for {symbol}")
|
|
return df
|
|
|
|
logger.info(f" Loaded {len(df):,} records")
|
|
|
|
if include_features:
|
|
df = self._compute_basic_features(df)
|
|
logger.info(f" Added {len(df.columns) - 6} derived features")
|
|
|
|
return df
|
|
|
|
def _load_batches(
|
|
self,
|
|
symbol: str,
|
|
start_date: str,
|
|
end_date: str,
|
|
timeframe: str,
|
|
batch_size: int
|
|
) -> Iterator[pd.DataFrame]:
|
|
"""
|
|
Load data in batches using offset pagination.
|
|
|
|
Args:
|
|
symbol: Trading symbol
|
|
start_date: Start date
|
|
end_date: End date
|
|
timeframe: Timeframe
|
|
batch_size: Records per batch
|
|
|
|
Yields:
|
|
DataFrames for each batch
|
|
"""
|
|
table_map = {
|
|
'5m': 'ohlcv_5m',
|
|
'15m': 'ohlcv_15m',
|
|
'1h': 'ohlcv_1h',
|
|
'4h': 'ohlcv_4h',
|
|
'd': 'ohlcv_daily',
|
|
'1d': 'ohlcv_daily',
|
|
}
|
|
table = table_map.get(timeframe.lower(), 'ohlcv_5m')
|
|
|
|
clean_symbol = symbol
|
|
if symbol.startswith('C:') or symbol.startswith('X:') or symbol.startswith('I:'):
|
|
clean_symbol = symbol[2:]
|
|
|
|
offset = 0
|
|
total_loaded = 0
|
|
|
|
while True:
|
|
query = f"""
|
|
SELECT
|
|
o.timestamp,
|
|
o.open,
|
|
o.high,
|
|
o.low,
|
|
o.close,
|
|
o.volume,
|
|
o.vwap
|
|
FROM market_data.{table} o
|
|
JOIN market_data.tickers t ON t.id = o.ticker_id
|
|
WHERE UPPER(t.symbol) = UPPER(:symbol)
|
|
AND o.timestamp >= :start_date
|
|
AND o.timestamp <= :end_date
|
|
ORDER BY o.timestamp ASC
|
|
LIMIT :limit OFFSET :offset
|
|
"""
|
|
|
|
params = {
|
|
'symbol': clean_symbol,
|
|
'start_date': start_date,
|
|
'end_date': end_date,
|
|
'limit': batch_size,
|
|
'offset': offset
|
|
}
|
|
|
|
batch_df = pd.read_sql(text(query), self.db.engine, params=params)
|
|
|
|
if batch_df.empty:
|
|
break
|
|
|
|
batch_df['timestamp'] = pd.to_datetime(batch_df['timestamp'])
|
|
batch_df.set_index('timestamp', inplace=True)
|
|
|
|
total_loaded += len(batch_df)
|
|
logger.debug(f" Loaded batch: {len(batch_df)} records (total: {total_loaded:,})")
|
|
|
|
yield batch_df
|
|
|
|
if len(batch_df) < batch_size:
|
|
break
|
|
|
|
offset += batch_size
|
|
|
|
def stream_training_data(
|
|
self,
|
|
symbol: str,
|
|
timeframe: str = '5m',
|
|
start_date: Optional[str] = None,
|
|
end_date: Optional[str] = None,
|
|
batch_size: Optional[int] = None
|
|
) -> Iterator[pd.DataFrame]:
|
|
"""
|
|
Stream training data in chunks for memory-efficient processing.
|
|
|
|
This method is ideal for processing very large datasets that
|
|
don't fit in memory. Each chunk is loaded, processed, and
|
|
can be discarded before loading the next.
|
|
|
|
Args:
|
|
symbol: Trading symbol
|
|
timeframe: Data timeframe
|
|
start_date: Start date (defaults to 5 years ago)
|
|
end_date: End date (defaults to today)
|
|
batch_size: Records per chunk (defaults to config)
|
|
|
|
Yields:
|
|
DataFrames for each chunk with computed features
|
|
"""
|
|
if start_date is None:
|
|
start_date = (datetime.now() - timedelta(days=365 * 5)).strftime('%Y-%m-%d')
|
|
if end_date is None:
|
|
end_date = datetime.now().strftime('%Y-%m-%d')
|
|
if batch_size is None:
|
|
batch_size = self.config.stream_chunk_size
|
|
|
|
logger.info(f"Streaming data for {symbol} ({timeframe})")
|
|
logger.info(f" Date range: {start_date} to {end_date}")
|
|
logger.info(f" Chunk size: {batch_size:,}")
|
|
|
|
chunk_count = 0
|
|
total_records = 0
|
|
|
|
for batch_df in self._load_batches(
|
|
symbol, start_date, end_date, timeframe, batch_size
|
|
):
|
|
chunk_count += 1
|
|
total_records += len(batch_df)
|
|
|
|
# Compute features for this chunk
|
|
batch_df = self._compute_basic_features(batch_df)
|
|
|
|
logger.debug(f" Yielding chunk {chunk_count}: {len(batch_df)} records")
|
|
yield batch_df
|
|
|
|
logger.info(f" Streamed {chunk_count} chunks, {total_records:,} total records")
|
|
|
|
def get_features_and_targets(
|
|
self,
|
|
symbol: str,
|
|
timeframe: str = '5m',
|
|
start_date: Optional[str] = None,
|
|
end_date: Optional[str] = None,
|
|
target_horizon: int = 12,
|
|
target_type: str = 'return'
|
|
) -> Tuple[pd.DataFrame, pd.Series]:
|
|
"""
|
|
Get feature matrix and target vector for model training.
|
|
|
|
Args:
|
|
symbol: Trading symbol
|
|
timeframe: Data timeframe
|
|
start_date: Start date (defaults to 3 years ago)
|
|
end_date: End date (defaults to today)
|
|
target_horizon: Number of bars ahead for target
|
|
target_type: 'return' for % returns, 'direction' for up/down
|
|
|
|
Returns:
|
|
Tuple of (X features DataFrame, y target Series)
|
|
"""
|
|
if start_date is None:
|
|
start_date = (datetime.now() - timedelta(days=365 * 3)).strftime('%Y-%m-%d')
|
|
if end_date is None:
|
|
end_date = datetime.now().strftime('%Y-%m-%d')
|
|
|
|
logger.info(f"Preparing features and targets for {symbol} ({timeframe})")
|
|
logger.info(f" Target: {target_type} at horizon {target_horizon}")
|
|
|
|
# Load data with features
|
|
df = self.get_training_data(
|
|
symbol=symbol,
|
|
start_date=start_date,
|
|
end_date=end_date,
|
|
timeframe=timeframe,
|
|
include_features=True
|
|
)
|
|
|
|
if df.empty:
|
|
return pd.DataFrame(), pd.Series()
|
|
|
|
# Compute target
|
|
if target_type == 'return':
|
|
target = df['close'].pct_change(target_horizon).shift(-target_horizon)
|
|
elif target_type == 'direction':
|
|
future_return = df['close'].pct_change(target_horizon).shift(-target_horizon)
|
|
target = (future_return > 0).astype(int)
|
|
else:
|
|
raise ValueError(f"Unknown target_type: {target_type}")
|
|
|
|
# Select feature columns (exclude OHLCV and target-related)
|
|
feature_cols = [
|
|
col for col in df.columns
|
|
if col not in ['open', 'high', 'low', 'close', 'volume', 'vwap']
|
|
and not col.startswith('target_')
|
|
]
|
|
|
|
X = df[feature_cols]
|
|
y = target
|
|
|
|
# Remove rows with NaN targets (end of series)
|
|
valid_mask = ~y.isna() & ~X.isna().any(axis=1)
|
|
X = X[valid_mask]
|
|
y = y[valid_mask]
|
|
|
|
logger.info(f" Features shape: {X.shape}")
|
|
logger.info(f" Feature columns: {list(X.columns)}")
|
|
|
|
return X, y
|
|
|
|
def _compute_basic_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
"""
|
|
Compute basic derived features for the DataFrame.
|
|
|
|
Features computed:
|
|
- Returns at multiple horizons
|
|
- Volatility (rolling std)
|
|
- Range and body percentages
|
|
- Volume ratios
|
|
|
|
Args:
|
|
df: DataFrame with OHLCV columns
|
|
|
|
Returns:
|
|
DataFrame with additional feature columns
|
|
"""
|
|
df = df.copy()
|
|
|
|
# Price returns at different horizons
|
|
for horizon in self.config.return_horizons:
|
|
df[f'return_{horizon}'] = df['close'].pct_change(horizon)
|
|
|
|
# Log returns (more suitable for ML)
|
|
df['log_return'] = np.log(df['close'] / df['close'].shift(1))
|
|
|
|
# Volatility (rolling standard deviation of returns)
|
|
window = self.config.volatility_window
|
|
df['volatility'] = df['log_return'].rolling(window=window).std()
|
|
df['volatility_pct'] = df['volatility'] * np.sqrt(252 * 12) # Annualized for 5m
|
|
|
|
# High-Low range as percentage
|
|
df['range_pct'] = (df['high'] - df['low']) / df['close']
|
|
|
|
# Body as percentage of range
|
|
df['body_pct'] = np.abs(df['close'] - df['open']) / (df['high'] - df['low'] + 1e-10)
|
|
|
|
# Upper and lower wicks
|
|
body_high = df[['open', 'close']].max(axis=1)
|
|
body_low = df[['open', 'close']].min(axis=1)
|
|
df['upper_wick_pct'] = (df['high'] - body_high) / (df['high'] - df['low'] + 1e-10)
|
|
df['lower_wick_pct'] = (body_low - df['low']) / (df['high'] - df['low'] + 1e-10)
|
|
|
|
# Bullish/Bearish candle
|
|
df['is_bullish'] = (df['close'] > df['open']).astype(int)
|
|
|
|
# Volume features
|
|
if 'volume' in df.columns and df['volume'].sum() > 0:
|
|
df['volume_sma'] = df['volume'].rolling(window=window).mean()
|
|
df['volume_ratio'] = df['volume'] / (df['volume_sma'] + 1e-10)
|
|
df['volume_std'] = df['volume'].rolling(window=window).std()
|
|
else:
|
|
df['volume_sma'] = 0
|
|
df['volume_ratio'] = 1
|
|
df['volume_std'] = 0
|
|
|
|
# Price momentum indicators
|
|
df['momentum_5'] = df['close'] - df['close'].shift(5)
|
|
df['momentum_12'] = df['close'] - df['close'].shift(12)
|
|
|
|
# Moving average distances
|
|
df['sma_10'] = df['close'].rolling(window=10).mean()
|
|
df['sma_20'] = df['close'].rolling(window=20).mean()
|
|
df['dist_sma_10'] = (df['close'] - df['sma_10']) / df['close']
|
|
df['dist_sma_20'] = (df['close'] - df['sma_20']) / df['close']
|
|
|
|
return df
|
|
|
|
def get_multi_symbol_data(
|
|
self,
|
|
symbols: List[str],
|
|
start_date: str,
|
|
end_date: str,
|
|
timeframe: str = '5m'
|
|
) -> Dict[str, pd.DataFrame]:
|
|
"""
|
|
Load training data for multiple symbols.
|
|
|
|
Args:
|
|
symbols: List of trading symbols
|
|
start_date: Start date
|
|
end_date: End date
|
|
timeframe: Data timeframe
|
|
|
|
Returns:
|
|
Dictionary mapping symbol to DataFrame
|
|
"""
|
|
logger.info(f"Loading data for {len(symbols)} symbols")
|
|
|
|
data_dict = {}
|
|
for symbol in symbols:
|
|
df = self.get_training_data(
|
|
symbol=symbol,
|
|
start_date=start_date,
|
|
end_date=end_date,
|
|
timeframe=timeframe,
|
|
include_features=True
|
|
)
|
|
if not df.empty:
|
|
data_dict[symbol] = df
|
|
logger.info(f" {symbol}: {len(df):,} records")
|
|
else:
|
|
logger.warning(f" {symbol}: No data found")
|
|
|
|
return data_dict
|
|
|
|
def get_data_summary(
|
|
self,
|
|
symbol: str,
|
|
timeframe: str = '5m'
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Get summary statistics for available data.
|
|
|
|
Args:
|
|
symbol: Trading symbol
|
|
timeframe: Data timeframe
|
|
|
|
Returns:
|
|
Dictionary with summary statistics
|
|
"""
|
|
table_map = {
|
|
'5m': 'ohlcv_5m',
|
|
'15m': 'ohlcv_15m',
|
|
'1h': 'ohlcv_1h',
|
|
'4h': 'ohlcv_4h',
|
|
'd': 'ohlcv_daily',
|
|
}
|
|
table = table_map.get(timeframe.lower(), 'ohlcv_5m')
|
|
|
|
clean_symbol = symbol
|
|
if symbol.startswith('C:') or symbol.startswith('X:'):
|
|
clean_symbol = symbol[2:]
|
|
|
|
query = f"""
|
|
SELECT
|
|
COUNT(*) as total_records,
|
|
MIN(o.timestamp) as first_date,
|
|
MAX(o.timestamp) as last_date,
|
|
AVG(o.close) as avg_price,
|
|
STDDEV(o.close) as std_price,
|
|
AVG(o.volume) as avg_volume
|
|
FROM market_data.{table} o
|
|
JOIN market_data.tickers t ON t.id = o.ticker_id
|
|
WHERE UPPER(t.symbol) = UPPER(:symbol)
|
|
"""
|
|
|
|
result = pd.read_sql(text(query), self.db.engine, params={'symbol': clean_symbol})
|
|
|
|
if result.empty or result['total_records'].iloc[0] == 0:
|
|
return {'symbol': symbol, 'timeframe': timeframe, 'error': 'No data found'}
|
|
|
|
row = result.iloc[0]
|
|
return {
|
|
'symbol': symbol,
|
|
'timeframe': timeframe,
|
|
'total_records': int(row['total_records']),
|
|
'first_date': str(row['first_date']),
|
|
'last_date': str(row['last_date']),
|
|
'avg_price': float(row['avg_price']) if row['avg_price'] else 0,
|
|
'std_price': float(row['std_price']) if row['std_price'] else 0,
|
|
'avg_volume': float(row['avg_volume']) if row['avg_volume'] else 0,
|
|
}
|
|
|
|
def clear_cache(self):
|
|
"""Clear the internal data cache."""
|
|
self._cache.clear()
|
|
logger.info("Data cache cleared")
|
|
|
|
|
|
def load_training_data(
|
|
symbol: str,
|
|
start_date: str,
|
|
end_date: str,
|
|
timeframe: str = '5m'
|
|
) -> pd.DataFrame:
|
|
"""
|
|
Convenience function to load training data.
|
|
|
|
Args:
|
|
symbol: Trading symbol
|
|
start_date: Start date (YYYY-MM-DD)
|
|
end_date: End date (YYYY-MM-DD)
|
|
timeframe: Data timeframe
|
|
|
|
Returns:
|
|
DataFrame with OHLCV data and features
|
|
"""
|
|
loader = TrainingDataLoader()
|
|
return loader.get_training_data(symbol, start_date, end_date, timeframe)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Test the training data loader
|
|
print("Testing TrainingDataLoader...")
|
|
|
|
loader = TrainingDataLoader()
|
|
|
|
# Test data summary
|
|
print("\nData summary for XAUUSD:")
|
|
summary = loader.get_data_summary('XAUUSD', '5m')
|
|
for key, value in summary.items():
|
|
print(f" {key}: {value}")
|
|
|
|
# Test batch loading
|
|
print("\nTesting batch loading:")
|
|
df = loader.get_training_data(
|
|
symbol='XAUUSD',
|
|
start_date='2024-01-01',
|
|
end_date='2024-12-31',
|
|
timeframe='5m',
|
|
batch_size=50000
|
|
)
|
|
print(f" Loaded {len(df):,} records")
|
|
print(f" Columns: {list(df.columns)}")
|
|
|
|
# Test features and targets
|
|
print("\nTesting features and targets:")
|
|
X, y = loader.get_features_and_targets(
|
|
symbol='XAUUSD',
|
|
timeframe='5m',
|
|
start_date='2024-01-01',
|
|
end_date='2024-06-30',
|
|
target_horizon=12
|
|
)
|
|
print(f" X shape: {X.shape}")
|
|
print(f" y shape: {y.shape}")
|
|
print(f" Features: {list(X.columns)}")
|
|
|
|
# Test streaming
|
|
print("\nTesting streaming:")
|
|
chunk_count = 0
|
|
total_records = 0
|
|
for chunk in loader.stream_training_data(
|
|
symbol='XAUUSD',
|
|
timeframe='5m',
|
|
start_date='2024-01-01',
|
|
end_date='2024-03-31',
|
|
batch_size=10000
|
|
):
|
|
chunk_count += 1
|
|
total_records += len(chunk)
|
|
if chunk_count >= 3:
|
|
print(f" (stopped after {chunk_count} chunks for test)")
|
|
break
|
|
|
|
print(f" Chunks: {chunk_count}, Records: {total_records:,}")
|
|
|
|
print("\nTest complete!")
|