trading-platform-ml-engine-v2/src/data/training_loader.py

"""
Training Data Loader for ML Engine
===================================
Provides efficient data loading from PostgreSQL for ML training.

This module implements:
- Batch loading for large datasets
- Streaming support for memory-efficient processing
- Filtering by symbol, timeframe, and date range
- Feature and target extraction for model training

Author: ML Pipeline (NEXUS v4.0)
Created: 2026-01-25
"""

from typing import Optional, Dict, Any, List, Iterator, Tuple
from datetime import datetime, timedelta
from dataclasses import dataclass, field

import pandas as pd
import numpy as np
from sqlalchemy import text
from loguru import logger

from .database import PostgreSQLConnection, DatabaseManager


@dataclass
class TrainingDataConfig:
    """Configuration for training data loading."""

    # Database connection
    host: str = 'localhost'
    port: int = 5432
    database: str = 'trading_platform'
    user: str = 'trading_user'
    password: str = 'trading_dev_2026'

    # Default query parameters
    default_batch_size: int = 50000
    default_timeframe: str = '5m'

    # Feature generation
    return_horizons: List[int] = field(default_factory=lambda: [1, 5, 12, 24])
    volatility_window: int = 20

    # Streaming configuration
    stream_chunk_size: int = 10000
    prefetch_chunks: int = 2


class TrainingDataLoader:
    """
    Efficient data loader for ML model training.

    Provides batch loading, streaming, and feature extraction
    from the PostgreSQL trading_platform database.

    Usage:
        loader = TrainingDataLoader()

        # Simple batch loading
        df = loader.get_training_data('XAUUSD', '2023-01-01', '2024-12-31')

        # Streaming for large datasets
        for batch in loader.stream_training_data('XAUUSD', batch_size=50000):
            process_batch(batch)

        # Get features and targets
        X, y = loader.get_features_and_targets('XAUUSD', '5m')
    """

    def __init__(
        self,
        config: Optional[TrainingDataConfig] = None,
        db_connection: Optional[PostgreSQLConnection] = None
    ):
        """
        Initialize the training data loader.

        Args:
            config: Configuration object for data loading
            db_connection: Existing database connection (creates new if None)
        """
        self.config = config or TrainingDataConfig()
        self.db = db_connection or PostgreSQLConnection()
        self._cache: Dict[str, pd.DataFrame] = {}

        logger.info("TrainingDataLoader initialized")
        logger.info(f"  Database: {self.config.database}")
        logger.info(f"  Default batch size: {self.config.default_batch_size}")

    def get_training_data(
        self,
        symbol: str,
        start_date: str,
        end_date: str,
        timeframe: str = '5m',
        batch_size: Optional[int] = None,
        include_features: bool = True
    ) -> pd.DataFrame:
        """
        Load training data for a symbol within a date range.

        Args:
            symbol: Trading symbol (e.g., 'XAUUSD', 'EURUSD')
            start_date: Start date (YYYY-MM-DD format)
            end_date: End date (YYYY-MM-DD format)
            timeframe: Data timeframe ('5m', '15m', '1h', '4h', 'd')
            batch_size: Number of records per batch (None for all at once)
            include_features: Whether to compute derived features

        Returns:
            DataFrame with OHLCV data and optional features, indexed by timestamp
        """
        logger.info(f"Loading training data for {symbol} ({timeframe})")
        logger.info(f"  Date range: {start_date} to {end_date}")

        if batch_size is None:
            # Load all data at once
            df = self.db.get_ticker_data(
                symbol=symbol,
                timeframe=timeframe,
                start_date=start_date,
                end_date=end_date,
                limit=5000000  # High limit for all data
            )
        else:
            # Load in batches and concatenate
            batches = []
            for batch_df in self._load_batches(
                symbol, start_date, end_date, timeframe, batch_size
            ):
                batches.append(batch_df)

            if not batches:
                logger.warning(f"No data found for {symbol}")
                return pd.DataFrame()

            df = pd.concat(batches, axis=0)
            df = df.sort_index()
            df = df[~df.index.duplicated(keep='first')]

        if df.empty:
            logger.warning(f"No data loaded for {symbol}")
            return df

        logger.info(f"  Loaded {len(df):,} records")

        if include_features:
            df = self._compute_basic_features(df)
            logger.info(f"  Added {len(df.columns) - 6} derived features")

        return df

    def _load_batches(
        self,
        symbol: str,
        start_date: str,
        end_date: str,
        timeframe: str,
        batch_size: int
    ) -> Iterator[pd.DataFrame]:
        """
        Load data in batches using offset pagination.

        Args:
            symbol: Trading symbol
            start_date: Start date
            end_date: End date
            timeframe: Timeframe
            batch_size: Records per batch

        Yields:
            DataFrames for each batch
        """
        table_map = {
            '5m': 'ohlcv_5m',
            '15m': 'ohlcv_15m',
            '1h': 'ohlcv_1h',
            '4h': 'ohlcv_4h',
            'd': 'ohlcv_daily',
            '1d': 'ohlcv_daily',
        }
        table = table_map.get(timeframe.lower(), 'ohlcv_5m')

        clean_symbol = symbol
        if symbol.startswith('C:') or symbol.startswith('X:') or symbol.startswith('I:'):
            clean_symbol = symbol[2:]

        offset = 0
        total_loaded = 0

        while True:
            query = f"""
            SELECT
                o.timestamp,
                o.open,
                o.high,
                o.low,
                o.close,
                o.volume,
                o.vwap
            FROM market_data.{table} o
            JOIN market_data.tickers t ON t.id = o.ticker_id
            WHERE UPPER(t.symbol) = UPPER(:symbol)
              AND o.timestamp >= :start_date
              AND o.timestamp <= :end_date
            ORDER BY o.timestamp ASC
            LIMIT :limit OFFSET :offset
            """

            params = {
                'symbol': clean_symbol,
                'start_date': start_date,
                'end_date': end_date,
                'limit': batch_size,
                'offset': offset
            }

            batch_df = pd.read_sql(text(query), self.db.engine, params=params)

            if batch_df.empty:
                break

            batch_df['timestamp'] = pd.to_datetime(batch_df['timestamp'])
            batch_df.set_index('timestamp', inplace=True)

            total_loaded += len(batch_df)
            logger.debug(f"  Loaded batch: {len(batch_df)} records (total: {total_loaded:,})")

            yield batch_df

            if len(batch_df) < batch_size:
                break

            offset += batch_size

    def stream_training_data(
        self,
        symbol: str,
        timeframe: str = '5m',
        start_date: Optional[str] = None,
        end_date: Optional[str] = None,
        batch_size: Optional[int] = None
    ) -> Iterator[pd.DataFrame]:
        """
        Stream training data in chunks for memory-efficient processing.

        This method is ideal for processing very large datasets that
        don't fit in memory. Each chunk is loaded, processed, and
        can be discarded before loading the next.

        Args:
            symbol: Trading symbol
            timeframe: Data timeframe
            start_date: Start date (defaults to 5 years ago)
            end_date: End date (defaults to today)
            batch_size: Records per chunk (defaults to config)

        Yields:
            DataFrames for each chunk with computed features
        """
        if start_date is None:
            start_date = (datetime.now() - timedelta(days=365 * 5)).strftime('%Y-%m-%d')
        if end_date is None:
            end_date = datetime.now().strftime('%Y-%m-%d')
        if batch_size is None:
            batch_size = self.config.stream_chunk_size

        logger.info(f"Streaming data for {symbol} ({timeframe})")
        logger.info(f"  Date range: {start_date} to {end_date}")
        logger.info(f"  Chunk size: {batch_size:,}")

        chunk_count = 0
        total_records = 0

        for batch_df in self._load_batches(
            symbol, start_date, end_date, timeframe, batch_size
        ):
            chunk_count += 1
            total_records += len(batch_df)

            # Compute features for this chunk
            batch_df = self._compute_basic_features(batch_df)

            logger.debug(f"  Yielding chunk {chunk_count}: {len(batch_df)} records")
            yield batch_df

        logger.info(f"  Streamed {chunk_count} chunks, {total_records:,} total records")

    def get_features_and_targets(
        self,
        symbol: str,
        timeframe: str = '5m',
        start_date: Optional[str] = None,
        end_date: Optional[str] = None,
        target_horizon: int = 12,
        target_type: str = 'return'
    ) -> Tuple[pd.DataFrame, pd.Series]:
        """
        Get feature matrix and target vector for model training.

        Args:
            symbol: Trading symbol
            timeframe: Data timeframe
            start_date: Start date (defaults to 3 years ago)
            end_date: End date (defaults to today)
            target_horizon: Number of bars ahead for target
            target_type: 'return' for % returns, 'direction' for up/down

        Returns:
            Tuple of (X features DataFrame, y target Series)
        """
        if start_date is None:
            start_date = (datetime.now() - timedelta(days=365 * 3)).strftime('%Y-%m-%d')
        if end_date is None:
            end_date = datetime.now().strftime('%Y-%m-%d')

        logger.info(f"Preparing features and targets for {symbol} ({timeframe})")
        logger.info(f"  Target: {target_type} at horizon {target_horizon}")

        # Load data with features
        df = self.get_training_data(
            symbol=symbol,
            start_date=start_date,
            end_date=end_date,
            timeframe=timeframe,
            include_features=True
        )

        if df.empty:
            return pd.DataFrame(), pd.Series()

        # Compute target
        if target_type == 'return':
            target = df['close'].pct_change(target_horizon).shift(-target_horizon)
        elif target_type == 'direction':
            future_return = df['close'].pct_change(target_horizon).shift(-target_horizon)
            target = (future_return > 0).astype(int)
        else:
            raise ValueError(f"Unknown target_type: {target_type}")

        # Select feature columns (exclude OHLCV and target-related)
        feature_cols = [
            col for col in df.columns
            if col not in ['open', 'high', 'low', 'close', 'volume', 'vwap']
            and not col.startswith('target_')
        ]

        X = df[feature_cols]
        y = target

        # Remove rows with NaN targets (end of series)
        valid_mask = ~y.isna() & ~X.isna().any(axis=1)
        X = X[valid_mask]
        y = y[valid_mask]

        logger.info(f"  Features shape: {X.shape}")
        logger.info(f"  Feature columns: {list(X.columns)}")

        return X, y

    def _compute_basic_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Compute basic derived features for the DataFrame.

        Features computed:
        - Returns at multiple horizons
        - Volatility (rolling std)
        - Range and body percentages
        - Volume ratios

        Args:
            df: DataFrame with OHLCV columns

        Returns:
            DataFrame with additional feature columns
        """
        df = df.copy()

        # Price returns at different horizons
        for horizon in self.config.return_horizons:
            df[f'return_{horizon}'] = df['close'].pct_change(horizon)

        # Log returns (more suitable for ML)
        df['log_return'] = np.log(df['close'] / df['close'].shift(1))

        # Volatility (rolling standard deviation of returns)
        window = self.config.volatility_window
        df['volatility'] = df['log_return'].rolling(window=window).std()
        df['volatility_pct'] = df['volatility'] * np.sqrt(252 * 12)  # Annualized for 5m

        # High-Low range as percentage
        df['range_pct'] = (df['high'] - df['low']) / df['close']

        # Body as percentage of range
        df['body_pct'] = np.abs(df['close'] - df['open']) / (df['high'] - df['low'] + 1e-10)

        # Upper and lower wicks
        body_high = df[['open', 'close']].max(axis=1)
        body_low = df[['open', 'close']].min(axis=1)
        df['upper_wick_pct'] = (df['high'] - body_high) / (df['high'] - df['low'] + 1e-10)
        df['lower_wick_pct'] = (body_low - df['low']) / (df['high'] - df['low'] + 1e-10)

        # Bullish/Bearish candle
        df['is_bullish'] = (df['close'] > df['open']).astype(int)

        # Volume features
        if 'volume' in df.columns and df['volume'].sum() > 0:
            df['volume_sma'] = df['volume'].rolling(window=window).mean()
            df['volume_ratio'] = df['volume'] / (df['volume_sma'] + 1e-10)
            df['volume_std'] = df['volume'].rolling(window=window).std()
        else:
            df['volume_sma'] = 0
            df['volume_ratio'] = 1
            df['volume_std'] = 0

        # Price momentum indicators
        df['momentum_5'] = df['close'] - df['close'].shift(5)
        df['momentum_12'] = df['close'] - df['close'].shift(12)

        # Moving average distances
        df['sma_10'] = df['close'].rolling(window=10).mean()
        df['sma_20'] = df['close'].rolling(window=20).mean()
        df['dist_sma_10'] = (df['close'] - df['sma_10']) / df['close']
        df['dist_sma_20'] = (df['close'] - df['sma_20']) / df['close']

        return df

    def get_multi_symbol_data(
        self,
        symbols: List[str],
        start_date: str,
        end_date: str,
        timeframe: str = '5m'
    ) -> Dict[str, pd.DataFrame]:
        """
        Load training data for multiple symbols.

        Args:
            symbols: List of trading symbols
            start_date: Start date
            end_date: End date
            timeframe: Data timeframe

        Returns:
            Dictionary mapping symbol to DataFrame
        """
        logger.info(f"Loading data for {len(symbols)} symbols")

        data_dict = {}
        for symbol in symbols:
            df = self.get_training_data(
                symbol=symbol,
                start_date=start_date,
                end_date=end_date,
                timeframe=timeframe,
                include_features=True
            )
            if not df.empty:
                data_dict[symbol] = df
                logger.info(f"  {symbol}: {len(df):,} records")
            else:
                logger.warning(f"  {symbol}: No data found")

        return data_dict

    def get_data_summary(
        self,
        symbol: str,
        timeframe: str = '5m'
    ) -> Dict[str, Any]:
        """
        Get summary statistics for available data.

        Args:
            symbol: Trading symbol
            timeframe: Data timeframe

        Returns:
            Dictionary with summary statistics
        """
        table_map = {
            '5m': 'ohlcv_5m',
            '15m': 'ohlcv_15m',
            '1h': 'ohlcv_1h',
            '4h': 'ohlcv_4h',
            'd': 'ohlcv_daily',
        }
        table = table_map.get(timeframe.lower(), 'ohlcv_5m')

        clean_symbol = symbol
        if symbol.startswith('C:') or symbol.startswith('X:'):
            clean_symbol = symbol[2:]

        query = f"""
        SELECT
            COUNT(*) as total_records,
            MIN(o.timestamp) as first_date,
            MAX(o.timestamp) as last_date,
            AVG(o.close) as avg_price,
            STDDEV(o.close) as std_price,
            AVG(o.volume) as avg_volume
        FROM market_data.{table} o
        JOIN market_data.tickers t ON t.id = o.ticker_id
        WHERE UPPER(t.symbol) = UPPER(:symbol)
        """

        result = pd.read_sql(text(query), self.db.engine, params={'symbol': clean_symbol})

        if result.empty or result['total_records'].iloc[0] == 0:
            return {'symbol': symbol, 'timeframe': timeframe, 'error': 'No data found'}

        row = result.iloc[0]
        return {
            'symbol': symbol,
            'timeframe': timeframe,
            'total_records': int(row['total_records']),
            'first_date': str(row['first_date']),
            'last_date': str(row['last_date']),
            'avg_price': float(row['avg_price']) if row['avg_price'] else 0,
            'std_price': float(row['std_price']) if row['std_price'] else 0,
            'avg_volume': float(row['avg_volume']) if row['avg_volume'] else 0,
        }

    def clear_cache(self):
        """Clear the internal data cache."""
        self._cache.clear()
        logger.info("Data cache cleared")


def load_training_data(
    symbol: str,
    start_date: str,
    end_date: str,
    timeframe: str = '5m'
) -> pd.DataFrame:
    """
    Convenience function to load training data.

    Args:
        symbol: Trading symbol
        start_date: Start date (YYYY-MM-DD)
        end_date: End date (YYYY-MM-DD)
        timeframe: Data timeframe

    Returns:
        DataFrame with OHLCV data and features
    """
    loader = TrainingDataLoader()
    return loader.get_training_data(symbol, start_date, end_date, timeframe)


if __name__ == "__main__":
    # Test the training data loader
    print("Testing TrainingDataLoader...")

    loader = TrainingDataLoader()

    # Test data summary
    print("\nData summary for XAUUSD:")
    summary = loader.get_data_summary('XAUUSD', '5m')
    for key, value in summary.items():
        print(f"  {key}: {value}")

    # Test batch loading
    print("\nTesting batch loading:")
    df = loader.get_training_data(
        symbol='XAUUSD',
        start_date='2024-01-01',
        end_date='2024-12-31',
        timeframe='5m',
        batch_size=50000
    )
    print(f"  Loaded {len(df):,} records")
    print(f"  Columns: {list(df.columns)}")

    # Test features and targets
    print("\nTesting features and targets:")
    X, y = loader.get_features_and_targets(
        symbol='XAUUSD',
        timeframe='5m',
        start_date='2024-01-01',
        end_date='2024-06-30',
        target_horizon=12
    )
    print(f"  X shape: {X.shape}")
    print(f"  y shape: {y.shape}")
    print(f"  Features: {list(X.columns)}")

    # Test streaming
    print("\nTesting streaming:")
    chunk_count = 0
    total_records = 0
    for chunk in loader.stream_training_data(
        symbol='XAUUSD',
        timeframe='5m',
        start_date='2024-01-01',
        end_date='2024-03-31',
        batch_size=10000
    ):
        chunk_count += 1
        total_records += len(chunk)
        if chunk_count >= 3:
            print(f"  (stopped after {chunk_count} chunks for test)")
            break

    print(f"  Chunks: {chunk_count}, Records: {total_records:,}")

    print("\nTest complete!")