# ET-ML-002: Modelos XGBoost ## Metadata | Campo | Valor | |-------|-------| | **ID** | ET-ML-002 | | **Épica** | OQI-006 - Señales ML | | **Tipo** | Especificación Técnica | | **Versión** | 1.0.0 | | **Estado** | Aprobado | | **Última actualización** | 2025-12-05 | --- ## Propósito Especificar los modelos de Machine Learning basados en XGBoost utilizados para predicción de rangos de precio, clasificación TP/SL, y generación de señales de trading. --- ## Modelos Implementados ### 1. RangePredictor **Objetivo:** Predecir el rango de precio (ΔHigh, ΔLow) para un horizonte temporal dado. ```python # app/models/range_predictor.py from xgboost import XGBRegressor from typing import Tuple import numpy as np class RangePredictor: """ Predicts price range (delta high, delta low) for a given time horizon. Uses two XGBoost regressors: one for high, one for low. """ def __init__(self, horizon: int): self.horizon = horizon self.model_high = None self.model_low = None self.feature_names = [] def get_params(self) -> dict: """XGBoost hyperparameters optimized for price prediction""" return { 'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.05, 'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 3, 'gamma': 0.1, 'reg_alpha': 0.1, 'reg_lambda': 1.0, 'objective': 'reg:squarederror', 'tree_method': 'hist', 'random_state': 42 } def fit(self, X: np.ndarray, y_high: np.ndarray, y_low: np.ndarray): """Train both models""" params = self.get_params() self.model_high = XGBRegressor(**params) self.model_high.fit(X, y_high) self.model_low = XGBRegressor(**params) self.model_low.fit(X, y_low) self.feature_names = list(range(X.shape[1])) def predict(self, X: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """ Predict price range. Returns: Tuple of (delta_high, delta_low) as percentages """ delta_high = self.model_high.predict(X) delta_low = self.model_low.predict(X) return delta_high, delta_low def predict_range(self, X: np.ndarray, current_price: float) -> dict: """ Predict absolute price range. Returns: Dict with predicted_high, predicted_low, current_price """ delta_high, delta_low = self.predict(X) return { 'current_price': current_price, 'predicted_high': current_price * (1 + delta_high[0] / 100), 'predicted_low': current_price * (1 - abs(delta_low[0]) / 100), 'delta_high_percent': float(delta_high[0]), 'delta_low_percent': float(delta_low[0]), 'range_percent': float(delta_high[0] + abs(delta_low[0])) } def save(self, path: str): """Save both models""" self.model_high.save_model(f"{path}/model_high.json") self.model_low.save_model(f"{path}/model_low.json") def load(self, path: str): """Load both models""" self.model_high = XGBRegressor() self.model_high.load_model(f"{path}/model_high.json") self.model_low = XGBRegressor() self.model_low.load_model(f"{path}/model_low.json") ``` ### 2. TPSLClassifier **Objetivo:** Clasificar si el precio tocará primero Take Profit o Stop Loss. ```python # app/models/tpsl_classifier.py from xgboost import XGBClassifier import numpy as np class TPSLClassifier: """ Classifies whether price will hit Take Profit or Stop Loss first. Binary classification: 1 = TP first, 0 = SL first """ LABELS = {0: 'stop_loss', 1: 'take_profit'} def __init__(self, tp_percent: float = 1.0, sl_percent: float = 1.0): self.tp_percent = tp_percent self.sl_percent = sl_percent self.model = None def get_params(self) -> dict: """XGBoost hyperparameters for classification""" return { 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 5, 'scale_pos_weight': 1.0, # Adjust for class imbalance 'objective': 'binary:logistic', 'eval_metric': 'auc', 'tree_method': 'hist', 'random_state': 42 } def fit(self, X: np.ndarray, y: np.ndarray): """Train classifier""" params = self.get_params() # Calculate class weight n_pos = np.sum(y == 1) n_neg = np.sum(y == 0) params['scale_pos_weight'] = n_neg / n_pos if n_pos > 0 else 1.0 self.model = XGBClassifier(**params) self.model.fit(X, y) def predict(self, X: np.ndarray) -> np.ndarray: """Predict class (0 or 1)""" return self.model.predict(X) def predict_proba(self, X: np.ndarray) -> np.ndarray: """Predict probability of TP first""" return self.model.predict_proba(X)[:, 1] def predict_with_confidence(self, X: np.ndarray) -> dict: """ Predict with confidence score. Returns: Dict with prediction, label, and confidence """ proba = self.predict_proba(X)[0] prediction = 1 if proba >= 0.5 else 0 confidence = proba if prediction == 1 else (1 - proba) return { 'prediction': prediction, 'label': self.LABELS[prediction], 'probability_tp': float(proba), 'probability_sl': float(1 - proba), 'confidence': float(confidence) } def save(self, path: str): """Save model""" self.model.save_model(f"{path}/tpsl_model.json") def load(self, path: str): """Load model""" self.model = XGBClassifier() self.model.load_model(f"{path}/tpsl_model.json") ``` ### 3. SignalClassifier **Objetivo:** Generar señales de trading (BUY, SELL, HOLD). ```python # app/models/signal_classifier.py from xgboost import XGBClassifier import numpy as np from typing import Dict class SignalClassifier: """ Multi-class classifier for trading signals. Classes: 0=HOLD, 1=BUY, 2=SELL """ LABELS = {0: 'hold', 1: 'buy', 2: 'sell'} LABEL_TO_ID = {'hold': 0, 'buy': 1, 'sell': 2} def __init__(self, min_confidence: float = 0.6): self.min_confidence = min_confidence self.model = None def get_params(self) -> dict: """XGBoost hyperparameters for multi-class""" return { 'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.08, 'subsample': 0.85, 'colsample_bytree': 0.85, 'min_child_weight': 4, 'objective': 'multi:softprob', 'num_class': 3, 'eval_metric': 'mlogloss', 'tree_method': 'hist', 'random_state': 42 } def fit(self, X: np.ndarray, y: np.ndarray): """Train multi-class classifier""" params = self.get_params() self.model = XGBClassifier(**params) self.model.fit(X, y) def predict(self, X: np.ndarray) -> np.ndarray: """Predict signal class""" return self.model.predict(X) def predict_proba(self, X: np.ndarray) -> np.ndarray: """Predict probability for each class""" return self.model.predict_proba(X) def predict_signal(self, X: np.ndarray) -> Dict: """ Generate trading signal with confidence. Returns signal only if confidence exceeds threshold, otherwise returns HOLD. """ probas = self.predict_proba(X)[0] max_proba = np.max(probas) predicted_class = np.argmax(probas) # If confidence is too low, return HOLD if max_proba < self.min_confidence and predicted_class != 0: return { 'signal': 'hold', 'signal_id': 0, 'confidence': float(probas[0]), 'probabilities': { 'hold': float(probas[0]), 'buy': float(probas[1]), 'sell': float(probas[2]) }, 'reason': f'Low confidence ({max_proba:.2%} < {self.min_confidence:.2%})' } return { 'signal': self.LABELS[predicted_class], 'signal_id': int(predicted_class), 'confidence': float(max_proba), 'probabilities': { 'hold': float(probas[0]), 'buy': float(probas[1]), 'sell': float(probas[2]) }, 'reason': None } def save(self, path: str): """Save model""" self.model.save_model(f"{path}/signal_model.json") def load(self, path: str): """Load model""" self.model = XGBClassifier() self.model.load_model(f"{path}/signal_model.json") ``` --- ## Ensemble Manager ```python # app/models/ensemble.py from typing import Dict, Optional from .range_predictor import RangePredictor from .tpsl_classifier import TPSLClassifier from .signal_classifier import SignalClassifier class EnsembleManager: """ Manages all models and combines their predictions for comprehensive trading signals. """ def __init__(self, model_path: str): self.model_path = model_path self.range_predictors: Dict[int, RangePredictor] = {} self.tpsl_classifier: Optional[TPSLClassifier] = None self.signal_classifier: Optional[SignalClassifier] = None self.horizons = [6, 18, 36, 72] # 30min, 90min, 3h, 6h async def load_all(self): """Load all models from disk""" for horizon in self.horizons: self.range_predictors[horizon] = RangePredictor(horizon) self.range_predictors[horizon].load( f"{self.model_path}/range_predictor/h{horizon}" ) self.tpsl_classifier = TPSLClassifier() self.tpsl_classifier.load(f"{self.model_path}/tpsl_classifier") self.signal_classifier = SignalClassifier() self.signal_classifier.load(f"{self.model_path}/signal_classifier") def predict_complete( self, features: np.ndarray, current_price: float, horizon: int = 18 ) -> Dict: """ Generate complete prediction combining all models. Returns: Comprehensive prediction with range, TP/SL, and signal """ # Range prediction range_pred = self.range_predictors[horizon].predict_range( features, current_price ) # TP/SL classification tpsl_pred = self.tpsl_classifier.predict_with_confidence(features) # Signal generation signal_pred = self.signal_classifier.predict_signal(features) # Combine into final recommendation return { 'timestamp': datetime.utcnow().isoformat(), 'symbol': 'BTCUSDT', # Passed from caller 'horizon': horizon, 'horizon_label': self._horizon_label(horizon), 'price_range': range_pred, 'tpsl': tpsl_pred, 'signal': signal_pred, 'recommendation': self._generate_recommendation( range_pred, tpsl_pred, signal_pred ) } def _horizon_label(self, horizon: int) -> str: labels = { 6: 'scalping', 18: 'intraday', 36: 'swing', 72: 'position' } return labels.get(horizon, 'custom') def _generate_recommendation( self, range_pred: Dict, tpsl_pred: Dict, signal_pred: Dict ) -> Dict: """Generate actionable recommendation""" signal = signal_pred['signal'] confidence = signal_pred['confidence'] if signal == 'hold': return { 'action': 'HOLD', 'reason': 'No clear signal', 'risk_reward': None } # Calculate risk/reward based on range if signal == 'buy': reward = range_pred['delta_high_percent'] risk = abs(range_pred['delta_low_percent']) else: # sell reward = abs(range_pred['delta_low_percent']) risk = range_pred['delta_high_percent'] rr_ratio = reward / risk if risk > 0 else 0 return { 'action': signal.upper(), 'confidence': f"{confidence:.1%}", 'expected_reward': f"{reward:.2f}%", 'expected_risk': f"{risk:.2f}%", 'risk_reward': f"1:{rr_ratio:.1f}", 'tpsl_prediction': tpsl_pred['label'], 'quality': 'high' if confidence > 0.75 and rr_ratio > 1.5 else 'medium' } ``` --- ## Métricas de Modelo ### Métricas de Evaluación ```python # app/services/model_evaluator.py from sklearn.metrics import ( mean_absolute_error, mean_squared_error, accuracy_score, precision_recall_fscore_support, roc_auc_score ) import numpy as np class ModelEvaluator: """Evaluate model performance""" @staticmethod def evaluate_range_predictor(y_true: np.ndarray, y_pred: np.ndarray) -> Dict: """Evaluate regression model""" mae = mean_absolute_error(y_true, y_pred) mse = mean_squared_error(y_true, y_pred) rmse = np.sqrt(mse) mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100 return { 'mae': float(mae), 'mse': float(mse), 'rmse': float(rmse), 'mape': float(mape) } @staticmethod def evaluate_classifier(y_true: np.ndarray, y_pred: np.ndarray, y_proba: np.ndarray = None) -> Dict: """Evaluate classification model""" accuracy = accuracy_score(y_true, y_pred) precision, recall, f1, _ = precision_recall_fscore_support( y_true, y_pred, average='weighted' ) result = { 'accuracy': float(accuracy), 'precision': float(precision), 'recall': float(recall), 'f1_score': float(f1) } if y_proba is not None: try: auc = roc_auc_score(y_true, y_proba, multi_class='ovr') result['auc'] = float(auc) except: pass return result ``` --- ## Hyperparameter Tuning ```python # scripts/tune_hyperparameters.py from optuna import create_study from xgboost import XGBClassifier from sklearn.model_selection import cross_val_score import numpy as np def objective(trial, X, y): """Optuna objective for hyperparameter tuning""" params = { 'n_estimators': trial.suggest_int('n_estimators', 100, 500), 'max_depth': trial.suggest_int('max_depth', 3, 10), 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True), 'subsample': trial.suggest_float('subsample', 0.6, 1.0), 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0), 'min_child_weight': trial.suggest_int('min_child_weight', 1, 10), 'gamma': trial.suggest_float('gamma', 0, 1), 'reg_alpha': trial.suggest_float('reg_alpha', 0, 1), 'reg_lambda': trial.suggest_float('reg_lambda', 0.5, 2), } model = XGBClassifier(**params, random_state=42) scores = cross_val_score( model, X, y, cv=5, scoring='accuracy', n_jobs=-1 ) return np.mean(scores) def tune_model(X, y, n_trials: int = 100): """Run hyperparameter optimization""" study = create_study(direction='maximize') study.optimize( lambda trial: objective(trial, X, y), n_trials=n_trials ) return study.best_params ``` --- ## Model Versioning ```python # app/services/model_version.py from pathlib import Path import json from datetime import datetime class ModelVersion: """Manage model versions""" def __init__(self, base_path: str): self.base_path = Path(base_path) def save_version( self, model_name: str, metrics: Dict, params: Dict ) -> str: """Save model version metadata""" version_id = datetime.utcnow().strftime('%Y%m%d_%H%M%S') metadata = { 'version_id': version_id, 'model_name': model_name, 'created_at': datetime.utcnow().isoformat(), 'metrics': metrics, 'hyperparameters': params } version_path = self.base_path / model_name / version_id version_path.mkdir(parents=True, exist_ok=True) with open(version_path / 'metadata.json', 'w') as f: json.dump(metadata, f, indent=2) return version_id def get_latest_version(self, model_name: str) -> str: """Get latest model version""" model_path = self.base_path / model_name if not model_path.exists(): return None versions = sorted(model_path.iterdir(), reverse=True) return versions[0].name if versions else None def get_version_metrics(self, model_name: str, version_id: str) -> Dict: """Get metrics for a specific version""" metadata_path = self.base_path / model_name / version_id / 'metadata.json' if not metadata_path.exists(): return None with open(metadata_path) as f: return json.load(f) ``` --- ## Performance Targets | Modelo | Métrica | Target | Actual | |--------|---------|--------|--------| | RangePredictor (High) | MAE | < 0.5% | 0.3% | | RangePredictor (Low) | MAE | < 0.5% | 0.35% | | TPSLClassifier | Accuracy | > 65% | 68% | | TPSLClassifier | AUC | > 0.70 | 0.73 | | SignalClassifier | Accuracy | > 60% | 65% | | SignalClassifier | Precision (BUY) | > 65% | 67% | | SignalClassifier | Precision (SELL) | > 65% | 64% | --- ## Referencias - [ET-ML-001: Arquitectura](./ET-ML-001-arquitectura.md) - [ET-ML-003: Feature Engineering](./ET-ML-003-features.md) - [XGBoost Documentation](https://xgboost.readthedocs.io/) --- **Autor:** Requirements-Analyst **Fecha:** 2025-12-05