🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1165 lines
37 KiB
Markdown
1165 lines
37 KiB
Markdown
---
|
|
id: "ET-ML-avm"
|
|
title: "Especificacion Tecnica - Modelo de Valuacion Automatizada (AVM)"
|
|
type: "Technical Specification"
|
|
epic: "IAI-008"
|
|
status: "Draft"
|
|
version: "1.0"
|
|
project: "inmobiliaria-analytics"
|
|
created_date: "2026-01-04"
|
|
updated_date: "2026-01-04"
|
|
---
|
|
|
|
# ET-IA-008-avm: Modelo de Valuacion Automatizada (AVM)
|
|
|
|
---
|
|
|
|
## 1. Resumen
|
|
|
|
Sistema de Machine Learning para estimar el valor de mercado de propiedades inmobiliarias basado en caracteristicas fisicas, ubicacion, condiciones de mercado y comparables recientes.
|
|
|
|
---
|
|
|
|
## 2. Arquitectura del Sistema
|
|
|
|
```
|
|
┌─────────────────────────────────────────────────────────────────────┐
|
|
│ AVM PIPELINE │
|
|
├─────────────────────────────────────────────────────────────────────┤
|
|
│ │
|
|
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
|
│ │ Feature │───▶│ Model │───▶│ Post │ │
|
|
│ │ Engine │ │ Ensemble │ │ Process │ │
|
|
│ └─────────────┘ └─────────────┘ └─────────────┘ │
|
|
│ │ │ │ │
|
|
│ ▼ ▼ ▼ │
|
|
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
|
│ │ Property │ │ XGBoost │ │ Confidence │ │
|
|
│ │ Features │ │ LightGBM │ │ Intervals │ │
|
|
│ │ Location │ │ CatBoost │ │ SHAP │ │
|
|
│ │ Market │ │ Averaging │ │ Comparable │ │
|
|
│ └─────────────┘ └─────────────┘ └─────────────┘ │
|
|
│ │
|
|
└─────────────────────────────────────────────────────────────────────┘
|
|
│
|
|
▼
|
|
┌─────────────────┐
|
|
│ FastAPI │
|
|
│ Endpoint │
|
|
└─────────────────┘
|
|
```
|
|
|
|
---
|
|
|
|
## 3. Features del Modelo
|
|
|
|
### 3.1 Categorias de Features
|
|
|
|
```python
|
|
# src/ml/avm/features.py
|
|
from dataclasses import dataclass
|
|
from typing import List, Optional
|
|
from enum import Enum
|
|
|
|
class FeatureCategory(Enum):
|
|
PHYSICAL = "physical"
|
|
LOCATION = "location"
|
|
TEMPORAL = "temporal"
|
|
MARKET = "market"
|
|
DERIVED = "derived"
|
|
|
|
@dataclass
|
|
class PropertyFeatures:
|
|
"""Features de la propiedad fisica"""
|
|
property_type: str # casa, departamento, etc.
|
|
constructed_area_m2: float
|
|
land_area_m2: Optional[float]
|
|
bedrooms: int
|
|
bathrooms: float
|
|
parking_spaces: int
|
|
floors: int
|
|
year_built: Optional[int]
|
|
has_pool: bool
|
|
has_garden: bool
|
|
has_gym: bool
|
|
has_security: bool
|
|
has_elevator: bool
|
|
amenities_count: int
|
|
|
|
@dataclass
|
|
class LocationFeatures:
|
|
"""Features de ubicacion"""
|
|
neighborhood: str
|
|
municipality: str
|
|
latitude: float
|
|
longitude: float
|
|
distance_to_center_km: float
|
|
distance_to_metro_km: Optional[float]
|
|
distance_to_park_km: float
|
|
distance_to_school_km: float
|
|
distance_to_hospital_km: float
|
|
walk_score: float # 0-100
|
|
crime_index: float # 0-100 (mayor = mas seguro)
|
|
noise_level: float # 0-100
|
|
avg_income_zone: float # ingreso promedio zona
|
|
|
|
@dataclass
|
|
class TemporalFeatures:
|
|
"""Features temporales"""
|
|
month: int
|
|
quarter: int
|
|
year: int
|
|
days_since_listing: int
|
|
is_holiday_season: bool
|
|
inflation_rate: float
|
|
interest_rate: float
|
|
|
|
@dataclass
|
|
class MarketFeatures:
|
|
"""Features de mercado"""
|
|
avg_price_m2_neighborhood: float
|
|
median_price_m2_neighborhood: float
|
|
price_trend_3m: float # % cambio ultimos 3 meses
|
|
price_trend_12m: float # % cambio ultimo ano
|
|
inventory_count: int # propiedades activas en zona
|
|
absorption_rate: float # ventas/inventario
|
|
days_on_market_avg: float # dias promedio en mercado
|
|
comparable_count: int # num comparables encontrados
|
|
supply_demand_ratio: float
|
|
|
|
@dataclass
|
|
class DerivedFeatures:
|
|
"""Features derivados/calculados"""
|
|
price_per_m2_estimated: float
|
|
age_years: Optional[int]
|
|
bathroom_bedroom_ratio: float
|
|
parking_per_bedroom: float
|
|
area_per_bedroom: float
|
|
is_new_construction: bool # < 2 anos
|
|
is_premium_zone: bool
|
|
relative_size: float # vs promedio zona
|
|
quality_score: float # 0-100 basado en amenidades
|
|
```
|
|
|
|
### 3.2 Feature Engineering Pipeline
|
|
|
|
```python
|
|
# src/ml/avm/feature_engineering.py
|
|
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.base import BaseEstimator, TransformerMixin
|
|
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
|
from sklearn.compose import ColumnTransformer
|
|
from sklearn.pipeline import Pipeline
|
|
from geopy.distance import geodesic
|
|
|
|
class PropertyFeatureEngineer(BaseEstimator, TransformerMixin):
|
|
"""Pipeline de ingenieria de features"""
|
|
|
|
def __init__(self, market_data_service):
|
|
self.market_data = market_data_service
|
|
self.city_center = (20.6736, -103.3927) # Guadalajara centro
|
|
|
|
def fit(self, X, y=None):
|
|
return self
|
|
|
|
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
|
|
df = X.copy()
|
|
|
|
# 1. Features derivados de propiedad
|
|
df = self._add_property_derived(df)
|
|
|
|
# 2. Features de ubicacion
|
|
df = self._add_location_features(df)
|
|
|
|
# 3. Features de mercado
|
|
df = self._add_market_features(df)
|
|
|
|
# 4. Features temporales
|
|
df = self._add_temporal_features(df)
|
|
|
|
# 5. Encoding categoricos
|
|
df = self._encode_categoricals(df)
|
|
|
|
return df
|
|
|
|
def _add_property_derived(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
# Edad de la propiedad
|
|
current_year = pd.Timestamp.now().year
|
|
df['age_years'] = np.where(
|
|
df['year_built'].notna(),
|
|
current_year - df['year_built'],
|
|
np.nan
|
|
)
|
|
|
|
# Ratios
|
|
df['bathroom_bedroom_ratio'] = df['bathrooms'] / df['bedrooms'].clip(lower=1)
|
|
df['parking_per_bedroom'] = df['parking_spaces'] / df['bedrooms'].clip(lower=1)
|
|
df['area_per_bedroom'] = df['constructed_area_m2'] / df['bedrooms'].clip(lower=1)
|
|
|
|
# Flags
|
|
df['is_new_construction'] = df['age_years'] <= 2
|
|
df['has_land'] = df['land_area_m2'].notna() & (df['land_area_m2'] > 0)
|
|
|
|
# Quality score basado en amenidades
|
|
amenity_cols = ['has_pool', 'has_garden', 'has_gym', 'has_security', 'has_elevator']
|
|
df['quality_score'] = df[amenity_cols].sum(axis=1) * 20
|
|
|
|
# Log transforms para areas
|
|
df['log_constructed_area'] = np.log1p(df['constructed_area_m2'])
|
|
df['log_land_area'] = np.log1p(df['land_area_m2'].fillna(0))
|
|
|
|
return df
|
|
|
|
def _add_location_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
# Distancia al centro
|
|
def calc_distance(row):
|
|
if pd.isna(row['latitude']) or pd.isna(row['longitude']):
|
|
return np.nan
|
|
return geodesic(
|
|
(row['latitude'], row['longitude']),
|
|
self.city_center
|
|
).kilometers
|
|
|
|
df['distance_to_center_km'] = df.apply(calc_distance, axis=1)
|
|
|
|
# Zona premium (colonias especificas)
|
|
premium_zones = [
|
|
'providencia', 'americana', 'lafayette', 'country',
|
|
'puerta de hierro', 'real', 'bugambilias', 'chapalita'
|
|
]
|
|
df['is_premium_zone'] = df['neighborhood'].str.lower().isin(premium_zones)
|
|
|
|
# Cluster geografico (usando municipio como proxy)
|
|
municipality_encoding = {
|
|
'zapopan': 1.2,
|
|
'guadalajara': 1.0,
|
|
'tlaquepaque': 0.85,
|
|
'tonala': 0.75,
|
|
'tlajomulco': 0.8,
|
|
}
|
|
df['municipality_factor'] = df['municipality'].str.lower().map(municipality_encoding).fillna(0.9)
|
|
|
|
return df
|
|
|
|
def _add_market_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
# Obtener datos de mercado por zona
|
|
for idx, row in df.iterrows():
|
|
market_stats = self.market_data.get_zone_stats(
|
|
neighborhood=row['neighborhood'],
|
|
property_type=row['property_type']
|
|
)
|
|
|
|
df.at[idx, 'avg_price_m2_zone'] = market_stats.get('avg_price_m2', np.nan)
|
|
df.at[idx, 'median_price_m2_zone'] = market_stats.get('median_price_m2', np.nan)
|
|
df.at[idx, 'price_trend_3m'] = market_stats.get('trend_3m', 0)
|
|
df.at[idx, 'inventory_zone'] = market_stats.get('inventory', 0)
|
|
df.at[idx, 'days_on_market_zone'] = market_stats.get('avg_dom', 60)
|
|
|
|
# Tamano relativo vs zona
|
|
df['relative_size'] = df['constructed_area_m2'] / df['avg_price_m2_zone'].clip(lower=1)
|
|
|
|
return df
|
|
|
|
def _add_temporal_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
now = pd.Timestamp.now()
|
|
|
|
df['month'] = now.month
|
|
df['quarter'] = now.quarter
|
|
df['year'] = now.year
|
|
|
|
# Temporada alta (enero-marzo, septiembre-noviembre)
|
|
df['is_high_season'] = df['month'].isin([1, 2, 3, 9, 10, 11])
|
|
|
|
# Indicadores macroeconomicos actuales (mock - usar API real)
|
|
df['inflation_rate'] = 4.5 # %
|
|
df['interest_rate'] = 11.0 # %
|
|
|
|
return df
|
|
|
|
def _encode_categoricals(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
# Target encoding para neighborhood (usar valores precalculados)
|
|
neighborhood_means = self.market_data.get_neighborhood_price_means()
|
|
df['neighborhood_encoded'] = df['neighborhood'].map(neighborhood_means)
|
|
df['neighborhood_encoded'] = df['neighborhood_encoded'].fillna(
|
|
neighborhood_means.mean()
|
|
)
|
|
|
|
# One-hot para property_type
|
|
property_dummies = pd.get_dummies(
|
|
df['property_type'],
|
|
prefix='type',
|
|
drop_first=True
|
|
)
|
|
df = pd.concat([df, property_dummies], axis=1)
|
|
|
|
return df
|
|
|
|
|
|
def create_feature_pipeline(market_service) -> Pipeline:
|
|
"""Crear pipeline completo de features"""
|
|
|
|
# Features numericos
|
|
numeric_features = [
|
|
'constructed_area_m2', 'land_area_m2', 'bedrooms', 'bathrooms',
|
|
'parking_spaces', 'latitude', 'longitude', 'distance_to_center_km',
|
|
'age_years', 'quality_score', 'avg_price_m2_zone', 'price_trend_3m'
|
|
]
|
|
|
|
# Features categoricos
|
|
categorical_features = ['property_type', 'municipality']
|
|
|
|
# Preprocessor
|
|
preprocessor = ColumnTransformer(
|
|
transformers=[
|
|
('num', StandardScaler(), numeric_features),
|
|
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
|
|
],
|
|
remainder='passthrough'
|
|
)
|
|
|
|
pipeline = Pipeline([
|
|
('feature_engineer', PropertyFeatureEngineer(market_service)),
|
|
('preprocessor', preprocessor),
|
|
])
|
|
|
|
return pipeline
|
|
```
|
|
|
|
---
|
|
|
|
## 4. Modelo Ensemble
|
|
|
|
### 4.1 Arquitectura del Ensemble
|
|
|
|
```python
|
|
# src/ml/avm/ensemble.py
|
|
import numpy as np
|
|
import pandas as pd
|
|
from typing import Dict, List, Tuple, Optional
|
|
from sklearn.base import BaseEstimator, RegressorMixin
|
|
from sklearn.model_selection import cross_val_predict
|
|
import xgboost as xgb
|
|
import lightgbm as lgb
|
|
from catboost import CatBoostRegressor
|
|
import joblib
|
|
|
|
class AVMEnsemble(BaseEstimator, RegressorMixin):
|
|
"""Ensemble de modelos para valuacion automatizada"""
|
|
|
|
def __init__(
|
|
self,
|
|
weights: Optional[Dict[str, float]] = None,
|
|
use_stacking: bool = False
|
|
):
|
|
self.weights = weights or {
|
|
'xgboost': 0.35,
|
|
'lightgbm': 0.35,
|
|
'catboost': 0.30,
|
|
}
|
|
self.use_stacking = use_stacking
|
|
self.models = {}
|
|
self.meta_model = None
|
|
|
|
def _create_models(self) -> Dict[str, BaseEstimator]:
|
|
"""Crear modelos base del ensemble"""
|
|
|
|
models = {
|
|
'xgboost': xgb.XGBRegressor(
|
|
n_estimators=500,
|
|
max_depth=8,
|
|
learning_rate=0.05,
|
|
subsample=0.8,
|
|
colsample_bytree=0.8,
|
|
reg_alpha=0.1,
|
|
reg_lambda=1.0,
|
|
random_state=42,
|
|
n_jobs=-1,
|
|
),
|
|
|
|
'lightgbm': lgb.LGBMRegressor(
|
|
n_estimators=500,
|
|
max_depth=10,
|
|
learning_rate=0.05,
|
|
num_leaves=31,
|
|
subsample=0.8,
|
|
colsample_bytree=0.8,
|
|
reg_alpha=0.1,
|
|
reg_lambda=1.0,
|
|
random_state=42,
|
|
n_jobs=-1,
|
|
verbose=-1,
|
|
),
|
|
|
|
'catboost': CatBoostRegressor(
|
|
iterations=500,
|
|
depth=8,
|
|
learning_rate=0.05,
|
|
l2_leaf_reg=3,
|
|
random_seed=42,
|
|
verbose=False,
|
|
),
|
|
}
|
|
|
|
return models
|
|
|
|
def fit(self, X: np.ndarray, y: np.ndarray) -> 'AVMEnsemble':
|
|
"""Entrenar ensemble"""
|
|
|
|
self.models = self._create_models()
|
|
|
|
if self.use_stacking:
|
|
# Stacking: usar predicciones OOF como meta-features
|
|
meta_features = np.zeros((len(y), len(self.models)))
|
|
|
|
for i, (name, model) in enumerate(self.models.items()):
|
|
# Predicciones out-of-fold
|
|
oof_preds = cross_val_predict(
|
|
model, X, y, cv=5, n_jobs=-1
|
|
)
|
|
meta_features[:, i] = oof_preds
|
|
|
|
# Entrenar en todo el dataset
|
|
model.fit(X, y)
|
|
|
|
# Meta-modelo
|
|
from sklearn.linear_model import Ridge
|
|
self.meta_model = Ridge(alpha=1.0)
|
|
self.meta_model.fit(meta_features, y)
|
|
|
|
else:
|
|
# Simple averaging
|
|
for name, model in self.models.items():
|
|
model.fit(X, y)
|
|
|
|
return self
|
|
|
|
def predict(self, X: np.ndarray) -> np.ndarray:
|
|
"""Prediccion del ensemble"""
|
|
|
|
predictions = {}
|
|
for name, model in self.models.items():
|
|
predictions[name] = model.predict(X)
|
|
|
|
if self.use_stacking:
|
|
meta_features = np.column_stack(list(predictions.values()))
|
|
return self.meta_model.predict(meta_features)
|
|
else:
|
|
# Weighted average
|
|
weighted_sum = np.zeros(len(X))
|
|
for name, preds in predictions.items():
|
|
weighted_sum += preds * self.weights[name]
|
|
return weighted_sum
|
|
|
|
def predict_with_uncertainty(
|
|
self,
|
|
X: np.ndarray
|
|
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
|
"""Prediccion con intervalos de confianza"""
|
|
|
|
predictions = []
|
|
for name, model in self.models.items():
|
|
predictions.append(model.predict(X))
|
|
|
|
predictions = np.array(predictions)
|
|
|
|
# Media y desviacion estandar de modelos
|
|
mean_pred = np.mean(predictions, axis=0)
|
|
std_pred = np.std(predictions, axis=0)
|
|
|
|
# Intervalos de confianza (95%)
|
|
lower = mean_pred - 1.96 * std_pred
|
|
upper = mean_pred + 1.96 * std_pred
|
|
|
|
return mean_pred, lower, upper
|
|
|
|
def get_feature_importance(self) -> pd.DataFrame:
|
|
"""Obtener importancia de features promediada"""
|
|
|
|
importances = []
|
|
|
|
for name, model in self.models.items():
|
|
if hasattr(model, 'feature_importances_'):
|
|
imp = model.feature_importances_
|
|
importances.append(imp)
|
|
|
|
if not importances:
|
|
return pd.DataFrame()
|
|
|
|
avg_importance = np.mean(importances, axis=0)
|
|
return pd.DataFrame({
|
|
'importance': avg_importance
|
|
}).sort_values('importance', ascending=False)
|
|
|
|
def save(self, path: str):
|
|
"""Guardar modelo"""
|
|
joblib.dump({
|
|
'models': self.models,
|
|
'weights': self.weights,
|
|
'meta_model': self.meta_model,
|
|
'use_stacking': self.use_stacking,
|
|
}, path)
|
|
|
|
@classmethod
|
|
def load(cls, path: str) -> 'AVMEnsemble':
|
|
"""Cargar modelo"""
|
|
data = joblib.load(path)
|
|
ensemble = cls(
|
|
weights=data['weights'],
|
|
use_stacking=data['use_stacking']
|
|
)
|
|
ensemble.models = data['models']
|
|
ensemble.meta_model = data['meta_model']
|
|
return ensemble
|
|
```
|
|
|
|
### 4.2 Training Pipeline
|
|
|
|
```python
|
|
# src/ml/avm/training.py
|
|
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.model_selection import train_test_split, cross_val_score
|
|
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
|
|
import mlflow
|
|
from datetime import datetime
|
|
|
|
from .ensemble import AVMEnsemble
|
|
from .feature_engineering import create_feature_pipeline
|
|
from .data_loader import PropertyDataLoader
|
|
|
|
class AVMTrainer:
|
|
"""Pipeline de entrenamiento para AVM"""
|
|
|
|
def __init__(
|
|
self,
|
|
data_loader: PropertyDataLoader,
|
|
market_service,
|
|
mlflow_experiment: str = "avm-training"
|
|
):
|
|
self.data_loader = data_loader
|
|
self.market_service = market_service
|
|
mlflow.set_experiment(mlflow_experiment)
|
|
|
|
def train(
|
|
self,
|
|
property_types: List[str] = None,
|
|
min_samples: int = 1000,
|
|
test_size: float = 0.2
|
|
) -> Dict:
|
|
"""Entrenar modelo AVM"""
|
|
|
|
with mlflow.start_run(run_name=f"avm-{datetime.now().strftime('%Y%m%d_%H%M')}"):
|
|
# 1. Cargar datos
|
|
df = self.data_loader.load_training_data(
|
|
property_types=property_types,
|
|
min_samples=min_samples
|
|
)
|
|
|
|
mlflow.log_param("n_samples", len(df))
|
|
mlflow.log_param("property_types", property_types)
|
|
|
|
# 2. Feature engineering
|
|
feature_pipeline = create_feature_pipeline(self.market_service)
|
|
X = feature_pipeline.fit_transform(df)
|
|
y = df['price'].values
|
|
|
|
# Log feature names
|
|
feature_names = self._get_feature_names(feature_pipeline, df)
|
|
mlflow.log_param("n_features", len(feature_names))
|
|
|
|
# 3. Split
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
X, y, test_size=test_size, random_state=42
|
|
)
|
|
|
|
# 4. Entrenar ensemble
|
|
ensemble = AVMEnsemble(use_stacking=True)
|
|
ensemble.fit(X_train, y_train)
|
|
|
|
# 5. Evaluar
|
|
metrics = self._evaluate(ensemble, X_train, y_train, X_test, y_test)
|
|
|
|
for name, value in metrics.items():
|
|
mlflow.log_metric(name, value)
|
|
|
|
# 6. Guardar modelo
|
|
model_path = f"models/avm_{datetime.now().strftime('%Y%m%d')}.joblib"
|
|
ensemble.save(model_path)
|
|
mlflow.log_artifact(model_path)
|
|
|
|
# 7. Guardar feature pipeline
|
|
import joblib
|
|
pipeline_path = "models/feature_pipeline.joblib"
|
|
joblib.dump(feature_pipeline, pipeline_path)
|
|
mlflow.log_artifact(pipeline_path)
|
|
|
|
return {
|
|
'metrics': metrics,
|
|
'model_path': model_path,
|
|
'feature_importance': ensemble.get_feature_importance()
|
|
}
|
|
|
|
def _evaluate(
|
|
self,
|
|
model: AVMEnsemble,
|
|
X_train: np.ndarray,
|
|
y_train: np.ndarray,
|
|
X_test: np.ndarray,
|
|
y_test: np.ndarray
|
|
) -> Dict[str, float]:
|
|
"""Evaluar modelo"""
|
|
|
|
y_pred_train = model.predict(X_train)
|
|
y_pred_test = model.predict(X_test)
|
|
|
|
# MAE
|
|
mae_train = mean_absolute_error(y_train, y_pred_train)
|
|
mae_test = mean_absolute_error(y_test, y_pred_test)
|
|
|
|
# MAPE
|
|
mape_train = mean_absolute_percentage_error(y_train, y_pred_train)
|
|
mape_test = mean_absolute_percentage_error(y_test, y_pred_test)
|
|
|
|
# Mediana de error absoluto
|
|
median_ae_test = np.median(np.abs(y_test - y_pred_test))
|
|
|
|
# Porcentaje dentro de 10% del valor real
|
|
within_10pct = np.mean(np.abs(y_test - y_pred_test) / y_test < 0.10)
|
|
within_15pct = np.mean(np.abs(y_test - y_pred_test) / y_test < 0.15)
|
|
|
|
# R2
|
|
from sklearn.metrics import r2_score
|
|
r2_test = r2_score(y_test, y_pred_test)
|
|
|
|
return {
|
|
'mae_train': mae_train,
|
|
'mae_test': mae_test,
|
|
'mape_train': mape_train * 100,
|
|
'mape_test': mape_test * 100,
|
|
'median_ae_test': median_ae_test,
|
|
'within_10pct': within_10pct * 100,
|
|
'within_15pct': within_15pct * 100,
|
|
'r2_test': r2_test,
|
|
}
|
|
|
|
def _get_feature_names(self, pipeline, df) -> List[str]:
|
|
"""Obtener nombres de features del pipeline"""
|
|
# Simplificado - en produccion seria mas robusto
|
|
return list(df.columns)
|
|
```
|
|
|
|
---
|
|
|
|
## 5. Explicabilidad con SHAP
|
|
|
|
```python
|
|
# src/ml/avm/explainability.py
|
|
import shap
|
|
import numpy as np
|
|
import pandas as pd
|
|
from typing import Dict, List
|
|
|
|
class AVMExplainer:
|
|
"""Explicabilidad de valuaciones usando SHAP"""
|
|
|
|
def __init__(self, model, feature_names: List[str]):
|
|
self.model = model
|
|
self.feature_names = feature_names
|
|
self.explainer = None
|
|
|
|
def initialize_explainer(self, X_background: np.ndarray):
|
|
"""Inicializar SHAP explainer con datos de background"""
|
|
# Usar TreeExplainer para modelos basados en arboles
|
|
if hasattr(self.model, 'models'):
|
|
# Para ensemble, usar el primer modelo
|
|
base_model = list(self.model.models.values())[0]
|
|
self.explainer = shap.TreeExplainer(base_model)
|
|
else:
|
|
self.explainer = shap.TreeExplainer(self.model)
|
|
|
|
def explain(
|
|
self,
|
|
X: np.ndarray,
|
|
feature_values: Dict[str, any] = None
|
|
) -> Dict:
|
|
"""Generar explicacion para una prediccion"""
|
|
|
|
if self.explainer is None:
|
|
raise ValueError("Explainer not initialized. Call initialize_explainer first.")
|
|
|
|
# Calcular SHAP values
|
|
shap_values = self.explainer.shap_values(X)
|
|
|
|
if len(X.shape) == 1:
|
|
X = X.reshape(1, -1)
|
|
shap_values = shap_values.reshape(1, -1)
|
|
|
|
# Obtener base value (prediccion promedio)
|
|
base_value = self.explainer.expected_value
|
|
if isinstance(base_value, np.ndarray):
|
|
base_value = base_value[0]
|
|
|
|
# Crear explicacion estructurada
|
|
explanations = []
|
|
|
|
for i in range(len(X)):
|
|
feature_impacts = []
|
|
|
|
for j, (name, shap_val) in enumerate(
|
|
zip(self.feature_names, shap_values[i])
|
|
):
|
|
feature_val = X[i, j] if feature_values is None else feature_values.get(name, X[i, j])
|
|
|
|
feature_impacts.append({
|
|
'feature': name,
|
|
'value': float(feature_val),
|
|
'shap_value': float(shap_val),
|
|
'impact': 'positive' if shap_val > 0 else 'negative',
|
|
'impact_formatted': self._format_impact(shap_val),
|
|
})
|
|
|
|
# Ordenar por impacto absoluto
|
|
feature_impacts.sort(key=lambda x: abs(x['shap_value']), reverse=True)
|
|
|
|
explanations.append({
|
|
'base_value': float(base_value),
|
|
'predicted_value': float(base_value + sum(shap_values[i])),
|
|
'top_positive': [f for f in feature_impacts if f['impact'] == 'positive'][:5],
|
|
'top_negative': [f for f in feature_impacts if f['impact'] == 'negative'][:5],
|
|
'all_impacts': feature_impacts,
|
|
})
|
|
|
|
return explanations[0] if len(explanations) == 1 else explanations
|
|
|
|
def generate_natural_language(self, explanation: Dict) -> str:
|
|
"""Generar explicacion en lenguaje natural"""
|
|
|
|
lines = []
|
|
predicted = explanation['predicted_value']
|
|
base = explanation['base_value']
|
|
|
|
lines.append(f"El valor estimado es ${predicted:,.0f} MXN")
|
|
lines.append(f"(Valor base promedio: ${base:,.0f} MXN)")
|
|
lines.append("")
|
|
|
|
# Factores positivos
|
|
if explanation['top_positive']:
|
|
lines.append("Factores que AUMENTAN el valor:")
|
|
for factor in explanation['top_positive'][:3]:
|
|
lines.append(f" + {self._humanize_feature(factor['feature'])}: "
|
|
f"{factor['impact_formatted']}")
|
|
|
|
# Factores negativos
|
|
if explanation['top_negative']:
|
|
lines.append("")
|
|
lines.append("Factores que REDUCEN el valor:")
|
|
for factor in explanation['top_negative'][:3]:
|
|
lines.append(f" - {self._humanize_feature(factor['feature'])}: "
|
|
f"{factor['impact_formatted']}")
|
|
|
|
return "\n".join(lines)
|
|
|
|
def _format_impact(self, shap_value: float) -> str:
|
|
"""Formatear impacto en pesos"""
|
|
prefix = "+" if shap_value > 0 else ""
|
|
return f"{prefix}${shap_value:,.0f}"
|
|
|
|
def _humanize_feature(self, feature: str) -> str:
|
|
"""Convertir nombre de feature a texto legible"""
|
|
mappings = {
|
|
'constructed_area_m2': 'Superficie construida',
|
|
'land_area_m2': 'Superficie de terreno',
|
|
'bedrooms': 'Numero de recamaras',
|
|
'bathrooms': 'Numero de banos',
|
|
'parking_spaces': 'Estacionamientos',
|
|
'is_premium_zone': 'Ubicacion premium',
|
|
'distance_to_center_km': 'Distancia al centro',
|
|
'age_years': 'Antiguedad',
|
|
'quality_score': 'Calidad/amenidades',
|
|
'avg_price_m2_zone': 'Precio promedio de zona',
|
|
'price_trend_3m': 'Tendencia de mercado',
|
|
'has_pool': 'Cuenta con alberca',
|
|
'has_garden': 'Cuenta con jardin',
|
|
}
|
|
return mappings.get(feature, feature.replace('_', ' ').title())
|
|
```
|
|
|
|
---
|
|
|
|
## 6. API de Valuacion
|
|
|
|
```python
|
|
# src/ml/avm/api.py
|
|
from fastapi import FastAPI, HTTPException
|
|
from pydantic import BaseModel, Field
|
|
from typing import List, Optional
|
|
import numpy as np
|
|
|
|
from .ensemble import AVMEnsemble
|
|
from .explainability import AVMExplainer
|
|
from .comparables import ComparablesFinder
|
|
|
|
app = FastAPI(title="AVM API", version="1.0.0")
|
|
|
|
# Modelos cargados al inicio
|
|
avm_model: AVMEnsemble = None
|
|
explainer: AVMExplainer = None
|
|
comparables_finder: ComparablesFinder = None
|
|
|
|
class PropertyInput(BaseModel):
|
|
property_type: str = Field(..., example="casa")
|
|
constructed_area_m2: float = Field(..., gt=0, example=180)
|
|
land_area_m2: Optional[float] = Field(None, example=250)
|
|
bedrooms: int = Field(..., ge=1, example=3)
|
|
bathrooms: float = Field(..., ge=1, example=2.5)
|
|
parking_spaces: int = Field(0, ge=0, example=2)
|
|
latitude: float = Field(..., example=20.6736)
|
|
longitude: float = Field(..., example=-103.3927)
|
|
neighborhood: str = Field(..., example="Providencia")
|
|
municipality: str = Field(..., example="Guadalajara")
|
|
year_built: Optional[int] = Field(None, example=2018)
|
|
amenities: List[str] = Field(default_factory=list)
|
|
|
|
class ValuationResponse(BaseModel):
|
|
estimated_value: float
|
|
confidence: float
|
|
range_low: float
|
|
range_high: float
|
|
price_per_m2: float
|
|
explanation: dict
|
|
comparables: List[dict]
|
|
|
|
@app.post("/valuate", response_model=ValuationResponse)
|
|
async def valuate_property(property_data: PropertyInput):
|
|
"""Obtener valuacion de una propiedad"""
|
|
|
|
try:
|
|
# 1. Preparar features
|
|
features = prepare_features(property_data)
|
|
|
|
# 2. Prediccion con incertidumbre
|
|
mean_pred, lower, upper = avm_model.predict_with_uncertainty(features)
|
|
|
|
estimated_value = float(mean_pred[0])
|
|
range_low = float(lower[0])
|
|
range_high = float(upper[0])
|
|
|
|
# 3. Calcular confianza
|
|
confidence = calculate_confidence(
|
|
features,
|
|
estimated_value,
|
|
range_low,
|
|
range_high
|
|
)
|
|
|
|
# 4. Generar explicacion
|
|
explanation = explainer.explain(features)
|
|
|
|
# 5. Buscar comparables
|
|
comparables = comparables_finder.find(
|
|
property_data.dict(),
|
|
limit=5
|
|
)
|
|
|
|
# 6. Precio por m2
|
|
price_per_m2 = estimated_value / property_data.constructed_area_m2
|
|
|
|
return ValuationResponse(
|
|
estimated_value=round(estimated_value, -3), # Redondear a miles
|
|
confidence=round(confidence, 2),
|
|
range_low=round(range_low, -3),
|
|
range_high=round(range_high, -3),
|
|
price_per_m2=round(price_per_m2, 0),
|
|
explanation=explanation,
|
|
comparables=comparables,
|
|
)
|
|
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@app.get("/market-stats/{neighborhood}")
|
|
async def get_market_stats(neighborhood: str):
|
|
"""Obtener estadisticas de mercado por zona"""
|
|
|
|
stats = market_service.get_zone_stats(neighborhood)
|
|
|
|
return {
|
|
"neighborhood": neighborhood,
|
|
"avg_price_m2": stats.get('avg_price_m2'),
|
|
"median_price_m2": stats.get('median_price_m2'),
|
|
"inventory": stats.get('inventory'),
|
|
"trend_3m": stats.get('trend_3m'),
|
|
"trend_12m": stats.get('trend_12m'),
|
|
"avg_days_on_market": stats.get('avg_dom'),
|
|
}
|
|
|
|
def prepare_features(property_data: PropertyInput) -> np.ndarray:
|
|
"""Convertir input a features del modelo"""
|
|
# Implementar transformacion
|
|
pass
|
|
|
|
def calculate_confidence(
|
|
features: np.ndarray,
|
|
prediction: float,
|
|
lower: float,
|
|
upper: float
|
|
) -> float:
|
|
"""Calcular score de confianza 0-100"""
|
|
|
|
# Factores que afectan confianza:
|
|
# 1. Ancho del intervalo de confianza
|
|
interval_width = (upper - lower) / prediction
|
|
interval_score = max(0, 100 - interval_width * 200)
|
|
|
|
# 2. Cantidad de comparables disponibles
|
|
# (se calcularía con datos reales)
|
|
comparables_score = 80 # placeholder
|
|
|
|
# 3. Calidad de datos de entrada
|
|
data_quality_score = 90 # placeholder
|
|
|
|
# Promedio ponderado
|
|
confidence = (
|
|
interval_score * 0.4 +
|
|
comparables_score * 0.35 +
|
|
data_quality_score * 0.25
|
|
)
|
|
|
|
return min(100, max(0, confidence))
|
|
```
|
|
|
|
---
|
|
|
|
## 7. Busqueda de Comparables
|
|
|
|
```python
|
|
# src/ml/avm/comparables.py
|
|
from typing import List, Dict
|
|
import numpy as np
|
|
from sqlalchemy import text
|
|
from .database import get_db_connection
|
|
|
|
class ComparablesFinder:
|
|
"""Buscar propiedades comparables"""
|
|
|
|
def __init__(self, db_connection):
|
|
self.db = db_connection
|
|
|
|
def find(
|
|
self,
|
|
property_data: Dict,
|
|
limit: int = 5,
|
|
max_distance_km: float = 2.0,
|
|
max_age_days: int = 180
|
|
) -> List[Dict]:
|
|
"""Encontrar propiedades comparables"""
|
|
|
|
query = text("""
|
|
WITH target AS (
|
|
SELECT
|
|
ST_SetSRID(ST_MakePoint(:lng, :lat), 4326)::geography as location,
|
|
:property_type as ptype,
|
|
:bedrooms as beds,
|
|
:bathrooms as baths,
|
|
:area as area
|
|
)
|
|
SELECT
|
|
p.id,
|
|
p.title,
|
|
p.price,
|
|
p.constructed_area_m2,
|
|
p.bedrooms,
|
|
p.bathrooms,
|
|
p.neighborhood,
|
|
p.source_url,
|
|
p.last_seen_at,
|
|
ST_Distance(p.coordinates::geography, t.location) / 1000 as distance_km,
|
|
|
|
-- Similarity score
|
|
(
|
|
-- Penalizar por distancia
|
|
(1 - LEAST(ST_Distance(p.coordinates::geography, t.location) / 2000, 1)) * 30 +
|
|
|
|
-- Similaridad de area (dentro de 30%)
|
|
CASE WHEN ABS(p.constructed_area_m2 - t.area) / t.area < 0.3
|
|
THEN (1 - ABS(p.constructed_area_m2 - t.area) / t.area) * 25
|
|
ELSE 0 END +
|
|
|
|
-- Match de recamaras
|
|
CASE WHEN p.bedrooms = t.beds THEN 20
|
|
WHEN ABS(p.bedrooms - t.beds) = 1 THEN 10
|
|
ELSE 0 END +
|
|
|
|
-- Match de banos
|
|
CASE WHEN p.bathrooms = t.baths THEN 15
|
|
WHEN ABS(p.bathrooms - t.baths) <= 0.5 THEN 8
|
|
ELSE 0 END +
|
|
|
|
-- Recencia (mas reciente = mejor)
|
|
LEAST(10, 180 - EXTRACT(DAY FROM NOW() - p.last_seen_at)) / 18
|
|
) as similarity_score
|
|
|
|
FROM properties p, target t
|
|
WHERE p.property_type = t.ptype
|
|
AND p.status IN ('active', 'sold')
|
|
AND ST_DWithin(
|
|
p.coordinates::geography,
|
|
t.location,
|
|
:max_distance * 1000
|
|
)
|
|
AND p.last_seen_at > NOW() - INTERVAL ':max_age days'
|
|
AND p.constructed_area_m2 BETWEEN t.area * 0.7 AND t.area * 1.3
|
|
|
|
ORDER BY similarity_score DESC
|
|
LIMIT :limit
|
|
""")
|
|
|
|
result = self.db.execute(query, {
|
|
'lat': property_data['latitude'],
|
|
'lng': property_data['longitude'],
|
|
'property_type': property_data['property_type'],
|
|
'bedrooms': property_data['bedrooms'],
|
|
'bathrooms': property_data['bathrooms'],
|
|
'area': property_data['constructed_area_m2'],
|
|
'max_distance': max_distance_km,
|
|
'max_age': max_age_days,
|
|
'limit': limit,
|
|
})
|
|
|
|
comparables = []
|
|
for row in result:
|
|
comparables.append({
|
|
'id': row.id,
|
|
'title': row.title,
|
|
'price': float(row.price),
|
|
'price_per_m2': float(row.price / row.constructed_area_m2),
|
|
'area_m2': float(row.constructed_area_m2),
|
|
'bedrooms': row.bedrooms,
|
|
'bathrooms': float(row.bathrooms),
|
|
'neighborhood': row.neighborhood,
|
|
'distance_km': round(row.distance_km, 2),
|
|
'similarity_score': round(row.similarity_score, 1),
|
|
'url': row.source_url,
|
|
'last_seen': row.last_seen_at.isoformat(),
|
|
})
|
|
|
|
return comparables
|
|
```
|
|
|
|
---
|
|
|
|
## 8. Tests
|
|
|
|
```python
|
|
# src/ml/avm/__tests__/test_ensemble.py
|
|
import pytest
|
|
import numpy as np
|
|
from ..ensemble import AVMEnsemble
|
|
|
|
class TestAVMEnsemble:
|
|
|
|
@pytest.fixture
|
|
def sample_data(self):
|
|
np.random.seed(42)
|
|
X = np.random.randn(100, 10)
|
|
y = np.random.randn(100) * 1000000 + 3000000 # Precios entre 2-4M
|
|
return X, y
|
|
|
|
def test_fit_predict(self, sample_data):
|
|
X, y = sample_data
|
|
model = AVMEnsemble()
|
|
model.fit(X, y)
|
|
|
|
predictions = model.predict(X)
|
|
|
|
assert len(predictions) == len(y)
|
|
assert predictions.min() > 0
|
|
|
|
def test_predict_with_uncertainty(self, sample_data):
|
|
X, y = sample_data
|
|
model = AVMEnsemble()
|
|
model.fit(X, y)
|
|
|
|
mean, lower, upper = model.predict_with_uncertainty(X)
|
|
|
|
assert len(mean) == len(y)
|
|
assert all(lower <= mean)
|
|
assert all(mean <= upper)
|
|
|
|
def test_stacking_ensemble(self, sample_data):
|
|
X, y = sample_data
|
|
model = AVMEnsemble(use_stacking=True)
|
|
model.fit(X, y)
|
|
|
|
predictions = model.predict(X)
|
|
assert model.meta_model is not None
|
|
|
|
def test_save_load(self, sample_data, tmp_path):
|
|
X, y = sample_data
|
|
model = AVMEnsemble()
|
|
model.fit(X, y)
|
|
|
|
path = tmp_path / "model.joblib"
|
|
model.save(str(path))
|
|
|
|
loaded = AVMEnsemble.load(str(path))
|
|
preds_original = model.predict(X)
|
|
preds_loaded = loaded.predict(X)
|
|
|
|
np.testing.assert_array_almost_equal(preds_original, preds_loaded)
|
|
```
|
|
|
|
---
|
|
|
|
## 9. Metricas y Monitoreo
|
|
|
|
```yaml
|
|
# Metricas a trackear en produccion
|
|
metrics:
|
|
model_performance:
|
|
- name: avm_mape
|
|
type: gauge
|
|
description: "Mean Absolute Percentage Error"
|
|
|
|
- name: avm_predictions_total
|
|
type: counter
|
|
description: "Total valuations performed"
|
|
labels: [property_type, zone]
|
|
|
|
- name: avm_prediction_latency_seconds
|
|
type: histogram
|
|
description: "Prediction latency"
|
|
buckets: [0.1, 0.25, 0.5, 1, 2]
|
|
|
|
- name: avm_confidence_score
|
|
type: histogram
|
|
description: "Confidence scores distribution"
|
|
buckets: [50, 60, 70, 80, 90, 100]
|
|
|
|
data_quality:
|
|
- name: avm_missing_features
|
|
type: counter
|
|
description: "Predictions with missing features"
|
|
labels: [feature]
|
|
|
|
- name: avm_comparables_found
|
|
type: histogram
|
|
description: "Number of comparables found"
|
|
buckets: [0, 1, 3, 5, 10]
|
|
|
|
drift:
|
|
- name: avm_feature_drift_score
|
|
type: gauge
|
|
description: "Feature distribution drift"
|
|
labels: [feature]
|
|
|
|
- name: avm_prediction_drift_score
|
|
type: gauge
|
|
description: "Prediction distribution drift"
|
|
```
|
|
|
|
---
|
|
|
|
**Siguiente:** [ET-IA-008-survival.md](./ET-IA-008-survival.md) - Modelo de prediccion de tiempo de venta
|