inmobiliaria-analytics/docs/01-fase-alcance-inicial/IAI-007-webscraper/especificaciones/ET-SCR-002-etl.md
rckrdmrd f570727617 feat: Documentation and orchestration updates
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-07 05:35:40 -06:00

1389 lines
38 KiB
Markdown

---
id: "ET-SCR-002"
title: "Especificacion Tecnica - Pipeline ETL y Normalizacion"
type: "Technical Specification"
epic: "IAI-007"
status: "Draft"
version: "1.0"
project: "inmobiliaria-analytics"
created_date: "2026-01-04"
updated_date: "2026-01-04"
---
# ET-SCR-002: Pipeline ETL y Normalizacion
---
## 1. Resumen
Pipeline de Extract-Transform-Load para procesar datos crudos de propiedades scrapeadas, normalizarlos a un esquema unificado, enriquecerlos con geocoding y detectar duplicados.
---
## 2. Arquitectura del Pipeline
```
┌──────────────┐ ┌──────────────┐ ┌──────────────┐
│ EXTRACT │────▶│ TRANSFORM │────▶│ LOAD │
│ │ │ │ │ │
│ - Raw HTML │ │ - Parse │ │ - Validate │
│ - JSON APIs │ │ - Normalize │ │ - Dedupe │
│ - Sitemap │ │ - Geocode │ │ - Upsert │
└──────────────┘ └──────────────┘ └──────────────┘
│ │ │
▼ ▼ ▼
┌──────────────┐ ┌──────────────┐ ┌──────────────┐
│ staging │ │ enriched │ │ properties │
│ _raw │ │ _staging │ │ (final) │
└──────────────┘ └──────────────┘ └──────────────┘
```
---
## 3. Esquema de Datos
### 3.1 Raw Data (Entrada)
```typescript
interface RawPropertyData {
source: string;
sourceId: string;
sourceUrl: string;
scrapedAt: Date;
rawHtml?: string;
rawJson?: Record<string, unknown>;
// Campos extraidos (pueden variar por fuente)
titulo?: string;
precio?: string;
ubicacion?: string;
superficie?: string;
recamaras?: string;
banos?: string;
descripcion?: string;
imagenes?: string[];
amenidades?: string[];
contacto?: {
nombre?: string;
telefono?: string;
email?: string;
};
}
```
### 3.2 Normalized Data (Salida)
```typescript
interface NormalizedProperty {
// Identificadores
id: string; // UUID interno
sourceId: string;
source: string;
sourceUrl: string;
// Informacion basica
title: string;
description: string;
propertyType: PropertyType;
transactionType: TransactionType;
// Precios
price: number;
currency: 'MXN' | 'USD';
pricePerSqm: number | null;
// Superficie
landArea: number | null; // m2 terreno
constructedArea: number | null; // m2 construccion
// Caracteristicas
bedrooms: number | null;
bathrooms: number | null;
parkingSpaces: number | null;
floors: number | null;
yearBuilt: number | null;
// Ubicacion
location: {
rawAddress: string;
street: string | null;
neighborhood: string; // colonia
municipality: string; // municipio
state: string;
postalCode: string | null;
country: string;
coordinates: {
lat: number;
lng: number;
} | null;
geocodeConfidence: number;
};
// Media
images: PropertyImage[];
virtualTour: string | null;
video: string | null;
// Amenidades
amenities: string[];
// Contacto
agent: {
name: string | null;
phone: string | null;
email: string | null;
agency: string | null;
};
// Metadata
firstSeenAt: Date;
lastSeenAt: Date;
publishedAt: Date | null;
status: PropertyStatus;
// Calidad de datos
dataQuality: {
score: number; // 0-100
missingFields: string[];
warnings: string[];
};
}
enum PropertyType {
CASA = 'casa',
DEPARTAMENTO = 'departamento',
TERRENO = 'terreno',
LOCAL_COMERCIAL = 'local_comercial',
OFICINA = 'oficina',
BODEGA = 'bodega',
EDIFICIO = 'edificio',
OTRO = 'otro'
}
enum TransactionType {
VENTA = 'venta',
RENTA = 'renta',
TRASPASO = 'traspaso'
}
enum PropertyStatus {
ACTIVE = 'active',
SOLD = 'sold',
RENTED = 'rented',
INACTIVE = 'inactive',
REMOVED = 'removed'
}
interface PropertyImage {
url: string;
thumbnailUrl: string | null;
order: number;
isMain: boolean;
}
```
---
## 4. Implementacion del Pipeline
### 4.1 Extractor Base
```typescript
// src/etl/extractors/base.extractor.ts
import { RawPropertyData } from '../types';
export abstract class BaseExtractor {
abstract source: string;
abstract extractFromHtml(html: string, url: string): Partial<RawPropertyData>;
abstract extractFromJson(json: unknown, url: string): Partial<RawPropertyData>;
protected cleanText(text: string | null | undefined): string {
if (!text) return '';
return text
.replace(/\s+/g, ' ')
.replace(/[\n\r\t]/g, ' ')
.trim();
}
protected extractNumbers(text: string): number[] {
const matches = text.match(/[\d,]+(\.\d+)?/g) || [];
return matches.map(m => parseFloat(m.replace(/,/g, '')));
}
}
```
### 4.2 Extractor Inmuebles24
```typescript
// src/etl/extractors/inmuebles24.extractor.ts
import * as cheerio from 'cheerio';
import { BaseExtractor } from './base.extractor';
import { RawPropertyData } from '../types';
export class Inmuebles24Extractor extends BaseExtractor {
source = 'inmuebles24';
extractFromHtml(html: string, url: string): Partial<RawPropertyData> {
const $ = cheerio.load(html);
return {
source: this.source,
sourceUrl: url,
sourceId: this.extractSourceId(url),
titulo: this.cleanText($('h1.title-type-sup').text()),
precio: this.cleanText($('.price-value').text()),
ubicacion: this.cleanText($('.location-container').text()),
superficie: this.extractSuperficie($),
recamaras: this.extractFeature($, 'recamaras'),
banos: this.extractFeature($, 'banos'),
descripcion: this.cleanText($('.description-content').text()),
imagenes: this.extractImages($),
amenidades: this.extractAmenidades($),
contacto: {
nombre: this.cleanText($('.publisher-name').text()),
telefono: $('[data-phone]').attr('data-phone') || null,
},
};
}
extractFromJson(json: any, url: string): Partial<RawPropertyData> {
// Procesar JSON-LD o APIs internas
if (json['@type'] === 'RealEstateListing') {
return {
source: this.source,
sourceUrl: url,
sourceId: json.identifier,
titulo: json.name,
precio: json.offers?.price?.toString(),
// ... mapear resto de campos
};
}
return {};
}
private extractSourceId(url: string): string {
const match = url.match(/propiedades\/(\d+)/);
return match ? match[1] : '';
}
private extractSuperficie($: cheerio.CheerioAPI): string {
const container = $('.surface-container').text();
return this.cleanText(container);
}
private extractFeature($: cheerio.CheerioAPI, feature: string): string {
const el = $(`.feature-${feature}`).text();
return this.cleanText(el);
}
private extractImages($: cheerio.CheerioAPI): string[] {
const images: string[] = [];
$('img.gallery-image').each((_, el) => {
const src = $(el).attr('src') || $(el).attr('data-src');
if (src) images.push(src);
});
return images;
}
private extractAmenidades($: cheerio.CheerioAPI): string[] {
const amenities: string[] = [];
$('.amenity-item').each((_, el) => {
amenities.push(this.cleanText($(el).text()));
});
return amenities;
}
}
```
### 4.3 Transformador/Normalizador
```typescript
// src/etl/transformers/normalizer.ts
import { RawPropertyData, NormalizedProperty, PropertyType, TransactionType } from '../types';
import { GeocodingService } from '../services/geocoding.service';
export class PropertyNormalizer {
constructor(private geocoder: GeocodingService) {}
async normalize(raw: RawPropertyData): Promise<NormalizedProperty> {
const price = this.parsePrice(raw.precio);
const areas = this.parseAreas(raw.superficie);
const location = await this.normalizeLocation(raw.ubicacion);
const normalized: NormalizedProperty = {
id: this.generateId(raw),
sourceId: raw.sourceId,
source: raw.source,
sourceUrl: raw.sourceUrl,
title: this.normalizeTitle(raw.titulo),
description: raw.descripcion || '',
propertyType: this.detectPropertyType(raw),
transactionType: this.detectTransactionType(raw),
price: price.amount,
currency: price.currency,
pricePerSqm: areas.constructed
? Math.round(price.amount / areas.constructed)
: null,
landArea: areas.land,
constructedArea: areas.constructed,
bedrooms: this.parseNumber(raw.recamaras),
bathrooms: this.parseNumber(raw.banos),
parkingSpaces: this.extractParkingSpaces(raw),
floors: null,
yearBuilt: null,
location,
images: this.normalizeImages(raw.imagenes),
virtualTour: null,
video: null,
amenities: this.normalizeAmenities(raw.amenidades),
agent: {
name: raw.contacto?.nombre || null,
phone: this.normalizePhone(raw.contacto?.telefono),
email: raw.contacto?.email || null,
agency: null,
},
firstSeenAt: raw.scrapedAt,
lastSeenAt: raw.scrapedAt,
publishedAt: null,
status: 'active',
dataQuality: this.calculateDataQuality(raw),
};
return normalized;
}
private parsePrice(priceStr?: string): { amount: number; currency: 'MXN' | 'USD' } {
if (!priceStr) return { amount: 0, currency: 'MXN' };
const currency = priceStr.includes('USD') || priceStr.includes('$') && priceStr.includes('dll')
? 'USD' : 'MXN';
const cleaned = priceStr.replace(/[^\d.]/g, '');
const amount = parseFloat(cleaned) || 0;
return { amount, currency };
}
private parseAreas(superficieStr?: string): { land: number | null; constructed: number | null } {
if (!superficieStr) return { land: null, constructed: null };
const result = { land: null as number | null, constructed: null as number | null };
// Buscar patrones como "180 m2 construccion" o "250 m2 terreno"
const constMatch = superficieStr.match(/(\d+(?:\.\d+)?)\s*m[2²]?\s*(const|constr)/i);
const landMatch = superficieStr.match(/(\d+(?:\.\d+)?)\s*m[2²]?\s*(terr|lote)/i);
if (constMatch) result.constructed = parseFloat(constMatch[1]);
if (landMatch) result.land = parseFloat(landMatch[1]);
// Si solo hay un numero, asumir es area construida para casas/deptos
if (!result.constructed && !result.land) {
const numbers = superficieStr.match(/(\d+(?:\.\d+)?)/g);
if (numbers && numbers.length === 1) {
result.constructed = parseFloat(numbers[0]);
}
}
return result;
}
private async normalizeLocation(rawAddress?: string): Promise<NormalizedProperty['location']> {
const defaultLocation = {
rawAddress: rawAddress || '',
street: null,
neighborhood: '',
municipality: '',
state: 'Jalisco',
postalCode: null,
country: 'Mexico',
coordinates: null,
geocodeConfidence: 0,
};
if (!rawAddress) return defaultLocation;
try {
const geocoded = await this.geocoder.geocode(rawAddress);
return {
rawAddress,
street: geocoded.street,
neighborhood: geocoded.neighborhood || this.extractColonia(rawAddress),
municipality: geocoded.municipality || 'Guadalajara',
state: geocoded.state || 'Jalisco',
postalCode: geocoded.postalCode,
country: 'Mexico',
coordinates: geocoded.coordinates,
geocodeConfidence: geocoded.confidence,
};
} catch (error) {
// Fallback: parsing manual
return {
...defaultLocation,
neighborhood: this.extractColonia(rawAddress),
municipality: this.extractMunicipio(rawAddress),
};
}
}
private extractColonia(address: string): string {
// Patrones comunes: "Col. Providencia", "Colonia Americana"
const match = address.match(/(?:col\.?|colonia)\s+([^,]+)/i);
return match ? match[1].trim() : '';
}
private extractMunicipio(address: string): string {
const municipios = [
'Guadalajara', 'Zapopan', 'Tlaquepaque', 'Tonala',
'Tlajomulco', 'El Salto', 'Ixtlahuacan'
];
for (const mun of municipios) {
if (address.toLowerCase().includes(mun.toLowerCase())) {
return mun;
}
}
return '';
}
private detectPropertyType(raw: RawPropertyData): PropertyType {
const text = `${raw.titulo} ${raw.descripcion}`.toLowerCase();
if (text.includes('departamento') || text.includes('depto')) {
return PropertyType.DEPARTAMENTO;
}
if (text.includes('casa')) {
return PropertyType.CASA;
}
if (text.includes('terreno') || text.includes('lote')) {
return PropertyType.TERRENO;
}
if (text.includes('local') || text.includes('comercial')) {
return PropertyType.LOCAL_COMERCIAL;
}
if (text.includes('oficina')) {
return PropertyType.OFICINA;
}
if (text.includes('bodega')) {
return PropertyType.BODEGA;
}
return PropertyType.OTRO;
}
private detectTransactionType(raw: RawPropertyData): TransactionType {
const text = `${raw.titulo} ${raw.sourceUrl}`.toLowerCase();
if (text.includes('renta') || text.includes('alquiler')) {
return TransactionType.RENTA;
}
if (text.includes('traspaso')) {
return TransactionType.TRASPASO;
}
return TransactionType.VENTA;
}
private normalizePhone(phone?: string | null): string | null {
if (!phone) return null;
// Limpiar y formatear telefono mexicano
const cleaned = phone.replace(/\D/g, '');
if (cleaned.length === 10) {
return cleaned;
}
if (cleaned.length === 12 && cleaned.startsWith('52')) {
return cleaned.substring(2);
}
return cleaned || null;
}
private normalizeImages(images?: string[]): NormalizedProperty['images'] {
if (!images || images.length === 0) return [];
return images.map((url, index) => ({
url: this.normalizeImageUrl(url),
thumbnailUrl: this.generateThumbnailUrl(url),
order: index,
isMain: index === 0,
}));
}
private normalizeImageUrl(url: string): string {
// Asegurar HTTPS y limpiar parametros innecesarios
return url.replace(/^http:/, 'https:');
}
private generateThumbnailUrl(url: string): string {
// Generar URL de thumbnail (depende del CDN usado)
return url.replace('/images/', '/thumbnails/');
}
private normalizeAmenities(amenities?: string[]): string[] {
if (!amenities) return [];
const normalized = new Set<string>();
const mapping: Record<string, string> = {
'alberca': 'Alberca',
'piscina': 'Alberca',
'jardin': 'Jardin',
'gym': 'Gimnasio',
'gimnasio': 'Gimnasio',
'roof': 'Roof Garden',
'terraza': 'Terraza',
'seguridad': 'Seguridad 24/7',
'vigilancia': 'Seguridad 24/7',
'estacionamiento': 'Estacionamiento',
'cochera': 'Estacionamiento',
};
for (const amenity of amenities) {
const lower = amenity.toLowerCase().trim();
const key = Object.keys(mapping).find(k => lower.includes(k));
normalized.add(key ? mapping[key] : amenity);
}
return Array.from(normalized);
}
private parseNumber(str?: string): number | null {
if (!str) return null;
const num = parseInt(str.replace(/\D/g, ''));
return isNaN(num) ? null : num;
}
private extractParkingSpaces(raw: RawPropertyData): number | null {
const text = `${raw.descripcion} ${raw.amenidades?.join(' ')}`;
const match = text.match(/(\d+)\s*(estacionamiento|cochera|parking)/i);
return match ? parseInt(match[1]) : null;
}
private generateId(raw: RawPropertyData): string {
// Crear hash unico basado en source + sourceId
const crypto = require('crypto');
const input = `${raw.source}:${raw.sourceId}`;
return crypto.createHash('sha256').update(input).digest('hex').substring(0, 32);
}
private calculateDataQuality(raw: RawPropertyData): NormalizedProperty['dataQuality'] {
const requiredFields = ['titulo', 'precio', 'ubicacion', 'superficie'];
const optionalFields = ['recamaras', 'banos', 'descripcion', 'imagenes'];
const missingRequired = requiredFields.filter(f => !raw[f as keyof RawPropertyData]);
const missingOptional = optionalFields.filter(f => !raw[f as keyof RawPropertyData]);
const warnings: string[] = [];
// Validaciones
if (raw.precio && parseFloat(raw.precio.replace(/\D/g, '')) < 100000) {
warnings.push('Precio sospechosamente bajo');
}
if (raw.imagenes && raw.imagenes.length < 3) {
warnings.push('Pocas imagenes');
}
const score = Math.max(0, 100
- (missingRequired.length * 20)
- (missingOptional.length * 5)
- (warnings.length * 10)
);
return {
score,
missingFields: [...missingRequired, ...missingOptional],
warnings,
};
}
}
```
---
## 5. Servicio de Geocoding
```typescript
// src/etl/services/geocoding.service.ts
import { Redis } from 'ioredis';
interface GeocodedResult {
street: string | null;
neighborhood: string | null;
municipality: string | null;
state: string | null;
postalCode: string | null;
coordinates: { lat: number; lng: number } | null;
confidence: number;
}
export class GeocodingService {
private redis: Redis;
private nominatimUrl = 'https://nominatim.openstreetmap.org/search';
private rateLimiter: { lastCall: number; minInterval: number };
constructor() {
this.redis = new Redis(process.env.REDIS_URL);
this.rateLimiter = { lastCall: 0, minInterval: 1100 }; // 1 req/sec for Nominatim
}
async geocode(address: string): Promise<GeocodedResult> {
// 1. Check cache
const cacheKey = `geocode:${this.hashAddress(address)}`;
const cached = await this.redis.get(cacheKey);
if (cached) {
return JSON.parse(cached);
}
// 2. Rate limiting
await this.enforceRateLimit();
// 3. Call geocoding API
const result = await this.callNominatim(address);
// 4. Cache result (30 days)
await this.redis.setex(cacheKey, 60 * 60 * 24 * 30, JSON.stringify(result));
return result;
}
private async callNominatim(address: string): Promise<GeocodedResult> {
const params = new URLSearchParams({
q: `${address}, Jalisco, Mexico`,
format: 'json',
addressdetails: '1',
limit: '1',
});
try {
const response = await fetch(`${this.nominatimUrl}?${params}`, {
headers: {
'User-Agent': 'InmobiliariaAnalytics/1.0',
},
});
const data = await response.json();
if (!data || data.length === 0) {
return this.emptyResult();
}
const result = data[0];
const addr = result.address || {};
return {
street: addr.road || addr.street || null,
neighborhood: addr.suburb || addr.neighbourhood || null,
municipality: addr.city || addr.town || addr.municipality || null,
state: addr.state || null,
postalCode: addr.postcode || null,
coordinates: {
lat: parseFloat(result.lat),
lng: parseFloat(result.lon),
},
confidence: this.calculateConfidence(result),
};
} catch (error) {
console.error('Geocoding error:', error);
return this.emptyResult();
}
}
private calculateConfidence(result: any): number {
// Basado en importance y type de Nominatim
const importance = result.importance || 0;
const type = result.type;
let confidence = importance * 100;
// Bonus por tipo preciso
if (type === 'house' || type === 'building') {
confidence = Math.min(100, confidence + 20);
}
return Math.round(confidence);
}
private async enforceRateLimit(): Promise<void> {
const now = Date.now();
const elapsed = now - this.rateLimiter.lastCall;
if (elapsed < this.rateLimiter.minInterval) {
await new Promise(resolve =>
setTimeout(resolve, this.rateLimiter.minInterval - elapsed)
);
}
this.rateLimiter.lastCall = Date.now();
}
private hashAddress(address: string): string {
const crypto = require('crypto');
return crypto.createHash('md5').update(address.toLowerCase().trim()).digest('hex');
}
private emptyResult(): GeocodedResult {
return {
street: null,
neighborhood: null,
municipality: null,
state: null,
postalCode: null,
coordinates: null,
confidence: 0,
};
}
}
```
---
## 6. Detector de Duplicados
```typescript
// src/etl/services/deduplication.service.ts
import { Pool } from 'pg';
import { NormalizedProperty } from '../types';
interface DuplicateCandidate {
id: string;
similarity: number;
matchedFields: string[];
}
export class DeduplicationService {
private db: Pool;
constructor() {
this.db = new Pool({ connectionString: process.env.DATABASE_URL });
}
async findDuplicates(property: NormalizedProperty): Promise<DuplicateCandidate[]> {
const candidates: DuplicateCandidate[] = [];
// 1. Exacto por sourceId de otra fuente
const exactMatch = await this.findExactMatch(property);
if (exactMatch) {
candidates.push({ ...exactMatch, similarity: 1.0 });
}
// 2. Fuzzy matching por caracteristicas
const fuzzyMatches = await this.findFuzzyMatches(property);
candidates.push(...fuzzyMatches);
return candidates.sort((a, b) => b.similarity - a.similarity);
}
private async findExactMatch(property: NormalizedProperty): Promise<DuplicateCandidate | null> {
// Buscar misma propiedad de diferente fuente
const query = `
SELECT id, source, source_id, title, price,
ST_Distance(
coordinates::geography,
ST_SetSRID(ST_MakePoint($1, $2), 4326)::geography
) as distance_meters
FROM properties
WHERE source != $3
AND price BETWEEN $4 * 0.95 AND $4 * 1.05
AND property_type = $5
AND ST_DWithin(
coordinates::geography,
ST_SetSRID(ST_MakePoint($1, $2), 4326)::geography,
100 -- 100 metros
)
LIMIT 5
`;
if (!property.location.coordinates) return null;
const result = await this.db.query(query, [
property.location.coordinates.lng,
property.location.coordinates.lat,
property.source,
property.price,
property.propertyType,
]);
for (const row of result.rows) {
const titleSimilarity = this.calculateTextSimilarity(property.title, row.title);
if (titleSimilarity > 0.8 && row.distance_meters < 50) {
return {
id: row.id,
similarity: 0.95,
matchedFields: ['coordinates', 'price', 'title', 'property_type'],
};
}
}
return null;
}
private async findFuzzyMatches(property: NormalizedProperty): Promise<DuplicateCandidate[]> {
const query = `
SELECT id, title, price, bedrooms, bathrooms, constructed_area,
neighborhood, coordinates
FROM properties
WHERE source != $1
AND neighborhood = $2
AND property_type = $3
AND price BETWEEN $4 * 0.9 AND $4 * 1.1
AND status = 'active'
LIMIT 20
`;
const result = await this.db.query(query, [
property.source,
property.location.neighborhood,
property.propertyType,
property.price,
]);
const candidates: DuplicateCandidate[] = [];
for (const row of result.rows) {
const similarity = this.calculatePropertySimilarity(property, row);
if (similarity > 0.75) {
candidates.push({
id: row.id,
similarity,
matchedFields: this.getMatchedFields(property, row),
});
}
}
return candidates;
}
private calculatePropertySimilarity(prop: NormalizedProperty, candidate: any): number {
let score = 0;
let totalWeight = 0;
// Precio (peso 0.3)
const priceDiff = Math.abs(prop.price - candidate.price) / prop.price;
score += (1 - Math.min(priceDiff, 1)) * 0.3;
totalWeight += 0.3;
// Area (peso 0.25)
if (prop.constructedArea && candidate.constructed_area) {
const areaDiff = Math.abs(prop.constructedArea - candidate.constructed_area) / prop.constructedArea;
score += (1 - Math.min(areaDiff, 1)) * 0.25;
totalWeight += 0.25;
}
// Recamaras (peso 0.15)
if (prop.bedrooms !== null && candidate.bedrooms !== null) {
score += (prop.bedrooms === candidate.bedrooms ? 1 : 0) * 0.15;
totalWeight += 0.15;
}
// Banos (peso 0.15)
if (prop.bathrooms !== null && candidate.bathrooms !== null) {
score += (prop.bathrooms === candidate.bathrooms ? 1 : 0) * 0.15;
totalWeight += 0.15;
}
// Titulo (peso 0.15)
const titleSim = this.calculateTextSimilarity(prop.title, candidate.title);
score += titleSim * 0.15;
totalWeight += 0.15;
return totalWeight > 0 ? score / totalWeight : 0;
}
private calculateTextSimilarity(text1: string, text2: string): number {
// Jaccard similarity de palabras
const words1 = new Set(text1.toLowerCase().split(/\s+/));
const words2 = new Set(text2.toLowerCase().split(/\s+/));
const intersection = new Set([...words1].filter(x => words2.has(x)));
const union = new Set([...words1, ...words2]);
return intersection.size / union.size;
}
private getMatchedFields(prop: NormalizedProperty, candidate: any): string[] {
const matched: string[] = [];
if (Math.abs(prop.price - candidate.price) / prop.price < 0.05) {
matched.push('price');
}
if (prop.bedrooms === candidate.bedrooms) {
matched.push('bedrooms');
}
if (prop.bathrooms === candidate.bathrooms) {
matched.push('bathrooms');
}
if (prop.location.neighborhood === candidate.neighborhood) {
matched.push('neighborhood');
}
return matched;
}
async mergeProperties(
primaryId: string,
duplicateIds: string[]
): Promise<void> {
const client = await this.db.connect();
try {
await client.query('BEGIN');
// Crear registros en property_aliases
for (const dupId of duplicateIds) {
await client.query(`
INSERT INTO property_aliases (primary_id, alias_id, merged_at)
VALUES ($1, $2, NOW())
ON CONFLICT DO NOTHING
`, [primaryId, dupId]);
}
// Marcar duplicados como merged
await client.query(`
UPDATE properties
SET status = 'merged', merged_into = $1
WHERE id = ANY($2)
`, [primaryId, duplicateIds]);
await client.query('COMMIT');
} catch (error) {
await client.query('ROLLBACK');
throw error;
} finally {
client.release();
}
}
}
```
---
## 7. Loader (Carga a Base de Datos)
```typescript
// src/etl/loaders/property.loader.ts
import { Pool } from 'pg';
import { NormalizedProperty } from '../types';
import { DeduplicationService } from '../services/deduplication.service';
export class PropertyLoader {
private db: Pool;
private deduper: DeduplicationService;
constructor() {
this.db = new Pool({ connectionString: process.env.DATABASE_URL });
this.deduper = new DeduplicationService();
}
async load(property: NormalizedProperty): Promise<{ action: 'inserted' | 'updated' | 'duplicate'; id: string }> {
// 1. Verificar si ya existe por source + sourceId
const existing = await this.findExisting(property.source, property.sourceId);
if (existing) {
await this.update(existing.id, property);
return { action: 'updated', id: existing.id };
}
// 2. Buscar duplicados de otras fuentes
const duplicates = await this.deduper.findDuplicates(property);
if (duplicates.length > 0 && duplicates[0].similarity > 0.9) {
// Es un duplicado, vincular a existente
await this.linkDuplicate(duplicates[0].id, property);
return { action: 'duplicate', id: duplicates[0].id };
}
// 3. Insertar nueva propiedad
const id = await this.insert(property);
return { action: 'inserted', id };
}
private async findExisting(source: string, sourceId: string): Promise<{ id: string } | null> {
const result = await this.db.query(
'SELECT id FROM properties WHERE source = $1 AND source_id = $2',
[source, sourceId]
);
return result.rows[0] || null;
}
private async insert(property: NormalizedProperty): Promise<string> {
const query = `
INSERT INTO properties (
id, source, source_id, source_url,
title, description, property_type, transaction_type,
price, currency, price_per_sqm,
land_area, constructed_area,
bedrooms, bathrooms, parking_spaces, floors, year_built,
raw_address, street, neighborhood, municipality, state, postal_code, country,
coordinates, geocode_confidence,
images, virtual_tour, video,
amenities,
agent_name, agent_phone, agent_email, agent_agency,
first_seen_at, last_seen_at, published_at, status,
data_quality_score, missing_fields, data_warnings
) VALUES (
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
$11, $12, $13, $14, $15, $16, $17, $18,
$19, $20, $21, $22, $23, $24, $25,
ST_SetSRID(ST_MakePoint($26, $27), 4326), $28,
$29, $30, $31, $32,
$33, $34, $35, $36,
$37, $38, $39, $40,
$41, $42, $43
)
RETURNING id
`;
const coords = property.location.coordinates;
const result = await this.db.query(query, [
property.id,
property.source,
property.sourceId,
property.sourceUrl,
property.title,
property.description,
property.propertyType,
property.transactionType,
property.price,
property.currency,
property.pricePerSqm,
property.landArea,
property.constructedArea,
property.bedrooms,
property.bathrooms,
property.parkingSpaces,
property.floors,
property.yearBuilt,
property.location.rawAddress,
property.location.street,
property.location.neighborhood,
property.location.municipality,
property.location.state,
property.location.postalCode,
property.location.country,
coords?.lng || null,
coords?.lat || null,
property.location.geocodeConfidence,
JSON.stringify(property.images),
property.virtualTour,
property.video,
property.amenities,
property.agent.name,
property.agent.phone,
property.agent.email,
property.agent.agency,
property.firstSeenAt,
property.lastSeenAt,
property.publishedAt,
property.status,
property.dataQuality.score,
property.dataQuality.missingFields,
property.dataQuality.warnings,
]);
return result.rows[0].id;
}
private async update(id: string, property: NormalizedProperty): Promise<void> {
const query = `
UPDATE properties SET
title = $2,
description = $3,
price = $4,
price_per_sqm = $5,
last_seen_at = NOW(),
data_quality_score = $6,
images = $7
WHERE id = $1
`;
await this.db.query(query, [
id,
property.title,
property.description,
property.price,
property.pricePerSqm,
property.dataQuality.score,
JSON.stringify(property.images),
]);
}
private async linkDuplicate(existingId: string, property: NormalizedProperty): Promise<void> {
// Registrar como alias
await this.db.query(`
INSERT INTO property_aliases (primary_id, alias_source, alias_source_id, alias_url)
VALUES ($1, $2, $3, $4)
ON CONFLICT DO NOTHING
`, [existingId, property.source, property.sourceId, property.sourceUrl]);
// Actualizar last_seen del principal
await this.db.query(`
UPDATE properties SET last_seen_at = NOW() WHERE id = $1
`, [existingId]);
}
}
```
---
## 8. Esquema de Base de Datos
```sql
-- Tabla principal de propiedades
CREATE TABLE properties (
id VARCHAR(32) PRIMARY KEY,
source VARCHAR(50) NOT NULL,
source_id VARCHAR(100) NOT NULL,
source_url TEXT NOT NULL,
title VARCHAR(500) NOT NULL,
description TEXT,
property_type VARCHAR(50) NOT NULL,
transaction_type VARCHAR(20) NOT NULL,
price DECIMAL(15,2) NOT NULL,
currency VARCHAR(3) DEFAULT 'MXN',
price_per_sqm DECIMAL(10,2),
land_area DECIMAL(10,2),
constructed_area DECIMAL(10,2),
bedrooms SMALLINT,
bathrooms DECIMAL(3,1),
parking_spaces SMALLINT,
floors SMALLINT,
year_built SMALLINT,
raw_address TEXT,
street VARCHAR(200),
neighborhood VARCHAR(100),
municipality VARCHAR(100),
state VARCHAR(50),
postal_code VARCHAR(10),
country VARCHAR(50) DEFAULT 'Mexico',
coordinates GEOMETRY(Point, 4326),
geocode_confidence SMALLINT,
images JSONB DEFAULT '[]',
virtual_tour TEXT,
video TEXT,
amenities TEXT[],
agent_name VARCHAR(200),
agent_phone VARCHAR(20),
agent_email VARCHAR(200),
agent_agency VARCHAR(200),
first_seen_at TIMESTAMP NOT NULL,
last_seen_at TIMESTAMP NOT NULL,
published_at TIMESTAMP,
status VARCHAR(20) DEFAULT 'active',
merged_into VARCHAR(32) REFERENCES properties(id),
data_quality_score SMALLINT,
missing_fields TEXT[],
data_warnings TEXT[],
created_at TIMESTAMP DEFAULT NOW(),
updated_at TIMESTAMP DEFAULT NOW(),
UNIQUE(source, source_id)
);
-- Indices
CREATE INDEX idx_properties_location ON properties USING GIST(coordinates);
CREATE INDEX idx_properties_neighborhood ON properties(neighborhood);
CREATE INDEX idx_properties_price ON properties(price);
CREATE INDEX idx_properties_type ON properties(property_type);
CREATE INDEX idx_properties_status ON properties(status);
CREATE INDEX idx_properties_last_seen ON properties(last_seen_at);
-- Tabla de aliases (propiedades duplicadas)
CREATE TABLE property_aliases (
id SERIAL PRIMARY KEY,
primary_id VARCHAR(32) REFERENCES properties(id),
alias_source VARCHAR(50) NOT NULL,
alias_source_id VARCHAR(100) NOT NULL,
alias_url TEXT,
merged_at TIMESTAMP DEFAULT NOW(),
UNIQUE(alias_source, alias_source_id)
);
-- Historial de precios
CREATE TABLE price_history (
id SERIAL PRIMARY KEY,
property_id VARCHAR(32) REFERENCES properties(id),
price DECIMAL(15,2) NOT NULL,
currency VARCHAR(3) NOT NULL,
recorded_at TIMESTAMP DEFAULT NOW()
);
CREATE INDEX idx_price_history_property ON price_history(property_id);
```
---
## 9. Tests
```typescript
// src/etl/__tests__/normalizer.test.ts
import { PropertyNormalizer } from '../transformers/normalizer';
import { GeocodingService } from '../services/geocoding.service';
jest.mock('../services/geocoding.service');
describe('PropertyNormalizer', () => {
let normalizer: PropertyNormalizer;
beforeEach(() => {
const mockGeocoder = new GeocodingService();
(mockGeocoder.geocode as jest.Mock).mockResolvedValue({
street: 'Av. Providencia',
neighborhood: 'Providencia',
municipality: 'Guadalajara',
state: 'Jalisco',
postalCode: '44630',
coordinates: { lat: 20.6736, lng: -103.3927 },
confidence: 85,
});
normalizer = new PropertyNormalizer(mockGeocoder);
});
describe('parsePrice', () => {
it('should parse MXN price correctly', async () => {
const raw = {
source: 'test',
sourceId: '123',
sourceUrl: 'http://test.com/123',
scrapedAt: new Date(),
precio: '$4,500,000 MXN',
};
const result = await normalizer.normalize(raw);
expect(result.price).toBe(4500000);
expect(result.currency).toBe('MXN');
});
it('should parse USD price correctly', async () => {
const raw = {
source: 'test',
sourceId: '124',
sourceUrl: 'http://test.com/124',
scrapedAt: new Date(),
precio: '$350,000 USD',
};
const result = await normalizer.normalize(raw);
expect(result.price).toBe(350000);
expect(result.currency).toBe('USD');
});
});
describe('parseAreas', () => {
it('should parse both land and constructed areas', async () => {
const raw = {
source: 'test',
sourceId: '125',
sourceUrl: 'http://test.com/125',
scrapedAt: new Date(),
superficie: '180 m2 construccion, 250 m2 terreno',
};
const result = await normalizer.normalize(raw);
expect(result.constructedArea).toBe(180);
expect(result.landArea).toBe(250);
});
});
describe('detectPropertyType', () => {
it('should detect departamento', async () => {
const raw = {
source: 'test',
sourceId: '126',
sourceUrl: 'http://test.com/126',
scrapedAt: new Date(),
titulo: 'Hermoso departamento en Providencia',
};
const result = await normalizer.normalize(raw);
expect(result.propertyType).toBe('departamento');
});
});
});
```
---
## 10. Metricas y Monitoreo
```typescript
// Metricas del pipeline ETL
export const etlMetrics = {
// Contadores
properties_extracted_total: new Counter({
name: 'etl_properties_extracted_total',
help: 'Total properties extracted',
labelNames: ['source'],
}),
properties_normalized_total: new Counter({
name: 'etl_properties_normalized_total',
help: 'Total properties normalized',
labelNames: ['source', 'property_type'],
}),
properties_loaded_total: new Counter({
name: 'etl_properties_loaded_total',
help: 'Total properties loaded',
labelNames: ['action'], // inserted, updated, duplicate
}),
geocoding_requests_total: new Counter({
name: 'etl_geocoding_requests_total',
help: 'Total geocoding requests',
labelNames: ['status'], // success, error, cache_hit
}),
// Histogramas
normalization_duration_seconds: new Histogram({
name: 'etl_normalization_duration_seconds',
help: 'Time to normalize a property',
buckets: [0.01, 0.05, 0.1, 0.5, 1, 2, 5],
}),
data_quality_score: new Histogram({
name: 'etl_data_quality_score',
help: 'Data quality scores',
buckets: [20, 40, 60, 80, 100],
}),
};
```
---
**Siguiente:** [ET-IA-007-proxies.md](./ET-IA-007-proxies.md)