--- id: "ET-SCR-002" title: "Especificacion Tecnica - Pipeline ETL y Normalizacion" type: "Technical Specification" epic: "IAI-007" status: "Draft" version: "1.0" project: "inmobiliaria-analytics" created_date: "2026-01-04" updated_date: "2026-01-04" --- # ET-SCR-002: Pipeline ETL y Normalizacion --- ## 1. Resumen Pipeline de Extract-Transform-Load para procesar datos crudos de propiedades scrapeadas, normalizarlos a un esquema unificado, enriquecerlos con geocoding y detectar duplicados. --- ## 2. Arquitectura del Pipeline ``` ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ EXTRACT │────▶│ TRANSFORM │────▶│ LOAD │ │ │ │ │ │ │ │ - Raw HTML │ │ - Parse │ │ - Validate │ │ - JSON APIs │ │ - Normalize │ │ - Dedupe │ │ - Sitemap │ │ - Geocode │ │ - Upsert │ └──────────────┘ └──────────────┘ └──────────────┘ │ │ │ ▼ ▼ ▼ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ staging │ │ enriched │ │ properties │ │ _raw │ │ _staging │ │ (final) │ └──────────────┘ └──────────────┘ └──────────────┘ ``` --- ## 3. Esquema de Datos ### 3.1 Raw Data (Entrada) ```typescript interface RawPropertyData { source: string; sourceId: string; sourceUrl: string; scrapedAt: Date; rawHtml?: string; rawJson?: Record; // Campos extraidos (pueden variar por fuente) titulo?: string; precio?: string; ubicacion?: string; superficie?: string; recamaras?: string; banos?: string; descripcion?: string; imagenes?: string[]; amenidades?: string[]; contacto?: { nombre?: string; telefono?: string; email?: string; }; } ``` ### 3.2 Normalized Data (Salida) ```typescript interface NormalizedProperty { // Identificadores id: string; // UUID interno sourceId: string; source: string; sourceUrl: string; // Informacion basica title: string; description: string; propertyType: PropertyType; transactionType: TransactionType; // Precios price: number; currency: 'MXN' | 'USD'; pricePerSqm: number | null; // Superficie landArea: number | null; // m2 terreno constructedArea: number | null; // m2 construccion // Caracteristicas bedrooms: number | null; bathrooms: number | null; parkingSpaces: number | null; floors: number | null; yearBuilt: number | null; // Ubicacion location: { rawAddress: string; street: string | null; neighborhood: string; // colonia municipality: string; // municipio state: string; postalCode: string | null; country: string; coordinates: { lat: number; lng: number; } | null; geocodeConfidence: number; }; // Media images: PropertyImage[]; virtualTour: string | null; video: string | null; // Amenidades amenities: string[]; // Contacto agent: { name: string | null; phone: string | null; email: string | null; agency: string | null; }; // Metadata firstSeenAt: Date; lastSeenAt: Date; publishedAt: Date | null; status: PropertyStatus; // Calidad de datos dataQuality: { score: number; // 0-100 missingFields: string[]; warnings: string[]; }; } enum PropertyType { CASA = 'casa', DEPARTAMENTO = 'departamento', TERRENO = 'terreno', LOCAL_COMERCIAL = 'local_comercial', OFICINA = 'oficina', BODEGA = 'bodega', EDIFICIO = 'edificio', OTRO = 'otro' } enum TransactionType { VENTA = 'venta', RENTA = 'renta', TRASPASO = 'traspaso' } enum PropertyStatus { ACTIVE = 'active', SOLD = 'sold', RENTED = 'rented', INACTIVE = 'inactive', REMOVED = 'removed' } interface PropertyImage { url: string; thumbnailUrl: string | null; order: number; isMain: boolean; } ``` --- ## 4. Implementacion del Pipeline ### 4.1 Extractor Base ```typescript // src/etl/extractors/base.extractor.ts import { RawPropertyData } from '../types'; export abstract class BaseExtractor { abstract source: string; abstract extractFromHtml(html: string, url: string): Partial; abstract extractFromJson(json: unknown, url: string): Partial; protected cleanText(text: string | null | undefined): string { if (!text) return ''; return text .replace(/\s+/g, ' ') .replace(/[\n\r\t]/g, ' ') .trim(); } protected extractNumbers(text: string): number[] { const matches = text.match(/[\d,]+(\.\d+)?/g) || []; return matches.map(m => parseFloat(m.replace(/,/g, ''))); } } ``` ### 4.2 Extractor Inmuebles24 ```typescript // src/etl/extractors/inmuebles24.extractor.ts import * as cheerio from 'cheerio'; import { BaseExtractor } from './base.extractor'; import { RawPropertyData } from '../types'; export class Inmuebles24Extractor extends BaseExtractor { source = 'inmuebles24'; extractFromHtml(html: string, url: string): Partial { const $ = cheerio.load(html); return { source: this.source, sourceUrl: url, sourceId: this.extractSourceId(url), titulo: this.cleanText($('h1.title-type-sup').text()), precio: this.cleanText($('.price-value').text()), ubicacion: this.cleanText($('.location-container').text()), superficie: this.extractSuperficie($), recamaras: this.extractFeature($, 'recamaras'), banos: this.extractFeature($, 'banos'), descripcion: this.cleanText($('.description-content').text()), imagenes: this.extractImages($), amenidades: this.extractAmenidades($), contacto: { nombre: this.cleanText($('.publisher-name').text()), telefono: $('[data-phone]').attr('data-phone') || null, }, }; } extractFromJson(json: any, url: string): Partial { // Procesar JSON-LD o APIs internas if (json['@type'] === 'RealEstateListing') { return { source: this.source, sourceUrl: url, sourceId: json.identifier, titulo: json.name, precio: json.offers?.price?.toString(), // ... mapear resto de campos }; } return {}; } private extractSourceId(url: string): string { const match = url.match(/propiedades\/(\d+)/); return match ? match[1] : ''; } private extractSuperficie($: cheerio.CheerioAPI): string { const container = $('.surface-container').text(); return this.cleanText(container); } private extractFeature($: cheerio.CheerioAPI, feature: string): string { const el = $(`.feature-${feature}`).text(); return this.cleanText(el); } private extractImages($: cheerio.CheerioAPI): string[] { const images: string[] = []; $('img.gallery-image').each((_, el) => { const src = $(el).attr('src') || $(el).attr('data-src'); if (src) images.push(src); }); return images; } private extractAmenidades($: cheerio.CheerioAPI): string[] { const amenities: string[] = []; $('.amenity-item').each((_, el) => { amenities.push(this.cleanText($(el).text())); }); return amenities; } } ``` ### 4.3 Transformador/Normalizador ```typescript // src/etl/transformers/normalizer.ts import { RawPropertyData, NormalizedProperty, PropertyType, TransactionType } from '../types'; import { GeocodingService } from '../services/geocoding.service'; export class PropertyNormalizer { constructor(private geocoder: GeocodingService) {} async normalize(raw: RawPropertyData): Promise { const price = this.parsePrice(raw.precio); const areas = this.parseAreas(raw.superficie); const location = await this.normalizeLocation(raw.ubicacion); const normalized: NormalizedProperty = { id: this.generateId(raw), sourceId: raw.sourceId, source: raw.source, sourceUrl: raw.sourceUrl, title: this.normalizeTitle(raw.titulo), description: raw.descripcion || '', propertyType: this.detectPropertyType(raw), transactionType: this.detectTransactionType(raw), price: price.amount, currency: price.currency, pricePerSqm: areas.constructed ? Math.round(price.amount / areas.constructed) : null, landArea: areas.land, constructedArea: areas.constructed, bedrooms: this.parseNumber(raw.recamaras), bathrooms: this.parseNumber(raw.banos), parkingSpaces: this.extractParkingSpaces(raw), floors: null, yearBuilt: null, location, images: this.normalizeImages(raw.imagenes), virtualTour: null, video: null, amenities: this.normalizeAmenities(raw.amenidades), agent: { name: raw.contacto?.nombre || null, phone: this.normalizePhone(raw.contacto?.telefono), email: raw.contacto?.email || null, agency: null, }, firstSeenAt: raw.scrapedAt, lastSeenAt: raw.scrapedAt, publishedAt: null, status: 'active', dataQuality: this.calculateDataQuality(raw), }; return normalized; } private parsePrice(priceStr?: string): { amount: number; currency: 'MXN' | 'USD' } { if (!priceStr) return { amount: 0, currency: 'MXN' }; const currency = priceStr.includes('USD') || priceStr.includes('$') && priceStr.includes('dll') ? 'USD' : 'MXN'; const cleaned = priceStr.replace(/[^\d.]/g, ''); const amount = parseFloat(cleaned) || 0; return { amount, currency }; } private parseAreas(superficieStr?: string): { land: number | null; constructed: number | null } { if (!superficieStr) return { land: null, constructed: null }; const result = { land: null as number | null, constructed: null as number | null }; // Buscar patrones como "180 m2 construccion" o "250 m2 terreno" const constMatch = superficieStr.match(/(\d+(?:\.\d+)?)\s*m[2²]?\s*(const|constr)/i); const landMatch = superficieStr.match(/(\d+(?:\.\d+)?)\s*m[2²]?\s*(terr|lote)/i); if (constMatch) result.constructed = parseFloat(constMatch[1]); if (landMatch) result.land = parseFloat(landMatch[1]); // Si solo hay un numero, asumir es area construida para casas/deptos if (!result.constructed && !result.land) { const numbers = superficieStr.match(/(\d+(?:\.\d+)?)/g); if (numbers && numbers.length === 1) { result.constructed = parseFloat(numbers[0]); } } return result; } private async normalizeLocation(rawAddress?: string): Promise { const defaultLocation = { rawAddress: rawAddress || '', street: null, neighborhood: '', municipality: '', state: 'Jalisco', postalCode: null, country: 'Mexico', coordinates: null, geocodeConfidence: 0, }; if (!rawAddress) return defaultLocation; try { const geocoded = await this.geocoder.geocode(rawAddress); return { rawAddress, street: geocoded.street, neighborhood: geocoded.neighborhood || this.extractColonia(rawAddress), municipality: geocoded.municipality || 'Guadalajara', state: geocoded.state || 'Jalisco', postalCode: geocoded.postalCode, country: 'Mexico', coordinates: geocoded.coordinates, geocodeConfidence: geocoded.confidence, }; } catch (error) { // Fallback: parsing manual return { ...defaultLocation, neighborhood: this.extractColonia(rawAddress), municipality: this.extractMunicipio(rawAddress), }; } } private extractColonia(address: string): string { // Patrones comunes: "Col. Providencia", "Colonia Americana" const match = address.match(/(?:col\.?|colonia)\s+([^,]+)/i); return match ? match[1].trim() : ''; } private extractMunicipio(address: string): string { const municipios = [ 'Guadalajara', 'Zapopan', 'Tlaquepaque', 'Tonala', 'Tlajomulco', 'El Salto', 'Ixtlahuacan' ]; for (const mun of municipios) { if (address.toLowerCase().includes(mun.toLowerCase())) { return mun; } } return ''; } private detectPropertyType(raw: RawPropertyData): PropertyType { const text = `${raw.titulo} ${raw.descripcion}`.toLowerCase(); if (text.includes('departamento') || text.includes('depto')) { return PropertyType.DEPARTAMENTO; } if (text.includes('casa')) { return PropertyType.CASA; } if (text.includes('terreno') || text.includes('lote')) { return PropertyType.TERRENO; } if (text.includes('local') || text.includes('comercial')) { return PropertyType.LOCAL_COMERCIAL; } if (text.includes('oficina')) { return PropertyType.OFICINA; } if (text.includes('bodega')) { return PropertyType.BODEGA; } return PropertyType.OTRO; } private detectTransactionType(raw: RawPropertyData): TransactionType { const text = `${raw.titulo} ${raw.sourceUrl}`.toLowerCase(); if (text.includes('renta') || text.includes('alquiler')) { return TransactionType.RENTA; } if (text.includes('traspaso')) { return TransactionType.TRASPASO; } return TransactionType.VENTA; } private normalizePhone(phone?: string | null): string | null { if (!phone) return null; // Limpiar y formatear telefono mexicano const cleaned = phone.replace(/\D/g, ''); if (cleaned.length === 10) { return cleaned; } if (cleaned.length === 12 && cleaned.startsWith('52')) { return cleaned.substring(2); } return cleaned || null; } private normalizeImages(images?: string[]): NormalizedProperty['images'] { if (!images || images.length === 0) return []; return images.map((url, index) => ({ url: this.normalizeImageUrl(url), thumbnailUrl: this.generateThumbnailUrl(url), order: index, isMain: index === 0, })); } private normalizeImageUrl(url: string): string { // Asegurar HTTPS y limpiar parametros innecesarios return url.replace(/^http:/, 'https:'); } private generateThumbnailUrl(url: string): string { // Generar URL de thumbnail (depende del CDN usado) return url.replace('/images/', '/thumbnails/'); } private normalizeAmenities(amenities?: string[]): string[] { if (!amenities) return []; const normalized = new Set(); const mapping: Record = { 'alberca': 'Alberca', 'piscina': 'Alberca', 'jardin': 'Jardin', 'gym': 'Gimnasio', 'gimnasio': 'Gimnasio', 'roof': 'Roof Garden', 'terraza': 'Terraza', 'seguridad': 'Seguridad 24/7', 'vigilancia': 'Seguridad 24/7', 'estacionamiento': 'Estacionamiento', 'cochera': 'Estacionamiento', }; for (const amenity of amenities) { const lower = amenity.toLowerCase().trim(); const key = Object.keys(mapping).find(k => lower.includes(k)); normalized.add(key ? mapping[key] : amenity); } return Array.from(normalized); } private parseNumber(str?: string): number | null { if (!str) return null; const num = parseInt(str.replace(/\D/g, '')); return isNaN(num) ? null : num; } private extractParkingSpaces(raw: RawPropertyData): number | null { const text = `${raw.descripcion} ${raw.amenidades?.join(' ')}`; const match = text.match(/(\d+)\s*(estacionamiento|cochera|parking)/i); return match ? parseInt(match[1]) : null; } private generateId(raw: RawPropertyData): string { // Crear hash unico basado en source + sourceId const crypto = require('crypto'); const input = `${raw.source}:${raw.sourceId}`; return crypto.createHash('sha256').update(input).digest('hex').substring(0, 32); } private calculateDataQuality(raw: RawPropertyData): NormalizedProperty['dataQuality'] { const requiredFields = ['titulo', 'precio', 'ubicacion', 'superficie']; const optionalFields = ['recamaras', 'banos', 'descripcion', 'imagenes']; const missingRequired = requiredFields.filter(f => !raw[f as keyof RawPropertyData]); const missingOptional = optionalFields.filter(f => !raw[f as keyof RawPropertyData]); const warnings: string[] = []; // Validaciones if (raw.precio && parseFloat(raw.precio.replace(/\D/g, '')) < 100000) { warnings.push('Precio sospechosamente bajo'); } if (raw.imagenes && raw.imagenes.length < 3) { warnings.push('Pocas imagenes'); } const score = Math.max(0, 100 - (missingRequired.length * 20) - (missingOptional.length * 5) - (warnings.length * 10) ); return { score, missingFields: [...missingRequired, ...missingOptional], warnings, }; } } ``` --- ## 5. Servicio de Geocoding ```typescript // src/etl/services/geocoding.service.ts import { Redis } from 'ioredis'; interface GeocodedResult { street: string | null; neighborhood: string | null; municipality: string | null; state: string | null; postalCode: string | null; coordinates: { lat: number; lng: number } | null; confidence: number; } export class GeocodingService { private redis: Redis; private nominatimUrl = 'https://nominatim.openstreetmap.org/search'; private rateLimiter: { lastCall: number; minInterval: number }; constructor() { this.redis = new Redis(process.env.REDIS_URL); this.rateLimiter = { lastCall: 0, minInterval: 1100 }; // 1 req/sec for Nominatim } async geocode(address: string): Promise { // 1. Check cache const cacheKey = `geocode:${this.hashAddress(address)}`; const cached = await this.redis.get(cacheKey); if (cached) { return JSON.parse(cached); } // 2. Rate limiting await this.enforceRateLimit(); // 3. Call geocoding API const result = await this.callNominatim(address); // 4. Cache result (30 days) await this.redis.setex(cacheKey, 60 * 60 * 24 * 30, JSON.stringify(result)); return result; } private async callNominatim(address: string): Promise { const params = new URLSearchParams({ q: `${address}, Jalisco, Mexico`, format: 'json', addressdetails: '1', limit: '1', }); try { const response = await fetch(`${this.nominatimUrl}?${params}`, { headers: { 'User-Agent': 'InmobiliariaAnalytics/1.0', }, }); const data = await response.json(); if (!data || data.length === 0) { return this.emptyResult(); } const result = data[0]; const addr = result.address || {}; return { street: addr.road || addr.street || null, neighborhood: addr.suburb || addr.neighbourhood || null, municipality: addr.city || addr.town || addr.municipality || null, state: addr.state || null, postalCode: addr.postcode || null, coordinates: { lat: parseFloat(result.lat), lng: parseFloat(result.lon), }, confidence: this.calculateConfidence(result), }; } catch (error) { console.error('Geocoding error:', error); return this.emptyResult(); } } private calculateConfidence(result: any): number { // Basado en importance y type de Nominatim const importance = result.importance || 0; const type = result.type; let confidence = importance * 100; // Bonus por tipo preciso if (type === 'house' || type === 'building') { confidence = Math.min(100, confidence + 20); } return Math.round(confidence); } private async enforceRateLimit(): Promise { const now = Date.now(); const elapsed = now - this.rateLimiter.lastCall; if (elapsed < this.rateLimiter.minInterval) { await new Promise(resolve => setTimeout(resolve, this.rateLimiter.minInterval - elapsed) ); } this.rateLimiter.lastCall = Date.now(); } private hashAddress(address: string): string { const crypto = require('crypto'); return crypto.createHash('md5').update(address.toLowerCase().trim()).digest('hex'); } private emptyResult(): GeocodedResult { return { street: null, neighborhood: null, municipality: null, state: null, postalCode: null, coordinates: null, confidence: 0, }; } } ``` --- ## 6. Detector de Duplicados ```typescript // src/etl/services/deduplication.service.ts import { Pool } from 'pg'; import { NormalizedProperty } from '../types'; interface DuplicateCandidate { id: string; similarity: number; matchedFields: string[]; } export class DeduplicationService { private db: Pool; constructor() { this.db = new Pool({ connectionString: process.env.DATABASE_URL }); } async findDuplicates(property: NormalizedProperty): Promise { const candidates: DuplicateCandidate[] = []; // 1. Exacto por sourceId de otra fuente const exactMatch = await this.findExactMatch(property); if (exactMatch) { candidates.push({ ...exactMatch, similarity: 1.0 }); } // 2. Fuzzy matching por caracteristicas const fuzzyMatches = await this.findFuzzyMatches(property); candidates.push(...fuzzyMatches); return candidates.sort((a, b) => b.similarity - a.similarity); } private async findExactMatch(property: NormalizedProperty): Promise { // Buscar misma propiedad de diferente fuente const query = ` SELECT id, source, source_id, title, price, ST_Distance( coordinates::geography, ST_SetSRID(ST_MakePoint($1, $2), 4326)::geography ) as distance_meters FROM properties WHERE source != $3 AND price BETWEEN $4 * 0.95 AND $4 * 1.05 AND property_type = $5 AND ST_DWithin( coordinates::geography, ST_SetSRID(ST_MakePoint($1, $2), 4326)::geography, 100 -- 100 metros ) LIMIT 5 `; if (!property.location.coordinates) return null; const result = await this.db.query(query, [ property.location.coordinates.lng, property.location.coordinates.lat, property.source, property.price, property.propertyType, ]); for (const row of result.rows) { const titleSimilarity = this.calculateTextSimilarity(property.title, row.title); if (titleSimilarity > 0.8 && row.distance_meters < 50) { return { id: row.id, similarity: 0.95, matchedFields: ['coordinates', 'price', 'title', 'property_type'], }; } } return null; } private async findFuzzyMatches(property: NormalizedProperty): Promise { const query = ` SELECT id, title, price, bedrooms, bathrooms, constructed_area, neighborhood, coordinates FROM properties WHERE source != $1 AND neighborhood = $2 AND property_type = $3 AND price BETWEEN $4 * 0.9 AND $4 * 1.1 AND status = 'active' LIMIT 20 `; const result = await this.db.query(query, [ property.source, property.location.neighborhood, property.propertyType, property.price, ]); const candidates: DuplicateCandidate[] = []; for (const row of result.rows) { const similarity = this.calculatePropertySimilarity(property, row); if (similarity > 0.75) { candidates.push({ id: row.id, similarity, matchedFields: this.getMatchedFields(property, row), }); } } return candidates; } private calculatePropertySimilarity(prop: NormalizedProperty, candidate: any): number { let score = 0; let totalWeight = 0; // Precio (peso 0.3) const priceDiff = Math.abs(prop.price - candidate.price) / prop.price; score += (1 - Math.min(priceDiff, 1)) * 0.3; totalWeight += 0.3; // Area (peso 0.25) if (prop.constructedArea && candidate.constructed_area) { const areaDiff = Math.abs(prop.constructedArea - candidate.constructed_area) / prop.constructedArea; score += (1 - Math.min(areaDiff, 1)) * 0.25; totalWeight += 0.25; } // Recamaras (peso 0.15) if (prop.bedrooms !== null && candidate.bedrooms !== null) { score += (prop.bedrooms === candidate.bedrooms ? 1 : 0) * 0.15; totalWeight += 0.15; } // Banos (peso 0.15) if (prop.bathrooms !== null && candidate.bathrooms !== null) { score += (prop.bathrooms === candidate.bathrooms ? 1 : 0) * 0.15; totalWeight += 0.15; } // Titulo (peso 0.15) const titleSim = this.calculateTextSimilarity(prop.title, candidate.title); score += titleSim * 0.15; totalWeight += 0.15; return totalWeight > 0 ? score / totalWeight : 0; } private calculateTextSimilarity(text1: string, text2: string): number { // Jaccard similarity de palabras const words1 = new Set(text1.toLowerCase().split(/\s+/)); const words2 = new Set(text2.toLowerCase().split(/\s+/)); const intersection = new Set([...words1].filter(x => words2.has(x))); const union = new Set([...words1, ...words2]); return intersection.size / union.size; } private getMatchedFields(prop: NormalizedProperty, candidate: any): string[] { const matched: string[] = []; if (Math.abs(prop.price - candidate.price) / prop.price < 0.05) { matched.push('price'); } if (prop.bedrooms === candidate.bedrooms) { matched.push('bedrooms'); } if (prop.bathrooms === candidate.bathrooms) { matched.push('bathrooms'); } if (prop.location.neighborhood === candidate.neighborhood) { matched.push('neighborhood'); } return matched; } async mergeProperties( primaryId: string, duplicateIds: string[] ): Promise { const client = await this.db.connect(); try { await client.query('BEGIN'); // Crear registros en property_aliases for (const dupId of duplicateIds) { await client.query(` INSERT INTO property_aliases (primary_id, alias_id, merged_at) VALUES ($1, $2, NOW()) ON CONFLICT DO NOTHING `, [primaryId, dupId]); } // Marcar duplicados como merged await client.query(` UPDATE properties SET status = 'merged', merged_into = $1 WHERE id = ANY($2) `, [primaryId, duplicateIds]); await client.query('COMMIT'); } catch (error) { await client.query('ROLLBACK'); throw error; } finally { client.release(); } } } ``` --- ## 7. Loader (Carga a Base de Datos) ```typescript // src/etl/loaders/property.loader.ts import { Pool } from 'pg'; import { NormalizedProperty } from '../types'; import { DeduplicationService } from '../services/deduplication.service'; export class PropertyLoader { private db: Pool; private deduper: DeduplicationService; constructor() { this.db = new Pool({ connectionString: process.env.DATABASE_URL }); this.deduper = new DeduplicationService(); } async load(property: NormalizedProperty): Promise<{ action: 'inserted' | 'updated' | 'duplicate'; id: string }> { // 1. Verificar si ya existe por source + sourceId const existing = await this.findExisting(property.source, property.sourceId); if (existing) { await this.update(existing.id, property); return { action: 'updated', id: existing.id }; } // 2. Buscar duplicados de otras fuentes const duplicates = await this.deduper.findDuplicates(property); if (duplicates.length > 0 && duplicates[0].similarity > 0.9) { // Es un duplicado, vincular a existente await this.linkDuplicate(duplicates[0].id, property); return { action: 'duplicate', id: duplicates[0].id }; } // 3. Insertar nueva propiedad const id = await this.insert(property); return { action: 'inserted', id }; } private async findExisting(source: string, sourceId: string): Promise<{ id: string } | null> { const result = await this.db.query( 'SELECT id FROM properties WHERE source = $1 AND source_id = $2', [source, sourceId] ); return result.rows[0] || null; } private async insert(property: NormalizedProperty): Promise { const query = ` INSERT INTO properties ( id, source, source_id, source_url, title, description, property_type, transaction_type, price, currency, price_per_sqm, land_area, constructed_area, bedrooms, bathrooms, parking_spaces, floors, year_built, raw_address, street, neighborhood, municipality, state, postal_code, country, coordinates, geocode_confidence, images, virtual_tour, video, amenities, agent_name, agent_phone, agent_email, agent_agency, first_seen_at, last_seen_at, published_at, status, data_quality_score, missing_fields, data_warnings ) VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, ST_SetSRID(ST_MakePoint($26, $27), 4326), $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41, $42, $43 ) RETURNING id `; const coords = property.location.coordinates; const result = await this.db.query(query, [ property.id, property.source, property.sourceId, property.sourceUrl, property.title, property.description, property.propertyType, property.transactionType, property.price, property.currency, property.pricePerSqm, property.landArea, property.constructedArea, property.bedrooms, property.bathrooms, property.parkingSpaces, property.floors, property.yearBuilt, property.location.rawAddress, property.location.street, property.location.neighborhood, property.location.municipality, property.location.state, property.location.postalCode, property.location.country, coords?.lng || null, coords?.lat || null, property.location.geocodeConfidence, JSON.stringify(property.images), property.virtualTour, property.video, property.amenities, property.agent.name, property.agent.phone, property.agent.email, property.agent.agency, property.firstSeenAt, property.lastSeenAt, property.publishedAt, property.status, property.dataQuality.score, property.dataQuality.missingFields, property.dataQuality.warnings, ]); return result.rows[0].id; } private async update(id: string, property: NormalizedProperty): Promise { const query = ` UPDATE properties SET title = $2, description = $3, price = $4, price_per_sqm = $5, last_seen_at = NOW(), data_quality_score = $6, images = $7 WHERE id = $1 `; await this.db.query(query, [ id, property.title, property.description, property.price, property.pricePerSqm, property.dataQuality.score, JSON.stringify(property.images), ]); } private async linkDuplicate(existingId: string, property: NormalizedProperty): Promise { // Registrar como alias await this.db.query(` INSERT INTO property_aliases (primary_id, alias_source, alias_source_id, alias_url) VALUES ($1, $2, $3, $4) ON CONFLICT DO NOTHING `, [existingId, property.source, property.sourceId, property.sourceUrl]); // Actualizar last_seen del principal await this.db.query(` UPDATE properties SET last_seen_at = NOW() WHERE id = $1 `, [existingId]); } } ``` --- ## 8. Esquema de Base de Datos ```sql -- Tabla principal de propiedades CREATE TABLE properties ( id VARCHAR(32) PRIMARY KEY, source VARCHAR(50) NOT NULL, source_id VARCHAR(100) NOT NULL, source_url TEXT NOT NULL, title VARCHAR(500) NOT NULL, description TEXT, property_type VARCHAR(50) NOT NULL, transaction_type VARCHAR(20) NOT NULL, price DECIMAL(15,2) NOT NULL, currency VARCHAR(3) DEFAULT 'MXN', price_per_sqm DECIMAL(10,2), land_area DECIMAL(10,2), constructed_area DECIMAL(10,2), bedrooms SMALLINT, bathrooms DECIMAL(3,1), parking_spaces SMALLINT, floors SMALLINT, year_built SMALLINT, raw_address TEXT, street VARCHAR(200), neighborhood VARCHAR(100), municipality VARCHAR(100), state VARCHAR(50), postal_code VARCHAR(10), country VARCHAR(50) DEFAULT 'Mexico', coordinates GEOMETRY(Point, 4326), geocode_confidence SMALLINT, images JSONB DEFAULT '[]', virtual_tour TEXT, video TEXT, amenities TEXT[], agent_name VARCHAR(200), agent_phone VARCHAR(20), agent_email VARCHAR(200), agent_agency VARCHAR(200), first_seen_at TIMESTAMP NOT NULL, last_seen_at TIMESTAMP NOT NULL, published_at TIMESTAMP, status VARCHAR(20) DEFAULT 'active', merged_into VARCHAR(32) REFERENCES properties(id), data_quality_score SMALLINT, missing_fields TEXT[], data_warnings TEXT[], created_at TIMESTAMP DEFAULT NOW(), updated_at TIMESTAMP DEFAULT NOW(), UNIQUE(source, source_id) ); -- Indices CREATE INDEX idx_properties_location ON properties USING GIST(coordinates); CREATE INDEX idx_properties_neighborhood ON properties(neighborhood); CREATE INDEX idx_properties_price ON properties(price); CREATE INDEX idx_properties_type ON properties(property_type); CREATE INDEX idx_properties_status ON properties(status); CREATE INDEX idx_properties_last_seen ON properties(last_seen_at); -- Tabla de aliases (propiedades duplicadas) CREATE TABLE property_aliases ( id SERIAL PRIMARY KEY, primary_id VARCHAR(32) REFERENCES properties(id), alias_source VARCHAR(50) NOT NULL, alias_source_id VARCHAR(100) NOT NULL, alias_url TEXT, merged_at TIMESTAMP DEFAULT NOW(), UNIQUE(alias_source, alias_source_id) ); -- Historial de precios CREATE TABLE price_history ( id SERIAL PRIMARY KEY, property_id VARCHAR(32) REFERENCES properties(id), price DECIMAL(15,2) NOT NULL, currency VARCHAR(3) NOT NULL, recorded_at TIMESTAMP DEFAULT NOW() ); CREATE INDEX idx_price_history_property ON price_history(property_id); ``` --- ## 9. Tests ```typescript // src/etl/__tests__/normalizer.test.ts import { PropertyNormalizer } from '../transformers/normalizer'; import { GeocodingService } from '../services/geocoding.service'; jest.mock('../services/geocoding.service'); describe('PropertyNormalizer', () => { let normalizer: PropertyNormalizer; beforeEach(() => { const mockGeocoder = new GeocodingService(); (mockGeocoder.geocode as jest.Mock).mockResolvedValue({ street: 'Av. Providencia', neighborhood: 'Providencia', municipality: 'Guadalajara', state: 'Jalisco', postalCode: '44630', coordinates: { lat: 20.6736, lng: -103.3927 }, confidence: 85, }); normalizer = new PropertyNormalizer(mockGeocoder); }); describe('parsePrice', () => { it('should parse MXN price correctly', async () => { const raw = { source: 'test', sourceId: '123', sourceUrl: 'http://test.com/123', scrapedAt: new Date(), precio: '$4,500,000 MXN', }; const result = await normalizer.normalize(raw); expect(result.price).toBe(4500000); expect(result.currency).toBe('MXN'); }); it('should parse USD price correctly', async () => { const raw = { source: 'test', sourceId: '124', sourceUrl: 'http://test.com/124', scrapedAt: new Date(), precio: '$350,000 USD', }; const result = await normalizer.normalize(raw); expect(result.price).toBe(350000); expect(result.currency).toBe('USD'); }); }); describe('parseAreas', () => { it('should parse both land and constructed areas', async () => { const raw = { source: 'test', sourceId: '125', sourceUrl: 'http://test.com/125', scrapedAt: new Date(), superficie: '180 m2 construccion, 250 m2 terreno', }; const result = await normalizer.normalize(raw); expect(result.constructedArea).toBe(180); expect(result.landArea).toBe(250); }); }); describe('detectPropertyType', () => { it('should detect departamento', async () => { const raw = { source: 'test', sourceId: '126', sourceUrl: 'http://test.com/126', scrapedAt: new Date(), titulo: 'Hermoso departamento en Providencia', }; const result = await normalizer.normalize(raw); expect(result.propertyType).toBe('departamento'); }); }); }); ``` --- ## 10. Metricas y Monitoreo ```typescript // Metricas del pipeline ETL export const etlMetrics = { // Contadores properties_extracted_total: new Counter({ name: 'etl_properties_extracted_total', help: 'Total properties extracted', labelNames: ['source'], }), properties_normalized_total: new Counter({ name: 'etl_properties_normalized_total', help: 'Total properties normalized', labelNames: ['source', 'property_type'], }), properties_loaded_total: new Counter({ name: 'etl_properties_loaded_total', help: 'Total properties loaded', labelNames: ['action'], // inserted, updated, duplicate }), geocoding_requests_total: new Counter({ name: 'etl_geocoding_requests_total', help: 'Total geocoding requests', labelNames: ['status'], // success, error, cache_hit }), // Histogramas normalization_duration_seconds: new Histogram({ name: 'etl_normalization_duration_seconds', help: 'Time to normalize a property', buckets: [0.01, 0.05, 0.1, 0.5, 1, 2, 5], }), data_quality_score: new Histogram({ name: 'etl_data_quality_score', help: 'Data quality scores', buckets: [20, 40, 60, 80, 100], }), }; ``` --- **Siguiente:** [ET-IA-007-proxies.md](./ET-IA-007-proxies.md)