inmobiliaria-analytics/docs/01-fase-alcance-inicial/IAI-007-webscraper/especificaciones/ET-SCR-002-etl.md
rckrdmrd f570727617 feat: Documentation and orchestration updates
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-07 05:35:40 -06:00

38 KiB

id title type epic status version project created_date updated_date
ET-SCR-002 Especificacion Tecnica - Pipeline ETL y Normalizacion Technical Specification IAI-007 Draft 1.0 inmobiliaria-analytics 2026-01-04 2026-01-04

ET-SCR-002: Pipeline ETL y Normalizacion


1. Resumen

Pipeline de Extract-Transform-Load para procesar datos crudos de propiedades scrapeadas, normalizarlos a un esquema unificado, enriquecerlos con geocoding y detectar duplicados.


2. Arquitectura del Pipeline

┌──────────────┐     ┌──────────────┐     ┌──────────────┐
│   EXTRACT    │────▶│  TRANSFORM   │────▶│    LOAD      │
│              │     │              │     │              │
│ - Raw HTML   │     │ - Parse      │     │ - Validate   │
│ - JSON APIs  │     │ - Normalize  │     │ - Dedupe     │
│ - Sitemap    │     │ - Geocode    │     │ - Upsert     │
└──────────────┘     └──────────────┘     └──────────────┘
       │                    │                    │
       ▼                    ▼                    ▼
┌──────────────┐     ┌──────────────┐     ┌──────────────┐
│   staging    │     │   enriched   │     │  properties  │
│   _raw       │     │   _staging   │     │  (final)     │
└──────────────┘     └──────────────┘     └──────────────┘

3. Esquema de Datos

3.1 Raw Data (Entrada)

interface RawPropertyData {
  source: string;
  sourceId: string;
  sourceUrl: string;
  scrapedAt: Date;
  rawHtml?: string;
  rawJson?: Record<string, unknown>;

  // Campos extraidos (pueden variar por fuente)
  titulo?: string;
  precio?: string;
  ubicacion?: string;
  superficie?: string;
  recamaras?: string;
  banos?: string;
  descripcion?: string;
  imagenes?: string[];
  amenidades?: string[];
  contacto?: {
    nombre?: string;
    telefono?: string;
    email?: string;
  };
}

3.2 Normalized Data (Salida)

interface NormalizedProperty {
  // Identificadores
  id: string;  // UUID interno
  sourceId: string;
  source: string;
  sourceUrl: string;

  // Informacion basica
  title: string;
  description: string;
  propertyType: PropertyType;
  transactionType: TransactionType;

  // Precios
  price: number;
  currency: 'MXN' | 'USD';
  pricePerSqm: number | null;

  // Superficie
  landArea: number | null;        // m2 terreno
  constructedArea: number | null;  // m2 construccion

  // Caracteristicas
  bedrooms: number | null;
  bathrooms: number | null;
  parkingSpaces: number | null;
  floors: number | null;
  yearBuilt: number | null;

  // Ubicacion
  location: {
    rawAddress: string;
    street: string | null;
    neighborhood: string;  // colonia
    municipality: string;  // municipio
    state: string;
    postalCode: string | null;
    country: string;
    coordinates: {
      lat: number;
      lng: number;
    } | null;
    geocodeConfidence: number;
  };

  // Media
  images: PropertyImage[];
  virtualTour: string | null;
  video: string | null;

  // Amenidades
  amenities: string[];

  // Contacto
  agent: {
    name: string | null;
    phone: string | null;
    email: string | null;
    agency: string | null;
  };

  // Metadata
  firstSeenAt: Date;
  lastSeenAt: Date;
  publishedAt: Date | null;
  status: PropertyStatus;

  // Calidad de datos
  dataQuality: {
    score: number;  // 0-100
    missingFields: string[];
    warnings: string[];
  };
}

enum PropertyType {
  CASA = 'casa',
  DEPARTAMENTO = 'departamento',
  TERRENO = 'terreno',
  LOCAL_COMERCIAL = 'local_comercial',
  OFICINA = 'oficina',
  BODEGA = 'bodega',
  EDIFICIO = 'edificio',
  OTRO = 'otro'
}

enum TransactionType {
  VENTA = 'venta',
  RENTA = 'renta',
  TRASPASO = 'traspaso'
}

enum PropertyStatus {
  ACTIVE = 'active',
  SOLD = 'sold',
  RENTED = 'rented',
  INACTIVE = 'inactive',
  REMOVED = 'removed'
}

interface PropertyImage {
  url: string;
  thumbnailUrl: string | null;
  order: number;
  isMain: boolean;
}

4. Implementacion del Pipeline

4.1 Extractor Base

// src/etl/extractors/base.extractor.ts
import { RawPropertyData } from '../types';

export abstract class BaseExtractor {
  abstract source: string;

  abstract extractFromHtml(html: string, url: string): Partial<RawPropertyData>;
  abstract extractFromJson(json: unknown, url: string): Partial<RawPropertyData>;

  protected cleanText(text: string | null | undefined): string {
    if (!text) return '';
    return text
      .replace(/\s+/g, ' ')
      .replace(/[\n\r\t]/g, ' ')
      .trim();
  }

  protected extractNumbers(text: string): number[] {
    const matches = text.match(/[\d,]+(\.\d+)?/g) || [];
    return matches.map(m => parseFloat(m.replace(/,/g, '')));
  }
}

4.2 Extractor Inmuebles24

// src/etl/extractors/inmuebles24.extractor.ts
import * as cheerio from 'cheerio';
import { BaseExtractor } from './base.extractor';
import { RawPropertyData } from '../types';

export class Inmuebles24Extractor extends BaseExtractor {
  source = 'inmuebles24';

  extractFromHtml(html: string, url: string): Partial<RawPropertyData> {
    const $ = cheerio.load(html);

    return {
      source: this.source,
      sourceUrl: url,
      sourceId: this.extractSourceId(url),
      titulo: this.cleanText($('h1.title-type-sup').text()),
      precio: this.cleanText($('.price-value').text()),
      ubicacion: this.cleanText($('.location-container').text()),

      superficie: this.extractSuperficie($),
      recamaras: this.extractFeature($, 'recamaras'),
      banos: this.extractFeature($, 'banos'),

      descripcion: this.cleanText($('.description-content').text()),

      imagenes: this.extractImages($),
      amenidades: this.extractAmenidades($),

      contacto: {
        nombre: this.cleanText($('.publisher-name').text()),
        telefono: $('[data-phone]').attr('data-phone') || null,
      },
    };
  }

  extractFromJson(json: any, url: string): Partial<RawPropertyData> {
    // Procesar JSON-LD o APIs internas
    if (json['@type'] === 'RealEstateListing') {
      return {
        source: this.source,
        sourceUrl: url,
        sourceId: json.identifier,
        titulo: json.name,
        precio: json.offers?.price?.toString(),
        // ... mapear resto de campos
      };
    }
    return {};
  }

  private extractSourceId(url: string): string {
    const match = url.match(/propiedades\/(\d+)/);
    return match ? match[1] : '';
  }

  private extractSuperficie($: cheerio.CheerioAPI): string {
    const container = $('.surface-container').text();
    return this.cleanText(container);
  }

  private extractFeature($: cheerio.CheerioAPI, feature: string): string {
    const el = $(`.feature-${feature}`).text();
    return this.cleanText(el);
  }

  private extractImages($: cheerio.CheerioAPI): string[] {
    const images: string[] = [];
    $('img.gallery-image').each((_, el) => {
      const src = $(el).attr('src') || $(el).attr('data-src');
      if (src) images.push(src);
    });
    return images;
  }

  private extractAmenidades($: cheerio.CheerioAPI): string[] {
    const amenities: string[] = [];
    $('.amenity-item').each((_, el) => {
      amenities.push(this.cleanText($(el).text()));
    });
    return amenities;
  }
}

4.3 Transformador/Normalizador

// src/etl/transformers/normalizer.ts
import { RawPropertyData, NormalizedProperty, PropertyType, TransactionType } from '../types';
import { GeocodingService } from '../services/geocoding.service';

export class PropertyNormalizer {
  constructor(private geocoder: GeocodingService) {}

  async normalize(raw: RawPropertyData): Promise<NormalizedProperty> {
    const price = this.parsePrice(raw.precio);
    const areas = this.parseAreas(raw.superficie);
    const location = await this.normalizeLocation(raw.ubicacion);

    const normalized: NormalizedProperty = {
      id: this.generateId(raw),
      sourceId: raw.sourceId,
      source: raw.source,
      sourceUrl: raw.sourceUrl,

      title: this.normalizeTitle(raw.titulo),
      description: raw.descripcion || '',
      propertyType: this.detectPropertyType(raw),
      transactionType: this.detectTransactionType(raw),

      price: price.amount,
      currency: price.currency,
      pricePerSqm: areas.constructed
        ? Math.round(price.amount / areas.constructed)
        : null,

      landArea: areas.land,
      constructedArea: areas.constructed,

      bedrooms: this.parseNumber(raw.recamaras),
      bathrooms: this.parseNumber(raw.banos),
      parkingSpaces: this.extractParkingSpaces(raw),
      floors: null,
      yearBuilt: null,

      location,

      images: this.normalizeImages(raw.imagenes),
      virtualTour: null,
      video: null,

      amenities: this.normalizeAmenities(raw.amenidades),

      agent: {
        name: raw.contacto?.nombre || null,
        phone: this.normalizePhone(raw.contacto?.telefono),
        email: raw.contacto?.email || null,
        agency: null,
      },

      firstSeenAt: raw.scrapedAt,
      lastSeenAt: raw.scrapedAt,
      publishedAt: null,
      status: 'active',

      dataQuality: this.calculateDataQuality(raw),
    };

    return normalized;
  }

  private parsePrice(priceStr?: string): { amount: number; currency: 'MXN' | 'USD' } {
    if (!priceStr) return { amount: 0, currency: 'MXN' };

    const currency = priceStr.includes('USD') || priceStr.includes('$') && priceStr.includes('dll')
      ? 'USD' : 'MXN';

    const cleaned = priceStr.replace(/[^\d.]/g, '');
    const amount = parseFloat(cleaned) || 0;

    return { amount, currency };
  }

  private parseAreas(superficieStr?: string): { land: number | null; constructed: number | null } {
    if (!superficieStr) return { land: null, constructed: null };

    const result = { land: null as number | null, constructed: null as number | null };

    // Buscar patrones como "180 m2 construccion" o "250 m2 terreno"
    const constMatch = superficieStr.match(/(\d+(?:\.\d+)?)\s*m[2²]?\s*(const|constr)/i);
    const landMatch = superficieStr.match(/(\d+(?:\.\d+)?)\s*m[2²]?\s*(terr|lote)/i);

    if (constMatch) result.constructed = parseFloat(constMatch[1]);
    if (landMatch) result.land = parseFloat(landMatch[1]);

    // Si solo hay un numero, asumir es area construida para casas/deptos
    if (!result.constructed && !result.land) {
      const numbers = superficieStr.match(/(\d+(?:\.\d+)?)/g);
      if (numbers && numbers.length === 1) {
        result.constructed = parseFloat(numbers[0]);
      }
    }

    return result;
  }

  private async normalizeLocation(rawAddress?: string): Promise<NormalizedProperty['location']> {
    const defaultLocation = {
      rawAddress: rawAddress || '',
      street: null,
      neighborhood: '',
      municipality: '',
      state: 'Jalisco',
      postalCode: null,
      country: 'Mexico',
      coordinates: null,
      geocodeConfidence: 0,
    };

    if (!rawAddress) return defaultLocation;

    try {
      const geocoded = await this.geocoder.geocode(rawAddress);

      return {
        rawAddress,
        street: geocoded.street,
        neighborhood: geocoded.neighborhood || this.extractColonia(rawAddress),
        municipality: geocoded.municipality || 'Guadalajara',
        state: geocoded.state || 'Jalisco',
        postalCode: geocoded.postalCode,
        country: 'Mexico',
        coordinates: geocoded.coordinates,
        geocodeConfidence: geocoded.confidence,
      };
    } catch (error) {
      // Fallback: parsing manual
      return {
        ...defaultLocation,
        neighborhood: this.extractColonia(rawAddress),
        municipality: this.extractMunicipio(rawAddress),
      };
    }
  }

  private extractColonia(address: string): string {
    // Patrones comunes: "Col. Providencia", "Colonia Americana"
    const match = address.match(/(?:col\.?|colonia)\s+([^,]+)/i);
    return match ? match[1].trim() : '';
  }

  private extractMunicipio(address: string): string {
    const municipios = [
      'Guadalajara', 'Zapopan', 'Tlaquepaque', 'Tonala',
      'Tlajomulco', 'El Salto', 'Ixtlahuacan'
    ];

    for (const mun of municipios) {
      if (address.toLowerCase().includes(mun.toLowerCase())) {
        return mun;
      }
    }
    return '';
  }

  private detectPropertyType(raw: RawPropertyData): PropertyType {
    const text = `${raw.titulo} ${raw.descripcion}`.toLowerCase();

    if (text.includes('departamento') || text.includes('depto')) {
      return PropertyType.DEPARTAMENTO;
    }
    if (text.includes('casa')) {
      return PropertyType.CASA;
    }
    if (text.includes('terreno') || text.includes('lote')) {
      return PropertyType.TERRENO;
    }
    if (text.includes('local') || text.includes('comercial')) {
      return PropertyType.LOCAL_COMERCIAL;
    }
    if (text.includes('oficina')) {
      return PropertyType.OFICINA;
    }
    if (text.includes('bodega')) {
      return PropertyType.BODEGA;
    }

    return PropertyType.OTRO;
  }

  private detectTransactionType(raw: RawPropertyData): TransactionType {
    const text = `${raw.titulo} ${raw.sourceUrl}`.toLowerCase();

    if (text.includes('renta') || text.includes('alquiler')) {
      return TransactionType.RENTA;
    }
    if (text.includes('traspaso')) {
      return TransactionType.TRASPASO;
    }

    return TransactionType.VENTA;
  }

  private normalizePhone(phone?: string | null): string | null {
    if (!phone) return null;

    // Limpiar y formatear telefono mexicano
    const cleaned = phone.replace(/\D/g, '');

    if (cleaned.length === 10) {
      return cleaned;
    }
    if (cleaned.length === 12 && cleaned.startsWith('52')) {
      return cleaned.substring(2);
    }

    return cleaned || null;
  }

  private normalizeImages(images?: string[]): NormalizedProperty['images'] {
    if (!images || images.length === 0) return [];

    return images.map((url, index) => ({
      url: this.normalizeImageUrl(url),
      thumbnailUrl: this.generateThumbnailUrl(url),
      order: index,
      isMain: index === 0,
    }));
  }

  private normalizeImageUrl(url: string): string {
    // Asegurar HTTPS y limpiar parametros innecesarios
    return url.replace(/^http:/, 'https:');
  }

  private generateThumbnailUrl(url: string): string {
    // Generar URL de thumbnail (depende del CDN usado)
    return url.replace('/images/', '/thumbnails/');
  }

  private normalizeAmenities(amenities?: string[]): string[] {
    if (!amenities) return [];

    const normalized = new Set<string>();
    const mapping: Record<string, string> = {
      'alberca': 'Alberca',
      'piscina': 'Alberca',
      'jardin': 'Jardin',
      'gym': 'Gimnasio',
      'gimnasio': 'Gimnasio',
      'roof': 'Roof Garden',
      'terraza': 'Terraza',
      'seguridad': 'Seguridad 24/7',
      'vigilancia': 'Seguridad 24/7',
      'estacionamiento': 'Estacionamiento',
      'cochera': 'Estacionamiento',
    };

    for (const amenity of amenities) {
      const lower = amenity.toLowerCase().trim();
      const key = Object.keys(mapping).find(k => lower.includes(k));
      normalized.add(key ? mapping[key] : amenity);
    }

    return Array.from(normalized);
  }

  private parseNumber(str?: string): number | null {
    if (!str) return null;
    const num = parseInt(str.replace(/\D/g, ''));
    return isNaN(num) ? null : num;
  }

  private extractParkingSpaces(raw: RawPropertyData): number | null {
    const text = `${raw.descripcion} ${raw.amenidades?.join(' ')}`;
    const match = text.match(/(\d+)\s*(estacionamiento|cochera|parking)/i);
    return match ? parseInt(match[1]) : null;
  }

  private generateId(raw: RawPropertyData): string {
    // Crear hash unico basado en source + sourceId
    const crypto = require('crypto');
    const input = `${raw.source}:${raw.sourceId}`;
    return crypto.createHash('sha256').update(input).digest('hex').substring(0, 32);
  }

  private calculateDataQuality(raw: RawPropertyData): NormalizedProperty['dataQuality'] {
    const requiredFields = ['titulo', 'precio', 'ubicacion', 'superficie'];
    const optionalFields = ['recamaras', 'banos', 'descripcion', 'imagenes'];

    const missingRequired = requiredFields.filter(f => !raw[f as keyof RawPropertyData]);
    const missingOptional = optionalFields.filter(f => !raw[f as keyof RawPropertyData]);

    const warnings: string[] = [];

    // Validaciones
    if (raw.precio && parseFloat(raw.precio.replace(/\D/g, '')) < 100000) {
      warnings.push('Precio sospechosamente bajo');
    }
    if (raw.imagenes && raw.imagenes.length < 3) {
      warnings.push('Pocas imagenes');
    }

    const score = Math.max(0, 100
      - (missingRequired.length * 20)
      - (missingOptional.length * 5)
      - (warnings.length * 10)
    );

    return {
      score,
      missingFields: [...missingRequired, ...missingOptional],
      warnings,
    };
  }
}

5. Servicio de Geocoding

// src/etl/services/geocoding.service.ts
import { Redis } from 'ioredis';

interface GeocodedResult {
  street: string | null;
  neighborhood: string | null;
  municipality: string | null;
  state: string | null;
  postalCode: string | null;
  coordinates: { lat: number; lng: number } | null;
  confidence: number;
}

export class GeocodingService {
  private redis: Redis;
  private nominatimUrl = 'https://nominatim.openstreetmap.org/search';
  private rateLimiter: { lastCall: number; minInterval: number };

  constructor() {
    this.redis = new Redis(process.env.REDIS_URL);
    this.rateLimiter = { lastCall: 0, minInterval: 1100 }; // 1 req/sec for Nominatim
  }

  async geocode(address: string): Promise<GeocodedResult> {
    // 1. Check cache
    const cacheKey = `geocode:${this.hashAddress(address)}`;
    const cached = await this.redis.get(cacheKey);
    if (cached) {
      return JSON.parse(cached);
    }

    // 2. Rate limiting
    await this.enforceRateLimit();

    // 3. Call geocoding API
    const result = await this.callNominatim(address);

    // 4. Cache result (30 days)
    await this.redis.setex(cacheKey, 60 * 60 * 24 * 30, JSON.stringify(result));

    return result;
  }

  private async callNominatim(address: string): Promise<GeocodedResult> {
    const params = new URLSearchParams({
      q: `${address}, Jalisco, Mexico`,
      format: 'json',
      addressdetails: '1',
      limit: '1',
    });

    try {
      const response = await fetch(`${this.nominatimUrl}?${params}`, {
        headers: {
          'User-Agent': 'InmobiliariaAnalytics/1.0',
        },
      });

      const data = await response.json();

      if (!data || data.length === 0) {
        return this.emptyResult();
      }

      const result = data[0];
      const addr = result.address || {};

      return {
        street: addr.road || addr.street || null,
        neighborhood: addr.suburb || addr.neighbourhood || null,
        municipality: addr.city || addr.town || addr.municipality || null,
        state: addr.state || null,
        postalCode: addr.postcode || null,
        coordinates: {
          lat: parseFloat(result.lat),
          lng: parseFloat(result.lon),
        },
        confidence: this.calculateConfidence(result),
      };
    } catch (error) {
      console.error('Geocoding error:', error);
      return this.emptyResult();
    }
  }

  private calculateConfidence(result: any): number {
    // Basado en importance y type de Nominatim
    const importance = result.importance || 0;
    const type = result.type;

    let confidence = importance * 100;

    // Bonus por tipo preciso
    if (type === 'house' || type === 'building') {
      confidence = Math.min(100, confidence + 20);
    }

    return Math.round(confidence);
  }

  private async enforceRateLimit(): Promise<void> {
    const now = Date.now();
    const elapsed = now - this.rateLimiter.lastCall;

    if (elapsed < this.rateLimiter.minInterval) {
      await new Promise(resolve =>
        setTimeout(resolve, this.rateLimiter.minInterval - elapsed)
      );
    }

    this.rateLimiter.lastCall = Date.now();
  }

  private hashAddress(address: string): string {
    const crypto = require('crypto');
    return crypto.createHash('md5').update(address.toLowerCase().trim()).digest('hex');
  }

  private emptyResult(): GeocodedResult {
    return {
      street: null,
      neighborhood: null,
      municipality: null,
      state: null,
      postalCode: null,
      coordinates: null,
      confidence: 0,
    };
  }
}

6. Detector de Duplicados

// src/etl/services/deduplication.service.ts
import { Pool } from 'pg';
import { NormalizedProperty } from '../types';

interface DuplicateCandidate {
  id: string;
  similarity: number;
  matchedFields: string[];
}

export class DeduplicationService {
  private db: Pool;

  constructor() {
    this.db = new Pool({ connectionString: process.env.DATABASE_URL });
  }

  async findDuplicates(property: NormalizedProperty): Promise<DuplicateCandidate[]> {
    const candidates: DuplicateCandidate[] = [];

    // 1. Exacto por sourceId de otra fuente
    const exactMatch = await this.findExactMatch(property);
    if (exactMatch) {
      candidates.push({ ...exactMatch, similarity: 1.0 });
    }

    // 2. Fuzzy matching por caracteristicas
    const fuzzyMatches = await this.findFuzzyMatches(property);
    candidates.push(...fuzzyMatches);

    return candidates.sort((a, b) => b.similarity - a.similarity);
  }

  private async findExactMatch(property: NormalizedProperty): Promise<DuplicateCandidate | null> {
    // Buscar misma propiedad de diferente fuente
    const query = `
      SELECT id, source, source_id, title, price,
             ST_Distance(
               coordinates::geography,
               ST_SetSRID(ST_MakePoint($1, $2), 4326)::geography
             ) as distance_meters
      FROM properties
      WHERE source != $3
        AND price BETWEEN $4 * 0.95 AND $4 * 1.05
        AND property_type = $5
        AND ST_DWithin(
          coordinates::geography,
          ST_SetSRID(ST_MakePoint($1, $2), 4326)::geography,
          100  -- 100 metros
        )
      LIMIT 5
    `;

    if (!property.location.coordinates) return null;

    const result = await this.db.query(query, [
      property.location.coordinates.lng,
      property.location.coordinates.lat,
      property.source,
      property.price,
      property.propertyType,
    ]);

    for (const row of result.rows) {
      const titleSimilarity = this.calculateTextSimilarity(property.title, row.title);
      if (titleSimilarity > 0.8 && row.distance_meters < 50) {
        return {
          id: row.id,
          similarity: 0.95,
          matchedFields: ['coordinates', 'price', 'title', 'property_type'],
        };
      }
    }

    return null;
  }

  private async findFuzzyMatches(property: NormalizedProperty): Promise<DuplicateCandidate[]> {
    const query = `
      SELECT id, title, price, bedrooms, bathrooms, constructed_area,
             neighborhood, coordinates
      FROM properties
      WHERE source != $1
        AND neighborhood = $2
        AND property_type = $3
        AND price BETWEEN $4 * 0.9 AND $4 * 1.1
        AND status = 'active'
      LIMIT 20
    `;

    const result = await this.db.query(query, [
      property.source,
      property.location.neighborhood,
      property.propertyType,
      property.price,
    ]);

    const candidates: DuplicateCandidate[] = [];

    for (const row of result.rows) {
      const similarity = this.calculatePropertySimilarity(property, row);

      if (similarity > 0.75) {
        candidates.push({
          id: row.id,
          similarity,
          matchedFields: this.getMatchedFields(property, row),
        });
      }
    }

    return candidates;
  }

  private calculatePropertySimilarity(prop: NormalizedProperty, candidate: any): number {
    let score = 0;
    let totalWeight = 0;

    // Precio (peso 0.3)
    const priceDiff = Math.abs(prop.price - candidate.price) / prop.price;
    score += (1 - Math.min(priceDiff, 1)) * 0.3;
    totalWeight += 0.3;

    // Area (peso 0.25)
    if (prop.constructedArea && candidate.constructed_area) {
      const areaDiff = Math.abs(prop.constructedArea - candidate.constructed_area) / prop.constructedArea;
      score += (1 - Math.min(areaDiff, 1)) * 0.25;
      totalWeight += 0.25;
    }

    // Recamaras (peso 0.15)
    if (prop.bedrooms !== null && candidate.bedrooms !== null) {
      score += (prop.bedrooms === candidate.bedrooms ? 1 : 0) * 0.15;
      totalWeight += 0.15;
    }

    // Banos (peso 0.15)
    if (prop.bathrooms !== null && candidate.bathrooms !== null) {
      score += (prop.bathrooms === candidate.bathrooms ? 1 : 0) * 0.15;
      totalWeight += 0.15;
    }

    // Titulo (peso 0.15)
    const titleSim = this.calculateTextSimilarity(prop.title, candidate.title);
    score += titleSim * 0.15;
    totalWeight += 0.15;

    return totalWeight > 0 ? score / totalWeight : 0;
  }

  private calculateTextSimilarity(text1: string, text2: string): number {
    // Jaccard similarity de palabras
    const words1 = new Set(text1.toLowerCase().split(/\s+/));
    const words2 = new Set(text2.toLowerCase().split(/\s+/));

    const intersection = new Set([...words1].filter(x => words2.has(x)));
    const union = new Set([...words1, ...words2]);

    return intersection.size / union.size;
  }

  private getMatchedFields(prop: NormalizedProperty, candidate: any): string[] {
    const matched: string[] = [];

    if (Math.abs(prop.price - candidate.price) / prop.price < 0.05) {
      matched.push('price');
    }
    if (prop.bedrooms === candidate.bedrooms) {
      matched.push('bedrooms');
    }
    if (prop.bathrooms === candidate.bathrooms) {
      matched.push('bathrooms');
    }
    if (prop.location.neighborhood === candidate.neighborhood) {
      matched.push('neighborhood');
    }

    return matched;
  }

  async mergeProperties(
    primaryId: string,
    duplicateIds: string[]
  ): Promise<void> {
    const client = await this.db.connect();

    try {
      await client.query('BEGIN');

      // Crear registros en property_aliases
      for (const dupId of duplicateIds) {
        await client.query(`
          INSERT INTO property_aliases (primary_id, alias_id, merged_at)
          VALUES ($1, $2, NOW())
          ON CONFLICT DO NOTHING
        `, [primaryId, dupId]);
      }

      // Marcar duplicados como merged
      await client.query(`
        UPDATE properties
        SET status = 'merged', merged_into = $1
        WHERE id = ANY($2)
      `, [primaryId, duplicateIds]);

      await client.query('COMMIT');
    } catch (error) {
      await client.query('ROLLBACK');
      throw error;
    } finally {
      client.release();
    }
  }
}

7. Loader (Carga a Base de Datos)

// src/etl/loaders/property.loader.ts
import { Pool } from 'pg';
import { NormalizedProperty } from '../types';
import { DeduplicationService } from '../services/deduplication.service';

export class PropertyLoader {
  private db: Pool;
  private deduper: DeduplicationService;

  constructor() {
    this.db = new Pool({ connectionString: process.env.DATABASE_URL });
    this.deduper = new DeduplicationService();
  }

  async load(property: NormalizedProperty): Promise<{ action: 'inserted' | 'updated' | 'duplicate'; id: string }> {
    // 1. Verificar si ya existe por source + sourceId
    const existing = await this.findExisting(property.source, property.sourceId);

    if (existing) {
      await this.update(existing.id, property);
      return { action: 'updated', id: existing.id };
    }

    // 2. Buscar duplicados de otras fuentes
    const duplicates = await this.deduper.findDuplicates(property);

    if (duplicates.length > 0 && duplicates[0].similarity > 0.9) {
      // Es un duplicado, vincular a existente
      await this.linkDuplicate(duplicates[0].id, property);
      return { action: 'duplicate', id: duplicates[0].id };
    }

    // 3. Insertar nueva propiedad
    const id = await this.insert(property);
    return { action: 'inserted', id };
  }

  private async findExisting(source: string, sourceId: string): Promise<{ id: string } | null> {
    const result = await this.db.query(
      'SELECT id FROM properties WHERE source = $1 AND source_id = $2',
      [source, sourceId]
    );
    return result.rows[0] || null;
  }

  private async insert(property: NormalizedProperty): Promise<string> {
    const query = `
      INSERT INTO properties (
        id, source, source_id, source_url,
        title, description, property_type, transaction_type,
        price, currency, price_per_sqm,
        land_area, constructed_area,
        bedrooms, bathrooms, parking_spaces, floors, year_built,
        raw_address, street, neighborhood, municipality, state, postal_code, country,
        coordinates, geocode_confidence,
        images, virtual_tour, video,
        amenities,
        agent_name, agent_phone, agent_email, agent_agency,
        first_seen_at, last_seen_at, published_at, status,
        data_quality_score, missing_fields, data_warnings
      ) VALUES (
        $1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
        $11, $12, $13, $14, $15, $16, $17, $18,
        $19, $20, $21, $22, $23, $24, $25,
        ST_SetSRID(ST_MakePoint($26, $27), 4326), $28,
        $29, $30, $31, $32,
        $33, $34, $35, $36,
        $37, $38, $39, $40,
        $41, $42, $43
      )
      RETURNING id
    `;

    const coords = property.location.coordinates;

    const result = await this.db.query(query, [
      property.id,
      property.source,
      property.sourceId,
      property.sourceUrl,
      property.title,
      property.description,
      property.propertyType,
      property.transactionType,
      property.price,
      property.currency,
      property.pricePerSqm,
      property.landArea,
      property.constructedArea,
      property.bedrooms,
      property.bathrooms,
      property.parkingSpaces,
      property.floors,
      property.yearBuilt,
      property.location.rawAddress,
      property.location.street,
      property.location.neighborhood,
      property.location.municipality,
      property.location.state,
      property.location.postalCode,
      property.location.country,
      coords?.lng || null,
      coords?.lat || null,
      property.location.geocodeConfidence,
      JSON.stringify(property.images),
      property.virtualTour,
      property.video,
      property.amenities,
      property.agent.name,
      property.agent.phone,
      property.agent.email,
      property.agent.agency,
      property.firstSeenAt,
      property.lastSeenAt,
      property.publishedAt,
      property.status,
      property.dataQuality.score,
      property.dataQuality.missingFields,
      property.dataQuality.warnings,
    ]);

    return result.rows[0].id;
  }

  private async update(id: string, property: NormalizedProperty): Promise<void> {
    const query = `
      UPDATE properties SET
        title = $2,
        description = $3,
        price = $4,
        price_per_sqm = $5,
        last_seen_at = NOW(),
        data_quality_score = $6,
        images = $7
      WHERE id = $1
    `;

    await this.db.query(query, [
      id,
      property.title,
      property.description,
      property.price,
      property.pricePerSqm,
      property.dataQuality.score,
      JSON.stringify(property.images),
    ]);
  }

  private async linkDuplicate(existingId: string, property: NormalizedProperty): Promise<void> {
    // Registrar como alias
    await this.db.query(`
      INSERT INTO property_aliases (primary_id, alias_source, alias_source_id, alias_url)
      VALUES ($1, $2, $3, $4)
      ON CONFLICT DO NOTHING
    `, [existingId, property.source, property.sourceId, property.sourceUrl]);

    // Actualizar last_seen del principal
    await this.db.query(`
      UPDATE properties SET last_seen_at = NOW() WHERE id = $1
    `, [existingId]);
  }
}

8. Esquema de Base de Datos

-- Tabla principal de propiedades
CREATE TABLE properties (
  id VARCHAR(32) PRIMARY KEY,
  source VARCHAR(50) NOT NULL,
  source_id VARCHAR(100) NOT NULL,
  source_url TEXT NOT NULL,

  title VARCHAR(500) NOT NULL,
  description TEXT,
  property_type VARCHAR(50) NOT NULL,
  transaction_type VARCHAR(20) NOT NULL,

  price DECIMAL(15,2) NOT NULL,
  currency VARCHAR(3) DEFAULT 'MXN',
  price_per_sqm DECIMAL(10,2),

  land_area DECIMAL(10,2),
  constructed_area DECIMAL(10,2),

  bedrooms SMALLINT,
  bathrooms DECIMAL(3,1),
  parking_spaces SMALLINT,
  floors SMALLINT,
  year_built SMALLINT,

  raw_address TEXT,
  street VARCHAR(200),
  neighborhood VARCHAR(100),
  municipality VARCHAR(100),
  state VARCHAR(50),
  postal_code VARCHAR(10),
  country VARCHAR(50) DEFAULT 'Mexico',
  coordinates GEOMETRY(Point, 4326),
  geocode_confidence SMALLINT,

  images JSONB DEFAULT '[]',
  virtual_tour TEXT,
  video TEXT,

  amenities TEXT[],

  agent_name VARCHAR(200),
  agent_phone VARCHAR(20),
  agent_email VARCHAR(200),
  agent_agency VARCHAR(200),

  first_seen_at TIMESTAMP NOT NULL,
  last_seen_at TIMESTAMP NOT NULL,
  published_at TIMESTAMP,
  status VARCHAR(20) DEFAULT 'active',
  merged_into VARCHAR(32) REFERENCES properties(id),

  data_quality_score SMALLINT,
  missing_fields TEXT[],
  data_warnings TEXT[],

  created_at TIMESTAMP DEFAULT NOW(),
  updated_at TIMESTAMP DEFAULT NOW(),

  UNIQUE(source, source_id)
);

-- Indices
CREATE INDEX idx_properties_location ON properties USING GIST(coordinates);
CREATE INDEX idx_properties_neighborhood ON properties(neighborhood);
CREATE INDEX idx_properties_price ON properties(price);
CREATE INDEX idx_properties_type ON properties(property_type);
CREATE INDEX idx_properties_status ON properties(status);
CREATE INDEX idx_properties_last_seen ON properties(last_seen_at);

-- Tabla de aliases (propiedades duplicadas)
CREATE TABLE property_aliases (
  id SERIAL PRIMARY KEY,
  primary_id VARCHAR(32) REFERENCES properties(id),
  alias_source VARCHAR(50) NOT NULL,
  alias_source_id VARCHAR(100) NOT NULL,
  alias_url TEXT,
  merged_at TIMESTAMP DEFAULT NOW(),

  UNIQUE(alias_source, alias_source_id)
);

-- Historial de precios
CREATE TABLE price_history (
  id SERIAL PRIMARY KEY,
  property_id VARCHAR(32) REFERENCES properties(id),
  price DECIMAL(15,2) NOT NULL,
  currency VARCHAR(3) NOT NULL,
  recorded_at TIMESTAMP DEFAULT NOW()
);

CREATE INDEX idx_price_history_property ON price_history(property_id);

9. Tests

// src/etl/__tests__/normalizer.test.ts
import { PropertyNormalizer } from '../transformers/normalizer';
import { GeocodingService } from '../services/geocoding.service';

jest.mock('../services/geocoding.service');

describe('PropertyNormalizer', () => {
  let normalizer: PropertyNormalizer;

  beforeEach(() => {
    const mockGeocoder = new GeocodingService();
    (mockGeocoder.geocode as jest.Mock).mockResolvedValue({
      street: 'Av. Providencia',
      neighborhood: 'Providencia',
      municipality: 'Guadalajara',
      state: 'Jalisco',
      postalCode: '44630',
      coordinates: { lat: 20.6736, lng: -103.3927 },
      confidence: 85,
    });

    normalizer = new PropertyNormalizer(mockGeocoder);
  });

  describe('parsePrice', () => {
    it('should parse MXN price correctly', async () => {
      const raw = {
        source: 'test',
        sourceId: '123',
        sourceUrl: 'http://test.com/123',
        scrapedAt: new Date(),
        precio: '$4,500,000 MXN',
      };

      const result = await normalizer.normalize(raw);

      expect(result.price).toBe(4500000);
      expect(result.currency).toBe('MXN');
    });

    it('should parse USD price correctly', async () => {
      const raw = {
        source: 'test',
        sourceId: '124',
        sourceUrl: 'http://test.com/124',
        scrapedAt: new Date(),
        precio: '$350,000 USD',
      };

      const result = await normalizer.normalize(raw);

      expect(result.price).toBe(350000);
      expect(result.currency).toBe('USD');
    });
  });

  describe('parseAreas', () => {
    it('should parse both land and constructed areas', async () => {
      const raw = {
        source: 'test',
        sourceId: '125',
        sourceUrl: 'http://test.com/125',
        scrapedAt: new Date(),
        superficie: '180 m2 construccion, 250 m2 terreno',
      };

      const result = await normalizer.normalize(raw);

      expect(result.constructedArea).toBe(180);
      expect(result.landArea).toBe(250);
    });
  });

  describe('detectPropertyType', () => {
    it('should detect departamento', async () => {
      const raw = {
        source: 'test',
        sourceId: '126',
        sourceUrl: 'http://test.com/126',
        scrapedAt: new Date(),
        titulo: 'Hermoso departamento en Providencia',
      };

      const result = await normalizer.normalize(raw);

      expect(result.propertyType).toBe('departamento');
    });
  });
});

10. Metricas y Monitoreo

// Metricas del pipeline ETL
export const etlMetrics = {
  // Contadores
  properties_extracted_total: new Counter({
    name: 'etl_properties_extracted_total',
    help: 'Total properties extracted',
    labelNames: ['source'],
  }),

  properties_normalized_total: new Counter({
    name: 'etl_properties_normalized_total',
    help: 'Total properties normalized',
    labelNames: ['source', 'property_type'],
  }),

  properties_loaded_total: new Counter({
    name: 'etl_properties_loaded_total',
    help: 'Total properties loaded',
    labelNames: ['action'], // inserted, updated, duplicate
  }),

  geocoding_requests_total: new Counter({
    name: 'etl_geocoding_requests_total',
    help: 'Total geocoding requests',
    labelNames: ['status'], // success, error, cache_hit
  }),

  // Histogramas
  normalization_duration_seconds: new Histogram({
    name: 'etl_normalization_duration_seconds',
    help: 'Time to normalize a property',
    buckets: [0.01, 0.05, 0.1, 0.5, 1, 2, 5],
  }),

  data_quality_score: new Histogram({
    name: 'etl_data_quality_score',
    help: 'Data quality scores',
    buckets: [20, 40, 60, 80, 100],
  }),
};

Siguiente: ET-IA-007-proxies.md