--- id: "ET-SCR-001" title: "Especificacion Tecnica: Motor de Scraping" type: "Technical Specification" epic: "IAI-007" status: "Draft" project: "inmobiliaria-analytics" version: "1.0.0" created_date: "2026-01-04" updated_date: "2026-01-04" --- # ET-SCR-001: Especificacion Tecnica del Motor de Scraping --- ## Resumen Esta especificacion define la arquitectura e implementacion del motor de web scraping con capacidades anti-detection para extraer datos de portales inmobiliarios protegidos por Cloudflare. --- ## Stack Tecnologico ```yaml runtime: Node.js 20 LTS language: TypeScript 5.x dependencias: scraping: - playwright: "^1.40.0" - playwright-extra: "^4.3.0" - puppeteer-extra-plugin-stealth: "^2.11.0" - cheerio: "^1.0.0" queue: - bullmq: "^5.0.0" - ioredis: "^5.3.0" http: - axios: "^1.6.0" - https-proxy-agent: "^7.0.0" utils: - pino: "^8.0.0" - zod: "^3.22.0" - date-fns: "^3.0.0" testing: - vitest: "^1.0.0" - msw: "^2.0.0" ``` --- ## Arquitectura de Componentes ``` ┌─────────────────────────────────────────────────────────────────┐ │ SCRAPER SERVICE │ ├─────────────────────────────────────────────────────────────────┤ │ │ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ │ │ Scheduler │───▶│ Job Queue │───▶│ Workers │ │ │ │ (Cron/API) │ │ (BullMQ) │ │ (N=2-4) │ │ │ └──────────────┘ └──────────────┘ └──────┬───────┘ │ │ │ │ │ ┌──────────────────────┼──────────┐ │ │ │ ▼ │ │ │ ┌──────────────┐ ┌────┴─────────┐ ┌──────────────┐ │ │ │ │ Proxy │◀───│ Browser │───▶│ Parser │ │ │ │ │ Manager │ │ Engine │ │ (Cheerio) │ │ │ │ └──────────────┘ │ (Playwright) │ └──────┬───────┘ │ │ │ └──────────────┘ │ │ │ │ ▼ │ │ │ ┌──────────────┐ │ │ │ │ Normalizer │ │ │ │ └──────┬───────┘ │ │ │ │ │ │ │ Scraper Core │ │ │ │ └────────────────────┼────────────┘ │ │ ▼ │ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ │ │ Metrics │◀───│ Storage │───▶│ PostgreSQL │ │ │ │ (Prometheus) │ │ (S3/Local) │ │ │ │ │ └──────────────┘ └──────────────┘ └──────────────┘ │ │ │ └─────────────────────────────────────────────────────────────────┘ ``` --- ## Estructura de Codigo ``` apps/scraper/ ├── src/ │ ├── index.ts # Entry point │ ├── config/ │ │ ├── index.ts │ │ ├── sources.config.ts # Configuracion por fuente │ │ └── schedules.config.ts │ │ │ ├── core/ │ │ ├── browser/ │ │ │ ├── browser-manager.ts │ │ │ ├── stealth-config.ts │ │ │ └── page-utils.ts │ │ │ │ │ ├── proxy/ │ │ │ ├── proxy-pool.ts │ │ │ ├── proxy-rotator.ts │ │ │ └── proxy-health.ts │ │ │ │ │ ├── queue/ │ │ │ ├── job-queue.ts │ │ │ ├── job-processor.ts │ │ │ └── job-types.ts │ │ │ │ │ └── rate-limiter/ │ │ └── adaptive-limiter.ts │ │ │ ├── scrapers/ │ │ ├── base-scraper.ts # Clase base abstracta │ │ ├── inmuebles24/ │ │ │ ├── scraper.ts │ │ │ ├── selectors.ts │ │ │ └── mappings.ts │ │ ├── vivanuncios/ │ │ │ ├── scraper.ts │ │ │ ├── selectors.ts │ │ │ └── mappings.ts │ │ └── segundamano/ │ │ └── ... │ │ │ ├── etl/ │ │ ├── extractor.ts │ │ ├── transformer.ts │ │ ├── normalizer.ts │ │ ├── geocoder.ts │ │ └── deduplicator.ts │ │ │ ├── storage/ │ │ ├── raw-storage.ts # S3/MinIO │ │ └── property-repository.ts │ │ │ ├── monitoring/ │ │ ├── metrics.ts │ │ ├── alerts.ts │ │ └── health-check.ts │ │ │ ├── api/ │ │ ├── routes/ │ │ │ ├── jobs.routes.ts │ │ │ ├── stats.routes.ts │ │ │ └── proxies.routes.ts │ │ └── server.ts │ │ │ └── types/ │ ├── job.types.ts │ ├── property.types.ts │ └── proxy.types.ts │ ├── tests/ │ ├── unit/ │ ├── integration/ │ └── e2e/ │ ├── Dockerfile ├── docker-compose.yml ├── package.json └── tsconfig.json ``` --- ## Implementacion del Browser Engine ### Browser Manager ```typescript // src/core/browser/browser-manager.ts import { chromium, Browser, BrowserContext, Page } from 'playwright'; import { addExtra } from 'playwright-extra'; import StealthPlugin from 'puppeteer-extra-plugin-stealth'; export class BrowserManager { private browser: Browser | null = null; private contexts: Map = new Map(); async initialize(): Promise { const chromiumExtra = addExtra(chromium); chromiumExtra.use(StealthPlugin()); this.browser = await chromiumExtra.launch({ headless: true, args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-accelerated-2d-canvas', '--disable-gpu', '--window-size=1920,1080', ], }); } async createContext( sessionId: string, proxy?: ProxyConfig ): Promise { if (!this.browser) throw new Error('Browser not initialized'); const context = await this.browser.newContext({ viewport: { width: 1920, height: 1080 }, userAgent: this.getRandomUserAgent(), locale: 'es-MX', timezoneId: 'America/Mexico_City', proxy: proxy ? { server: `${proxy.address}:${proxy.port}`, username: proxy.username, password: proxy.password, } : undefined, }); // Anti-detection patches await this.applyStealthPatches(context); this.contexts.set(sessionId, context); return context; } private async applyStealthPatches(context: BrowserContext): Promise { await context.addInitScript(() => { // Hide webdriver Object.defineProperty(navigator, 'webdriver', { get: () => undefined, }); // Mock plugins Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5], }); // Mock languages Object.defineProperty(navigator, 'languages', { get: () => ['es-MX', 'es', 'en-US', 'en'], }); }); } private getRandomUserAgent(): string { const userAgents = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0', // ... more user agents ]; return userAgents[Math.floor(Math.random() * userAgents.length)]; } async close(): Promise { for (const context of this.contexts.values()) { await context.close(); } if (this.browser) { await this.browser.close(); } } } ``` ### Human-like Behavior ```typescript // src/core/browser/page-utils.ts import { Page } from 'playwright'; export class PageUtils { static async humanScroll(page: Page): Promise { const scrollHeight = await page.evaluate(() => document.body.scrollHeight); let currentPosition = 0; while (currentPosition < scrollHeight) { const scrollAmount = Math.random() * 300 + 100; currentPosition += scrollAmount; await page.evaluate((y) => window.scrollTo(0, y), currentPosition); await this.randomDelay(100, 300); } } static async humanClick(page: Page, selector: string): Promise { const element = await page.$(selector); if (!element) throw new Error(`Element not found: ${selector}`); const box = await element.boundingBox(); if (!box) throw new Error(`Element not visible: ${selector}`); // Move to element with slight randomness const x = box.x + box.width / 2 + (Math.random() * 10 - 5); const y = box.y + box.height / 2 + (Math.random() * 10 - 5); await page.mouse.move(x, y, { steps: 10 }); await this.randomDelay(50, 150); await page.mouse.click(x, y); } static async randomDelay(min: number, max: number): Promise { const delay = Math.floor(Math.random() * (max - min + 1)) + min; await new Promise(resolve => setTimeout(resolve, delay)); } static async waitForCloudflare(page: Page): Promise { // Wait for Cloudflare challenge to complete try { await page.waitForSelector('#challenge-running', { state: 'hidden', timeout: 30000, }); } catch { // No challenge present, continue } // Additional wait for JS to fully load await page.waitForLoadState('networkidle'); } } ``` --- ## Base Scraper Implementation ```typescript // src/scrapers/base-scraper.ts import { Page, BrowserContext } from 'playwright'; import { BrowserManager } from '../core/browser/browser-manager'; import { ProxyPool } from '../core/proxy/proxy-pool'; import { PageUtils } from '../core/browser/page-utils'; import { Logger } from 'pino'; export interface ScrapingResult { success: boolean; properties: RawProperty[]; errors: ScrapingError[]; stats: ScrapingStats; } export abstract class BaseScraper { protected browserManager: BrowserManager; protected proxyPool: ProxyPool; protected logger: Logger; protected context: BrowserContext | null = null; protected page: Page | null = null; abstract readonly source: string; abstract readonly baseUrl: string; constructor( browserManager: BrowserManager, proxyPool: ProxyPool, logger: Logger ) { this.browserManager = browserManager; this.proxyPool = proxyPool; this.logger = logger.child({ source: this.source }); } async scrape(config: ScrapingConfig): Promise { const stats: ScrapingStats = { pagesScraped: 0, propertiesFound: 0, errors: 0, startedAt: new Date(), }; const properties: RawProperty[] = []; const errors: ScrapingError[] = []; try { await this.initSession(); for (const city of config.targetCities) { for (const type of config.propertyTypes) { const result = await this.scrapeListings(city, type, config); properties.push(...result.properties); errors.push(...result.errors); stats.pagesScraped += result.pagesScraped; } } stats.propertiesFound = properties.length; stats.errors = errors.length; stats.completedAt = new Date(); return { success: true, properties, errors, stats }; } catch (error) { this.logger.error({ error }, 'Scraping failed'); return { success: false, properties, errors: [...errors, { type: 'fatal', message: String(error) }], stats, }; } finally { await this.closeSession(); } } protected async initSession(): Promise { const proxy = await this.proxyPool.getProxy(); const sessionId = `${this.source}-${Date.now()}`; this.context = await this.browserManager.createContext(sessionId, proxy); this.page = await this.context.newPage(); // Set default timeout this.page.setDefaultTimeout(30000); } protected async closeSession(): Promise { if (this.page) await this.page.close(); if (this.context) await this.context.close(); } protected async navigateWithRetry( url: string, maxRetries: number = 3 ): Promise { for (let attempt = 1; attempt <= maxRetries; attempt++) { try { await this.page!.goto(url, { waitUntil: 'domcontentloaded' }); await PageUtils.waitForCloudflare(this.page!); return; } catch (error) { this.logger.warn({ url, attempt, error }, 'Navigation failed, retrying'); if (attempt === maxRetries) throw error; // Rotate proxy on failure await this.rotateProxy(); await PageUtils.randomDelay(2000, 5000); } } } protected async rotateProxy(): Promise { const newProxy = await this.proxyPool.getProxy(); await this.closeSession(); const sessionId = `${this.source}-${Date.now()}`; this.context = await this.browserManager.createContext(sessionId, newProxy); this.page = await this.context.newPage(); } // Abstract methods to be implemented by each source protected abstract scrapeListings( city: string, propertyType: string, config: ScrapingConfig ): Promise; protected abstract parsePropertyDetail( page: Page ): Promise; protected abstract getListingUrl( city: string, propertyType: string, page: number ): string; } ``` --- ## Proxy Pool Implementation ```typescript // src/core/proxy/proxy-pool.ts import { Redis } from 'ioredis'; export interface ProxyConfig { id: string; address: string; port: number; username?: string; password?: string; type: 'residential' | 'datacenter' | 'mobile'; country: string; status: 'active' | 'cooling' | 'banned'; successRate: number; lastUsedAt?: Date; coolingUntil?: Date; } export class ProxyPool { private redis: Redis; private readonly POOL_KEY = 'proxy:pool'; private readonly COOLING_KEY = 'proxy:cooling'; constructor(redis: Redis) { this.redis = redis; } async getProxy(): Promise { // Get all active proxies const proxies = await this.getActiveProxies(); if (proxies.length === 0) { throw new Error('No active proxies available'); } // Weighted selection based on success rate const selected = this.weightedSelection(proxies); // Mark as used await this.markUsed(selected.id); return selected; } private async getActiveProxies(): Promise { const all = await this.redis.hgetall(this.POOL_KEY); const now = Date.now(); return Object.values(all) .map(p => JSON.parse(p) as ProxyConfig) .filter(p => { if (p.status === 'banned') return false; if (p.status === 'cooling' && p.coolingUntil) { return new Date(p.coolingUntil).getTime() < now; } return p.status === 'active'; }); } private weightedSelection(proxies: ProxyConfig[]): ProxyConfig { // Higher success rate = higher weight const totalWeight = proxies.reduce((sum, p) => sum + p.successRate, 0); let random = Math.random() * totalWeight; for (const proxy of proxies) { random -= proxy.successRate; if (random <= 0) return proxy; } return proxies[0]; } async markUsed(proxyId: string): Promise { const proxy = await this.getProxy(proxyId); if (proxy) { proxy.lastUsedAt = new Date(); await this.redis.hset(this.POOL_KEY, proxyId, JSON.stringify(proxy)); } } async markSuccess(proxyId: string): Promise { const proxy = await this.getProxyById(proxyId); if (proxy) { // Update success rate with exponential moving average proxy.successRate = proxy.successRate * 0.9 + 1 * 0.1; await this.redis.hset(this.POOL_KEY, proxyId, JSON.stringify(proxy)); } } async markFailure(proxyId: string, errorType: string): Promise { const proxy = await this.getProxyById(proxyId); if (!proxy) return; // Update success rate proxy.successRate = proxy.successRate * 0.9 + 0 * 0.1; if (errorType === 'rate_limit') { // Put in cooling for 1 hour proxy.status = 'cooling'; proxy.coolingUntil = new Date(Date.now() + 3600000); } else if (errorType === 'banned') { proxy.status = 'banned'; } await this.redis.hset(this.POOL_KEY, proxyId, JSON.stringify(proxy)); } async getStats(): Promise { const all = await this.redis.hgetall(this.POOL_KEY); const proxies = Object.values(all).map(p => JSON.parse(p) as ProxyConfig); return { total: proxies.length, active: proxies.filter(p => p.status === 'active').length, cooling: proxies.filter(p => p.status === 'cooling').length, banned: proxies.filter(p => p.status === 'banned').length, avgSuccessRate: proxies.reduce((sum, p) => sum + p.successRate, 0) / proxies.length, }; } } ``` --- ## Job Queue Implementation ```typescript // src/core/queue/job-queue.ts import { Queue, Worker, Job } from 'bullmq'; import { Redis } from 'ioredis'; export interface ScrapingJobData { id: string; type: 'full_scan' | 'incremental' | 'targeted' | 'refresh'; source: string; config: ScrapingConfig; createdBy?: string; } export class JobQueue { private queue: Queue; private worker: Worker; private redis: Redis; constructor(redis: Redis, processor: JobProcessor) { this.redis = redis; this.queue = new Queue('scraping', { connection: redis, defaultJobOptions: { attempts: 3, backoff: { type: 'exponential', delay: 5000, }, removeOnComplete: 100, removeOnFail: 50, }, }); this.worker = new Worker( 'scraping', async (job: Job) => { return processor.process(job); }, { connection: redis, concurrency: 2, } ); this.setupEventHandlers(); } private setupEventHandlers(): void { this.worker.on('completed', (job, result) => { console.log(`Job ${job.id} completed`, result); }); this.worker.on('failed', (job, error) => { console.error(`Job ${job?.id} failed`, error); }); this.worker.on('progress', (job, progress) => { console.log(`Job ${job.id} progress: ${progress}%`); }); } async addJob(data: ScrapingJobData): Promise> { return this.queue.add(data.type, data, { jobId: data.id, }); } async scheduleJob( data: ScrapingJobData, cron: string ): Promise { await this.queue.add(data.type, data, { repeat: { pattern: cron }, jobId: `${data.id}-scheduled`, }); } async pauseJob(jobId: string): Promise { const job = await this.queue.getJob(jobId); if (job) { await job.updateProgress({ status: 'paused' }); } } async getJobStatus(jobId: string): Promise { const job = await this.queue.getJob(jobId); if (!job) return null; const state = await job.getState(); return { id: job.id!, state, progress: job.progress, data: job.data, attemptsMade: job.attemptsMade, failedReason: job.failedReason, }; } } ``` --- ## API Endpoints ```typescript // src/api/routes/jobs.routes.ts import { Router } from 'express'; import { z } from 'zod'; const CreateJobSchema = z.object({ type: z.enum(['full_scan', 'incremental', 'targeted', 'refresh']), source: z.string(), config: z.object({ targetCities: z.array(z.string()).optional(), propertyTypes: z.array(z.string()).optional(), maxPages: z.number().optional(), delayMs: z.object({ min: z.number(), max: z.number(), }).optional(), }), }); export function createJobsRouter(jobQueue: JobQueue): Router { const router = Router(); // Create new job router.post('/', async (req, res) => { const parsed = CreateJobSchema.safeParse(req.body); if (!parsed.success) { return res.status(400).json({ error: parsed.error }); } const jobId = `job-${Date.now()}`; const job = await jobQueue.addJob({ id: jobId, ...parsed.data, }); res.status(201).json({ id: job.id, status: 'queued', }); }); // List jobs router.get('/', async (req, res) => { const jobs = await jobQueue.getJobs(req.query); res.json({ jobs }); }); // Get job status router.get('/:id', async (req, res) => { const status = await jobQueue.getJobStatus(req.params.id); if (!status) { return res.status(404).json({ error: 'Job not found' }); } res.json(status); }); // Pause job router.post('/:id/pause', async (req, res) => { await jobQueue.pauseJob(req.params.id); res.json({ status: 'paused' }); }); // Resume job router.post('/:id/resume', async (req, res) => { await jobQueue.resumeJob(req.params.id); res.json({ status: 'resumed' }); }); // Cancel job router.delete('/:id', async (req, res) => { await jobQueue.cancelJob(req.params.id); res.status(204).send(); }); return router; } ``` --- ## Docker Configuration ```yaml # docker-compose.yml version: '3.8' services: scraper: build: context: . dockerfile: Dockerfile environment: - NODE_ENV=production - REDIS_URL=redis://redis:6379 - DATABASE_URL=postgresql://user:pass@postgres:5432/inmobiliaria - S3_ENDPOINT=http://minio:9000 - S3_BUCKET=raw-data depends_on: - redis - postgres - minio deploy: replicas: 2 resources: limits: memory: 2G cpus: '1' redis: image: redis:7-alpine volumes: - redis-data:/data postgres: image: postgres:16-alpine environment: POSTGRES_DB: inmobiliaria POSTGRES_USER: user POSTGRES_PASSWORD: pass volumes: - postgres-data:/var/lib/postgresql/data minio: image: minio/minio command: server /data --console-address ":9001" environment: MINIO_ROOT_USER: minioadmin MINIO_ROOT_PASSWORD: minioadmin volumes: - minio-data:/data volumes: redis-data: postgres-data: minio-data: ``` --- ## Metricas Prometheus ```typescript // src/monitoring/metrics.ts import { Registry, Counter, Histogram, Gauge } from 'prom-client'; export const register = new Registry(); export const metrics = { propertiesScraped: new Counter({ name: 'scraper_properties_total', help: 'Total properties scraped', labelNames: ['source', 'status'], registers: [register], }), requestDuration: new Histogram({ name: 'scraper_request_duration_seconds', help: 'Duration of scraping requests', labelNames: ['source'], buckets: [0.1, 0.5, 1, 2, 5, 10, 30], registers: [register], }), activeJobs: new Gauge({ name: 'scraper_active_jobs', help: 'Number of active scraping jobs', labelNames: ['source'], registers: [register], }), proxyPoolSize: new Gauge({ name: 'scraper_proxy_pool_size', help: 'Size of proxy pool by status', labelNames: ['status'], registers: [register], }), errorsTotal: new Counter({ name: 'scraper_errors_total', help: 'Total scraping errors', labelNames: ['source', 'error_type'], registers: [register], }), }; ``` --- ## Testing Strategy ```typescript // tests/integration/inmuebles24.test.ts import { describe, it, expect, beforeAll, afterAll } from 'vitest'; import { BrowserManager } from '../../src/core/browser/browser-manager'; import { Inmuebles24Scraper } from '../../src/scrapers/inmuebles24/scraper'; describe('Inmuebles24 Scraper', () => { let browserManager: BrowserManager; let scraper: Inmuebles24Scraper; beforeAll(async () => { browserManager = new BrowserManager(); await browserManager.initialize(); scraper = new Inmuebles24Scraper(browserManager, mockProxyPool, mockLogger); }); afterAll(async () => { await browserManager.close(); }); it('should extract property listings from search page', async () => { const result = await scraper.scrape({ targetCities: ['guadalajara'], propertyTypes: ['casas'], maxPages: 1, }); expect(result.success).toBe(true); expect(result.properties.length).toBeGreaterThan(0); expect(result.properties[0]).toHaveProperty('source_id'); expect(result.properties[0]).toHaveProperty('price'); }); it('should handle Cloudflare challenge', async () => { // Test with mock that returns challenge page // Verify scraper waits and retries }); it('should rotate proxy on failure', async () => { // Test proxy rotation logic }); }); ``` --- ## Criterios de Aceptacion Tecnicos - [ ] Bot detection tests pass (bot.sannysoft.com) - [ ] Scraper extracts 500+ properties without block - [ ] Request latency p95 < 10s - [ ] Memory usage < 500MB per worker - [ ] CPU usage < 50% average - [ ] Error rate < 5% - [ ] All unit tests pass - [ ] Integration tests pass --- **Documento:** Especificacion Tecnica Motor Scraping **Version:** 1.0.0 **Autor:** Tech Lead **Fecha:** 2026-01-04