inmobiliaria-analytics/docs/01-fase-alcance-inicial/IAI-007-webscraper/especificaciones/ET-SCR-001-scraper.md
rckrdmrd f570727617 feat: Documentation and orchestration updates
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-07 05:35:40 -06:00

27 KiB

id title type epic status project version created_date updated_date
ET-SCR-001 Especificacion Tecnica: Motor de Scraping Technical Specification IAI-007 Draft inmobiliaria-analytics 1.0.0 2026-01-04 2026-01-04

ET-SCR-001: Especificacion Tecnica del Motor de Scraping


Resumen

Esta especificacion define la arquitectura e implementacion del motor de web scraping con capacidades anti-detection para extraer datos de portales inmobiliarios protegidos por Cloudflare.


Stack Tecnologico

runtime: Node.js 20 LTS
language: TypeScript 5.x

dependencias:
  scraping:
    - playwright: "^1.40.0"
    - playwright-extra: "^4.3.0"
    - puppeteer-extra-plugin-stealth: "^2.11.0"
    - cheerio: "^1.0.0"

  queue:
    - bullmq: "^5.0.0"
    - ioredis: "^5.3.0"

  http:
    - axios: "^1.6.0"
    - https-proxy-agent: "^7.0.0"

  utils:
    - pino: "^8.0.0"
    - zod: "^3.22.0"
    - date-fns: "^3.0.0"

  testing:
    - vitest: "^1.0.0"
    - msw: "^2.0.0"

Arquitectura de Componentes

┌─────────────────────────────────────────────────────────────────┐
│                        SCRAPER SERVICE                          │
├─────────────────────────────────────────────────────────────────┤
│                                                                 │
│  ┌──────────────┐    ┌──────────────┐    ┌──────────────┐      │
│  │   Scheduler  │───▶│  Job Queue   │───▶│   Workers    │      │
│  │  (Cron/API)  │    │  (BullMQ)    │    │  (N=2-4)     │      │
│  └──────────────┘    └──────────────┘    └──────┬───────┘      │
│                                                  │              │
│                           ┌──────────────────────┼──────────┐   │
│                           │                      ▼          │   │
│  ┌──────────────┐    ┌────┴─────────┐    ┌──────────────┐   │   │
│  │    Proxy     │◀───│   Browser    │───▶│   Parser     │   │   │
│  │   Manager    │    │   Engine     │    │  (Cheerio)   │   │   │
│  └──────────────┘    │ (Playwright) │    └──────┬───────┘   │   │
│                      └──────────────┘           │           │   │
│                                                 ▼           │   │
│                                         ┌──────────────┐    │   │
│                                         │ Normalizer   │    │   │
│                                         └──────┬───────┘    │   │
│                                                │            │   │
│                      Scraper Core              │            │   │
│                           └────────────────────┼────────────┘   │
│                                                ▼                │
│  ┌──────────────┐    ┌──────────────┐    ┌──────────────┐      │
│  │   Metrics    │◀───│   Storage    │───▶│  PostgreSQL  │      │
│  │ (Prometheus) │    │  (S3/Local)  │    │              │      │
│  └──────────────┘    └──────────────┘    └──────────────┘      │
│                                                                 │
└─────────────────────────────────────────────────────────────────┘

Estructura de Codigo

apps/scraper/
├── src/
│   ├── index.ts                 # Entry point
│   ├── config/
│   │   ├── index.ts
│   │   ├── sources.config.ts    # Configuracion por fuente
│   │   └── schedules.config.ts
│   │
│   ├── core/
│   │   ├── browser/
│   │   │   ├── browser-manager.ts
│   │   │   ├── stealth-config.ts
│   │   │   └── page-utils.ts
│   │   │
│   │   ├── proxy/
│   │   │   ├── proxy-pool.ts
│   │   │   ├── proxy-rotator.ts
│   │   │   └── proxy-health.ts
│   │   │
│   │   ├── queue/
│   │   │   ├── job-queue.ts
│   │   │   ├── job-processor.ts
│   │   │   └── job-types.ts
│   │   │
│   │   └── rate-limiter/
│   │       └── adaptive-limiter.ts
│   │
│   ├── scrapers/
│   │   ├── base-scraper.ts       # Clase base abstracta
│   │   ├── inmuebles24/
│   │   │   ├── scraper.ts
│   │   │   ├── selectors.ts
│   │   │   └── mappings.ts
│   │   ├── vivanuncios/
│   │   │   ├── scraper.ts
│   │   │   ├── selectors.ts
│   │   │   └── mappings.ts
│   │   └── segundamano/
│   │       └── ...
│   │
│   ├── etl/
│   │   ├── extractor.ts
│   │   ├── transformer.ts
│   │   ├── normalizer.ts
│   │   ├── geocoder.ts
│   │   └── deduplicator.ts
│   │
│   ├── storage/
│   │   ├── raw-storage.ts        # S3/MinIO
│   │   └── property-repository.ts
│   │
│   ├── monitoring/
│   │   ├── metrics.ts
│   │   ├── alerts.ts
│   │   └── health-check.ts
│   │
│   ├── api/
│   │   ├── routes/
│   │   │   ├── jobs.routes.ts
│   │   │   ├── stats.routes.ts
│   │   │   └── proxies.routes.ts
│   │   └── server.ts
│   │
│   └── types/
│       ├── job.types.ts
│       ├── property.types.ts
│       └── proxy.types.ts
│
├── tests/
│   ├── unit/
│   ├── integration/
│   └── e2e/
│
├── Dockerfile
├── docker-compose.yml
├── package.json
└── tsconfig.json

Implementacion del Browser Engine

Browser Manager

// src/core/browser/browser-manager.ts
import { chromium, Browser, BrowserContext, Page } from 'playwright';
import { addExtra } from 'playwright-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';

export class BrowserManager {
  private browser: Browser | null = null;
  private contexts: Map<string, BrowserContext> = new Map();

  async initialize(): Promise<void> {
    const chromiumExtra = addExtra(chromium);
    chromiumExtra.use(StealthPlugin());

    this.browser = await chromiumExtra.launch({
      headless: true,
      args: [
        '--no-sandbox',
        '--disable-setuid-sandbox',
        '--disable-dev-shm-usage',
        '--disable-accelerated-2d-canvas',
        '--disable-gpu',
        '--window-size=1920,1080',
      ],
    });
  }

  async createContext(
    sessionId: string,
    proxy?: ProxyConfig
  ): Promise<BrowserContext> {
    if (!this.browser) throw new Error('Browser not initialized');

    const context = await this.browser.newContext({
      viewport: { width: 1920, height: 1080 },
      userAgent: this.getRandomUserAgent(),
      locale: 'es-MX',
      timezoneId: 'America/Mexico_City',
      proxy: proxy ? {
        server: `${proxy.address}:${proxy.port}`,
        username: proxy.username,
        password: proxy.password,
      } : undefined,
    });

    // Anti-detection patches
    await this.applyStealthPatches(context);

    this.contexts.set(sessionId, context);
    return context;
  }

  private async applyStealthPatches(context: BrowserContext): Promise<void> {
    await context.addInitScript(() => {
      // Hide webdriver
      Object.defineProperty(navigator, 'webdriver', {
        get: () => undefined,
      });

      // Mock plugins
      Object.defineProperty(navigator, 'plugins', {
        get: () => [1, 2, 3, 4, 5],
      });

      // Mock languages
      Object.defineProperty(navigator, 'languages', {
        get: () => ['es-MX', 'es', 'en-US', 'en'],
      });
    });
  }

  private getRandomUserAgent(): string {
    const userAgents = [
      'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
      'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
      'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
      // ... more user agents
    ];
    return userAgents[Math.floor(Math.random() * userAgents.length)];
  }

  async close(): Promise<void> {
    for (const context of this.contexts.values()) {
      await context.close();
    }
    if (this.browser) {
      await this.browser.close();
    }
  }
}

Human-like Behavior

// src/core/browser/page-utils.ts
import { Page } from 'playwright';

export class PageUtils {
  static async humanScroll(page: Page): Promise<void> {
    const scrollHeight = await page.evaluate(() => document.body.scrollHeight);
    let currentPosition = 0;

    while (currentPosition < scrollHeight) {
      const scrollAmount = Math.random() * 300 + 100;
      currentPosition += scrollAmount;

      await page.evaluate((y) => window.scrollTo(0, y), currentPosition);
      await this.randomDelay(100, 300);
    }
  }

  static async humanClick(page: Page, selector: string): Promise<void> {
    const element = await page.$(selector);
    if (!element) throw new Error(`Element not found: ${selector}`);

    const box = await element.boundingBox();
    if (!box) throw new Error(`Element not visible: ${selector}`);

    // Move to element with slight randomness
    const x = box.x + box.width / 2 + (Math.random() * 10 - 5);
    const y = box.y + box.height / 2 + (Math.random() * 10 - 5);

    await page.mouse.move(x, y, { steps: 10 });
    await this.randomDelay(50, 150);
    await page.mouse.click(x, y);
  }

  static async randomDelay(min: number, max: number): Promise<void> {
    const delay = Math.floor(Math.random() * (max - min + 1)) + min;
    await new Promise(resolve => setTimeout(resolve, delay));
  }

  static async waitForCloudflare(page: Page): Promise<void> {
    // Wait for Cloudflare challenge to complete
    try {
      await page.waitForSelector('#challenge-running', {
        state: 'hidden',
        timeout: 30000,
      });
    } catch {
      // No challenge present, continue
    }

    // Additional wait for JS to fully load
    await page.waitForLoadState('networkidle');
  }
}

Base Scraper Implementation

// src/scrapers/base-scraper.ts
import { Page, BrowserContext } from 'playwright';
import { BrowserManager } from '../core/browser/browser-manager';
import { ProxyPool } from '../core/proxy/proxy-pool';
import { PageUtils } from '../core/browser/page-utils';
import { Logger } from 'pino';

export interface ScrapingResult {
  success: boolean;
  properties: RawProperty[];
  errors: ScrapingError[];
  stats: ScrapingStats;
}

export abstract class BaseScraper {
  protected browserManager: BrowserManager;
  protected proxyPool: ProxyPool;
  protected logger: Logger;
  protected context: BrowserContext | null = null;
  protected page: Page | null = null;

  abstract readonly source: string;
  abstract readonly baseUrl: string;

  constructor(
    browserManager: BrowserManager,
    proxyPool: ProxyPool,
    logger: Logger
  ) {
    this.browserManager = browserManager;
    this.proxyPool = proxyPool;
    this.logger = logger.child({ source: this.source });
  }

  async scrape(config: ScrapingConfig): Promise<ScrapingResult> {
    const stats: ScrapingStats = {
      pagesScraped: 0,
      propertiesFound: 0,
      errors: 0,
      startedAt: new Date(),
    };

    const properties: RawProperty[] = [];
    const errors: ScrapingError[] = [];

    try {
      await this.initSession();

      for (const city of config.targetCities) {
        for (const type of config.propertyTypes) {
          const result = await this.scrapeListings(city, type, config);
          properties.push(...result.properties);
          errors.push(...result.errors);
          stats.pagesScraped += result.pagesScraped;
        }
      }

      stats.propertiesFound = properties.length;
      stats.errors = errors.length;
      stats.completedAt = new Date();

      return { success: true, properties, errors, stats };

    } catch (error) {
      this.logger.error({ error }, 'Scraping failed');
      return {
        success: false,
        properties,
        errors: [...errors, { type: 'fatal', message: String(error) }],
        stats,
      };
    } finally {
      await this.closeSession();
    }
  }

  protected async initSession(): Promise<void> {
    const proxy = await this.proxyPool.getProxy();
    const sessionId = `${this.source}-${Date.now()}`;

    this.context = await this.browserManager.createContext(sessionId, proxy);
    this.page = await this.context.newPage();

    // Set default timeout
    this.page.setDefaultTimeout(30000);
  }

  protected async closeSession(): Promise<void> {
    if (this.page) await this.page.close();
    if (this.context) await this.context.close();
  }

  protected async navigateWithRetry(
    url: string,
    maxRetries: number = 3
  ): Promise<void> {
    for (let attempt = 1; attempt <= maxRetries; attempt++) {
      try {
        await this.page!.goto(url, { waitUntil: 'domcontentloaded' });
        await PageUtils.waitForCloudflare(this.page!);
        return;
      } catch (error) {
        this.logger.warn({ url, attempt, error }, 'Navigation failed, retrying');

        if (attempt === maxRetries) throw error;

        // Rotate proxy on failure
        await this.rotateProxy();
        await PageUtils.randomDelay(2000, 5000);
      }
    }
  }

  protected async rotateProxy(): Promise<void> {
    const newProxy = await this.proxyPool.getProxy();
    await this.closeSession();

    const sessionId = `${this.source}-${Date.now()}`;
    this.context = await this.browserManager.createContext(sessionId, newProxy);
    this.page = await this.context.newPage();
  }

  // Abstract methods to be implemented by each source
  protected abstract scrapeListings(
    city: string,
    propertyType: string,
    config: ScrapingConfig
  ): Promise<ListingResult>;

  protected abstract parsePropertyDetail(
    page: Page
  ): Promise<RawProperty>;

  protected abstract getListingUrl(
    city: string,
    propertyType: string,
    page: number
  ): string;
}

Proxy Pool Implementation

// src/core/proxy/proxy-pool.ts
import { Redis } from 'ioredis';

export interface ProxyConfig {
  id: string;
  address: string;
  port: number;
  username?: string;
  password?: string;
  type: 'residential' | 'datacenter' | 'mobile';
  country: string;
  status: 'active' | 'cooling' | 'banned';
  successRate: number;
  lastUsedAt?: Date;
  coolingUntil?: Date;
}

export class ProxyPool {
  private redis: Redis;
  private readonly POOL_KEY = 'proxy:pool';
  private readonly COOLING_KEY = 'proxy:cooling';

  constructor(redis: Redis) {
    this.redis = redis;
  }

  async getProxy(): Promise<ProxyConfig> {
    // Get all active proxies
    const proxies = await this.getActiveProxies();

    if (proxies.length === 0) {
      throw new Error('No active proxies available');
    }

    // Weighted selection based on success rate
    const selected = this.weightedSelection(proxies);

    // Mark as used
    await this.markUsed(selected.id);

    return selected;
  }

  private async getActiveProxies(): Promise<ProxyConfig[]> {
    const all = await this.redis.hgetall(this.POOL_KEY);
    const now = Date.now();

    return Object.values(all)
      .map(p => JSON.parse(p) as ProxyConfig)
      .filter(p => {
        if (p.status === 'banned') return false;
        if (p.status === 'cooling' && p.coolingUntil) {
          return new Date(p.coolingUntil).getTime() < now;
        }
        return p.status === 'active';
      });
  }

  private weightedSelection(proxies: ProxyConfig[]): ProxyConfig {
    // Higher success rate = higher weight
    const totalWeight = proxies.reduce((sum, p) => sum + p.successRate, 0);
    let random = Math.random() * totalWeight;

    for (const proxy of proxies) {
      random -= proxy.successRate;
      if (random <= 0) return proxy;
    }

    return proxies[0];
  }

  async markUsed(proxyId: string): Promise<void> {
    const proxy = await this.getProxy(proxyId);
    if (proxy) {
      proxy.lastUsedAt = new Date();
      await this.redis.hset(this.POOL_KEY, proxyId, JSON.stringify(proxy));
    }
  }

  async markSuccess(proxyId: string): Promise<void> {
    const proxy = await this.getProxyById(proxyId);
    if (proxy) {
      // Update success rate with exponential moving average
      proxy.successRate = proxy.successRate * 0.9 + 1 * 0.1;
      await this.redis.hset(this.POOL_KEY, proxyId, JSON.stringify(proxy));
    }
  }

  async markFailure(proxyId: string, errorType: string): Promise<void> {
    const proxy = await this.getProxyById(proxyId);
    if (!proxy) return;

    // Update success rate
    proxy.successRate = proxy.successRate * 0.9 + 0 * 0.1;

    if (errorType === 'rate_limit') {
      // Put in cooling for 1 hour
      proxy.status = 'cooling';
      proxy.coolingUntil = new Date(Date.now() + 3600000);
    } else if (errorType === 'banned') {
      proxy.status = 'banned';
    }

    await this.redis.hset(this.POOL_KEY, proxyId, JSON.stringify(proxy));
  }

  async getStats(): Promise<ProxyPoolStats> {
    const all = await this.redis.hgetall(this.POOL_KEY);
    const proxies = Object.values(all).map(p => JSON.parse(p) as ProxyConfig);

    return {
      total: proxies.length,
      active: proxies.filter(p => p.status === 'active').length,
      cooling: proxies.filter(p => p.status === 'cooling').length,
      banned: proxies.filter(p => p.status === 'banned').length,
      avgSuccessRate: proxies.reduce((sum, p) => sum + p.successRate, 0) / proxies.length,
    };
  }
}

Job Queue Implementation

// src/core/queue/job-queue.ts
import { Queue, Worker, Job } from 'bullmq';
import { Redis } from 'ioredis';

export interface ScrapingJobData {
  id: string;
  type: 'full_scan' | 'incremental' | 'targeted' | 'refresh';
  source: string;
  config: ScrapingConfig;
  createdBy?: string;
}

export class JobQueue {
  private queue: Queue<ScrapingJobData>;
  private worker: Worker<ScrapingJobData>;
  private redis: Redis;

  constructor(redis: Redis, processor: JobProcessor) {
    this.redis = redis;

    this.queue = new Queue('scraping', {
      connection: redis,
      defaultJobOptions: {
        attempts: 3,
        backoff: {
          type: 'exponential',
          delay: 5000,
        },
        removeOnComplete: 100,
        removeOnFail: 50,
      },
    });

    this.worker = new Worker(
      'scraping',
      async (job: Job<ScrapingJobData>) => {
        return processor.process(job);
      },
      {
        connection: redis,
        concurrency: 2,
      }
    );

    this.setupEventHandlers();
  }

  private setupEventHandlers(): void {
    this.worker.on('completed', (job, result) => {
      console.log(`Job ${job.id} completed`, result);
    });

    this.worker.on('failed', (job, error) => {
      console.error(`Job ${job?.id} failed`, error);
    });

    this.worker.on('progress', (job, progress) => {
      console.log(`Job ${job.id} progress: ${progress}%`);
    });
  }

  async addJob(data: ScrapingJobData): Promise<Job<ScrapingJobData>> {
    return this.queue.add(data.type, data, {
      jobId: data.id,
    });
  }

  async scheduleJob(
    data: ScrapingJobData,
    cron: string
  ): Promise<void> {
    await this.queue.add(data.type, data, {
      repeat: { pattern: cron },
      jobId: `${data.id}-scheduled`,
    });
  }

  async pauseJob(jobId: string): Promise<void> {
    const job = await this.queue.getJob(jobId);
    if (job) {
      await job.updateProgress({ status: 'paused' });
    }
  }

  async getJobStatus(jobId: string): Promise<JobStatus | null> {
    const job = await this.queue.getJob(jobId);
    if (!job) return null;

    const state = await job.getState();
    return {
      id: job.id!,
      state,
      progress: job.progress,
      data: job.data,
      attemptsMade: job.attemptsMade,
      failedReason: job.failedReason,
    };
  }
}

API Endpoints

// src/api/routes/jobs.routes.ts
import { Router } from 'express';
import { z } from 'zod';

const CreateJobSchema = z.object({
  type: z.enum(['full_scan', 'incremental', 'targeted', 'refresh']),
  source: z.string(),
  config: z.object({
    targetCities: z.array(z.string()).optional(),
    propertyTypes: z.array(z.string()).optional(),
    maxPages: z.number().optional(),
    delayMs: z.object({
      min: z.number(),
      max: z.number(),
    }).optional(),
  }),
});

export function createJobsRouter(jobQueue: JobQueue): Router {
  const router = Router();

  // Create new job
  router.post('/', async (req, res) => {
    const parsed = CreateJobSchema.safeParse(req.body);
    if (!parsed.success) {
      return res.status(400).json({ error: parsed.error });
    }

    const jobId = `job-${Date.now()}`;
    const job = await jobQueue.addJob({
      id: jobId,
      ...parsed.data,
    });

    res.status(201).json({
      id: job.id,
      status: 'queued',
    });
  });

  // List jobs
  router.get('/', async (req, res) => {
    const jobs = await jobQueue.getJobs(req.query);
    res.json({ jobs });
  });

  // Get job status
  router.get('/:id', async (req, res) => {
    const status = await jobQueue.getJobStatus(req.params.id);
    if (!status) {
      return res.status(404).json({ error: 'Job not found' });
    }
    res.json(status);
  });

  // Pause job
  router.post('/:id/pause', async (req, res) => {
    await jobQueue.pauseJob(req.params.id);
    res.json({ status: 'paused' });
  });

  // Resume job
  router.post('/:id/resume', async (req, res) => {
    await jobQueue.resumeJob(req.params.id);
    res.json({ status: 'resumed' });
  });

  // Cancel job
  router.delete('/:id', async (req, res) => {
    await jobQueue.cancelJob(req.params.id);
    res.status(204).send();
  });

  return router;
}

Docker Configuration

# docker-compose.yml
version: '3.8'

services:
  scraper:
    build:
      context: .
      dockerfile: Dockerfile
    environment:
      - NODE_ENV=production
      - REDIS_URL=redis://redis:6379
      - DATABASE_URL=postgresql://user:pass@postgres:5432/inmobiliaria
      - S3_ENDPOINT=http://minio:9000
      - S3_BUCKET=raw-data
    depends_on:
      - redis
      - postgres
      - minio
    deploy:
      replicas: 2
      resources:
        limits:
          memory: 2G
          cpus: '1'

  redis:
    image: redis:7-alpine
    volumes:
      - redis-data:/data

  postgres:
    image: postgres:16-alpine
    environment:
      POSTGRES_DB: inmobiliaria
      POSTGRES_USER: user
      POSTGRES_PASSWORD: pass
    volumes:
      - postgres-data:/var/lib/postgresql/data

  minio:
    image: minio/minio
    command: server /data --console-address ":9001"
    environment:
      MINIO_ROOT_USER: minioadmin
      MINIO_ROOT_PASSWORD: minioadmin
    volumes:
      - minio-data:/data

volumes:
  redis-data:
  postgres-data:
  minio-data:

Metricas Prometheus

// src/monitoring/metrics.ts
import { Registry, Counter, Histogram, Gauge } from 'prom-client';

export const register = new Registry();

export const metrics = {
  propertiesScraped: new Counter({
    name: 'scraper_properties_total',
    help: 'Total properties scraped',
    labelNames: ['source', 'status'],
    registers: [register],
  }),

  requestDuration: new Histogram({
    name: 'scraper_request_duration_seconds',
    help: 'Duration of scraping requests',
    labelNames: ['source'],
    buckets: [0.1, 0.5, 1, 2, 5, 10, 30],
    registers: [register],
  }),

  activeJobs: new Gauge({
    name: 'scraper_active_jobs',
    help: 'Number of active scraping jobs',
    labelNames: ['source'],
    registers: [register],
  }),

  proxyPoolSize: new Gauge({
    name: 'scraper_proxy_pool_size',
    help: 'Size of proxy pool by status',
    labelNames: ['status'],
    registers: [register],
  }),

  errorsTotal: new Counter({
    name: 'scraper_errors_total',
    help: 'Total scraping errors',
    labelNames: ['source', 'error_type'],
    registers: [register],
  }),
};

Testing Strategy

// tests/integration/inmuebles24.test.ts
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
import { BrowserManager } from '../../src/core/browser/browser-manager';
import { Inmuebles24Scraper } from '../../src/scrapers/inmuebles24/scraper';

describe('Inmuebles24 Scraper', () => {
  let browserManager: BrowserManager;
  let scraper: Inmuebles24Scraper;

  beforeAll(async () => {
    browserManager = new BrowserManager();
    await browserManager.initialize();
    scraper = new Inmuebles24Scraper(browserManager, mockProxyPool, mockLogger);
  });

  afterAll(async () => {
    await browserManager.close();
  });

  it('should extract property listings from search page', async () => {
    const result = await scraper.scrape({
      targetCities: ['guadalajara'],
      propertyTypes: ['casas'],
      maxPages: 1,
    });

    expect(result.success).toBe(true);
    expect(result.properties.length).toBeGreaterThan(0);
    expect(result.properties[0]).toHaveProperty('source_id');
    expect(result.properties[0]).toHaveProperty('price');
  });

  it('should handle Cloudflare challenge', async () => {
    // Test with mock that returns challenge page
    // Verify scraper waits and retries
  });

  it('should rotate proxy on failure', async () => {
    // Test proxy rotation logic
  });
});

Criterios de Aceptacion Tecnicos

  • Bot detection tests pass (bot.sannysoft.com)
  • Scraper extracts 500+ properties without block
  • Request latency p95 < 10s
  • Memory usage < 500MB per worker
  • CPU usage < 50% average
  • Error rate < 5%
  • All unit tests pass
  • Integration tests pass

Documento: Especificacion Tecnica Motor Scraping Version: 1.0.0 Autor: Tech Lead Fecha: 2026-01-04