🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
966 lines
27 KiB
Markdown
966 lines
27 KiB
Markdown
---
|
|
id: "ET-SCR-001"
|
|
title: "Especificacion Tecnica: Motor de Scraping"
|
|
type: "Technical Specification"
|
|
epic: "IAI-007"
|
|
status: "Draft"
|
|
project: "inmobiliaria-analytics"
|
|
version: "1.0.0"
|
|
created_date: "2026-01-04"
|
|
updated_date: "2026-01-04"
|
|
---
|
|
|
|
# ET-SCR-001: Especificacion Tecnica del Motor de Scraping
|
|
|
|
---
|
|
|
|
## Resumen
|
|
|
|
Esta especificacion define la arquitectura e implementacion del motor de web scraping con capacidades anti-detection para extraer datos de portales inmobiliarios protegidos por Cloudflare.
|
|
|
|
---
|
|
|
|
## Stack Tecnologico
|
|
|
|
```yaml
|
|
runtime: Node.js 20 LTS
|
|
language: TypeScript 5.x
|
|
|
|
dependencias:
|
|
scraping:
|
|
- playwright: "^1.40.0"
|
|
- playwright-extra: "^4.3.0"
|
|
- puppeteer-extra-plugin-stealth: "^2.11.0"
|
|
- cheerio: "^1.0.0"
|
|
|
|
queue:
|
|
- bullmq: "^5.0.0"
|
|
- ioredis: "^5.3.0"
|
|
|
|
http:
|
|
- axios: "^1.6.0"
|
|
- https-proxy-agent: "^7.0.0"
|
|
|
|
utils:
|
|
- pino: "^8.0.0"
|
|
- zod: "^3.22.0"
|
|
- date-fns: "^3.0.0"
|
|
|
|
testing:
|
|
- vitest: "^1.0.0"
|
|
- msw: "^2.0.0"
|
|
```
|
|
|
|
---
|
|
|
|
## Arquitectura de Componentes
|
|
|
|
```
|
|
┌─────────────────────────────────────────────────────────────────┐
|
|
│ SCRAPER SERVICE │
|
|
├─────────────────────────────────────────────────────────────────┤
|
|
│ │
|
|
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
|
│ │ Scheduler │───▶│ Job Queue │───▶│ Workers │ │
|
|
│ │ (Cron/API) │ │ (BullMQ) │ │ (N=2-4) │ │
|
|
│ └──────────────┘ └──────────────┘ └──────┬───────┘ │
|
|
│ │ │
|
|
│ ┌──────────────────────┼──────────┐ │
|
|
│ │ ▼ │ │
|
|
│ ┌──────────────┐ ┌────┴─────────┐ ┌──────────────┐ │ │
|
|
│ │ Proxy │◀───│ Browser │───▶│ Parser │ │ │
|
|
│ │ Manager │ │ Engine │ │ (Cheerio) │ │ │
|
|
│ └──────────────┘ │ (Playwright) │ └──────┬───────┘ │ │
|
|
│ └──────────────┘ │ │ │
|
|
│ ▼ │ │
|
|
│ ┌──────────────┐ │ │
|
|
│ │ Normalizer │ │ │
|
|
│ └──────┬───────┘ │ │
|
|
│ │ │ │
|
|
│ Scraper Core │ │ │
|
|
│ └────────────────────┼────────────┘ │
|
|
│ ▼ │
|
|
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
|
│ │ Metrics │◀───│ Storage │───▶│ PostgreSQL │ │
|
|
│ │ (Prometheus) │ │ (S3/Local) │ │ │ │
|
|
│ └──────────────┘ └──────────────┘ └──────────────┘ │
|
|
│ │
|
|
└─────────────────────────────────────────────────────────────────┘
|
|
```
|
|
|
|
---
|
|
|
|
## Estructura de Codigo
|
|
|
|
```
|
|
apps/scraper/
|
|
├── src/
|
|
│ ├── index.ts # Entry point
|
|
│ ├── config/
|
|
│ │ ├── index.ts
|
|
│ │ ├── sources.config.ts # Configuracion por fuente
|
|
│ │ └── schedules.config.ts
|
|
│ │
|
|
│ ├── core/
|
|
│ │ ├── browser/
|
|
│ │ │ ├── browser-manager.ts
|
|
│ │ │ ├── stealth-config.ts
|
|
│ │ │ └── page-utils.ts
|
|
│ │ │
|
|
│ │ ├── proxy/
|
|
│ │ │ ├── proxy-pool.ts
|
|
│ │ │ ├── proxy-rotator.ts
|
|
│ │ │ └── proxy-health.ts
|
|
│ │ │
|
|
│ │ ├── queue/
|
|
│ │ │ ├── job-queue.ts
|
|
│ │ │ ├── job-processor.ts
|
|
│ │ │ └── job-types.ts
|
|
│ │ │
|
|
│ │ └── rate-limiter/
|
|
│ │ └── adaptive-limiter.ts
|
|
│ │
|
|
│ ├── scrapers/
|
|
│ │ ├── base-scraper.ts # Clase base abstracta
|
|
│ │ ├── inmuebles24/
|
|
│ │ │ ├── scraper.ts
|
|
│ │ │ ├── selectors.ts
|
|
│ │ │ └── mappings.ts
|
|
│ │ ├── vivanuncios/
|
|
│ │ │ ├── scraper.ts
|
|
│ │ │ ├── selectors.ts
|
|
│ │ │ └── mappings.ts
|
|
│ │ └── segundamano/
|
|
│ │ └── ...
|
|
│ │
|
|
│ ├── etl/
|
|
│ │ ├── extractor.ts
|
|
│ │ ├── transformer.ts
|
|
│ │ ├── normalizer.ts
|
|
│ │ ├── geocoder.ts
|
|
│ │ └── deduplicator.ts
|
|
│ │
|
|
│ ├── storage/
|
|
│ │ ├── raw-storage.ts # S3/MinIO
|
|
│ │ └── property-repository.ts
|
|
│ │
|
|
│ ├── monitoring/
|
|
│ │ ├── metrics.ts
|
|
│ │ ├── alerts.ts
|
|
│ │ └── health-check.ts
|
|
│ │
|
|
│ ├── api/
|
|
│ │ ├── routes/
|
|
│ │ │ ├── jobs.routes.ts
|
|
│ │ │ ├── stats.routes.ts
|
|
│ │ │ └── proxies.routes.ts
|
|
│ │ └── server.ts
|
|
│ │
|
|
│ └── types/
|
|
│ ├── job.types.ts
|
|
│ ├── property.types.ts
|
|
│ └── proxy.types.ts
|
|
│
|
|
├── tests/
|
|
│ ├── unit/
|
|
│ ├── integration/
|
|
│ └── e2e/
|
|
│
|
|
├── Dockerfile
|
|
├── docker-compose.yml
|
|
├── package.json
|
|
└── tsconfig.json
|
|
```
|
|
|
|
---
|
|
|
|
## Implementacion del Browser Engine
|
|
|
|
### Browser Manager
|
|
|
|
```typescript
|
|
// src/core/browser/browser-manager.ts
|
|
import { chromium, Browser, BrowserContext, Page } from 'playwright';
|
|
import { addExtra } from 'playwright-extra';
|
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
|
|
|
export class BrowserManager {
|
|
private browser: Browser | null = null;
|
|
private contexts: Map<string, BrowserContext> = new Map();
|
|
|
|
async initialize(): Promise<void> {
|
|
const chromiumExtra = addExtra(chromium);
|
|
chromiumExtra.use(StealthPlugin());
|
|
|
|
this.browser = await chromiumExtra.launch({
|
|
headless: true,
|
|
args: [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--disable-accelerated-2d-canvas',
|
|
'--disable-gpu',
|
|
'--window-size=1920,1080',
|
|
],
|
|
});
|
|
}
|
|
|
|
async createContext(
|
|
sessionId: string,
|
|
proxy?: ProxyConfig
|
|
): Promise<BrowserContext> {
|
|
if (!this.browser) throw new Error('Browser not initialized');
|
|
|
|
const context = await this.browser.newContext({
|
|
viewport: { width: 1920, height: 1080 },
|
|
userAgent: this.getRandomUserAgent(),
|
|
locale: 'es-MX',
|
|
timezoneId: 'America/Mexico_City',
|
|
proxy: proxy ? {
|
|
server: `${proxy.address}:${proxy.port}`,
|
|
username: proxy.username,
|
|
password: proxy.password,
|
|
} : undefined,
|
|
});
|
|
|
|
// Anti-detection patches
|
|
await this.applyStealthPatches(context);
|
|
|
|
this.contexts.set(sessionId, context);
|
|
return context;
|
|
}
|
|
|
|
private async applyStealthPatches(context: BrowserContext): Promise<void> {
|
|
await context.addInitScript(() => {
|
|
// Hide webdriver
|
|
Object.defineProperty(navigator, 'webdriver', {
|
|
get: () => undefined,
|
|
});
|
|
|
|
// Mock plugins
|
|
Object.defineProperty(navigator, 'plugins', {
|
|
get: () => [1, 2, 3, 4, 5],
|
|
});
|
|
|
|
// Mock languages
|
|
Object.defineProperty(navigator, 'languages', {
|
|
get: () => ['es-MX', 'es', 'en-US', 'en'],
|
|
});
|
|
});
|
|
}
|
|
|
|
private getRandomUserAgent(): string {
|
|
const userAgents = [
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
|
|
// ... more user agents
|
|
];
|
|
return userAgents[Math.floor(Math.random() * userAgents.length)];
|
|
}
|
|
|
|
async close(): Promise<void> {
|
|
for (const context of this.contexts.values()) {
|
|
await context.close();
|
|
}
|
|
if (this.browser) {
|
|
await this.browser.close();
|
|
}
|
|
}
|
|
}
|
|
```
|
|
|
|
### Human-like Behavior
|
|
|
|
```typescript
|
|
// src/core/browser/page-utils.ts
|
|
import { Page } from 'playwright';
|
|
|
|
export class PageUtils {
|
|
static async humanScroll(page: Page): Promise<void> {
|
|
const scrollHeight = await page.evaluate(() => document.body.scrollHeight);
|
|
let currentPosition = 0;
|
|
|
|
while (currentPosition < scrollHeight) {
|
|
const scrollAmount = Math.random() * 300 + 100;
|
|
currentPosition += scrollAmount;
|
|
|
|
await page.evaluate((y) => window.scrollTo(0, y), currentPosition);
|
|
await this.randomDelay(100, 300);
|
|
}
|
|
}
|
|
|
|
static async humanClick(page: Page, selector: string): Promise<void> {
|
|
const element = await page.$(selector);
|
|
if (!element) throw new Error(`Element not found: ${selector}`);
|
|
|
|
const box = await element.boundingBox();
|
|
if (!box) throw new Error(`Element not visible: ${selector}`);
|
|
|
|
// Move to element with slight randomness
|
|
const x = box.x + box.width / 2 + (Math.random() * 10 - 5);
|
|
const y = box.y + box.height / 2 + (Math.random() * 10 - 5);
|
|
|
|
await page.mouse.move(x, y, { steps: 10 });
|
|
await this.randomDelay(50, 150);
|
|
await page.mouse.click(x, y);
|
|
}
|
|
|
|
static async randomDelay(min: number, max: number): Promise<void> {
|
|
const delay = Math.floor(Math.random() * (max - min + 1)) + min;
|
|
await new Promise(resolve => setTimeout(resolve, delay));
|
|
}
|
|
|
|
static async waitForCloudflare(page: Page): Promise<void> {
|
|
// Wait for Cloudflare challenge to complete
|
|
try {
|
|
await page.waitForSelector('#challenge-running', {
|
|
state: 'hidden',
|
|
timeout: 30000,
|
|
});
|
|
} catch {
|
|
// No challenge present, continue
|
|
}
|
|
|
|
// Additional wait for JS to fully load
|
|
await page.waitForLoadState('networkidle');
|
|
}
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Base Scraper Implementation
|
|
|
|
```typescript
|
|
// src/scrapers/base-scraper.ts
|
|
import { Page, BrowserContext } from 'playwright';
|
|
import { BrowserManager } from '../core/browser/browser-manager';
|
|
import { ProxyPool } from '../core/proxy/proxy-pool';
|
|
import { PageUtils } from '../core/browser/page-utils';
|
|
import { Logger } from 'pino';
|
|
|
|
export interface ScrapingResult {
|
|
success: boolean;
|
|
properties: RawProperty[];
|
|
errors: ScrapingError[];
|
|
stats: ScrapingStats;
|
|
}
|
|
|
|
export abstract class BaseScraper {
|
|
protected browserManager: BrowserManager;
|
|
protected proxyPool: ProxyPool;
|
|
protected logger: Logger;
|
|
protected context: BrowserContext | null = null;
|
|
protected page: Page | null = null;
|
|
|
|
abstract readonly source: string;
|
|
abstract readonly baseUrl: string;
|
|
|
|
constructor(
|
|
browserManager: BrowserManager,
|
|
proxyPool: ProxyPool,
|
|
logger: Logger
|
|
) {
|
|
this.browserManager = browserManager;
|
|
this.proxyPool = proxyPool;
|
|
this.logger = logger.child({ source: this.source });
|
|
}
|
|
|
|
async scrape(config: ScrapingConfig): Promise<ScrapingResult> {
|
|
const stats: ScrapingStats = {
|
|
pagesScraped: 0,
|
|
propertiesFound: 0,
|
|
errors: 0,
|
|
startedAt: new Date(),
|
|
};
|
|
|
|
const properties: RawProperty[] = [];
|
|
const errors: ScrapingError[] = [];
|
|
|
|
try {
|
|
await this.initSession();
|
|
|
|
for (const city of config.targetCities) {
|
|
for (const type of config.propertyTypes) {
|
|
const result = await this.scrapeListings(city, type, config);
|
|
properties.push(...result.properties);
|
|
errors.push(...result.errors);
|
|
stats.pagesScraped += result.pagesScraped;
|
|
}
|
|
}
|
|
|
|
stats.propertiesFound = properties.length;
|
|
stats.errors = errors.length;
|
|
stats.completedAt = new Date();
|
|
|
|
return { success: true, properties, errors, stats };
|
|
|
|
} catch (error) {
|
|
this.logger.error({ error }, 'Scraping failed');
|
|
return {
|
|
success: false,
|
|
properties,
|
|
errors: [...errors, { type: 'fatal', message: String(error) }],
|
|
stats,
|
|
};
|
|
} finally {
|
|
await this.closeSession();
|
|
}
|
|
}
|
|
|
|
protected async initSession(): Promise<void> {
|
|
const proxy = await this.proxyPool.getProxy();
|
|
const sessionId = `${this.source}-${Date.now()}`;
|
|
|
|
this.context = await this.browserManager.createContext(sessionId, proxy);
|
|
this.page = await this.context.newPage();
|
|
|
|
// Set default timeout
|
|
this.page.setDefaultTimeout(30000);
|
|
}
|
|
|
|
protected async closeSession(): Promise<void> {
|
|
if (this.page) await this.page.close();
|
|
if (this.context) await this.context.close();
|
|
}
|
|
|
|
protected async navigateWithRetry(
|
|
url: string,
|
|
maxRetries: number = 3
|
|
): Promise<void> {
|
|
for (let attempt = 1; attempt <= maxRetries; attempt++) {
|
|
try {
|
|
await this.page!.goto(url, { waitUntil: 'domcontentloaded' });
|
|
await PageUtils.waitForCloudflare(this.page!);
|
|
return;
|
|
} catch (error) {
|
|
this.logger.warn({ url, attempt, error }, 'Navigation failed, retrying');
|
|
|
|
if (attempt === maxRetries) throw error;
|
|
|
|
// Rotate proxy on failure
|
|
await this.rotateProxy();
|
|
await PageUtils.randomDelay(2000, 5000);
|
|
}
|
|
}
|
|
}
|
|
|
|
protected async rotateProxy(): Promise<void> {
|
|
const newProxy = await this.proxyPool.getProxy();
|
|
await this.closeSession();
|
|
|
|
const sessionId = `${this.source}-${Date.now()}`;
|
|
this.context = await this.browserManager.createContext(sessionId, newProxy);
|
|
this.page = await this.context.newPage();
|
|
}
|
|
|
|
// Abstract methods to be implemented by each source
|
|
protected abstract scrapeListings(
|
|
city: string,
|
|
propertyType: string,
|
|
config: ScrapingConfig
|
|
): Promise<ListingResult>;
|
|
|
|
protected abstract parsePropertyDetail(
|
|
page: Page
|
|
): Promise<RawProperty>;
|
|
|
|
protected abstract getListingUrl(
|
|
city: string,
|
|
propertyType: string,
|
|
page: number
|
|
): string;
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Proxy Pool Implementation
|
|
|
|
```typescript
|
|
// src/core/proxy/proxy-pool.ts
|
|
import { Redis } from 'ioredis';
|
|
|
|
export interface ProxyConfig {
|
|
id: string;
|
|
address: string;
|
|
port: number;
|
|
username?: string;
|
|
password?: string;
|
|
type: 'residential' | 'datacenter' | 'mobile';
|
|
country: string;
|
|
status: 'active' | 'cooling' | 'banned';
|
|
successRate: number;
|
|
lastUsedAt?: Date;
|
|
coolingUntil?: Date;
|
|
}
|
|
|
|
export class ProxyPool {
|
|
private redis: Redis;
|
|
private readonly POOL_KEY = 'proxy:pool';
|
|
private readonly COOLING_KEY = 'proxy:cooling';
|
|
|
|
constructor(redis: Redis) {
|
|
this.redis = redis;
|
|
}
|
|
|
|
async getProxy(): Promise<ProxyConfig> {
|
|
// Get all active proxies
|
|
const proxies = await this.getActiveProxies();
|
|
|
|
if (proxies.length === 0) {
|
|
throw new Error('No active proxies available');
|
|
}
|
|
|
|
// Weighted selection based on success rate
|
|
const selected = this.weightedSelection(proxies);
|
|
|
|
// Mark as used
|
|
await this.markUsed(selected.id);
|
|
|
|
return selected;
|
|
}
|
|
|
|
private async getActiveProxies(): Promise<ProxyConfig[]> {
|
|
const all = await this.redis.hgetall(this.POOL_KEY);
|
|
const now = Date.now();
|
|
|
|
return Object.values(all)
|
|
.map(p => JSON.parse(p) as ProxyConfig)
|
|
.filter(p => {
|
|
if (p.status === 'banned') return false;
|
|
if (p.status === 'cooling' && p.coolingUntil) {
|
|
return new Date(p.coolingUntil).getTime() < now;
|
|
}
|
|
return p.status === 'active';
|
|
});
|
|
}
|
|
|
|
private weightedSelection(proxies: ProxyConfig[]): ProxyConfig {
|
|
// Higher success rate = higher weight
|
|
const totalWeight = proxies.reduce((sum, p) => sum + p.successRate, 0);
|
|
let random = Math.random() * totalWeight;
|
|
|
|
for (const proxy of proxies) {
|
|
random -= proxy.successRate;
|
|
if (random <= 0) return proxy;
|
|
}
|
|
|
|
return proxies[0];
|
|
}
|
|
|
|
async markUsed(proxyId: string): Promise<void> {
|
|
const proxy = await this.getProxy(proxyId);
|
|
if (proxy) {
|
|
proxy.lastUsedAt = new Date();
|
|
await this.redis.hset(this.POOL_KEY, proxyId, JSON.stringify(proxy));
|
|
}
|
|
}
|
|
|
|
async markSuccess(proxyId: string): Promise<void> {
|
|
const proxy = await this.getProxyById(proxyId);
|
|
if (proxy) {
|
|
// Update success rate with exponential moving average
|
|
proxy.successRate = proxy.successRate * 0.9 + 1 * 0.1;
|
|
await this.redis.hset(this.POOL_KEY, proxyId, JSON.stringify(proxy));
|
|
}
|
|
}
|
|
|
|
async markFailure(proxyId: string, errorType: string): Promise<void> {
|
|
const proxy = await this.getProxyById(proxyId);
|
|
if (!proxy) return;
|
|
|
|
// Update success rate
|
|
proxy.successRate = proxy.successRate * 0.9 + 0 * 0.1;
|
|
|
|
if (errorType === 'rate_limit') {
|
|
// Put in cooling for 1 hour
|
|
proxy.status = 'cooling';
|
|
proxy.coolingUntil = new Date(Date.now() + 3600000);
|
|
} else if (errorType === 'banned') {
|
|
proxy.status = 'banned';
|
|
}
|
|
|
|
await this.redis.hset(this.POOL_KEY, proxyId, JSON.stringify(proxy));
|
|
}
|
|
|
|
async getStats(): Promise<ProxyPoolStats> {
|
|
const all = await this.redis.hgetall(this.POOL_KEY);
|
|
const proxies = Object.values(all).map(p => JSON.parse(p) as ProxyConfig);
|
|
|
|
return {
|
|
total: proxies.length,
|
|
active: proxies.filter(p => p.status === 'active').length,
|
|
cooling: proxies.filter(p => p.status === 'cooling').length,
|
|
banned: proxies.filter(p => p.status === 'banned').length,
|
|
avgSuccessRate: proxies.reduce((sum, p) => sum + p.successRate, 0) / proxies.length,
|
|
};
|
|
}
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Job Queue Implementation
|
|
|
|
```typescript
|
|
// src/core/queue/job-queue.ts
|
|
import { Queue, Worker, Job } from 'bullmq';
|
|
import { Redis } from 'ioredis';
|
|
|
|
export interface ScrapingJobData {
|
|
id: string;
|
|
type: 'full_scan' | 'incremental' | 'targeted' | 'refresh';
|
|
source: string;
|
|
config: ScrapingConfig;
|
|
createdBy?: string;
|
|
}
|
|
|
|
export class JobQueue {
|
|
private queue: Queue<ScrapingJobData>;
|
|
private worker: Worker<ScrapingJobData>;
|
|
private redis: Redis;
|
|
|
|
constructor(redis: Redis, processor: JobProcessor) {
|
|
this.redis = redis;
|
|
|
|
this.queue = new Queue('scraping', {
|
|
connection: redis,
|
|
defaultJobOptions: {
|
|
attempts: 3,
|
|
backoff: {
|
|
type: 'exponential',
|
|
delay: 5000,
|
|
},
|
|
removeOnComplete: 100,
|
|
removeOnFail: 50,
|
|
},
|
|
});
|
|
|
|
this.worker = new Worker(
|
|
'scraping',
|
|
async (job: Job<ScrapingJobData>) => {
|
|
return processor.process(job);
|
|
},
|
|
{
|
|
connection: redis,
|
|
concurrency: 2,
|
|
}
|
|
);
|
|
|
|
this.setupEventHandlers();
|
|
}
|
|
|
|
private setupEventHandlers(): void {
|
|
this.worker.on('completed', (job, result) => {
|
|
console.log(`Job ${job.id} completed`, result);
|
|
});
|
|
|
|
this.worker.on('failed', (job, error) => {
|
|
console.error(`Job ${job?.id} failed`, error);
|
|
});
|
|
|
|
this.worker.on('progress', (job, progress) => {
|
|
console.log(`Job ${job.id} progress: ${progress}%`);
|
|
});
|
|
}
|
|
|
|
async addJob(data: ScrapingJobData): Promise<Job<ScrapingJobData>> {
|
|
return this.queue.add(data.type, data, {
|
|
jobId: data.id,
|
|
});
|
|
}
|
|
|
|
async scheduleJob(
|
|
data: ScrapingJobData,
|
|
cron: string
|
|
): Promise<void> {
|
|
await this.queue.add(data.type, data, {
|
|
repeat: { pattern: cron },
|
|
jobId: `${data.id}-scheduled`,
|
|
});
|
|
}
|
|
|
|
async pauseJob(jobId: string): Promise<void> {
|
|
const job = await this.queue.getJob(jobId);
|
|
if (job) {
|
|
await job.updateProgress({ status: 'paused' });
|
|
}
|
|
}
|
|
|
|
async getJobStatus(jobId: string): Promise<JobStatus | null> {
|
|
const job = await this.queue.getJob(jobId);
|
|
if (!job) return null;
|
|
|
|
const state = await job.getState();
|
|
return {
|
|
id: job.id!,
|
|
state,
|
|
progress: job.progress,
|
|
data: job.data,
|
|
attemptsMade: job.attemptsMade,
|
|
failedReason: job.failedReason,
|
|
};
|
|
}
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## API Endpoints
|
|
|
|
```typescript
|
|
// src/api/routes/jobs.routes.ts
|
|
import { Router } from 'express';
|
|
import { z } from 'zod';
|
|
|
|
const CreateJobSchema = z.object({
|
|
type: z.enum(['full_scan', 'incremental', 'targeted', 'refresh']),
|
|
source: z.string(),
|
|
config: z.object({
|
|
targetCities: z.array(z.string()).optional(),
|
|
propertyTypes: z.array(z.string()).optional(),
|
|
maxPages: z.number().optional(),
|
|
delayMs: z.object({
|
|
min: z.number(),
|
|
max: z.number(),
|
|
}).optional(),
|
|
}),
|
|
});
|
|
|
|
export function createJobsRouter(jobQueue: JobQueue): Router {
|
|
const router = Router();
|
|
|
|
// Create new job
|
|
router.post('/', async (req, res) => {
|
|
const parsed = CreateJobSchema.safeParse(req.body);
|
|
if (!parsed.success) {
|
|
return res.status(400).json({ error: parsed.error });
|
|
}
|
|
|
|
const jobId = `job-${Date.now()}`;
|
|
const job = await jobQueue.addJob({
|
|
id: jobId,
|
|
...parsed.data,
|
|
});
|
|
|
|
res.status(201).json({
|
|
id: job.id,
|
|
status: 'queued',
|
|
});
|
|
});
|
|
|
|
// List jobs
|
|
router.get('/', async (req, res) => {
|
|
const jobs = await jobQueue.getJobs(req.query);
|
|
res.json({ jobs });
|
|
});
|
|
|
|
// Get job status
|
|
router.get('/:id', async (req, res) => {
|
|
const status = await jobQueue.getJobStatus(req.params.id);
|
|
if (!status) {
|
|
return res.status(404).json({ error: 'Job not found' });
|
|
}
|
|
res.json(status);
|
|
});
|
|
|
|
// Pause job
|
|
router.post('/:id/pause', async (req, res) => {
|
|
await jobQueue.pauseJob(req.params.id);
|
|
res.json({ status: 'paused' });
|
|
});
|
|
|
|
// Resume job
|
|
router.post('/:id/resume', async (req, res) => {
|
|
await jobQueue.resumeJob(req.params.id);
|
|
res.json({ status: 'resumed' });
|
|
});
|
|
|
|
// Cancel job
|
|
router.delete('/:id', async (req, res) => {
|
|
await jobQueue.cancelJob(req.params.id);
|
|
res.status(204).send();
|
|
});
|
|
|
|
return router;
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Docker Configuration
|
|
|
|
```yaml
|
|
# docker-compose.yml
|
|
version: '3.8'
|
|
|
|
services:
|
|
scraper:
|
|
build:
|
|
context: .
|
|
dockerfile: Dockerfile
|
|
environment:
|
|
- NODE_ENV=production
|
|
- REDIS_URL=redis://redis:6379
|
|
- DATABASE_URL=postgresql://user:pass@postgres:5432/inmobiliaria
|
|
- S3_ENDPOINT=http://minio:9000
|
|
- S3_BUCKET=raw-data
|
|
depends_on:
|
|
- redis
|
|
- postgres
|
|
- minio
|
|
deploy:
|
|
replicas: 2
|
|
resources:
|
|
limits:
|
|
memory: 2G
|
|
cpus: '1'
|
|
|
|
redis:
|
|
image: redis:7-alpine
|
|
volumes:
|
|
- redis-data:/data
|
|
|
|
postgres:
|
|
image: postgres:16-alpine
|
|
environment:
|
|
POSTGRES_DB: inmobiliaria
|
|
POSTGRES_USER: user
|
|
POSTGRES_PASSWORD: pass
|
|
volumes:
|
|
- postgres-data:/var/lib/postgresql/data
|
|
|
|
minio:
|
|
image: minio/minio
|
|
command: server /data --console-address ":9001"
|
|
environment:
|
|
MINIO_ROOT_USER: minioadmin
|
|
MINIO_ROOT_PASSWORD: minioadmin
|
|
volumes:
|
|
- minio-data:/data
|
|
|
|
volumes:
|
|
redis-data:
|
|
postgres-data:
|
|
minio-data:
|
|
```
|
|
|
|
---
|
|
|
|
## Metricas Prometheus
|
|
|
|
```typescript
|
|
// src/monitoring/metrics.ts
|
|
import { Registry, Counter, Histogram, Gauge } from 'prom-client';
|
|
|
|
export const register = new Registry();
|
|
|
|
export const metrics = {
|
|
propertiesScraped: new Counter({
|
|
name: 'scraper_properties_total',
|
|
help: 'Total properties scraped',
|
|
labelNames: ['source', 'status'],
|
|
registers: [register],
|
|
}),
|
|
|
|
requestDuration: new Histogram({
|
|
name: 'scraper_request_duration_seconds',
|
|
help: 'Duration of scraping requests',
|
|
labelNames: ['source'],
|
|
buckets: [0.1, 0.5, 1, 2, 5, 10, 30],
|
|
registers: [register],
|
|
}),
|
|
|
|
activeJobs: new Gauge({
|
|
name: 'scraper_active_jobs',
|
|
help: 'Number of active scraping jobs',
|
|
labelNames: ['source'],
|
|
registers: [register],
|
|
}),
|
|
|
|
proxyPoolSize: new Gauge({
|
|
name: 'scraper_proxy_pool_size',
|
|
help: 'Size of proxy pool by status',
|
|
labelNames: ['status'],
|
|
registers: [register],
|
|
}),
|
|
|
|
errorsTotal: new Counter({
|
|
name: 'scraper_errors_total',
|
|
help: 'Total scraping errors',
|
|
labelNames: ['source', 'error_type'],
|
|
registers: [register],
|
|
}),
|
|
};
|
|
```
|
|
|
|
---
|
|
|
|
## Testing Strategy
|
|
|
|
```typescript
|
|
// tests/integration/inmuebles24.test.ts
|
|
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
|
|
import { BrowserManager } from '../../src/core/browser/browser-manager';
|
|
import { Inmuebles24Scraper } from '../../src/scrapers/inmuebles24/scraper';
|
|
|
|
describe('Inmuebles24 Scraper', () => {
|
|
let browserManager: BrowserManager;
|
|
let scraper: Inmuebles24Scraper;
|
|
|
|
beforeAll(async () => {
|
|
browserManager = new BrowserManager();
|
|
await browserManager.initialize();
|
|
scraper = new Inmuebles24Scraper(browserManager, mockProxyPool, mockLogger);
|
|
});
|
|
|
|
afterAll(async () => {
|
|
await browserManager.close();
|
|
});
|
|
|
|
it('should extract property listings from search page', async () => {
|
|
const result = await scraper.scrape({
|
|
targetCities: ['guadalajara'],
|
|
propertyTypes: ['casas'],
|
|
maxPages: 1,
|
|
});
|
|
|
|
expect(result.success).toBe(true);
|
|
expect(result.properties.length).toBeGreaterThan(0);
|
|
expect(result.properties[0]).toHaveProperty('source_id');
|
|
expect(result.properties[0]).toHaveProperty('price');
|
|
});
|
|
|
|
it('should handle Cloudflare challenge', async () => {
|
|
// Test with mock that returns challenge page
|
|
// Verify scraper waits and retries
|
|
});
|
|
|
|
it('should rotate proxy on failure', async () => {
|
|
// Test proxy rotation logic
|
|
});
|
|
});
|
|
```
|
|
|
|
---
|
|
|
|
## Criterios de Aceptacion Tecnicos
|
|
|
|
- [ ] Bot detection tests pass (bot.sannysoft.com)
|
|
- [ ] Scraper extracts 500+ properties without block
|
|
- [ ] Request latency p95 < 10s
|
|
- [ ] Memory usage < 500MB per worker
|
|
- [ ] CPU usage < 50% average
|
|
- [ ] Error rate < 5%
|
|
- [ ] All unit tests pass
|
|
- [ ] Integration tests pass
|
|
|
|
---
|
|
|
|
**Documento:** Especificacion Tecnica Motor Scraping
|
|
**Version:** 1.0.0
|
|
**Autor:** Tech Lead
|
|
**Fecha:** 2026-01-04
|