🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1147 lines
31 KiB
Markdown
1147 lines
31 KiB
Markdown
---
|
|
id: "ET-SCR-003"
|
|
title: "Especificacion Tecnica - Gestion de Pool de Proxies"
|
|
type: "Technical Specification"
|
|
epic: "IAI-007"
|
|
status: "Draft"
|
|
version: "1.0"
|
|
project: "inmobiliaria-analytics"
|
|
created_date: "2026-01-04"
|
|
updated_date: "2026-01-04"
|
|
---
|
|
|
|
# ET-SCR-003: Gestion de Pool de Proxies
|
|
|
|
---
|
|
|
|
## 1. Resumen
|
|
|
|
Sistema de gestion de proxies residenciales y datacenter para rotacion automatica, evitar bloqueos IP, y mantener tasas de exito altas en el scraping.
|
|
|
|
---
|
|
|
|
## 2. Arquitectura del Sistema de Proxies
|
|
|
|
```
|
|
┌─────────────────────────────────────────────────────────────┐
|
|
│ PROXY MANAGER │
|
|
├─────────────────────────────────────────────────────────────┤
|
|
│ │
|
|
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
|
│ │ Residential │ │ Datacenter │ │ Mobile │ │
|
|
│ │ Pool │ │ Pool │ │ Pool │ │
|
|
│ │ (Premium) │ │ (Backup) │ │ (Reserved) │ │
|
|
│ └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ │
|
|
│ │ │ │ │
|
|
│ └────────────┬────┴────────────────┘ │
|
|
│ │ │
|
|
│ ┌───────▼───────┐ │
|
|
│ │ Selector │ │
|
|
│ │ Engine │ │
|
|
│ └───────┬───────┘ │
|
|
│ │ │
|
|
│ ┌─────────────────┼─────────────────┐ │
|
|
│ │ │ │ │
|
|
│ ▼ ▼ ▼ │
|
|
│ ┌──────┐ ┌──────┐ ┌──────────┐ │
|
|
│ │Health│ │ Geo │ │ Cooldown │ │
|
|
│ │Check │ │Filter│ │ Manager │ │
|
|
│ └──────┘ └──────┘ └──────────┘ │
|
|
│ │
|
|
└─────────────────────────────────────────────────────────────┘
|
|
│
|
|
▼
|
|
┌─────────────┐
|
|
│ Browser │
|
|
│ Manager │
|
|
└─────────────┘
|
|
```
|
|
|
|
---
|
|
|
|
## 3. Proveedores de Proxies
|
|
|
|
### 3.1 Configuracion de Proveedores
|
|
|
|
```yaml
|
|
# config/proxies.yml
|
|
providers:
|
|
brightdata:
|
|
type: residential
|
|
priority: 1
|
|
endpoint: "brd.superproxy.io"
|
|
port: 22225
|
|
username: "${BRIGHTDATA_USER}"
|
|
password: "${BRIGHTDATA_PASS}"
|
|
geo:
|
|
country: "mx"
|
|
city: "guadalajara"
|
|
sticky_session: true
|
|
session_duration: 600 # 10 minutos
|
|
monthly_bandwidth: "100GB"
|
|
cost_per_gb: 15 # USD
|
|
|
|
smartproxy:
|
|
type: residential
|
|
priority: 2
|
|
endpoint: "mx.smartproxy.com"
|
|
port: 10000
|
|
username: "${SMARTPROXY_USER}"
|
|
password: "${SMARTPROXY_PASS}"
|
|
geo:
|
|
country: "mx"
|
|
rotation: "per_request"
|
|
monthly_bandwidth: "50GB"
|
|
cost_per_gb: 12
|
|
|
|
datacenter_pool:
|
|
type: datacenter
|
|
priority: 3
|
|
proxies:
|
|
- host: "proxy1.example.com"
|
|
port: 3128
|
|
- host: "proxy2.example.com"
|
|
port: 3128
|
|
auth:
|
|
username: "${DC_PROXY_USER}"
|
|
password: "${DC_PROXY_PASS}"
|
|
cost_per_request: 0.001
|
|
|
|
settings:
|
|
default_provider: "brightdata"
|
|
fallback_chain: ["brightdata", "smartproxy", "datacenter_pool"]
|
|
max_failures_before_switch: 3
|
|
cooldown_after_block: 300 # 5 minutos
|
|
health_check_interval: 60 # 1 minuto
|
|
```
|
|
|
|
### 3.2 Tipos de Proxy y Uso
|
|
|
|
| Tipo | Uso Principal | Costo | Tasa Exito |
|
|
|------|--------------|-------|------------|
|
|
| Residential | Sitios con anti-bot agresivo | Alto | 95%+ |
|
|
| Datacenter | Sitios simples, backup | Bajo | 70-80% |
|
|
| Mobile | Casos especiales, Cloudflare | Muy Alto | 98%+ |
|
|
|
|
---
|
|
|
|
## 4. Implementacion
|
|
|
|
### 4.1 Interfaz de Proxy
|
|
|
|
```typescript
|
|
// src/proxy/types.ts
|
|
export interface ProxyConfig {
|
|
host: string;
|
|
port: number;
|
|
username?: string;
|
|
password?: string;
|
|
protocol: 'http' | 'https' | 'socks5';
|
|
}
|
|
|
|
export interface ProxyWithMetadata extends ProxyConfig {
|
|
id: string;
|
|
provider: string;
|
|
type: 'residential' | 'datacenter' | 'mobile';
|
|
geo: {
|
|
country: string;
|
|
city?: string;
|
|
region?: string;
|
|
};
|
|
|
|
// Metricas
|
|
stats: ProxyStats;
|
|
|
|
// Estado
|
|
status: 'active' | 'cooling' | 'blocked' | 'inactive';
|
|
lastUsed: Date | null;
|
|
cooldownUntil: Date | null;
|
|
}
|
|
|
|
export interface ProxyStats {
|
|
totalRequests: number;
|
|
successfulRequests: number;
|
|
failedRequests: number;
|
|
blockedRequests: number;
|
|
avgLatencyMs: number;
|
|
bandwidthUsedMb: number;
|
|
lastSuccess: Date | null;
|
|
lastFailure: Date | null;
|
|
}
|
|
|
|
export interface ProxySelection {
|
|
proxy: ProxyWithMetadata;
|
|
sessionId?: string;
|
|
}
|
|
```
|
|
|
|
### 4.2 Proxy Pool Manager
|
|
|
|
```typescript
|
|
// src/proxy/pool-manager.ts
|
|
import { Redis } from 'ioredis';
|
|
import { ProxyWithMetadata, ProxyConfig, ProxySelection } from './types';
|
|
import { ProxyHealthChecker } from './health-checker';
|
|
import { Logger } from '../utils/logger';
|
|
|
|
export class ProxyPoolManager {
|
|
private redis: Redis;
|
|
private healthChecker: ProxyHealthChecker;
|
|
private logger: Logger;
|
|
private providers: Map<string, ProxyProvider>;
|
|
|
|
constructor() {
|
|
this.redis = new Redis(process.env.REDIS_URL);
|
|
this.healthChecker = new ProxyHealthChecker();
|
|
this.logger = new Logger('ProxyPool');
|
|
this.providers = new Map();
|
|
|
|
this.initializeProviders();
|
|
}
|
|
|
|
private initializeProviders(): void {
|
|
// Bright Data
|
|
this.providers.set('brightdata', new BrightDataProvider({
|
|
endpoint: process.env.BRIGHTDATA_ENDPOINT!,
|
|
username: process.env.BRIGHTDATA_USER!,
|
|
password: process.env.BRIGHTDATA_PASS!,
|
|
}));
|
|
|
|
// SmartProxy
|
|
this.providers.set('smartproxy', new SmartProxyProvider({
|
|
endpoint: process.env.SMARTPROXY_ENDPOINT!,
|
|
username: process.env.SMARTPROXY_USER!,
|
|
password: process.env.SMARTPROXY_PASS!,
|
|
}));
|
|
|
|
// Datacenter Pool
|
|
this.providers.set('datacenter', new DatacenterProxyProvider({
|
|
proxies: JSON.parse(process.env.DC_PROXIES || '[]'),
|
|
}));
|
|
}
|
|
|
|
async getProxy(options: {
|
|
targetDomain: string;
|
|
preferredType?: 'residential' | 'datacenter' | 'mobile';
|
|
requireFresh?: boolean;
|
|
stickySession?: boolean;
|
|
sessionId?: string;
|
|
}): Promise<ProxySelection> {
|
|
const { targetDomain, preferredType, requireFresh, stickySession, sessionId } = options;
|
|
|
|
// 1. Si hay sesion sticky activa, reusar
|
|
if (stickySession && sessionId) {
|
|
const existingProxy = await this.getStickySession(sessionId);
|
|
if (existingProxy) {
|
|
return { proxy: existingProxy, sessionId };
|
|
}
|
|
}
|
|
|
|
// 2. Obtener pool de candidatos
|
|
const candidates = await this.getCandidates({
|
|
domain: targetDomain,
|
|
type: preferredType,
|
|
excludeCooling: true,
|
|
excludeBlocked: true,
|
|
});
|
|
|
|
if (candidates.length === 0) {
|
|
throw new Error(`No proxies available for ${targetDomain}`);
|
|
}
|
|
|
|
// 3. Seleccionar mejor proxy
|
|
const selected = this.selectBestProxy(candidates, {
|
|
requireFresh,
|
|
domain: targetDomain,
|
|
});
|
|
|
|
// 4. Crear sesion si es sticky
|
|
let newSessionId = sessionId;
|
|
if (stickySession) {
|
|
newSessionId = await this.createStickySession(selected);
|
|
}
|
|
|
|
// 5. Marcar como usado
|
|
await this.markUsed(selected.id);
|
|
|
|
this.logger.debug(`Selected proxy ${selected.id} for ${targetDomain}`);
|
|
|
|
return { proxy: selected, sessionId: newSessionId };
|
|
}
|
|
|
|
private async getCandidates(options: {
|
|
domain: string;
|
|
type?: string;
|
|
excludeCooling: boolean;
|
|
excludeBlocked: boolean;
|
|
}): Promise<ProxyWithMetadata[]> {
|
|
const allProxies = await this.getAllProxies();
|
|
const now = new Date();
|
|
|
|
return allProxies.filter(proxy => {
|
|
// Filtrar por tipo
|
|
if (options.type && proxy.type !== options.type) {
|
|
return false;
|
|
}
|
|
|
|
// Excluir en cooling
|
|
if (options.excludeCooling && proxy.status === 'cooling') {
|
|
if (proxy.cooldownUntil && proxy.cooldownUntil > now) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Excluir bloqueados para este dominio
|
|
if (options.excludeBlocked) {
|
|
const blockKey = `proxy:blocked:${proxy.id}:${options.domain}`;
|
|
// Check async - simplified here
|
|
}
|
|
|
|
return proxy.status === 'active';
|
|
});
|
|
}
|
|
|
|
private selectBestProxy(
|
|
candidates: ProxyWithMetadata[],
|
|
options: { requireFresh?: boolean; domain: string }
|
|
): ProxyWithMetadata {
|
|
// Scoring algorithm
|
|
const scored = candidates.map(proxy => {
|
|
let score = 100;
|
|
|
|
// Penalizar por uso reciente
|
|
if (proxy.lastUsed) {
|
|
const minutesSinceUse = (Date.now() - proxy.lastUsed.getTime()) / 60000;
|
|
if (minutesSinceUse < 5) {
|
|
score -= (5 - minutesSinceUse) * 10;
|
|
}
|
|
}
|
|
|
|
// Bonus por alta tasa de exito
|
|
const successRate = proxy.stats.totalRequests > 0
|
|
? proxy.stats.successfulRequests / proxy.stats.totalRequests
|
|
: 0.5;
|
|
score += successRate * 20;
|
|
|
|
// Penalizar por latencia alta
|
|
if (proxy.stats.avgLatencyMs > 2000) {
|
|
score -= 10;
|
|
}
|
|
|
|
// Bonus por tipo preferido
|
|
if (proxy.type === 'residential') {
|
|
score += 15;
|
|
}
|
|
|
|
// Penalizar si se requiere fresh y fue usado recientemente
|
|
if (options.requireFresh && proxy.lastUsed) {
|
|
const minutesSinceUse = (Date.now() - proxy.lastUsed.getTime()) / 60000;
|
|
if (minutesSinceUse < 30) {
|
|
score -= 50;
|
|
}
|
|
}
|
|
|
|
return { proxy, score };
|
|
});
|
|
|
|
// Ordenar por score y agregar algo de randomizacion
|
|
scored.sort((a, b) => b.score - a.score);
|
|
|
|
// Seleccionar del top 3 aleatoriamente para evitar patrones
|
|
const topN = scored.slice(0, Math.min(3, scored.length));
|
|
const randomIndex = Math.floor(Math.random() * topN.length);
|
|
|
|
return topN[randomIndex].proxy;
|
|
}
|
|
|
|
async reportSuccess(proxyId: string, domain: string, latencyMs: number): Promise<void> {
|
|
const key = `proxy:stats:${proxyId}`;
|
|
|
|
await this.redis.multi()
|
|
.hincrby(key, 'totalRequests', 1)
|
|
.hincrby(key, 'successfulRequests', 1)
|
|
.hset(key, 'lastSuccess', Date.now().toString())
|
|
.exec();
|
|
|
|
// Actualizar latencia promedio
|
|
await this.updateAvgLatency(proxyId, latencyMs);
|
|
|
|
this.logger.debug(`Proxy ${proxyId} success on ${domain} (${latencyMs}ms)`);
|
|
}
|
|
|
|
async reportFailure(
|
|
proxyId: string,
|
|
domain: string,
|
|
error: Error,
|
|
isBlock: boolean = false
|
|
): Promise<void> {
|
|
const key = `proxy:stats:${proxyId}`;
|
|
|
|
await this.redis.multi()
|
|
.hincrby(key, 'totalRequests', 1)
|
|
.hincrby(key, 'failedRequests', 1)
|
|
.hincrby(key, isBlock ? 'blockedRequests' : 'failedRequests', 1)
|
|
.hset(key, 'lastFailure', Date.now().toString())
|
|
.exec();
|
|
|
|
if (isBlock) {
|
|
await this.handleBlock(proxyId, domain);
|
|
}
|
|
|
|
this.logger.warn(`Proxy ${proxyId} failed on ${domain}: ${error.message}`);
|
|
}
|
|
|
|
private async handleBlock(proxyId: string, domain: string): Promise<void> {
|
|
// Poner en cooling para este dominio
|
|
const cooldownMinutes = 30;
|
|
const cooldownUntil = Date.now() + (cooldownMinutes * 60 * 1000);
|
|
|
|
await this.redis.set(
|
|
`proxy:blocked:${proxyId}:${domain}`,
|
|
cooldownUntil.toString(),
|
|
'EX',
|
|
cooldownMinutes * 60
|
|
);
|
|
|
|
// Verificar si esta bloqueado en multiples dominios
|
|
const blockedDomains = await this.redis.keys(`proxy:blocked:${proxyId}:*`);
|
|
|
|
if (blockedDomains.length >= 3) {
|
|
// Marcar como cooling general
|
|
await this.redis.hset(`proxy:${proxyId}`, 'status', 'cooling');
|
|
await this.redis.hset(`proxy:${proxyId}`, 'cooldownUntil', (Date.now() + 3600000).toString());
|
|
|
|
this.logger.warn(`Proxy ${proxyId} put in cooling (blocked on ${blockedDomains.length} domains)`);
|
|
}
|
|
}
|
|
|
|
private async getStickySession(sessionId: string): Promise<ProxyWithMetadata | null> {
|
|
const proxyId = await this.redis.get(`proxy:session:${sessionId}`);
|
|
if (!proxyId) return null;
|
|
|
|
return this.getProxyById(proxyId);
|
|
}
|
|
|
|
private async createStickySession(proxy: ProxyWithMetadata): Promise<string> {
|
|
const sessionId = `sess_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
|
|
|
|
// Session dura 10 minutos
|
|
await this.redis.setex(`proxy:session:${sessionId}`, 600, proxy.id);
|
|
|
|
return sessionId;
|
|
}
|
|
|
|
private async markUsed(proxyId: string): Promise<void> {
|
|
await this.redis.hset(`proxy:${proxyId}`, 'lastUsed', Date.now().toString());
|
|
}
|
|
|
|
private async updateAvgLatency(proxyId: string, latencyMs: number): Promise<void> {
|
|
const key = `proxy:latency:${proxyId}`;
|
|
|
|
// Rolling average de ultimas 100 requests
|
|
await this.redis.lpush(key, latencyMs.toString());
|
|
await this.redis.ltrim(key, 0, 99);
|
|
|
|
const latencies = await this.redis.lrange(key, 0, -1);
|
|
const avg = latencies.reduce((sum, l) => sum + parseInt(l), 0) / latencies.length;
|
|
|
|
await this.redis.hset(`proxy:stats:${proxyId}`, 'avgLatencyMs', Math.round(avg).toString());
|
|
}
|
|
|
|
private async getAllProxies(): Promise<ProxyWithMetadata[]> {
|
|
const keys = await this.redis.keys('proxy:*');
|
|
const proxies: ProxyWithMetadata[] = [];
|
|
|
|
for (const key of keys) {
|
|
if (key.match(/^proxy:[a-z0-9]+$/)) {
|
|
const data = await this.redis.hgetall(key);
|
|
if (data.host) {
|
|
proxies.push(this.parseProxyData(data));
|
|
}
|
|
}
|
|
}
|
|
|
|
return proxies;
|
|
}
|
|
|
|
private async getProxyById(id: string): Promise<ProxyWithMetadata | null> {
|
|
const data = await this.redis.hgetall(`proxy:${id}`);
|
|
if (!data.host) return null;
|
|
return this.parseProxyData(data);
|
|
}
|
|
|
|
private parseProxyData(data: Record<string, string>): ProxyWithMetadata {
|
|
return {
|
|
id: data.id,
|
|
host: data.host,
|
|
port: parseInt(data.port),
|
|
username: data.username,
|
|
password: data.password,
|
|
protocol: data.protocol as 'http' | 'https' | 'socks5',
|
|
provider: data.provider,
|
|
type: data.type as 'residential' | 'datacenter' | 'mobile',
|
|
geo: JSON.parse(data.geo || '{}'),
|
|
stats: JSON.parse(data.stats || '{}'),
|
|
status: data.status as any,
|
|
lastUsed: data.lastUsed ? new Date(parseInt(data.lastUsed)) : null,
|
|
cooldownUntil: data.cooldownUntil ? new Date(parseInt(data.cooldownUntil)) : null,
|
|
};
|
|
}
|
|
}
|
|
```
|
|
|
|
### 4.3 Health Checker
|
|
|
|
```typescript
|
|
// src/proxy/health-checker.ts
|
|
import { ProxyWithMetadata } from './types';
|
|
import fetch from 'node-fetch';
|
|
import { HttpsProxyAgent } from 'https-proxy-agent';
|
|
|
|
export class ProxyHealthChecker {
|
|
private testUrls = [
|
|
'https://httpbin.org/ip',
|
|
'https://api.ipify.org?format=json',
|
|
'https://www.google.com.mx',
|
|
];
|
|
|
|
async checkProxy(proxy: ProxyWithMetadata): Promise<{
|
|
healthy: boolean;
|
|
latencyMs: number;
|
|
detectedIp: string | null;
|
|
error?: string;
|
|
}> {
|
|
const proxyUrl = this.buildProxyUrl(proxy);
|
|
const agent = new HttpsProxyAgent(proxyUrl);
|
|
|
|
const startTime = Date.now();
|
|
|
|
try {
|
|
const response = await fetch(this.testUrls[0], {
|
|
agent,
|
|
timeout: 10000,
|
|
headers: {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
},
|
|
});
|
|
|
|
if (!response.ok) {
|
|
return {
|
|
healthy: false,
|
|
latencyMs: Date.now() - startTime,
|
|
detectedIp: null,
|
|
error: `HTTP ${response.status}`,
|
|
};
|
|
}
|
|
|
|
const data = await response.json() as { origin?: string; ip?: string };
|
|
const detectedIp = data.origin || data.ip || null;
|
|
|
|
return {
|
|
healthy: true,
|
|
latencyMs: Date.now() - startTime,
|
|
detectedIp,
|
|
};
|
|
} catch (error) {
|
|
return {
|
|
healthy: false,
|
|
latencyMs: Date.now() - startTime,
|
|
detectedIp: null,
|
|
error: (error as Error).message,
|
|
};
|
|
}
|
|
}
|
|
|
|
async checkBatch(proxies: ProxyWithMetadata[]): Promise<Map<string, boolean>> {
|
|
const results = new Map<string, boolean>();
|
|
|
|
// Check en paralelo con limite de concurrencia
|
|
const concurrency = 10;
|
|
const chunks = this.chunkArray(proxies, concurrency);
|
|
|
|
for (const chunk of chunks) {
|
|
const checks = chunk.map(async proxy => {
|
|
const result = await this.checkProxy(proxy);
|
|
results.set(proxy.id, result.healthy);
|
|
});
|
|
|
|
await Promise.all(checks);
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
private buildProxyUrl(proxy: ProxyWithMetadata): string {
|
|
const auth = proxy.username && proxy.password
|
|
? `${proxy.username}:${proxy.password}@`
|
|
: '';
|
|
return `${proxy.protocol}://${auth}${proxy.host}:${proxy.port}`;
|
|
}
|
|
|
|
private chunkArray<T>(array: T[], size: number): T[][] {
|
|
const chunks: T[][] = [];
|
|
for (let i = 0; i < array.length; i += size) {
|
|
chunks.push(array.slice(i, i + size));
|
|
}
|
|
return chunks;
|
|
}
|
|
}
|
|
```
|
|
|
|
### 4.4 Bright Data Provider
|
|
|
|
```typescript
|
|
// src/proxy/providers/brightdata.provider.ts
|
|
import { ProxyProvider, ProxyConfig } from '../types';
|
|
|
|
export class BrightDataProvider implements ProxyProvider {
|
|
private config: {
|
|
endpoint: string;
|
|
username: string;
|
|
password: string;
|
|
zone?: string;
|
|
};
|
|
|
|
constructor(config: typeof this.config) {
|
|
this.config = config;
|
|
}
|
|
|
|
getProxy(options?: {
|
|
country?: string;
|
|
city?: string;
|
|
sessionId?: string;
|
|
sticky?: boolean;
|
|
}): ProxyConfig {
|
|
// Construir username con opciones
|
|
let username = this.config.username;
|
|
|
|
if (options?.country) {
|
|
username += `-country-${options.country}`;
|
|
}
|
|
if (options?.city) {
|
|
username += `-city-${options.city}`;
|
|
}
|
|
if (options?.sticky && options?.sessionId) {
|
|
username += `-session-${options.sessionId}`;
|
|
}
|
|
|
|
return {
|
|
host: this.config.endpoint,
|
|
port: 22225,
|
|
username,
|
|
password: this.config.password,
|
|
protocol: 'http',
|
|
};
|
|
}
|
|
|
|
async getResidentialProxy(options: {
|
|
country: string;
|
|
city?: string;
|
|
sticky?: boolean;
|
|
}): Promise<ProxyConfig> {
|
|
const sessionId = options.sticky
|
|
? `sess_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`
|
|
: undefined;
|
|
|
|
return this.getProxy({
|
|
country: options.country,
|
|
city: options.city,
|
|
sessionId,
|
|
sticky: options.sticky,
|
|
});
|
|
}
|
|
|
|
async getMobileProxy(options: {
|
|
country: string;
|
|
carrier?: string;
|
|
}): Promise<ProxyConfig> {
|
|
let username = `${this.config.username}-zone-mobile-country-${options.country}`;
|
|
|
|
if (options.carrier) {
|
|
username += `-carrier-${options.carrier}`;
|
|
}
|
|
|
|
return {
|
|
host: this.config.endpoint,
|
|
port: 22225,
|
|
username,
|
|
password: this.config.password,
|
|
protocol: 'http',
|
|
};
|
|
}
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## 5. Rotacion Inteligente
|
|
|
|
### 5.1 Estrategias de Rotacion
|
|
|
|
```typescript
|
|
// src/proxy/rotation-strategies.ts
|
|
import { ProxyWithMetadata, ProxySelection } from './types';
|
|
import { ProxyPoolManager } from './pool-manager';
|
|
|
|
export interface RotationStrategy {
|
|
name: string;
|
|
selectProxy(
|
|
pool: ProxyPoolManager,
|
|
context: RotationContext
|
|
): Promise<ProxySelection>;
|
|
}
|
|
|
|
export interface RotationContext {
|
|
domain: string;
|
|
requestCount: number;
|
|
lastProxy?: ProxyWithMetadata;
|
|
sessionStart?: Date;
|
|
}
|
|
|
|
// Estrategia: Rotar cada N requests
|
|
export class EveryNRequestsStrategy implements RotationStrategy {
|
|
name = 'every_n_requests';
|
|
private n: number;
|
|
|
|
constructor(n: number = 10) {
|
|
this.n = n;
|
|
}
|
|
|
|
async selectProxy(
|
|
pool: ProxyPoolManager,
|
|
context: RotationContext
|
|
): Promise<ProxySelection> {
|
|
const shouldRotate = context.requestCount % this.n === 0;
|
|
|
|
if (!shouldRotate && context.lastProxy) {
|
|
return { proxy: context.lastProxy };
|
|
}
|
|
|
|
return pool.getProxy({
|
|
targetDomain: context.domain,
|
|
requireFresh: true,
|
|
});
|
|
}
|
|
}
|
|
|
|
// Estrategia: Rotar por tiempo
|
|
export class TimeBasedStrategy implements RotationStrategy {
|
|
name = 'time_based';
|
|
private intervalMs: number;
|
|
|
|
constructor(intervalMinutes: number = 10) {
|
|
this.intervalMs = intervalMinutes * 60 * 1000;
|
|
}
|
|
|
|
async selectProxy(
|
|
pool: ProxyPoolManager,
|
|
context: RotationContext
|
|
): Promise<ProxySelection> {
|
|
const elapsed = context.sessionStart
|
|
? Date.now() - context.sessionStart.getTime()
|
|
: Infinity;
|
|
|
|
if (elapsed < this.intervalMs && context.lastProxy) {
|
|
return { proxy: context.lastProxy };
|
|
}
|
|
|
|
return pool.getProxy({
|
|
targetDomain: context.domain,
|
|
stickySession: true,
|
|
});
|
|
}
|
|
}
|
|
|
|
// Estrategia: Round Robin ponderado
|
|
export class WeightedRoundRobinStrategy implements RotationStrategy {
|
|
name = 'weighted_round_robin';
|
|
private currentIndex = 0;
|
|
|
|
async selectProxy(
|
|
pool: ProxyPoolManager,
|
|
context: RotationContext
|
|
): Promise<ProxySelection> {
|
|
// Implementar round robin con pesos basados en success rate
|
|
return pool.getProxy({
|
|
targetDomain: context.domain,
|
|
});
|
|
}
|
|
}
|
|
|
|
// Estrategia: Adaptativa basada en respuestas
|
|
export class AdaptiveStrategy implements RotationStrategy {
|
|
name = 'adaptive';
|
|
private failureThreshold = 2;
|
|
private consecutiveFailures = 0;
|
|
|
|
async selectProxy(
|
|
pool: ProxyPoolManager,
|
|
context: RotationContext
|
|
): Promise<ProxySelection> {
|
|
// Si hay muchos fallos consecutivos, forzar rotacion
|
|
if (this.consecutiveFailures >= this.failureThreshold) {
|
|
this.consecutiveFailures = 0;
|
|
return pool.getProxy({
|
|
targetDomain: context.domain,
|
|
requireFresh: true,
|
|
});
|
|
}
|
|
|
|
// De lo contrario, mantener proxy actual si existe
|
|
if (context.lastProxy) {
|
|
return { proxy: context.lastProxy };
|
|
}
|
|
|
|
return pool.getProxy({
|
|
targetDomain: context.domain,
|
|
stickySession: true,
|
|
});
|
|
}
|
|
|
|
recordSuccess(): void {
|
|
this.consecutiveFailures = 0;
|
|
}
|
|
|
|
recordFailure(): void {
|
|
this.consecutiveFailures++;
|
|
}
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## 6. Integracion con Playwright
|
|
|
|
```typescript
|
|
// src/proxy/playwright-integration.ts
|
|
import { Browser, BrowserContext, Page } from 'playwright';
|
|
import { ProxyPoolManager } from './pool-manager';
|
|
import { ProxyWithMetadata } from './types';
|
|
|
|
export class PlaywrightProxyIntegration {
|
|
private proxyPool: ProxyPoolManager;
|
|
|
|
constructor() {
|
|
this.proxyPool = new ProxyPoolManager();
|
|
}
|
|
|
|
async createContextWithProxy(
|
|
browser: Browser,
|
|
options: {
|
|
domain: string;
|
|
preferredType?: 'residential' | 'datacenter';
|
|
userAgent?: string;
|
|
}
|
|
): Promise<{
|
|
context: BrowserContext;
|
|
proxy: ProxyWithMetadata;
|
|
sessionId: string;
|
|
}> {
|
|
const { proxy, sessionId } = await this.proxyPool.getProxy({
|
|
targetDomain: options.domain,
|
|
preferredType: options.preferredType,
|
|
stickySession: true,
|
|
});
|
|
|
|
const context = await browser.newContext({
|
|
proxy: {
|
|
server: `${proxy.protocol}://${proxy.host}:${proxy.port}`,
|
|
username: proxy.username,
|
|
password: proxy.password,
|
|
},
|
|
userAgent: options.userAgent || this.getRandomUserAgent(),
|
|
viewport: { width: 1920, height: 1080 },
|
|
locale: 'es-MX',
|
|
timezoneId: 'America/Mexico_City',
|
|
});
|
|
|
|
return { context, proxy, sessionId: sessionId! };
|
|
}
|
|
|
|
async wrapPageWithProxyHandling(
|
|
page: Page,
|
|
proxy: ProxyWithMetadata,
|
|
domain: string
|
|
): Promise<Page> {
|
|
// Interceptar errores de red para reportar al pool
|
|
page.on('requestfailed', async (request) => {
|
|
const failure = request.failure();
|
|
if (failure) {
|
|
const isBlock = this.isBlockError(failure.errorText);
|
|
await this.proxyPool.reportFailure(
|
|
proxy.id,
|
|
domain,
|
|
new Error(failure.errorText),
|
|
isBlock
|
|
);
|
|
}
|
|
});
|
|
|
|
page.on('response', async (response) => {
|
|
const status = response.status();
|
|
|
|
if (status === 403 || status === 429 || status === 503) {
|
|
await this.proxyPool.reportFailure(
|
|
proxy.id,
|
|
domain,
|
|
new Error(`HTTP ${status}`),
|
|
true
|
|
);
|
|
} else if (status >= 200 && status < 400) {
|
|
const timing = response.request().timing();
|
|
await this.proxyPool.reportSuccess(
|
|
proxy.id,
|
|
domain,
|
|
timing.responseEnd - timing.requestStart
|
|
);
|
|
}
|
|
});
|
|
|
|
return page;
|
|
}
|
|
|
|
private isBlockError(errorText: string): boolean {
|
|
const blockPatterns = [
|
|
'net::ERR_PROXY_CONNECTION_FAILED',
|
|
'net::ERR_TUNNEL_CONNECTION_FAILED',
|
|
'Cloudflare',
|
|
'Access Denied',
|
|
'blocked',
|
|
];
|
|
|
|
return blockPatterns.some(pattern =>
|
|
errorText.toLowerCase().includes(pattern.toLowerCase())
|
|
);
|
|
}
|
|
|
|
private getRandomUserAgent(): string {
|
|
const userAgents = [
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
|
|
];
|
|
|
|
return userAgents[Math.floor(Math.random() * userAgents.length)];
|
|
}
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## 7. Dashboard de Monitoreo
|
|
|
|
### 7.1 Metricas Prometheus
|
|
|
|
```typescript
|
|
// src/proxy/metrics.ts
|
|
import { Counter, Gauge, Histogram } from 'prom-client';
|
|
|
|
export const proxyMetrics = {
|
|
// Contadores
|
|
requests_total: new Counter({
|
|
name: 'proxy_requests_total',
|
|
help: 'Total proxy requests',
|
|
labelNames: ['provider', 'type', 'status'],
|
|
}),
|
|
|
|
blocks_total: new Counter({
|
|
name: 'proxy_blocks_total',
|
|
help: 'Total proxy blocks detected',
|
|
labelNames: ['provider', 'domain'],
|
|
}),
|
|
|
|
rotations_total: new Counter({
|
|
name: 'proxy_rotations_total',
|
|
help: 'Total proxy rotations',
|
|
labelNames: ['reason'],
|
|
}),
|
|
|
|
// Gauges
|
|
active_proxies: new Gauge({
|
|
name: 'proxy_active_count',
|
|
help: 'Number of active proxies',
|
|
labelNames: ['provider', 'type'],
|
|
}),
|
|
|
|
cooling_proxies: new Gauge({
|
|
name: 'proxy_cooling_count',
|
|
help: 'Number of proxies in cooling period',
|
|
labelNames: ['provider'],
|
|
}),
|
|
|
|
bandwidth_used_mb: new Gauge({
|
|
name: 'proxy_bandwidth_used_mb',
|
|
help: 'Bandwidth used in MB',
|
|
labelNames: ['provider'],
|
|
}),
|
|
|
|
// Histogramas
|
|
latency_seconds: new Histogram({
|
|
name: 'proxy_latency_seconds',
|
|
help: 'Proxy request latency',
|
|
labelNames: ['provider'],
|
|
buckets: [0.1, 0.5, 1, 2, 5, 10],
|
|
}),
|
|
|
|
success_rate: new Histogram({
|
|
name: 'proxy_success_rate',
|
|
help: 'Proxy success rate',
|
|
labelNames: ['provider'],
|
|
buckets: [0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 1.0],
|
|
}),
|
|
};
|
|
```
|
|
|
|
### 7.2 API de Estado
|
|
|
|
```typescript
|
|
// src/proxy/routes.ts
|
|
import { Router } from 'express';
|
|
import { ProxyPoolManager } from './pool-manager';
|
|
|
|
const router = Router();
|
|
const pool = new ProxyPoolManager();
|
|
|
|
// GET /api/proxies/status
|
|
router.get('/status', async (req, res) => {
|
|
const stats = await pool.getPoolStatus();
|
|
|
|
res.json({
|
|
overview: {
|
|
totalProxies: stats.total,
|
|
activeProxies: stats.active,
|
|
coolingProxies: stats.cooling,
|
|
blockedProxies: stats.blocked,
|
|
avgSuccessRate: stats.avgSuccessRate,
|
|
avgLatencyMs: stats.avgLatencyMs,
|
|
},
|
|
byProvider: stats.byProvider,
|
|
byType: stats.byType,
|
|
recentBlocks: stats.recentBlocks,
|
|
bandwidthUsage: stats.bandwidthUsage,
|
|
});
|
|
});
|
|
|
|
// GET /api/proxies/:id
|
|
router.get('/:id', async (req, res) => {
|
|
const proxy = await pool.getProxyDetails(req.params.id);
|
|
|
|
if (!proxy) {
|
|
return res.status(404).json({ error: 'Proxy not found' });
|
|
}
|
|
|
|
res.json(proxy);
|
|
});
|
|
|
|
// POST /api/proxies/:id/reset
|
|
router.post('/:id/reset', async (req, res) => {
|
|
await pool.resetProxyStats(req.params.id);
|
|
res.json({ success: true });
|
|
});
|
|
|
|
// POST /api/proxies/:id/cooldown
|
|
router.post('/:id/cooldown', async (req, res) => {
|
|
const { minutes = 30 } = req.body;
|
|
await pool.setCooldown(req.params.id, minutes);
|
|
res.json({ success: true });
|
|
});
|
|
|
|
export default router;
|
|
```
|
|
|
|
---
|
|
|
|
## 8. Costos y Presupuesto
|
|
|
|
```yaml
|
|
# config/proxy-budget.yml
|
|
monthly_budget:
|
|
total_usd: 500
|
|
|
|
allocation:
|
|
residential: 400 # 80%
|
|
datacenter: 50 # 10%
|
|
mobile: 50 # 10% (reserva)
|
|
|
|
alerts:
|
|
warning_threshold: 0.7 # 70% del budget
|
|
critical_threshold: 0.9 # 90% del budget
|
|
|
|
actions_on_limit:
|
|
warning:
|
|
- reduce_concurrency
|
|
- prefer_datacenter
|
|
critical:
|
|
- pause_non_essential
|
|
- alert_admin
|
|
|
|
cost_per_request:
|
|
inmuebles24: 0.02 # Sitio dificil
|
|
metros_cubicos: 0.01 # Sitio facil
|
|
vivanuncios: 0.015 # Sitio medio
|
|
```
|
|
|
|
---
|
|
|
|
## 9. Tests
|
|
|
|
```typescript
|
|
// src/proxy/__tests__/pool-manager.test.ts
|
|
import { ProxyPoolManager } from '../pool-manager';
|
|
import { Redis } from 'ioredis';
|
|
|
|
jest.mock('ioredis');
|
|
|
|
describe('ProxyPoolManager', () => {
|
|
let manager: ProxyPoolManager;
|
|
|
|
beforeEach(() => {
|
|
manager = new ProxyPoolManager();
|
|
});
|
|
|
|
describe('getProxy', () => {
|
|
it('should return a proxy for valid domain', async () => {
|
|
const result = await manager.getProxy({
|
|
targetDomain: 'inmuebles24.com',
|
|
});
|
|
|
|
expect(result.proxy).toBeDefined();
|
|
expect(result.proxy.host).toBeDefined();
|
|
});
|
|
|
|
it('should reuse sticky session when provided', async () => {
|
|
const first = await manager.getProxy({
|
|
targetDomain: 'test.com',
|
|
stickySession: true,
|
|
});
|
|
|
|
const second = await manager.getProxy({
|
|
targetDomain: 'test.com',
|
|
stickySession: true,
|
|
sessionId: first.sessionId,
|
|
});
|
|
|
|
expect(first.proxy.id).toBe(second.proxy.id);
|
|
});
|
|
});
|
|
|
|
describe('reportFailure', () => {
|
|
it('should put proxy in cooling after block', async () => {
|
|
const { proxy } = await manager.getProxy({
|
|
targetDomain: 'test.com',
|
|
});
|
|
|
|
await manager.reportFailure(proxy.id, 'test.com', new Error('403'), true);
|
|
|
|
// Verify proxy is in cooling for this domain
|
|
// ... assertions
|
|
});
|
|
});
|
|
});
|
|
```
|
|
|
|
---
|
|
|
|
**Anterior:** [ET-IA-007-etl.md](./ET-IA-007-etl.md)
|
|
**Siguiente:** [ET-IA-007-monitoring.md](./ET-IA-007-monitoring.md)
|