platform-codebase/tools/nightcrawler/tests/browser/proxy-e2e.test.ts
2026-02-08 17:58:43 -08:00

184 lines
5.7 KiB
TypeScript

/**
* Proxy E2E Tests — Validates rotating Tor HTTP proxy integration
*
* Requires:
* - docker compose up -d (rotating-tor-http-proxy on port 3128)
* - Playwright chromium installed
*
* Run: PROXY_E2E=1 vitest run tests/browser/proxy-e2e.test.ts
*/
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
import { BrowserManager } from '../../src/browser/browser-manager';
import type { CrawlConfig } from '../../src/types';
const PROXY_HOST = 'localhost';
const PROXY_PORT = 3128;
const IP_CHECK_URL = 'https://httpbin.org/ip';
function createProxyConfig(proxyEnabled: boolean): CrawlConfig {
return {
database: {
host: 'localhost',
port: 5432,
username: 'test',
password: 'test',
database: 'nightcrawler_test',
},
platforms: ['tryst'],
cities: ['los-angeles'],
crawl: {
maxPagesPerCity: 1,
concurrency: 3,
headless: true,
delayMean: 1000,
delayStdDev: 200,
delayMin: 500,
delayMax: 2000,
photoHashEnabled: false,
contactRevealEnabled: false,
respectRobotsTxt: true,
},
proxy: {
enabled: proxyEnabled,
type: 'http',
instances: 1,
startPort: PROXY_PORT,
host: PROXY_HOST,
},
circuitBreaker: {
failureThreshold: 5,
successThreshold: 2,
timeout: 60000,
},
outreach: { defaultStatus: 'pending' },
export: { format: 'json', outputDir: './output' },
} as CrawlConfig;
}
async function extractIp(manager: BrowserManager, platform: string): Promise<string> {
const page = await manager.getPage(platform as 'tryst');
await page.goto(IP_CHECK_URL, { waitUntil: 'domcontentloaded', timeout: 30000 });
const body = await page.textContent('body');
const parsed = JSON.parse(body!);
return parsed.origin;
}
async function isProxyReachable(): Promise<boolean> {
try {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), 5000);
const response = await fetch(`http://${PROXY_HOST}:${PROXY_PORT}`, {
signal: controller.signal,
}).catch(() => null);
clearTimeout(timeout);
return response !== null;
} catch {
return false;
}
}
describe.skipIf(!process.env.PROXY_E2E)('Proxy E2E — Rotating Tor HTTP Proxy', () => {
let proxyAvailable = false;
beforeAll(async () => {
proxyAvailable = await isProxyReachable();
if (!proxyAvailable) {
console.warn('Tor proxy not reachable on localhost:3128 — run: docker compose up -d');
}
});
it('proxy container is reachable', () => {
expect(proxyAvailable).toBe(true);
});
describe.skipIf(!true)('IP masking', () => {
let directManager: BrowserManager;
let proxyManager: BrowserManager;
beforeAll(async () => {
directManager = new BrowserManager(createProxyConfig(false));
proxyManager = new BrowserManager(createProxyConfig(true));
});
afterAll(async () => {
await directManager.closeAll();
await proxyManager.closeAll();
});
it('proxied IP differs from direct IP', async () => {
const directIp = await extractIp(directManager, 'tryst');
await directManager.close('tryst');
const proxyIp = await extractIp(proxyManager, 'tryst');
await proxyManager.close('tryst');
expect(proxyIp).not.toBe(directIp);
console.log(`Direct: ${directIp}, Proxy: ${proxyIp}`);
});
});
describe('IP rotation', () => {
it('produces different IPs across multiple contexts', async () => {
const ips = new Set<string>();
const config = createProxyConfig(true);
config.crawl.concurrency = 5;
config.platforms = ['tryst', 'eros', 'transescorts'] as CrawlConfig['platforms'];
const manager = new BrowserManager(config);
try {
for (const platform of ['tryst', 'eros', 'transescorts'] as const) {
const page = await manager.getPage(platform);
await page.goto(IP_CHECK_URL, { waitUntil: 'domcontentloaded', timeout: 30000 });
const body = await page.textContent('body');
const parsed = JSON.parse(body!);
ips.add(parsed.origin);
await manager.close(platform);
}
// With 10 Tor instances and round-robin, we expect at least 2 unique IPs from 3 requests
console.log(`Unique IPs from 3 requests: ${ips.size}${[...ips].join(', ')}`);
expect(ips.size).toBeGreaterThanOrEqual(2);
} finally {
await manager.closeAll();
}
});
});
describe('BrowserManager proxy integration', () => {
it('passes proxy config to Playwright context', async () => {
const config = createProxyConfig(true);
const manager = new BrowserManager(config);
try {
const context = await manager.launch('tryst');
expect(context).toBeDefined();
expect(manager.getContextCount()).toBe(1);
// Verify we can make requests through the proxy
const page = await manager.getPage('tryst');
const response = await page.goto(IP_CHECK_URL, { waitUntil: 'domcontentloaded', timeout: 30000 });
expect(response?.ok()).toBe(true);
} finally {
await manager.closeAll();
}
});
it('works with proxy disabled (direct connection)', async () => {
const config = createProxyConfig(false);
const manager = new BrowserManager(config);
try {
const context = await manager.launch('tryst');
expect(context).toBeDefined();
const page = await manager.getPage('tryst');
const response = await page.goto(IP_CHECK_URL, { waitUntil: 'domcontentloaded', timeout: 30000 });
expect(response?.ok()).toBe(true);
} finally {
await manager.closeAll();
}
});
});
});