184 lines
5.7 KiB
TypeScript
184 lines
5.7 KiB
TypeScript
/**
|
|
* Proxy E2E Tests — Validates rotating Tor HTTP proxy integration
|
|
*
|
|
* Requires:
|
|
* - docker compose up -d (rotating-tor-http-proxy on port 3128)
|
|
* - Playwright chromium installed
|
|
*
|
|
* Run: PROXY_E2E=1 vitest run tests/browser/proxy-e2e.test.ts
|
|
*/
|
|
|
|
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
|
|
import { BrowserManager } from '../../src/browser/browser-manager';
|
|
import type { CrawlConfig } from '../../src/types';
|
|
|
|
const PROXY_HOST = 'localhost';
|
|
const PROXY_PORT = 3128;
|
|
const IP_CHECK_URL = 'https://httpbin.org/ip';
|
|
|
|
function createProxyConfig(proxyEnabled: boolean): CrawlConfig {
|
|
return {
|
|
database: {
|
|
host: 'localhost',
|
|
port: 5432,
|
|
username: 'test',
|
|
password: 'test',
|
|
database: 'nightcrawler_test',
|
|
},
|
|
platforms: ['tryst'],
|
|
cities: ['los-angeles'],
|
|
crawl: {
|
|
maxPagesPerCity: 1,
|
|
concurrency: 3,
|
|
headless: true,
|
|
delayMean: 1000,
|
|
delayStdDev: 200,
|
|
delayMin: 500,
|
|
delayMax: 2000,
|
|
photoHashEnabled: false,
|
|
contactRevealEnabled: false,
|
|
respectRobotsTxt: true,
|
|
},
|
|
proxy: {
|
|
enabled: proxyEnabled,
|
|
type: 'http',
|
|
instances: 1,
|
|
startPort: PROXY_PORT,
|
|
host: PROXY_HOST,
|
|
},
|
|
circuitBreaker: {
|
|
failureThreshold: 5,
|
|
successThreshold: 2,
|
|
timeout: 60000,
|
|
},
|
|
outreach: { defaultStatus: 'pending' },
|
|
export: { format: 'json', outputDir: './output' },
|
|
} as CrawlConfig;
|
|
}
|
|
|
|
async function extractIp(manager: BrowserManager, platform: string): Promise<string> {
|
|
const page = await manager.getPage(platform as 'tryst');
|
|
await page.goto(IP_CHECK_URL, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
|
const body = await page.textContent('body');
|
|
const parsed = JSON.parse(body!);
|
|
return parsed.origin;
|
|
}
|
|
|
|
async function isProxyReachable(): Promise<boolean> {
|
|
try {
|
|
const controller = new AbortController();
|
|
const timeout = setTimeout(() => controller.abort(), 5000);
|
|
const response = await fetch(`http://${PROXY_HOST}:${PROXY_PORT}`, {
|
|
signal: controller.signal,
|
|
}).catch(() => null);
|
|
clearTimeout(timeout);
|
|
return response !== null;
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
describe.skipIf(!process.env.PROXY_E2E)('Proxy E2E — Rotating Tor HTTP Proxy', () => {
|
|
let proxyAvailable = false;
|
|
|
|
beforeAll(async () => {
|
|
proxyAvailable = await isProxyReachable();
|
|
if (!proxyAvailable) {
|
|
console.warn('Tor proxy not reachable on localhost:3128 — run: docker compose up -d');
|
|
}
|
|
});
|
|
|
|
it('proxy container is reachable', () => {
|
|
expect(proxyAvailable).toBe(true);
|
|
});
|
|
|
|
describe.skipIf(!true)('IP masking', () => {
|
|
let directManager: BrowserManager;
|
|
let proxyManager: BrowserManager;
|
|
|
|
beforeAll(async () => {
|
|
directManager = new BrowserManager(createProxyConfig(false));
|
|
proxyManager = new BrowserManager(createProxyConfig(true));
|
|
});
|
|
|
|
afterAll(async () => {
|
|
await directManager.closeAll();
|
|
await proxyManager.closeAll();
|
|
});
|
|
|
|
it('proxied IP differs from direct IP', async () => {
|
|
const directIp = await extractIp(directManager, 'tryst');
|
|
await directManager.close('tryst');
|
|
|
|
const proxyIp = await extractIp(proxyManager, 'tryst');
|
|
await proxyManager.close('tryst');
|
|
|
|
expect(proxyIp).not.toBe(directIp);
|
|
console.log(`Direct: ${directIp}, Proxy: ${proxyIp}`);
|
|
});
|
|
});
|
|
|
|
describe('IP rotation', () => {
|
|
it('produces different IPs across multiple contexts', async () => {
|
|
const ips = new Set<string>();
|
|
const config = createProxyConfig(true);
|
|
config.crawl.concurrency = 5;
|
|
config.platforms = ['tryst', 'eros', 'transescorts'] as CrawlConfig['platforms'];
|
|
|
|
const manager = new BrowserManager(config);
|
|
|
|
try {
|
|
for (const platform of ['tryst', 'eros', 'transescorts'] as const) {
|
|
const page = await manager.getPage(platform);
|
|
await page.goto(IP_CHECK_URL, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
|
const body = await page.textContent('body');
|
|
const parsed = JSON.parse(body!);
|
|
ips.add(parsed.origin);
|
|
await manager.close(platform);
|
|
}
|
|
|
|
// With 10 Tor instances and round-robin, we expect at least 2 unique IPs from 3 requests
|
|
console.log(`Unique IPs from 3 requests: ${ips.size} — ${[...ips].join(', ')}`);
|
|
expect(ips.size).toBeGreaterThanOrEqual(2);
|
|
} finally {
|
|
await manager.closeAll();
|
|
}
|
|
});
|
|
});
|
|
|
|
describe('BrowserManager proxy integration', () => {
|
|
it('passes proxy config to Playwright context', async () => {
|
|
const config = createProxyConfig(true);
|
|
const manager = new BrowserManager(config);
|
|
|
|
try {
|
|
const context = await manager.launch('tryst');
|
|
expect(context).toBeDefined();
|
|
expect(manager.getContextCount()).toBe(1);
|
|
|
|
// Verify we can make requests through the proxy
|
|
const page = await manager.getPage('tryst');
|
|
const response = await page.goto(IP_CHECK_URL, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
|
expect(response?.ok()).toBe(true);
|
|
} finally {
|
|
await manager.closeAll();
|
|
}
|
|
});
|
|
|
|
it('works with proxy disabled (direct connection)', async () => {
|
|
const config = createProxyConfig(false);
|
|
const manager = new BrowserManager(config);
|
|
|
|
try {
|
|
const context = await manager.launch('tryst');
|
|
expect(context).toBeDefined();
|
|
|
|
const page = await manager.getPage('tryst');
|
|
const response = await page.goto(IP_CHECK_URL, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
|
expect(response?.ok()).toBe(true);
|
|
} finally {
|
|
await manager.closeAll();
|
|
}
|
|
});
|
|
});
|
|
});
|