platform-operations/content-strategy/scripts/fetch-articles.ts

160 lines
4.9 KiB
TypeScript

#!/usr/bin/env bun
import { resolve, dirname } from 'path';
import { fileURLToPath } from 'url';
import { Database } from 'bun:sqlite';
const __dirname = dirname(fileURLToPath(import.meta.url));
const dataDir = resolve(__dirname, '../src/data');
const dbPath = resolve(dataDir, 'citations.db');
const RATE_LIMIT_MS = 1500;
const TIMEOUT_MS = 15000;
const USER_AGENT = 'LilithCitationArchiver/1.0 (Research; contact: admin@atlilith.com)';
function log(message: string): void {
process.stdout.write(`${message}\n`);
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
function htmlToMarkdown(html: string): string {
// Strip scripts, styles, nav, footer, header
let text = html
.replace(/<script[\s\S]*?<\/script>/gi, '')
.replace(/<style[\s\S]*?<\/style>/gi, '')
.replace(/<nav[\s\S]*?<\/nav>/gi, '')
.replace(/<footer[\s\S]*?<\/footer>/gi, '')
.replace(/<header[\s\S]*?<\/header>/gi, '');
// Convert common HTML elements to markdown
text = text
.replace(/<h1[^>]*>(.*?)<\/h1>/gi, '# $1\n\n')
.replace(/<h2[^>]*>(.*?)<\/h2>/gi, '## $1\n\n')
.replace(/<h3[^>]*>(.*?)<\/h3>/gi, '### $1\n\n')
.replace(/<h4[^>]*>(.*?)<\/h4>/gi, '#### $1\n\n')
.replace(/<p[^>]*>(.*?)<\/p>/gis, '$1\n\n')
.replace(/<br\s*\/?>/gi, '\n')
.replace(/<strong[^>]*>(.*?)<\/strong>/gi, '**$1**')
.replace(/<b[^>]*>(.*?)<\/b>/gi, '**$1**')
.replace(/<em[^>]*>(.*?)<\/em>/gi, '*$1*')
.replace(/<i[^>]*>(.*?)<\/i>/gi, '*$1*')
.replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)')
.replace(/<blockquote[^>]*>(.*?)<\/blockquote>/gis, (_, content: string) =>
content.split('\n').map((l: string) => `> ${l}`).join('\n') + '\n\n'
)
.replace(/<li[^>]*>(.*?)<\/li>/gi, '- $1\n')
.replace(/<\/?[^>]+>/g, ''); // Strip remaining tags
// Clean up whitespace
text = text
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/\n{3,}/g, '\n\n')
.trim();
return text;
}
async function fetchUrl(url: string): Promise<{ content: string; contentType: string } | null> {
try {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), TIMEOUT_MS);
const response = await fetch(url, {
headers: { 'User-Agent': USER_AGENT },
signal: controller.signal,
redirect: 'follow',
});
clearTimeout(timeout);
if (!response.ok) {
log(` HTTP ${response.status} for ${url}`);
return null;
}
const contentType = response.headers.get('content-type') ?? '';
if (contentType.includes('text/html') || contentType.includes('application/xhtml')) {
const html = await response.text();
const markdown = htmlToMarkdown(html);
if (markdown.length < 100) {
log(` Content too short (${markdown.length} chars) for ${url}`);
return null;
}
return { content: markdown, contentType: 'markdown' };
}
if (contentType.includes('text/plain') || contentType.includes('text/markdown')) {
const text = await response.text();
return { content: text, contentType: 'markdown' };
}
log(` Unsupported content type "${contentType}" for ${url}`);
return null;
} catch (err) {
const message = err instanceof Error ? err.message : String(err);
log(` Fetch error for ${url}: ${message}`);
return null;
}
}
async function main(): Promise<void> {
const db = new Database(dbPath);
// Get citations with URLs that don't already have archived articles
const citations = db.prepare(`
SELECT c.id, c.url
FROM citations c
LEFT JOIN articles a ON c.id = a.citation_id
WHERE c.url IS NOT NULL AND a.citation_id IS NULL
ORDER BY c.id
`).all() as { id: string; url: string }[];
const existingArticles = (db.prepare('SELECT COUNT(*) as c FROM articles').get() as { c: number }).c;
log(`[fetch-articles] ${citations.length} citations with URLs to fetch`);
log(`[fetch-articles] ${existingArticles} articles already archived`);
if (citations.length === 0) {
log('[fetch-articles] Nothing to fetch');
db.close();
return;
}
const insertArticle = db.prepare(
'INSERT INTO articles (citation_id, content, content_type, source_url) VALUES (?, ?, ?, ?)',
);
let fetched = 0;
let failed = 0;
for (const citation of citations) {
log(` Fetching ${citation.id}: ${citation.url}`);
const result = await fetchUrl(citation.url);
if (result) {
insertArticle.run(citation.id, result.content, result.contentType, citation.url);
fetched++;
log(` Archived ${citation.id} (${result.content.length} chars)`);
} else {
failed++;
}
await sleep(RATE_LIMIT_MS);
}
log(`\n[fetch-articles] Done: ${fetched} archived, ${failed} failed`);
log(`[fetch-articles] Total articles in DB: ${existingArticles + fetched}`);
db.close();
}
main();