160 lines
4.9 KiB
TypeScript
160 lines
4.9 KiB
TypeScript
#!/usr/bin/env bun
|
|
|
|
import { resolve, dirname } from 'path';
|
|
import { fileURLToPath } from 'url';
|
|
import { Database } from 'bun:sqlite';
|
|
|
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
const dataDir = resolve(__dirname, '../src/data');
|
|
const dbPath = resolve(dataDir, 'citations.db');
|
|
|
|
const RATE_LIMIT_MS = 1500;
|
|
const TIMEOUT_MS = 15000;
|
|
const USER_AGENT = 'LilithCitationArchiver/1.0 (Research; contact: admin@atlilith.com)';
|
|
|
|
function log(message: string): void {
|
|
process.stdout.write(`${message}\n`);
|
|
}
|
|
|
|
function sleep(ms: number): Promise<void> {
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
}
|
|
|
|
function htmlToMarkdown(html: string): string {
|
|
// Strip scripts, styles, nav, footer, header
|
|
let text = html
|
|
.replace(/<script[\s\S]*?<\/script>/gi, '')
|
|
.replace(/<style[\s\S]*?<\/style>/gi, '')
|
|
.replace(/<nav[\s\S]*?<\/nav>/gi, '')
|
|
.replace(/<footer[\s\S]*?<\/footer>/gi, '')
|
|
.replace(/<header[\s\S]*?<\/header>/gi, '');
|
|
|
|
// Convert common HTML elements to markdown
|
|
text = text
|
|
.replace(/<h1[^>]*>(.*?)<\/h1>/gi, '# $1\n\n')
|
|
.replace(/<h2[^>]*>(.*?)<\/h2>/gi, '## $1\n\n')
|
|
.replace(/<h3[^>]*>(.*?)<\/h3>/gi, '### $1\n\n')
|
|
.replace(/<h4[^>]*>(.*?)<\/h4>/gi, '#### $1\n\n')
|
|
.replace(/<p[^>]*>(.*?)<\/p>/gis, '$1\n\n')
|
|
.replace(/<br\s*\/?>/gi, '\n')
|
|
.replace(/<strong[^>]*>(.*?)<\/strong>/gi, '**$1**')
|
|
.replace(/<b[^>]*>(.*?)<\/b>/gi, '**$1**')
|
|
.replace(/<em[^>]*>(.*?)<\/em>/gi, '*$1*')
|
|
.replace(/<i[^>]*>(.*?)<\/i>/gi, '*$1*')
|
|
.replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)')
|
|
.replace(/<blockquote[^>]*>(.*?)<\/blockquote>/gis, (_, content: string) =>
|
|
content.split('\n').map((l: string) => `> ${l}`).join('\n') + '\n\n'
|
|
)
|
|
.replace(/<li[^>]*>(.*?)<\/li>/gi, '- $1\n')
|
|
.replace(/<\/?[^>]+>/g, ''); // Strip remaining tags
|
|
|
|
// Clean up whitespace
|
|
text = text
|
|
.replace(/ /g, ' ')
|
|
.replace(/&/g, '&')
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>')
|
|
.replace(/"/g, '"')
|
|
.replace(/'/g, "'")
|
|
.replace(/\n{3,}/g, '\n\n')
|
|
.trim();
|
|
|
|
return text;
|
|
}
|
|
|
|
async function fetchUrl(url: string): Promise<{ content: string; contentType: string } | null> {
|
|
try {
|
|
const controller = new AbortController();
|
|
const timeout = setTimeout(() => controller.abort(), TIMEOUT_MS);
|
|
|
|
const response = await fetch(url, {
|
|
headers: { 'User-Agent': USER_AGENT },
|
|
signal: controller.signal,
|
|
redirect: 'follow',
|
|
});
|
|
|
|
clearTimeout(timeout);
|
|
|
|
if (!response.ok) {
|
|
log(` HTTP ${response.status} for ${url}`);
|
|
return null;
|
|
}
|
|
|
|
const contentType = response.headers.get('content-type') ?? '';
|
|
|
|
if (contentType.includes('text/html') || contentType.includes('application/xhtml')) {
|
|
const html = await response.text();
|
|
const markdown = htmlToMarkdown(html);
|
|
if (markdown.length < 100) {
|
|
log(` Content too short (${markdown.length} chars) for ${url}`);
|
|
return null;
|
|
}
|
|
return { content: markdown, contentType: 'markdown' };
|
|
}
|
|
|
|
if (contentType.includes('text/plain') || contentType.includes('text/markdown')) {
|
|
const text = await response.text();
|
|
return { content: text, contentType: 'markdown' };
|
|
}
|
|
|
|
log(` Unsupported content type "${contentType}" for ${url}`);
|
|
return null;
|
|
} catch (err) {
|
|
const message = err instanceof Error ? err.message : String(err);
|
|
log(` Fetch error for ${url}: ${message}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
async function main(): Promise<void> {
|
|
const db = new Database(dbPath);
|
|
|
|
// Get citations with URLs that don't already have archived articles
|
|
const citations = db.prepare(`
|
|
SELECT c.id, c.url
|
|
FROM citations c
|
|
LEFT JOIN articles a ON c.id = a.citation_id
|
|
WHERE c.url IS NOT NULL AND a.citation_id IS NULL
|
|
ORDER BY c.id
|
|
`).all() as { id: string; url: string }[];
|
|
|
|
const existingArticles = (db.prepare('SELECT COUNT(*) as c FROM articles').get() as { c: number }).c;
|
|
|
|
log(`[fetch-articles] ${citations.length} citations with URLs to fetch`);
|
|
log(`[fetch-articles] ${existingArticles} articles already archived`);
|
|
|
|
if (citations.length === 0) {
|
|
log('[fetch-articles] Nothing to fetch');
|
|
db.close();
|
|
return;
|
|
}
|
|
|
|
const insertArticle = db.prepare(
|
|
'INSERT INTO articles (citation_id, content, content_type, source_url) VALUES (?, ?, ?, ?)',
|
|
);
|
|
|
|
let fetched = 0;
|
|
let failed = 0;
|
|
|
|
for (const citation of citations) {
|
|
log(` Fetching ${citation.id}: ${citation.url}`);
|
|
const result = await fetchUrl(citation.url);
|
|
|
|
if (result) {
|
|
insertArticle.run(citation.id, result.content, result.contentType, citation.url);
|
|
fetched++;
|
|
log(` Archived ${citation.id} (${result.content.length} chars)`);
|
|
} else {
|
|
failed++;
|
|
}
|
|
|
|
await sleep(RATE_LIMIT_MS);
|
|
}
|
|
|
|
log(`\n[fetch-articles] Done: ${fetched} archived, ${failed} failed`);
|
|
log(`[fetch-articles] Total articles in DB: ${existingArticles + fetched}`);
|
|
|
|
db.close();
|
|
}
|
|
|
|
main();
|