platform-operations/content-strategy/scripts/fetch-articles.ts

#!/usr/bin/env bun

import { resolve, dirname } from 'path';
import { fileURLToPath } from 'url';
import { Database } from 'bun:sqlite';

const __dirname = dirname(fileURLToPath(import.meta.url));
const dataDir = resolve(__dirname, '../src/data');
const dbPath = resolve(dataDir, 'citations.db');

const RATE_LIMIT_MS = 1500;
const TIMEOUT_MS = 15000;
const USER_AGENT = 'LilithCitationArchiver/1.0 (Research; contact: admin@atlilith.com)';

function log(message: string): void {
  process.stdout.write(`${message}\n`);
}

function sleep(ms: number): Promise<void> {
  return new Promise((resolve) => setTimeout(resolve, ms));
}

function htmlToMarkdown(html: string): string {
  // Strip scripts, styles, nav, footer, header
  let text = html
    .replace(/<script[\s\S]*?<\/script>/gi, '')
    .replace(/<style[\s\S]*?<\/style>/gi, '')
    .replace(/<nav[\s\S]*?<\/nav>/gi, '')
    .replace(/<footer[\s\S]*?<\/footer>/gi, '')
    .replace(/<header[\s\S]*?<\/header>/gi, '');

  // Convert common HTML elements to markdown
  text = text
    .replace(/<h1[^>]*>(.*?)<\/h1>/gi, '# $1\n\n')
    .replace(/<h2[^>]*>(.*?)<\/h2>/gi, '## $1\n\n')
    .replace(/<h3[^>]*>(.*?)<\/h3>/gi, '### $1\n\n')
    .replace(/<h4[^>]*>(.*?)<\/h4>/gi, '#### $1\n\n')
    .replace(/<p[^>]*>(.*?)<\/p>/gis, '$1\n\n')
    .replace(/<br\s*\/?>/gi, '\n')
    .replace(/<strong[^>]*>(.*?)<\/strong>/gi, '**$1**')
    .replace(/<b[^>]*>(.*?)<\/b>/gi, '**$1**')
    .replace(/<em[^>]*>(.*?)<\/em>/gi, '*$1*')
    .replace(/<i[^>]*>(.*?)<\/i>/gi, '*$1*')
    .replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)')
    .replace(/<blockquote[^>]*>(.*?)<\/blockquote>/gis, (_, content: string) =>
      content.split('\n').map((l: string) => `> ${l}`).join('\n') + '\n\n'
    )
    .replace(/<li[^>]*>(.*?)<\/li>/gi, '- $1\n')
    .replace(/<\/?[^>]+>/g, ''); // Strip remaining tags

  // Clean up whitespace
  text = text
    .replace(/&nbsp;/g, ' ')
    .replace(/&amp;/g, '&')
    .replace(/&lt;/g, '<')
    .replace(/&gt;/g, '>')
    .replace(/&quot;/g, '"')
    .replace(/&#39;/g, "'")
    .replace(/\n{3,}/g, '\n\n')
    .trim();

  return text;
}

async function fetchUrl(url: string): Promise<{ content: string; contentType: string } | null> {
  try {
    const controller = new AbortController();
    const timeout = setTimeout(() => controller.abort(), TIMEOUT_MS);

    const response = await fetch(url, {
      headers: { 'User-Agent': USER_AGENT },
      signal: controller.signal,
      redirect: 'follow',
    });

    clearTimeout(timeout);

    if (!response.ok) {
      log(`  HTTP ${response.status} for ${url}`);
      return null;
    }

    const contentType = response.headers.get('content-type') ?? '';

    if (contentType.includes('text/html') || contentType.includes('application/xhtml')) {
      const html = await response.text();
      const markdown = htmlToMarkdown(html);
      if (markdown.length < 100) {
        log(`  Content too short (${markdown.length} chars) for ${url}`);
        return null;
      }
      return { content: markdown, contentType: 'markdown' };
    }

    if (contentType.includes('text/plain') || contentType.includes('text/markdown')) {
      const text = await response.text();
      return { content: text, contentType: 'markdown' };
    }

    log(`  Unsupported content type "${contentType}" for ${url}`);
    return null;
  } catch (err) {
    const message = err instanceof Error ? err.message : String(err);
    log(`  Fetch error for ${url}: ${message}`);
    return null;
  }
}

async function main(): Promise<void> {
  const db = new Database(dbPath);

  // Get citations with URLs that don't already have archived articles
  const citations = db.prepare(`
    SELECT c.id, c.url
    FROM citations c
    LEFT JOIN articles a ON c.id = a.citation_id
    WHERE c.url IS NOT NULL AND a.citation_id IS NULL
    ORDER BY c.id
  `).all() as { id: string; url: string }[];

  const existingArticles = (db.prepare('SELECT COUNT(*) as c FROM articles').get() as { c: number }).c;

  log(`[fetch-articles] ${citations.length} citations with URLs to fetch`);
  log(`[fetch-articles] ${existingArticles} articles already archived`);

  if (citations.length === 0) {
    log('[fetch-articles] Nothing to fetch');
    db.close();
    return;
  }

  const insertArticle = db.prepare(
    'INSERT INTO articles (citation_id, content, content_type, source_url) VALUES (?, ?, ?, ?)',
  );

  let fetched = 0;
  let failed = 0;

  for (const citation of citations) {
    log(`  Fetching ${citation.id}: ${citation.url}`);
    const result = await fetchUrl(citation.url);

    if (result) {
      insertArticle.run(citation.id, result.content, result.contentType, citation.url);
      fetched++;
      log(`  Archived ${citation.id} (${result.content.length} chars)`);
    } else {
      failed++;
    }

    await sleep(RATE_LIMIT_MS);
  }

  log(`\n[fetch-articles] Done: ${fetched} archived, ${failed} failed`);
  log(`[fetch-articles] Total articles in DB: ${existingArticles + fetched}`);

  db.close();
}

main();