platform-operations/content-strategy/scripts/parse-citations.ts

#!/usr/bin/env bun

import { readFileSync, readdirSync } from 'fs';
import { resolve, dirname } from 'path';
import { fileURLToPath } from 'url';
import { Database } from 'bun:sqlite';

import type {
  CitationAuthor,
  CitationClaim,
  CitationType,
  ThemeId,
} from '../src/types/citations';

const __dirname = dirname(fileURLToPath(import.meta.url));
const dataDir = resolve(__dirname, '../src/data');
const archiveDir = resolve(__dirname, '../docs/meta/theme-sources');
const schemaPath = resolve(__dirname, 'schema.sql');
const dbPath = resolve(dataDir, 'citations.db');

// ─── Theme ID mapping from filename stem ───

const THEME_MAP: Record<string, ThemeId> = {
  '01-anti-extraction': 'anti-extraction',
  '02-inverse-capitalism': 'inverse-capitalism',
  '03-body-sovereignty': 'body-sovereignty',
  '04-privacy': 'privacy',
  '05-permanent-software': 'permanent-software',
  '06-human-work': 'human-work',
  '07-ai-philosophy': 'ai-philosophy',
  '08-slutology': 'slutology',
  '09-cooperative-future': 'cooperative-future',
  '10-open-source': 'open-source',
};

// ─── Utilities ───

function slugify(text: string): string {
  return text
    .toLowerCase()
    .replace(/[^a-z0-9]+/g, '-')
    .replace(/^-+|-+$/g, '')
    .slice(0, 80);
}

function normalizeForDedup(text: string): string {
  return text.toLowerCase().replace(/[^a-z0-9]/g, '');
}

function normalizeForMatch(text: string): string {
  return text
    .toLowerCase()
    .replace(/[^a-z0-9\s]/g, '')
    .replace(/\s+/g, ' ')
    .trim();
}

function getPrimaryYear(yearStr: string): string {
  const match = yearStr.match(/\d{4}/);
  return match ? match[0] : yearStr.replace(/\s+/g, '').trim();
}

function log(message: string): void {
  process.stdout.write(`${message}\n`);
}

function loadJson<T>(filename: string): T {
  const raw = readFileSync(resolve(dataDir, filename), 'utf-8');
  return JSON.parse(raw) as T;
}

// ─── Intermediate types ───

interface RawEntry {
  source: string;
  claim: string;
  year: string;
  category: string;
  theme: ThemeId;
  calculated: boolean;
  url?: string;
}

interface FullRef {
  number: number;
  raw: string;
  url?: string;
  doi?: string;
  arxiv?: string;
}

// ─── Table format detection ───

type TableFormat = 'variant-a' | 'variant-b';

function detectTableFormat(line: string): TableFormat | null {
  if (/\|\s*#\s*\|\s*Claim\s*\|/i.test(line)) return 'variant-a';
  if (/\|\s*#\s*\|\s*Source\s*\|\s*Year\s*\|\s*Claim/i.test(line)) return 'variant-b';
  return null;
}

function parseTableRow(
  line: string,
  format: TableFormat,
): { source: string; claim: string; year: string } | null {
  const cells = line
    .split('|')
    .map((c) => c.trim())
    .filter((c) => c.length > 0);
  if (cells.length < 4) return null;
  if (/^-+$/.test(cells[0])) return null;
  if (!/^\d+$/.test(cells[0])) return null;

  if (format === 'variant-a') {
    return { claim: cells[1], source: cells[2], year: cells[3] };
  }
  return { source: cells[1], year: cells[2], claim: cells[3] };
}

// ─── Parse table-based files (variants A and B) ───

function parseTableFile(lines: string[], theme: ThemeId): RawEntry[] {
  const entries: RawEntry[] = [];
  let currentCategory = '';
  let currentFormat: TableFormat | null = null;

  for (const line of lines) {
    const h2Match = line.match(/^##\s+(.+)/);
    if (h2Match) {
      const heading = h2Match[1].trim();
      if (/full references|notes\b|cross-references|citation format/i.test(heading)) {
        break;
      }
      currentCategory = heading;
      currentFormat = null;
      continue;
    }

    const fmt = detectTableFormat(line);
    if (fmt) {
      currentFormat = fmt;
      continue;
    }

    if (!currentFormat) continue;
    if (/^\|[-\s|]+\|$/.test(line)) continue;

    const row = parseTableRow(line, currentFormat);
    if (!row) continue;

    entries.push({
      source: row.source,
      claim: row.claim,
      year: row.year,
      category: currentCategory,
      theme,
      calculated: /calculated|internal/i.test(row.source),
    });
  }

  return entries;
}

// ─── Parse paragraph format (03-body-sovereignty) ───

function parseParagraphFormat(lines: string[], theme: ThemeId): RawEntry[] {
  const entries: RawEntry[] = [];
  let currentCategory = '';

  for (const line of lines) {
    const h3Match = line.match(/^###\s+(.+)/);
    if (h3Match) {
      currentCategory = h3Match[1].trim();
      continue;
    }
    const h2Match = line.match(/^##\s+(.+)/);
    if (h2Match) {
      const heading = h2Match[1].trim();
      if (/citation format|cross-references|verification/i.test(heading)) continue;
      currentCategory = heading;
      continue;
    }

    // Match numbered entries: N. **Author** ...
    const numMatch = line.match(/^(\d+)\.\s+\*\*(.+?)\*\*/);
    if (!numMatch) continue;

    const source = numMatch[2].trim();
    const rest = line.substring(numMatch[0].length).trim();

    // Find year anywhere in the remaining text
    const yearMatch = rest.match(/\((\d{4})\)/) ?? rest.match(/\b(20[0-2]\d|19\d\d)\b/);
    const year = yearMatch ? yearMatch[1] : 'n.d.';

    // Claim: prefer quoted text, else cleaned text after year
    const quoteMatch = rest.match(/"([^"]+)"/);
    let claim: string;
    if (quoteMatch) {
      claim = quoteMatch[1];
    } else {
      let afterYear = rest;
      if (yearMatch) {
        const idx = rest.indexOf(yearMatch[0]);
        afterYear = rest.substring(idx + yearMatch[0].length);
      }
      afterYear = afterYear
        .replace(/^\)\.\s*/, '')
        .replace(/^[\s.,;:]+/, '')
        .replace(/\*[^*]+\*/g, '')
        .replace(/\s*https?:\/\/\S+/g, '')
        .replace(/^[\s.,;:]+/, '')
        .replace(/[.\s]+$/, '')
        .trim();

      if (!afterYear) {
        const italicMatch = rest.match(/\*([^*]+)\*/);
        afterYear = italicMatch ? italicMatch[1] : rest.substring(0, 120);
      }
      claim = afterYear;
    }

    entries.push({
      source,
      claim,
      year,
      category: currentCategory,
      theme,
      calculated: false,
    });
  }

  return entries;
}

// ─── Parse bullet format (10-open-source) ───

function parseBulletFormat(lines: string[], theme: ThemeId): RawEntry[] {
  const entries: RawEntry[] = [];
  let currentH2 = '';
  let currentSource = '';
  let currentYear = '';
  let currentClaim = '';
  let currentUrl: string | undefined;

  function flush(): void {
    if (currentSource && currentClaim) {
      entries.push({
        source: currentSource,
        claim: currentClaim,
        year: currentYear || 'n.d.',
        category: currentH2,
        theme,
        calculated: false,
        url: currentUrl,
      });
    }
    currentSource = '';
    currentYear = '';
    currentClaim = '';
    currentUrl = undefined;
  }

  for (const line of lines) {
    const h2Match = line.match(/^##\s+(.+)/);
    if (h2Match) {
      flush();
      if (/verification notes/i.test(h2Match[1])) break;
      currentH2 = h2Match[1].trim();
      continue;
    }

    const h3Match = line.match(/^###\s+(.+)/);
    if (h3Match) {
      flush();
      continue;
    }

    if (line.startsWith('---')) {
      flush();
      continue;
    }

    // Skip table rows (technology dependency table)
    if (line.startsWith('|')) continue;

    const sourceMatch = line.match(/^-\s+\*\*Source\*\*:\s*(.+)/);
    if (sourceMatch) {
      flush();
      currentSource = sourceMatch[1].trim();
      const ym =
        currentSource.match(/\((\d{4})\)/) ?? currentSource.match(/\b(20[0-2]\d|19\d\d)\b/);
      if (ym) currentYear = ym[1];
      continue;
    }

    const claimMatch = line.match(
      /^-\s+\*\*(?:Key\s+(?:finding|findings|data|property|text|principle|limitation|stat|figures|claim)|Quote)\*\*:\s*(.+)/i,
    );
    if (claimMatch) {
      currentClaim = claimMatch[1].trim().replace(/^"(.+)"$/, '$1');
      continue;
    }

    const urlMatch = line.match(/^-\s+\*\*URL\*\*:\s*(https?:\/\/\S+)/);
    if (urlMatch) {
      currentUrl = urlMatch[1].trim();
      continue;
    }
  }
  flush();

  return entries;
}

// ─── Full references parser ───

function parseFullReferences(lines: string[]): FullRef[] {
  const refs: FullRef[] = [];
  let inFullRefs = false;

  for (const line of lines) {
    if (/^##\s+Full References/i.test(line)) {
      inFullRefs = true;
      continue;
    }
    if (!inFullRefs) continue;

    const match = line.match(/^(\d+)\.\s+(.+)/);
    if (!match) continue;

    const num = parseInt(match[1]);
    const text = match[2].trim();

    const urlMatch = text.match(/(https?:\/\/\S+)/);
    const url = urlMatch ? urlMatch[1].replace(/[.,;)]+$/, '') : undefined;

    const doiMatch =
      text.match(/https?:\/\/doi\.org\/(\S+)/) ?? text.match(/doi[:/]\s*(10\.\S+)/i);
    const doi = doiMatch ? doiMatch[1].replace(/[.,;)]+$/, '') : undefined;

    const arxivMatch =
      text.match(/https?:\/\/arxiv\.org\/abs\/(\S+)/) ?? text.match(/arXiv[:/]\s*(\S+)/i);
    const arxiv = arxivMatch ? arxivMatch[1].replace(/[.,;)]+$/, '') : undefined;

    refs.push({ number: num, raw: text, url, doi, arxiv });
  }

  return refs;
}

// ─── Source matching: find best full-reference match for a source string ───

function findMatchingRef(source: string, refs: FullRef[]): FullRef | null {
  if (refs.length === 0) return null;

  const sourceNorm = normalizeForMatch(source);
  const sourceWords = sourceNorm.split(' ').filter((w) => w.length > 3);
  if (sourceWords.length === 0) return null;

  let bestMatch: FullRef | null = null;
  let bestScore = 0;

  for (const ref of refs) {
    const refNorm = normalizeForMatch(ref.raw);
    let score = 0;

    for (const word of sourceWords) {
      if (refNorm.includes(word)) score++;
    }

    // Bonus for matching the leading source identifier
    const primaryKey = normalizeForMatch(source.split(/[;,]/)[0]);
    if (primaryKey.length > 4 && refNorm.includes(primaryKey)) {
      score += 3;
    }

    if (score > bestScore) {
      bestScore = score;
      bestMatch = ref;
    }
  }

  return bestScore >= 2 ? bestMatch : null;
}

// ─── Type inference ───

const NEWS_KEYWORDS = [
  'variety',
  'time',
  'npr',
  'nbc',
  'fortune',
  'venturebeat',
  'mit technology review',
  '404 media',
  'the verge',
  'wired',
  'ars technica',
  'techcrunch',
  'business standard',
  'bbc',
  'reuters',
  'new york times',
  'financial times',
  'cnbc',
  'android police',
];
const ACADEMIC_KEYWORDS = [
  'doi',
  'arxiv',
  'journal',
  'et al',
  'university',
  'neurips',
  'usenix',
  'ieee',
  'springer',
  'plos',
  'bmc',
  'sage journals',
];
const POLICY_KEYWORDS = [
  'aclu',
  'gdpr',
  'regulation',
  'act no',
  'amnesty',
  'parliament',
  'government',
  'medecins du monde',
  'prostitution reform',
  'directive',
];
const REPORT_KEYWORDS = [
  'survey',
  'report',
  'census',
  'foundation',
  'coalition',
  'institute',
  'hacking//hustling',
  'hacking hustling',
  'gallup',
  'kff',
  'unfpa',
  'democracy at work',
];
const FILING_KEYWORDS = ['companies house', 'sec filing', 'annual report', 'financial statements'];
const PLATFORM_KEYWORDS = ['terms of service', 'tos', 'documentation', 'api', 'pricing page'];
const INDUSTRY_KEYWORDS = [
  'selecthub',
  'merchant machine',
  'paymentcloud',
  'pitchbook',
  'crunchbase',
  'similarweb',
  'comscore',
  'signalfire',
  'skyquest',
  'octoverse',
  'glassdoor',
  'foxy studios',
];

function inferType(source: string, claim: string): CitationType {
  const combined = `${source} ${claim}`.toLowerCase();

  if (/\bcalculated\b|internal testing|platform measurement|lilith platform internal/i.test(source))
    return 'internal';
  if (/community reviews|forum discussions|reddit/i.test(source)) return 'community';

  for (const k of FILING_KEYWORDS) if (combined.includes(k)) return 'filing';
  for (const k of ACADEMIC_KEYWORDS) if (combined.includes(k)) return 'academic';
  for (const k of POLICY_KEYWORDS) if (combined.includes(k)) return 'policy';
  for (const k of PLATFORM_KEYWORDS) if (combined.includes(k)) return 'platform';
  for (const k of NEWS_KEYWORDS) if (combined.includes(k)) return 'news';
  for (const k of REPORT_KEYWORDS) if (combined.includes(k)) return 'report';
  for (const k of INDUSTRY_KEYWORDS) if (combined.includes(k)) return 'industry';

  return 'industry';
}

// ─── Author parsing ───

function parseAuthors(source: string): CitationAuthor[] | undefined {
  const etAlMatch = source.match(/^([^,]+(?:,\s*[A-Z]\.?\s*)?)\s*et al/);
  if (etAlMatch) {
    return [{ name: etAlMatch[1].trim().replace(/[,.]$/, '') }];
  }

  const firstPart = source.split(/[;]/)[0].trim();
  if (firstPart && !/\b[A-Z]\.\s/.test(firstPart)) {
    return [{ name: firstPart, institutional: firstPart }];
  }

  return undefined;
}

// ─── Dedup key ───

function getSourceKey(source: string): string {
  const firstBySemicolon = source.split(/\s*;\s*/)[0].trim();
  return firstBySemicolon
    .replace(/["'].+?["']/g, '')
    .replace(/\(.+?\)/g, '')
    .replace(/\*[^*]+\*/g, '')
    .replace(/[,.]$/, '')
    .trim();
}

function dedupKey(source: string, year: string): string {
  const key = normalizeForDedup(getSourceKey(source));
  const primaryYear = getPrimaryYear(year);
  return `${key}__${primaryYear}`;
}

// ─── SQLite initialization ───

function initDatabase(): Database {
  const db = new Database(dbPath);
  const schema = readFileSync(schemaPath, 'utf-8');

  // Drop existing tables for clean rebuild
  db.run('PRAGMA foreign_keys = OFF');
  const existingTables = db.prepare(
    "SELECT name, type FROM sqlite_master WHERE type IN ('table', 'trigger') AND name NOT LIKE 'sqlite_%'",
  ).all() as { name: string; type: string }[];

  for (const { name, type } of existingTables) {
    if (type === 'trigger') {
      db.run(`DROP TRIGGER IF EXISTS "${name}"`);
    } else {
      db.run(`DROP TABLE IF EXISTS "${name}"`);
    }
  }

  db.exec(schema);
  return db;
}

// ─── Main ───

interface ContentItem {
  topic: string;
  group: string;
}

function main(): void {
  const library = loadJson<{ contentItems: ContentItem[] }>('library.json');

  // Build map: group -> slugified topic list
  const topicsByGroup = new Map<string, string[]>();
  for (const item of library.contentItems) {
    const list = topicsByGroup.get(item.group) ?? [];
    list.push(slugify(item.topic));
    topicsByGroup.set(item.group, list);
  }

  const files = readdirSync(archiveDir)
    .filter((f) => f.endsWith('-citations.md'))
    .sort();

  log(`[parse-citations] Found ${files.length} citation files`);

  // ─── Parse all files ───

  const allEntries: RawEntry[] = [];
  const allFullRefs: FullRef[] = [];

  for (const file of files) {
    const stem = file.replace('-citations.md', '');
    const theme = THEME_MAP[stem];
    if (!theme) {
      log(`[parse-citations] Warning: unknown stem "${stem}", skipping`);
      continue;
    }

    const content = readFileSync(resolve(archiveDir, file), 'utf-8');
    const lines = content.split('\n');

    // Detect table format
    let hasTable = false;
    for (const line of lines) {
      if (detectTableFormat(line)) {
        hasTable = true;
        break;
      }
    }

    let entries: RawEntry[];
    if (hasTable) {
      entries = parseTableFile(lines, theme);
    } else if (theme === 'body-sovereignty') {
      entries = parseParagraphFormat(lines, theme);
    } else if (theme === 'open-source') {
      entries = parseBulletFormat(lines, theme);
    } else {
      log(`[parse-citations] Warning: no parser matched for ${file}`);
      entries = [];
    }

    allEntries.push(...entries);

    const refs = parseFullReferences(lines);
    allFullRefs.push(...refs);

    log(`[parse-citations]   ${theme}: ${entries.length} entries, ${refs.length} full refs`);
  }

  log(`[parse-citations] Total raw entries: ${allEntries.length}`);

  // ─── Deduplicate and merge ───

  interface CitationDraft {
    primarySource: string;
    themes: Set<ThemeId>;
    claims: CitationClaim[];
    year: string;
    calculated: boolean;
    url?: string;
  }

  const citationMap = new Map<string, CitationDraft>();

  for (const entry of allEntries) {
    const key = dedupKey(entry.source, entry.year);
    const draft = citationMap.get(key);

    if (draft) {
      draft.themes.add(entry.theme);
      const claimNorm = normalizeForDedup(entry.claim);
      const isDuplicate = draft.claims.some((c) => normalizeForDedup(c.text) === claimNorm);
      if (!isDuplicate) {
        draft.claims.push({
          text: entry.claim,
          category: entry.category,
          year: entry.year,
        });
      }
      if (entry.calculated) draft.calculated = true;
      if (entry.url && !draft.url) draft.url = entry.url;
    } else {
      citationMap.set(key, {
        primarySource: entry.source,
        themes: new Set([entry.theme]),
        claims: [{ text: entry.claim, category: entry.category, year: entry.year }],
        year: entry.year,
        calculated: entry.calculated,
        url: entry.url,
      });
    }
  }

  log(`[parse-citations] After dedup: ${citationMap.size} unique citations`);

  // ─── Initialize SQLite and insert ───

  const db = initDatabase();
  log(`[parse-citations] SQLite database initialized at ${dbPath}`);

  const insertCitation = db.prepare(
    'INSERT INTO citations (id, type, title, year, publisher, url, doi, arxiv, venue, notes, calculated) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',
  );
  const insertAuthor = db.prepare(
    'INSERT INTO citation_authors (citation_id, name, institutional, position) VALUES (?, ?, ?, ?)',
  );
  const insertTheme = db.prepare(
    'INSERT INTO citation_themes (citation_id, theme) VALUES (?, ?)',
  );
  const insertClaim = db.prepare(
    'INSERT INTO claims (citation_id, text, category, year) VALUES (?, ?, ?, ?)',
  );

  const usedIds = new Set<string>();

  function makeUniqueId(base: string): string {
    let id = base;
    let counter = 2;
    while (usedIds.has(id)) {
      id = `${base}-${counter}`;
      counter++;
    }
    usedIds.add(id);
    return id;
  }

  let totalCitations = 0;
  let totalClaims = 0;

  const insertAll = db.transaction(() => {
    for (const [, draft] of citationMap) {
      const { primarySource, themes, claims, year, calculated } = draft;

      const matchedRef = findMatchingRef(primarySource, allFullRefs);

      const title = primarySource;
      const sourceKey = getSourceKey(primarySource);
      const primaryYear = getPrimaryYear(year);
      const id = makeUniqueId(slugify(`${sourceKey}-${primaryYear}`));
      const type = inferType(primarySource, claims[0]?.text ?? '');
      const authors = parseAuthors(primarySource);
      const url = matchedRef?.url ?? draft.url;
      const doi = matchedRef?.doi;
      const arxiv = matchedRef?.arxiv;

      insertCitation.run(
        id, type, title, year,
        null, // publisher
        url ?? null,
        doi ?? null,
        arxiv ?? null,
        null, // venue
        null, // notes
        calculated ? 1 : 0,
      );

      if (authors) {
        for (let i = 0; i < authors.length; i++) {
          insertAuthor.run(id, authors[i].name, authors[i].institutional ?? null, i);
        }
      }

      for (const theme of [...themes].sort()) {
        insertTheme.run(id, theme);
      }

      for (const claim of claims) {
        insertClaim.run(id, claim.text, claim.category, claim.year);
      }

      totalCitations++;
      totalClaims += claims.length;
    }
  });

  insertAll();

  // ─── Summary ───

  const themeCount = (
    db.prepare('SELECT COUNT(DISTINCT theme) as c FROM citation_themes').get() as { c: number }
  ).c;
  const withUrl = (
    db.prepare('SELECT COUNT(*) as c FROM citations WHERE url IS NOT NULL').get() as { c: number }
  ).c;
  const crossTheme = (
    db.prepare(
      'SELECT COUNT(*) as c FROM (SELECT citation_id FROM citation_themes GROUP BY citation_id HAVING COUNT(*) > 1)',
    ).get() as { c: number }
  ).c;

  log(`[parse-citations] ${totalCitations} citations, ${totalClaims} claims`);
  log(`[parse-citations] Themes: ${themeCount}`);
  log(`[parse-citations] With URL: ${withUrl}`);
  log(`[parse-citations] Cross-theme: ${crossTheme}`);
  log(`[parse-citations] Wrote ${dbPath}`);

  db.close();
}

main();