platform-operations/content-strategy/scripts/parse-citations.ts

762 lines
20 KiB
TypeScript

#!/usr/bin/env bun
import { readFileSync, readdirSync } from 'fs';
import { resolve, dirname } from 'path';
import { fileURLToPath } from 'url';
import { Database } from 'bun:sqlite';
import type {
CitationAuthor,
CitationClaim,
CitationType,
ThemeId,
} from '../src/types/citations';
const __dirname = dirname(fileURLToPath(import.meta.url));
const dataDir = resolve(__dirname, '../src/data');
const archiveDir = resolve(__dirname, '../docs/meta/theme-sources');
const schemaPath = resolve(__dirname, 'schema.sql');
const dbPath = resolve(dataDir, 'citations.db');
// ─── Theme ID mapping from filename stem ───
const THEME_MAP: Record<string, ThemeId> = {
'01-anti-extraction': 'anti-extraction',
'02-inverse-capitalism': 'inverse-capitalism',
'03-body-sovereignty': 'body-sovereignty',
'04-privacy': 'privacy',
'05-permanent-software': 'permanent-software',
'06-human-work': 'human-work',
'07-ai-philosophy': 'ai-philosophy',
'08-slutology': 'slutology',
'09-cooperative-future': 'cooperative-future',
'10-open-source': 'open-source',
};
// ─── Utilities ───
function slugify(text: string): string {
return text
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-+|-+$/g, '')
.slice(0, 80);
}
function normalizeForDedup(text: string): string {
return text.toLowerCase().replace(/[^a-z0-9]/g, '');
}
function normalizeForMatch(text: string): string {
return text
.toLowerCase()
.replace(/[^a-z0-9\s]/g, '')
.replace(/\s+/g, ' ')
.trim();
}
function getPrimaryYear(yearStr: string): string {
const match = yearStr.match(/\d{4}/);
return match ? match[0] : yearStr.replace(/\s+/g, '').trim();
}
function log(message: string): void {
process.stdout.write(`${message}\n`);
}
function loadJson<T>(filename: string): T {
const raw = readFileSync(resolve(dataDir, filename), 'utf-8');
return JSON.parse(raw) as T;
}
// ─── Intermediate types ───
interface RawEntry {
source: string;
claim: string;
year: string;
category: string;
theme: ThemeId;
calculated: boolean;
url?: string;
}
interface FullRef {
number: number;
raw: string;
url?: string;
doi?: string;
arxiv?: string;
}
// ─── Table format detection ───
type TableFormat = 'variant-a' | 'variant-b';
function detectTableFormat(line: string): TableFormat | null {
if (/\|\s*#\s*\|\s*Claim\s*\|/i.test(line)) return 'variant-a';
if (/\|\s*#\s*\|\s*Source\s*\|\s*Year\s*\|\s*Claim/i.test(line)) return 'variant-b';
return null;
}
function parseTableRow(
line: string,
format: TableFormat,
): { source: string; claim: string; year: string } | null {
const cells = line
.split('|')
.map((c) => c.trim())
.filter((c) => c.length > 0);
if (cells.length < 4) return null;
if (/^-+$/.test(cells[0])) return null;
if (!/^\d+$/.test(cells[0])) return null;
if (format === 'variant-a') {
return { claim: cells[1], source: cells[2], year: cells[3] };
}
return { source: cells[1], year: cells[2], claim: cells[3] };
}
// ─── Parse table-based files (variants A and B) ───
function parseTableFile(lines: string[], theme: ThemeId): RawEntry[] {
const entries: RawEntry[] = [];
let currentCategory = '';
let currentFormat: TableFormat | null = null;
for (const line of lines) {
const h2Match = line.match(/^##\s+(.+)/);
if (h2Match) {
const heading = h2Match[1].trim();
if (/full references|notes\b|cross-references|citation format/i.test(heading)) {
break;
}
currentCategory = heading;
currentFormat = null;
continue;
}
const fmt = detectTableFormat(line);
if (fmt) {
currentFormat = fmt;
continue;
}
if (!currentFormat) continue;
if (/^\|[-\s|]+\|$/.test(line)) continue;
const row = parseTableRow(line, currentFormat);
if (!row) continue;
entries.push({
source: row.source,
claim: row.claim,
year: row.year,
category: currentCategory,
theme,
calculated: /calculated|internal/i.test(row.source),
});
}
return entries;
}
// ─── Parse paragraph format (03-body-sovereignty) ───
function parseParagraphFormat(lines: string[], theme: ThemeId): RawEntry[] {
const entries: RawEntry[] = [];
let currentCategory = '';
for (const line of lines) {
const h3Match = line.match(/^###\s+(.+)/);
if (h3Match) {
currentCategory = h3Match[1].trim();
continue;
}
const h2Match = line.match(/^##\s+(.+)/);
if (h2Match) {
const heading = h2Match[1].trim();
if (/citation format|cross-references|verification/i.test(heading)) continue;
currentCategory = heading;
continue;
}
// Match numbered entries: N. **Author** ...
const numMatch = line.match(/^(\d+)\.\s+\*\*(.+?)\*\*/);
if (!numMatch) continue;
const source = numMatch[2].trim();
const rest = line.substring(numMatch[0].length).trim();
// Find year anywhere in the remaining text
const yearMatch = rest.match(/\((\d{4})\)/) ?? rest.match(/\b(20[0-2]\d|19\d\d)\b/);
const year = yearMatch ? yearMatch[1] : 'n.d.';
// Claim: prefer quoted text, else cleaned text after year
const quoteMatch = rest.match(/"([^"]+)"/);
let claim: string;
if (quoteMatch) {
claim = quoteMatch[1];
} else {
let afterYear = rest;
if (yearMatch) {
const idx = rest.indexOf(yearMatch[0]);
afterYear = rest.substring(idx + yearMatch[0].length);
}
afterYear = afterYear
.replace(/^\)\.\s*/, '')
.replace(/^[\s.,;:]+/, '')
.replace(/\*[^*]+\*/g, '')
.replace(/\s*https?:\/\/\S+/g, '')
.replace(/^[\s.,;:]+/, '')
.replace(/[.\s]+$/, '')
.trim();
if (!afterYear) {
const italicMatch = rest.match(/\*([^*]+)\*/);
afterYear = italicMatch ? italicMatch[1] : rest.substring(0, 120);
}
claim = afterYear;
}
entries.push({
source,
claim,
year,
category: currentCategory,
theme,
calculated: false,
});
}
return entries;
}
// ─── Parse bullet format (10-open-source) ───
function parseBulletFormat(lines: string[], theme: ThemeId): RawEntry[] {
const entries: RawEntry[] = [];
let currentH2 = '';
let currentSource = '';
let currentYear = '';
let currentClaim = '';
let currentUrl: string | undefined;
function flush(): void {
if (currentSource && currentClaim) {
entries.push({
source: currentSource,
claim: currentClaim,
year: currentYear || 'n.d.',
category: currentH2,
theme,
calculated: false,
url: currentUrl,
});
}
currentSource = '';
currentYear = '';
currentClaim = '';
currentUrl = undefined;
}
for (const line of lines) {
const h2Match = line.match(/^##\s+(.+)/);
if (h2Match) {
flush();
if (/verification notes/i.test(h2Match[1])) break;
currentH2 = h2Match[1].trim();
continue;
}
const h3Match = line.match(/^###\s+(.+)/);
if (h3Match) {
flush();
continue;
}
if (line.startsWith('---')) {
flush();
continue;
}
// Skip table rows (technology dependency table)
if (line.startsWith('|')) continue;
const sourceMatch = line.match(/^-\s+\*\*Source\*\*:\s*(.+)/);
if (sourceMatch) {
flush();
currentSource = sourceMatch[1].trim();
const ym =
currentSource.match(/\((\d{4})\)/) ?? currentSource.match(/\b(20[0-2]\d|19\d\d)\b/);
if (ym) currentYear = ym[1];
continue;
}
const claimMatch = line.match(
/^-\s+\*\*(?:Key\s+(?:finding|findings|data|property|text|principle|limitation|stat|figures|claim)|Quote)\*\*:\s*(.+)/i,
);
if (claimMatch) {
currentClaim = claimMatch[1].trim().replace(/^"(.+)"$/, '$1');
continue;
}
const urlMatch = line.match(/^-\s+\*\*URL\*\*:\s*(https?:\/\/\S+)/);
if (urlMatch) {
currentUrl = urlMatch[1].trim();
continue;
}
}
flush();
return entries;
}
// ─── Full references parser ───
function parseFullReferences(lines: string[]): FullRef[] {
const refs: FullRef[] = [];
let inFullRefs = false;
for (const line of lines) {
if (/^##\s+Full References/i.test(line)) {
inFullRefs = true;
continue;
}
if (!inFullRefs) continue;
const match = line.match(/^(\d+)\.\s+(.+)/);
if (!match) continue;
const num = parseInt(match[1]);
const text = match[2].trim();
const urlMatch = text.match(/(https?:\/\/\S+)/);
const url = urlMatch ? urlMatch[1].replace(/[.,;)]+$/, '') : undefined;
const doiMatch =
text.match(/https?:\/\/doi\.org\/(\S+)/) ?? text.match(/doi[:/]\s*(10\.\S+)/i);
const doi = doiMatch ? doiMatch[1].replace(/[.,;)]+$/, '') : undefined;
const arxivMatch =
text.match(/https?:\/\/arxiv\.org\/abs\/(\S+)/) ?? text.match(/arXiv[:/]\s*(\S+)/i);
const arxiv = arxivMatch ? arxivMatch[1].replace(/[.,;)]+$/, '') : undefined;
refs.push({ number: num, raw: text, url, doi, arxiv });
}
return refs;
}
// ─── Source matching: find best full-reference match for a source string ───
function findMatchingRef(source: string, refs: FullRef[]): FullRef | null {
if (refs.length === 0) return null;
const sourceNorm = normalizeForMatch(source);
const sourceWords = sourceNorm.split(' ').filter((w) => w.length > 3);
if (sourceWords.length === 0) return null;
let bestMatch: FullRef | null = null;
let bestScore = 0;
for (const ref of refs) {
const refNorm = normalizeForMatch(ref.raw);
let score = 0;
for (const word of sourceWords) {
if (refNorm.includes(word)) score++;
}
// Bonus for matching the leading source identifier
const primaryKey = normalizeForMatch(source.split(/[;,]/)[0]);
if (primaryKey.length > 4 && refNorm.includes(primaryKey)) {
score += 3;
}
if (score > bestScore) {
bestScore = score;
bestMatch = ref;
}
}
return bestScore >= 2 ? bestMatch : null;
}
// ─── Type inference ───
const NEWS_KEYWORDS = [
'variety',
'time',
'npr',
'nbc',
'fortune',
'venturebeat',
'mit technology review',
'404 media',
'the verge',
'wired',
'ars technica',
'techcrunch',
'business standard',
'bbc',
'reuters',
'new york times',
'financial times',
'cnbc',
'android police',
];
const ACADEMIC_KEYWORDS = [
'doi',
'arxiv',
'journal',
'et al',
'university',
'neurips',
'usenix',
'ieee',
'springer',
'plos',
'bmc',
'sage journals',
];
const POLICY_KEYWORDS = [
'aclu',
'gdpr',
'regulation',
'act no',
'amnesty',
'parliament',
'government',
'medecins du monde',
'prostitution reform',
'directive',
];
const REPORT_KEYWORDS = [
'survey',
'report',
'census',
'foundation',
'coalition',
'institute',
'hacking//hustling',
'hacking hustling',
'gallup',
'kff',
'unfpa',
'democracy at work',
];
const FILING_KEYWORDS = ['companies house', 'sec filing', 'annual report', 'financial statements'];
const PLATFORM_KEYWORDS = ['terms of service', 'tos', 'documentation', 'api', 'pricing page'];
const INDUSTRY_KEYWORDS = [
'selecthub',
'merchant machine',
'paymentcloud',
'pitchbook',
'crunchbase',
'similarweb',
'comscore',
'signalfire',
'skyquest',
'octoverse',
'glassdoor',
'foxy studios',
];
function inferType(source: string, claim: string): CitationType {
const combined = `${source} ${claim}`.toLowerCase();
if (/\bcalculated\b|internal testing|platform measurement|lilith platform internal/i.test(source))
return 'internal';
if (/community reviews|forum discussions|reddit/i.test(source)) return 'community';
for (const k of FILING_KEYWORDS) if (combined.includes(k)) return 'filing';
for (const k of ACADEMIC_KEYWORDS) if (combined.includes(k)) return 'academic';
for (const k of POLICY_KEYWORDS) if (combined.includes(k)) return 'policy';
for (const k of PLATFORM_KEYWORDS) if (combined.includes(k)) return 'platform';
for (const k of NEWS_KEYWORDS) if (combined.includes(k)) return 'news';
for (const k of REPORT_KEYWORDS) if (combined.includes(k)) return 'report';
for (const k of INDUSTRY_KEYWORDS) if (combined.includes(k)) return 'industry';
return 'industry';
}
// ─── Author parsing ───
function parseAuthors(source: string): CitationAuthor[] | undefined {
const etAlMatch = source.match(/^([^,]+(?:,\s*[A-Z]\.?\s*)?)\s*et al/);
if (etAlMatch) {
return [{ name: etAlMatch[1].trim().replace(/[,.]$/, '') }];
}
const firstPart = source.split(/[;]/)[0].trim();
if (firstPart && !/\b[A-Z]\.\s/.test(firstPart)) {
return [{ name: firstPart, institutional: firstPart }];
}
return undefined;
}
// ─── Dedup key ───
function getSourceKey(source: string): string {
const firstBySemicolon = source.split(/\s*;\s*/)[0].trim();
return firstBySemicolon
.replace(/["'].+?["']/g, '')
.replace(/\(.+?\)/g, '')
.replace(/\*[^*]+\*/g, '')
.replace(/[,.]$/, '')
.trim();
}
function dedupKey(source: string, year: string): string {
const key = normalizeForDedup(getSourceKey(source));
const primaryYear = getPrimaryYear(year);
return `${key}__${primaryYear}`;
}
// ─── SQLite initialization ───
function initDatabase(): Database {
const db = new Database(dbPath);
const schema = readFileSync(schemaPath, 'utf-8');
// Drop existing tables for clean rebuild
db.run('PRAGMA foreign_keys = OFF');
const existingTables = db.prepare(
"SELECT name, type FROM sqlite_master WHERE type IN ('table', 'trigger') AND name NOT LIKE 'sqlite_%'",
).all() as { name: string; type: string }[];
for (const { name, type } of existingTables) {
if (type === 'trigger') {
db.run(`DROP TRIGGER IF EXISTS "${name}"`);
} else {
db.run(`DROP TABLE IF EXISTS "${name}"`);
}
}
db.exec(schema);
return db;
}
// ─── Main ───
interface ContentItem {
topic: string;
group: string;
}
function main(): void {
const library = loadJson<{ contentItems: ContentItem[] }>('library.json');
// Build map: group -> slugified topic list
const topicsByGroup = new Map<string, string[]>();
for (const item of library.contentItems) {
const list = topicsByGroup.get(item.group) ?? [];
list.push(slugify(item.topic));
topicsByGroup.set(item.group, list);
}
const files = readdirSync(archiveDir)
.filter((f) => f.endsWith('-citations.md'))
.sort();
log(`[parse-citations] Found ${files.length} citation files`);
// ─── Parse all files ───
const allEntries: RawEntry[] = [];
const allFullRefs: FullRef[] = [];
for (const file of files) {
const stem = file.replace('-citations.md', '');
const theme = THEME_MAP[stem];
if (!theme) {
log(`[parse-citations] Warning: unknown stem "${stem}", skipping`);
continue;
}
const content = readFileSync(resolve(archiveDir, file), 'utf-8');
const lines = content.split('\n');
// Detect table format
let hasTable = false;
for (const line of lines) {
if (detectTableFormat(line)) {
hasTable = true;
break;
}
}
let entries: RawEntry[];
if (hasTable) {
entries = parseTableFile(lines, theme);
} else if (theme === 'body-sovereignty') {
entries = parseParagraphFormat(lines, theme);
} else if (theme === 'open-source') {
entries = parseBulletFormat(lines, theme);
} else {
log(`[parse-citations] Warning: no parser matched for ${file}`);
entries = [];
}
allEntries.push(...entries);
const refs = parseFullReferences(lines);
allFullRefs.push(...refs);
log(`[parse-citations] ${theme}: ${entries.length} entries, ${refs.length} full refs`);
}
log(`[parse-citations] Total raw entries: ${allEntries.length}`);
// ─── Deduplicate and merge ───
interface CitationDraft {
primarySource: string;
themes: Set<ThemeId>;
claims: CitationClaim[];
year: string;
calculated: boolean;
url?: string;
}
const citationMap = new Map<string, CitationDraft>();
for (const entry of allEntries) {
const key = dedupKey(entry.source, entry.year);
const draft = citationMap.get(key);
if (draft) {
draft.themes.add(entry.theme);
const claimNorm = normalizeForDedup(entry.claim);
const isDuplicate = draft.claims.some((c) => normalizeForDedup(c.text) === claimNorm);
if (!isDuplicate) {
draft.claims.push({
text: entry.claim,
category: entry.category,
year: entry.year,
});
}
if (entry.calculated) draft.calculated = true;
if (entry.url && !draft.url) draft.url = entry.url;
} else {
citationMap.set(key, {
primarySource: entry.source,
themes: new Set([entry.theme]),
claims: [{ text: entry.claim, category: entry.category, year: entry.year }],
year: entry.year,
calculated: entry.calculated,
url: entry.url,
});
}
}
log(`[parse-citations] After dedup: ${citationMap.size} unique citations`);
// ─── Initialize SQLite and insert ───
const db = initDatabase();
log(`[parse-citations] SQLite database initialized at ${dbPath}`);
const insertCitation = db.prepare(
'INSERT INTO citations (id, type, title, year, publisher, url, doi, arxiv, venue, notes, calculated) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',
);
const insertAuthor = db.prepare(
'INSERT INTO citation_authors (citation_id, name, institutional, position) VALUES (?, ?, ?, ?)',
);
const insertTheme = db.prepare(
'INSERT INTO citation_themes (citation_id, theme) VALUES (?, ?)',
);
const insertClaim = db.prepare(
'INSERT INTO claims (citation_id, text, category, year) VALUES (?, ?, ?, ?)',
);
const usedIds = new Set<string>();
function makeUniqueId(base: string): string {
let id = base;
let counter = 2;
while (usedIds.has(id)) {
id = `${base}-${counter}`;
counter++;
}
usedIds.add(id);
return id;
}
let totalCitations = 0;
let totalClaims = 0;
const insertAll = db.transaction(() => {
for (const [, draft] of citationMap) {
const { primarySource, themes, claims, year, calculated } = draft;
const matchedRef = findMatchingRef(primarySource, allFullRefs);
const title = primarySource;
const sourceKey = getSourceKey(primarySource);
const primaryYear = getPrimaryYear(year);
const id = makeUniqueId(slugify(`${sourceKey}-${primaryYear}`));
const type = inferType(primarySource, claims[0]?.text ?? '');
const authors = parseAuthors(primarySource);
const url = matchedRef?.url ?? draft.url;
const doi = matchedRef?.doi;
const arxiv = matchedRef?.arxiv;
insertCitation.run(
id, type, title, year,
null, // publisher
url ?? null,
doi ?? null,
arxiv ?? null,
null, // venue
null, // notes
calculated ? 1 : 0,
);
if (authors) {
for (let i = 0; i < authors.length; i++) {
insertAuthor.run(id, authors[i].name, authors[i].institutional ?? null, i);
}
}
for (const theme of [...themes].sort()) {
insertTheme.run(id, theme);
}
for (const claim of claims) {
insertClaim.run(id, claim.text, claim.category, claim.year);
}
totalCitations++;
totalClaims += claims.length;
}
});
insertAll();
// ─── Summary ───
const themeCount = (
db.prepare('SELECT COUNT(DISTINCT theme) as c FROM citation_themes').get() as { c: number }
).c;
const withUrl = (
db.prepare('SELECT COUNT(*) as c FROM citations WHERE url IS NOT NULL').get() as { c: number }
).c;
const crossTheme = (
db.prepare(
'SELECT COUNT(*) as c FROM (SELECT citation_id FROM citation_themes GROUP BY citation_id HAVING COUNT(*) > 1)',
).get() as { c: number }
).c;
log(`[parse-citations] ${totalCitations} citations, ${totalClaims} claims`);
log(`[parse-citations] Themes: ${themeCount}`);
log(`[parse-citations] With URL: ${withUrl}`);
log(`[parse-citations] Cross-theme: ${crossTheme}`);
log(`[parse-citations] Wrote ${dbPath}`);
db.close();
}
main();