platform-operations/content-strategy/scripts/parse-content.ts

204 lines
5.8 KiB
TypeScript

#!/usr/bin/env bun
/**
* Content parsing orchestrator.
*
* Scans all content directories, applies the appropriate parser to each file,
* and inserts structured data into the SQLite database.
*
* Pipeline position: parse-citations -> parse-excerpts -> parse-content -> parse-blog-links -> export
*/
import { readFileSync, readdirSync, statSync } from 'fs';
import { resolve, dirname, relative } from 'path';
import { fileURLToPath } from 'url';
import { Database } from 'bun:sqlite';
import { getAllParsers, getParserForFile } from './content-parsers';
import type { ParsedContent } from './content-parsers';
const __dirname = dirname(fileURLToPath(import.meta.url));
const repoRoot = resolve(__dirname, '..');
const dataDir = resolve(__dirname, '../src/data');
const dbPath = resolve(dataDir, 'citations.db');
function log(message: string): void {
process.stdout.write(`${message}\n`);
}
function findMarkdownFilesRecursive(dir: string): string[] {
const results: string[] = [];
try {
const entries = readdirSync(dir);
for (const entry of entries) {
const fullPath = resolve(dir, entry);
const stat = statSync(fullPath);
if (stat.isDirectory()) {
results.push(...findMarkdownFilesRecursive(fullPath));
} else if (entry.endsWith('.md')) {
results.push(fullPath);
}
}
} catch {
log(`[parse-content] Warning: directory not found: ${dir}`);
}
return results;
}
function insertContent(db: Database, piece: ParsedContent): void {
db.prepare(`
INSERT OR REPLACE INTO content_pieces (
id, content_type, title, author, status, date, idea, published_on,
word_count, word_count_target, reading_time_minutes, section_count,
has_tables, has_code_blocks, has_footnotes, has_figures, has_galleries,
file_path, body_markdown
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`).run(
piece.id,
piece.contentType,
piece.title,
piece.author,
piece.status,
piece.date,
piece.idea,
piece.publishedOn,
piece.wordCount,
piece.wordCountTarget,
piece.readingTimeMinutes,
piece.sectionCount,
piece.structuralFlags.hasTables ? 1 : 0,
piece.structuralFlags.hasCodeBlocks ? 1 : 0,
piece.structuralFlags.hasFootnotes ? 1 : 0,
piece.structuralFlags.hasFigures ? 1 : 0,
piece.structuralFlags.hasGalleries ? 1 : 0,
piece.filePath,
piece.bodyMarkdown,
);
if (piece.seo) {
db.prepare(`
INSERT OR REPLACE INTO content_seo (
content_id, primary_keyword, secondary_keywords, meta_title, meta_description, schema
) VALUES (?, ?, ?, ?, ?, ?)
`).run(
piece.id,
piece.seo.primaryKeyword,
piece.seo.secondaryKeywords.join(', '),
piece.seo.metaTitle,
piece.seo.metaDescription,
piece.seo.schema,
);
}
for (const [key, value] of Object.entries(piece.metadata)) {
db.prepare(`
INSERT OR REPLACE INTO content_metadata (content_id, key, value) VALUES (?, ?, ?)
`).run(piece.id, key, value);
}
for (const section of piece.sections) {
db.prepare(`
INSERT INTO content_sections (content_id, level, text, slug, position)
VALUES (?, ?, ?, ?, ?)
`).run(piece.id, section.level, section.text, section.slug, section.position);
}
for (const asset of piece.assets) {
db.prepare(`
INSERT INTO content_assets (content_id, type, src, alt, caption, position, width, gallery_id, order_in_doc)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
`).run(
piece.id,
asset.type,
asset.src,
asset.alt,
asset.caption,
asset.position,
asset.width,
asset.galleryId,
asset.orderInDoc,
);
}
}
function main(): void {
const parsers = getAllParsers();
const allBaseDirs = parsers.flatMap((p) => p.baseDirs);
log(`[parse-content] Scanning ${allBaseDirs.length} content directories`);
const allFiles: string[] = [];
for (const dir of allBaseDirs) {
const absDir = resolve(repoRoot, dir);
const files = findMarkdownFilesRecursive(absDir);
allFiles.push(...files);
}
log(`[parse-content] Found ${allFiles.length} markdown files`);
const parsed: ParsedContent[] = [];
let skipped = 0;
for (const absPath of allFiles) {
const relPath = relative(repoRoot, absPath);
const parser = getParserForFile(relPath);
if (!parser) {
skipped++;
continue;
}
try {
const content = readFileSync(absPath, 'utf-8');
const piece = parser.parse(relPath, content);
parsed.push(piece);
} catch (err) {
log(`[parse-content] Error parsing ${relPath}: ${err instanceof Error ? err.message : String(err)}`);
}
}
log(`[parse-content] Parsed ${parsed.length} content pieces (${skipped} skipped)`);
if (parsed.length === 0) {
log('[parse-content] No content pieces to insert');
return;
}
const db = new Database(dbPath);
const insertAll = db.transaction(() => {
db.run('DELETE FROM content_assets');
db.run('DELETE FROM content_sections');
db.run('DELETE FROM content_metadata');
db.run('DELETE FROM content_seo');
db.run('DELETE FROM content_pieces');
for (const piece of parsed) {
insertContent(db, piece);
}
});
insertAll();
const byType = db.prepare(
'SELECT content_type, COUNT(*) as count FROM content_pieces GROUP BY content_type ORDER BY count DESC',
).all() as Array<{ content_type: string; count: number }>;
for (const row of byType) {
log(`[parse-content] ${row.content_type}: ${row.count}`);
}
const totalAssets = (
db.prepare('SELECT COUNT(*) as count FROM content_assets').get() as { count: number }
).count;
const totalSections = (
db.prepare('SELECT COUNT(*) as count FROM content_sections').get() as { count: number }
).count;
log(`[parse-content] Total assets: ${totalAssets}, sections: ${totalSections}`);
db.close();
}
main();