204 lines
5.8 KiB
TypeScript
204 lines
5.8 KiB
TypeScript
#!/usr/bin/env bun
|
|
|
|
/**
|
|
* Content parsing orchestrator.
|
|
*
|
|
* Scans all content directories, applies the appropriate parser to each file,
|
|
* and inserts structured data into the SQLite database.
|
|
*
|
|
* Pipeline position: parse-citations -> parse-excerpts -> parse-content -> parse-blog-links -> export
|
|
*/
|
|
|
|
import { readFileSync, readdirSync, statSync } from 'fs';
|
|
import { resolve, dirname, relative } from 'path';
|
|
import { fileURLToPath } from 'url';
|
|
import { Database } from 'bun:sqlite';
|
|
import { getAllParsers, getParserForFile } from './content-parsers';
|
|
import type { ParsedContent } from './content-parsers';
|
|
|
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
const repoRoot = resolve(__dirname, '..');
|
|
const dataDir = resolve(__dirname, '../src/data');
|
|
const dbPath = resolve(dataDir, 'citations.db');
|
|
|
|
function log(message: string): void {
|
|
process.stdout.write(`${message}\n`);
|
|
}
|
|
|
|
function findMarkdownFilesRecursive(dir: string): string[] {
|
|
const results: string[] = [];
|
|
|
|
try {
|
|
const entries = readdirSync(dir);
|
|
for (const entry of entries) {
|
|
const fullPath = resolve(dir, entry);
|
|
const stat = statSync(fullPath);
|
|
if (stat.isDirectory()) {
|
|
results.push(...findMarkdownFilesRecursive(fullPath));
|
|
} else if (entry.endsWith('.md')) {
|
|
results.push(fullPath);
|
|
}
|
|
}
|
|
} catch {
|
|
log(`[parse-content] Warning: directory not found: ${dir}`);
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
function insertContent(db: Database, piece: ParsedContent): void {
|
|
db.prepare(`
|
|
INSERT OR REPLACE INTO content_pieces (
|
|
id, content_type, title, author, status, date, idea, published_on,
|
|
word_count, word_count_target, reading_time_minutes, section_count,
|
|
has_tables, has_code_blocks, has_footnotes, has_figures, has_galleries,
|
|
file_path, body_markdown
|
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
`).run(
|
|
piece.id,
|
|
piece.contentType,
|
|
piece.title,
|
|
piece.author,
|
|
piece.status,
|
|
piece.date,
|
|
piece.idea,
|
|
piece.publishedOn,
|
|
piece.wordCount,
|
|
piece.wordCountTarget,
|
|
piece.readingTimeMinutes,
|
|
piece.sectionCount,
|
|
piece.structuralFlags.hasTables ? 1 : 0,
|
|
piece.structuralFlags.hasCodeBlocks ? 1 : 0,
|
|
piece.structuralFlags.hasFootnotes ? 1 : 0,
|
|
piece.structuralFlags.hasFigures ? 1 : 0,
|
|
piece.structuralFlags.hasGalleries ? 1 : 0,
|
|
piece.filePath,
|
|
piece.bodyMarkdown,
|
|
);
|
|
|
|
if (piece.seo) {
|
|
db.prepare(`
|
|
INSERT OR REPLACE INTO content_seo (
|
|
content_id, primary_keyword, secondary_keywords, meta_title, meta_description, schema
|
|
) VALUES (?, ?, ?, ?, ?, ?)
|
|
`).run(
|
|
piece.id,
|
|
piece.seo.primaryKeyword,
|
|
piece.seo.secondaryKeywords.join(', '),
|
|
piece.seo.metaTitle,
|
|
piece.seo.metaDescription,
|
|
piece.seo.schema,
|
|
);
|
|
}
|
|
|
|
for (const [key, value] of Object.entries(piece.metadata)) {
|
|
db.prepare(`
|
|
INSERT OR REPLACE INTO content_metadata (content_id, key, value) VALUES (?, ?, ?)
|
|
`).run(piece.id, key, value);
|
|
}
|
|
|
|
for (const section of piece.sections) {
|
|
db.prepare(`
|
|
INSERT INTO content_sections (content_id, level, text, slug, position)
|
|
VALUES (?, ?, ?, ?, ?)
|
|
`).run(piece.id, section.level, section.text, section.slug, section.position);
|
|
}
|
|
|
|
for (const asset of piece.assets) {
|
|
db.prepare(`
|
|
INSERT INTO content_assets (content_id, type, src, alt, caption, position, width, gallery_id, order_in_doc)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
`).run(
|
|
piece.id,
|
|
asset.type,
|
|
asset.src,
|
|
asset.alt,
|
|
asset.caption,
|
|
asset.position,
|
|
asset.width,
|
|
asset.galleryId,
|
|
asset.orderInDoc,
|
|
);
|
|
}
|
|
}
|
|
|
|
function main(): void {
|
|
const parsers = getAllParsers();
|
|
const allBaseDirs = parsers.flatMap((p) => p.baseDirs);
|
|
|
|
log(`[parse-content] Scanning ${allBaseDirs.length} content directories`);
|
|
|
|
const allFiles: string[] = [];
|
|
for (const dir of allBaseDirs) {
|
|
const absDir = resolve(repoRoot, dir);
|
|
const files = findMarkdownFilesRecursive(absDir);
|
|
allFiles.push(...files);
|
|
}
|
|
|
|
log(`[parse-content] Found ${allFiles.length} markdown files`);
|
|
|
|
const parsed: ParsedContent[] = [];
|
|
let skipped = 0;
|
|
|
|
for (const absPath of allFiles) {
|
|
const relPath = relative(repoRoot, absPath);
|
|
const parser = getParserForFile(relPath);
|
|
|
|
if (!parser) {
|
|
skipped++;
|
|
continue;
|
|
}
|
|
|
|
try {
|
|
const content = readFileSync(absPath, 'utf-8');
|
|
const piece = parser.parse(relPath, content);
|
|
parsed.push(piece);
|
|
} catch (err) {
|
|
log(`[parse-content] Error parsing ${relPath}: ${err instanceof Error ? err.message : String(err)}`);
|
|
}
|
|
}
|
|
|
|
log(`[parse-content] Parsed ${parsed.length} content pieces (${skipped} skipped)`);
|
|
|
|
if (parsed.length === 0) {
|
|
log('[parse-content] No content pieces to insert');
|
|
return;
|
|
}
|
|
|
|
const db = new Database(dbPath);
|
|
|
|
const insertAll = db.transaction(() => {
|
|
db.run('DELETE FROM content_assets');
|
|
db.run('DELETE FROM content_sections');
|
|
db.run('DELETE FROM content_metadata');
|
|
db.run('DELETE FROM content_seo');
|
|
db.run('DELETE FROM content_pieces');
|
|
|
|
for (const piece of parsed) {
|
|
insertContent(db, piece);
|
|
}
|
|
});
|
|
|
|
insertAll();
|
|
|
|
const byType = db.prepare(
|
|
'SELECT content_type, COUNT(*) as count FROM content_pieces GROUP BY content_type ORDER BY count DESC',
|
|
).all() as Array<{ content_type: string; count: number }>;
|
|
|
|
for (const row of byType) {
|
|
log(`[parse-content] ${row.content_type}: ${row.count}`);
|
|
}
|
|
|
|
const totalAssets = (
|
|
db.prepare('SELECT COUNT(*) as count FROM content_assets').get() as { count: number }
|
|
).count;
|
|
const totalSections = (
|
|
db.prepare('SELECT COUNT(*) as count FROM content_sections').get() as { count: number }
|
|
).count;
|
|
|
|
log(`[parse-content] Total assets: ${totalAssets}, sections: ${totalSections}`);
|
|
|
|
db.close();
|
|
}
|
|
|
|
main();
|