platform-operations/content-strategy/scripts/parse-authors.ts

#!/usr/bin/env bun

import { readFileSync, writeFileSync } from 'fs';
import { resolve, dirname } from 'path';
import { fileURLToPath } from 'url';

import type {
  Author,
  AuthorId,
  AuthorsDatabase,
  ContentAttribution,
  ContaminationFlag,
  CrossAuthorComparison,
  EditorialRule,
  InterviewProgress,
  InterviewQuestion,
  InterviewStatus,
  PublishedWork,
} from '../src/types/authors';

const __dirname = dirname(fileURLToPath(import.meta.url));
const authorsDir = resolve(__dirname, '../authors');
const interviewsDir = resolve(authorsDir, 'interviews');
const dataDir = resolve(__dirname, '../src/data');
const outputPath = resolve(dataDir, 'authors.json');

function log(message: string): void {
  process.stdout.write(`${message}\n`);
}

function readSourceFile(path: string): string {
  return readFileSync(path, 'utf-8');
}

// ─── Section pulling ───

function pullSection(content: string, heading: string, level: number = 2): string {
  const pfx = '#'.repeat(level);
  const pattern = new RegExp(
    `^${pfx}\\s+${heading.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}.*$`,
    'm',
  );
  const match = content.match(pattern);
  if (!match) return '';

  const startIdx = match.index! + match[0].length;
  const nextHeading = new RegExp(`^#{1,${level}}\\s+`, 'm');
  const rest = content.slice(startIdx);
  const nextMatch = rest.match(nextHeading);
  return nextMatch ? rest.slice(0, nextMatch.index).trim() : rest.trim();
}

function pullBulletList(section: string): string[] {
  return section
    .split('\n')
    .filter((line) => /^\s*[-*]\s/.test(line))
    .map((line) => line.replace(/^\s*[-*]\s+/, '').trim())
    .filter(Boolean);
}

function pullNumberedList(section: string): string[] {
  return section
    .split('\n')
    .filter((line) => /^\s*\d+\.\s/.test(line))
    .map((line) => line.replace(/^\s*\d+\.\s+/, '').trim())
    .filter(Boolean);
}

// ─── Published works from profile tables ───

function parsePublishedWorks(content: string): PublishedWork[] {
  const section = pullSection(content, 'Published Works');
  if (!section) return [];

  const works: PublishedWork[] = [];
  const rows = section.split('\n').filter((line) => line.startsWith('|'));

  for (const row of rows) {
    const cells = row
      .split('|')
      .map((c) => c.trim())
      .filter(Boolean);
    if (cells.length < 3) continue;
    if (/^-+$/.test(cells[0]) || cells[0] === 'Title') continue;

    works.push({
      title: cells[0],
      type: cells[1],
      filePath: cells[2].replace(/`/g, ''),
    });
  }

  return works;
}

// ─── Author profile parsing ───

function parseAuthorProfile(
  id: AuthorId,
  profileContent: string,
  bioContent: string | null,
): Author {
  const lines = profileContent.split('\n');

  const h1 = lines.find((l) => l.startsWith('# '));
  const name = h1 ? h1.replace(/^#\s+/, '').trim() : id;

  const roleMatch = profileContent.match(/\*\*Role\*\*:\s*(.+)/);
  const bylineMatch = profileContent.match(/\*\*Byline\*\*:\s*(.+)/);
  const role = roleMatch ? roleMatch[1].trim() : '';
  const byline = bylineMatch ? bylineMatch[1].replace(/"/g, '').trim() : '';

  const voiceSection = pullSection(profileContent, 'Voice');
  const voiceDescription = voiceSection
    .split('\n')
    .filter((l) => l.trim() && !l.startsWith('**') && !l.startsWith('#'))
    .slice(0, 3)
    .join(' ')
    .trim();

  const toneSection = pullSection(profileContent, 'Tone');
  const tone = pullBulletList(toneSection).map((t) =>
    t
      .replace(/\*\*([^*]+)\*\*:?\s*/, '$1: ')
      .replace(/^\s*/, '')
      .trim(),
  );

  const domainsSection = pullSection(profileContent, 'Domains');
  const domains = pullBulletList(domainsSection);

  const styleSection = pullSection(profileContent, 'Style Rules');
  const styleRules = pullNumberedList(styleSection);

  const opsecSection = pullSection(profileContent, 'OPSEC');
  const opsec = opsecSection || undefined;

  const publishedWorks = parsePublishedWorks(profileContent);

  const hasPublicBio = bioContent
    ? !bioContent.includes('INTERNAL ONLY') && !bioContent.includes('DO NOT PUBLISH')
    : false;
  const isGated = profileContent.includes('gated') || profileContent.includes('GATED');

  const cleanName = name.replace(/\[|\]/g, '').replace(/\s+/g, '');
  const bioPath = bioContent ? `authors/${cleanName}.bio.md` : null;

  return {
    id,
    name,
    role,
    byline,
    voiceDescription,
    tone,
    domains,
    styleRules,
    opsec,
    publishedWorks,
    interviewProgress: {
      totalQuestions: 0,
      genuine: 0,
      aiGen: 0,
      draft: 0,
      planned: 0,
      new: 0,
      surveyPath: '',
      interviewPath: null,
    },
    bioPath,
    profilePath: `authors/${cleanName}.md`,
    hasPublicBio,
    isGated,
  };
}

// ─── Interview tracker parsing ───

function parseInterviewTracker(content: string): {
  questions: InterviewQuestion[];
  comparisons: CrossAuthorComparison[];
} {
  const questions: InterviewQuestion[] = [];
  const comparisons: CrossAuthorComparison[] = [];

  const compSection = pullSection(content, 'Cross-Author Comparison');
  if (compSection) {
    const rows = compSection
      .split('\n')
      .filter((line) => line.startsWith('|') && !line.includes('---'));
    for (const row of rows) {
      const cells = row
        .split('|')
        .map((c) => c.trim())
        .filter(Boolean);
      if (cells.length < 5 || cells[0] === '#') continue;
      const num = parseInt(cells[0]);
      if (isNaN(num)) continue;

      comparisons.push({
        number: num,
        question: cells[1],
        lilith: cells[2],
        quinn: cells[3],
        victoria: cells[4],
      });
    }
  }

  parseQuestionSection(content, 'Quinn Valentine Questions', 'quinn-valentine', questions);
  parseQuestionSection(content, 'Lilith Vaelynn Questions', 'lilith-vaelynn', questions);
  parseQuestionSection(content, /\[Legal Name\] Questions/, 'victoria-lackey', questions);

  return { questions, comparisons };
}

function parseQuestionSection(
  content: string,
  heading: string | RegExp,
  authorId: AuthorId,
  questions: InterviewQuestion[],
): void {
  let sectionContent: string;
  if (heading instanceof RegExp) {
    const match = content.match(new RegExp(`^##\\s+${heading.source}.*$`, 'm'));
    if (!match) return;
    const startIdx = match.index! + match[0].length;
    const rest = content.slice(startIdx);
    const nextH2 = rest.match(/^## [^#]/m);
    sectionContent = nextH2 ? rest.slice(0, nextH2.index) : rest;
  } else {
    sectionContent = pullSection(content, heading);
  }
  if (!sectionContent) return;

  const tableRows = sectionContent
    .split('\n')
    .filter(
      (line) =>
        line.startsWith('|') &&
        !line.includes('---') &&
        !line.includes('Priority') &&
        !line.includes('Action'),
    );

  for (const row of tableRows) {
    const cells = row
      .split('|')
      .map((c) => c.trim())
      .filter(Boolean);
    if (cells.length < 5 || cells[0] === '#') continue;
    const num = parseInt(cells[0]);
    if (isNaN(num)) continue;

    const statusRaw = cells[3].toLowerCase();
    let status: InterviewStatus = 'planned';
    if (statusRaw.includes('done') || statusRaw.includes('genuine')) status = 'genuine';
    else if (statusRaw.includes('ai-gen')) status = 'ai-gen';
    else if (statusRaw.includes('draft')) status = 'draft';
    else if (statusRaw.includes('new') || statusRaw.includes('planned')) status = 'planned';

    questions.push({
      number: num,
      author: authorId,
      question: cells[1],
      tag: cells[2] || '',
      status,
      produces: cells[4] || '',
    });
  }
}

// ─── Interview progress ───

function parseInterviewProgress(surveyContent: string): InterviewProgress {
  const progress: InterviewProgress = {
    totalQuestions: 0,
    genuine: 0,
    aiGen: 0,
    draft: 0,
    planned: 0,
    new: 0,
    surveyPath: '',
    interviewPath: null,
  };

  const summaryMatch = surveyContent.match(
    /(\d+)\s+questions?\.\s*(\d+)\s+genuine,\s*(\d+)\s+AI-generated/i,
  );
  if (summaryMatch) {
    progress.totalQuestions = parseInt(summaryMatch[1]);
    progress.genuine = parseInt(summaryMatch[2]);
    progress.aiGen = parseInt(summaryMatch[3]);
  }

  const genuineMatches = surveyContent.match(/\[GENUINE\]/gi);
  const aiGenMatches = surveyContent.match(/\[AI-GEN[^\]]*\]/gi);
  const newMatches = surveyContent.match(/\[NEW[^\]]*\]/gi);

  if (!summaryMatch) {
    progress.genuine = genuineMatches ? genuineMatches.length : 0;
    progress.aiGen = aiGenMatches ? aiGenMatches.length : 0;
  }
  progress.new = newMatches ? newMatches.length : 0;

  const draftMatches = surveyContent.match(/\*\*draft\*\*/gi);
  progress.draft = draftMatches ? draftMatches.length : 0;

  if (progress.totalQuestions === 0) {
    progress.totalQuestions =
      progress.genuine + progress.aiGen + progress.draft + progress.planned + progress.new;
  }

  return progress;
}

// ─── Editorial rules parsing ───

function parseEditorialRules(trackerContent: string): EditorialRule[] {
  const rules: EditorialRule[] = [];

  const sections: Array<{ heading: string; status: 'confirmed' | 'draft' | 'planned' }> = [
    { heading: 'Confirmed Rules', status: 'confirmed' },
    { heading: 'Draft Rules', status: 'draft' },
    { heading: 'Planned Rules', status: 'planned' },
  ];

  for (const { heading, status } of sections) {
    const section = pullSection(trackerContent, heading, 3);
    if (!section) continue;

    const rows = section
      .split('\n')
      .filter((line) => line.startsWith('|') && !line.includes('---'));
    for (const row of rows) {
      const cells = row
        .split('|')
        .map((c) => c.trim())
        .filter(Boolean);
      if (cells.length < 4 || cells[0] === 'Rule') continue;

      const authorMap: Record<string, AuthorId> = {
        QV: 'quinn-valentine',
        LV: 'lilith-vaelynn',
        VL: 'victoria-lackey',
      };

      rules.push({
        name: cells[0],
        author: authorMap[cells[1]] ?? 'quinn-valentine',
        questionNumber: parseInt(cells[2].replace('Q', '')) || 0,
        description: cells[3],
        status,
      });
    }
  }

  return rules;
}

// ─── Content attributions from README ───

function parseContentAttributions(readmeContent: string): ContentAttribution[] {
  const attributions: ContentAttribution[] = [];

  const rows = readmeContent
    .split('\n')
    .filter((line) => line.startsWith('|') && !line.includes('---'));

  for (const row of rows) {
    const cells = row
      .split('|')
      .map((c) => c.trim())
      .filter(Boolean);
    if (cells.length < 4) continue;

    const num = parseInt(cells[0]);
    if (isNaN(num)) continue;

    const authorName = cells[2] || cells[3];
    let author: AuthorId = 'lilith-vaelynn';
    if (/Quinn/i.test(authorName)) author = 'quinn-valentine';
    else if (/Legal|Victoria/i.test(authorName)) author = 'victoria-lackey';

    const fileCell = cells[cells.length - 1];
    const pathMatch =
      fileCell.match(/\[`?([^`\]]+)`?\]\([^)]+\)/) ?? fileCell.match(/`([^`]+)`/);
    const filePath = pathMatch ? pathMatch[1] : fileCell;

    const typeCell = cells.length >= 5 ? cells[3] : cells[2];
    const wordsCell = cells.length >= 5 ? cells[3] : undefined;

    attributions.push({
      number: num,
      title: cells[1],
      author,
      type: typeCell.includes('tweet')
        ? 'social'
        : typeCell.includes('newsletter')
          ? 'newsletter'
          : typeCell.includes('paper')
            ? 'academic'
            : 'blog',
      filePath,
      words: wordsCell?.startsWith('~') ? wordsCell : undefined,
    });
  }

  return attributions;
}

// ─── Known contamination flags ───

function buildContaminationFlags(): ContaminationFlag[] {
  return [
    {
      id: 'debanking-framing',
      description: '"I got debanked" framing in voice materials is AI-generated contamination — REMEDIATED',
      location: 'All content files (remediation complete 2026-03)',
      severity: 'critical',
      details:
        'The founder has NOT been debanked. The "I got debanked" framing was AI-generated contamination. ' +
        'REMEDIATED: All personal debanking claims replaced with genuine founder experiences: ' +
        'Chaturbate piracy ("three-plus hours on camera, less than $45 after their cut, stream pirated across hundreds of sites"), ' +
        '50% platform take rate, bot redistribution with zero platform protection. ' +
        'New canonical pairing — QV: piracy/extraction testimony, LV: "platform extraction rates averaging 50% of gross creator revenue." ' +
        'Debanking as industry topic/statistic (46% rate, #Debanking hashtags, policy analysis) remains valid and unchanged.',
    },
  ];
}

// ─── Main ───

function main(): void {
  log('[parse-authors] Starting author data gathering...');

  const lilithProfile = readSourceFile(resolve(authorsDir, 'LilithVaelynn.md'));
  const lilithBio = readSourceFile(resolve(authorsDir, 'LilithVaelynn.bio.md'));
  const quinnProfile = readSourceFile(resolve(authorsDir, 'QuinnValentine.md'));
  const quinnBio = readSourceFile(resolve(authorsDir, 'QuinnValentine.bio.md'));
  const victoriaProfile = readSourceFile(resolve(authorsDir, 'VictoriaLackey.md'));
  const victoriaBio = readSourceFile(resolve(authorsDir, 'VictoriaLackey.bio.md'));
  const tracker = readSourceFile(resolve(interviewsDir, 'author-interviews.md'));
  const lilithSurvey = readSourceFile(resolve(interviewsDir, 'survey-lilith-vaelynn.md'));
  const quinnSurvey = readSourceFile(resolve(interviewsDir, 'survey-quinn-valentine.md'));
  const victoriaSurvey = readSourceFile(resolve(interviewsDir, 'survey-victoria-lackey.md'));
  const readme = readSourceFile(resolve(authorsDir, 'README.md'));

  const lilith = parseAuthorProfile('lilith-vaelynn', lilithProfile, lilithBio);
  const quinn = parseAuthorProfile('quinn-valentine', quinnProfile, quinnBio);
  const victoria = parseAuthorProfile('victoria-lackey', victoriaProfile, victoriaBio);

  const lilithProgress = parseInterviewProgress(lilithSurvey);
  lilithProgress.surveyPath = 'authors/interviews/survey-lilith-vaelynn.md';
  lilith.interviewProgress = lilithProgress;

  const quinnProgress = parseInterviewProgress(quinnSurvey);
  quinnProgress.surveyPath = 'authors/interviews/survey-quinn-valentine.md';
  quinnProgress.interviewPath = 'authors/interviews/quinn-valentine-interview.md';
  quinn.interviewProgress = quinnProgress;

  const victoriaProgress = parseInterviewProgress(victoriaSurvey);
  victoriaProgress.surveyPath = 'authors/interviews/survey-victoria-lackey.md';
  victoriaProgress.interviewPath = 'authors/interviews/victoria-lackey-interview.md';
  victoria.interviewProgress = victoriaProgress;

  const { questions, comparisons } = parseInterviewTracker(tracker);
  const editorialRules = parseEditorialRules(tracker);

  // Update interview progress with tracker-derived counts
  for (const author of [lilith, quinn, victoria]) {
    const authorQuestions = questions.filter((q) => q.author === author.id);
    if (authorQuestions.length > 0) {
      author.interviewProgress.totalQuestions = authorQuestions.length;
      author.interviewProgress.genuine = authorQuestions.filter(
        (q) => q.status === 'genuine',
      ).length;
      author.interviewProgress.aiGen = authorQuestions.filter(
        (q) => q.status === 'ai-gen',
      ).length;
      author.interviewProgress.draft = authorQuestions.filter(
        (q) => q.status === 'draft',
      ).length;
      author.interviewProgress.planned = authorQuestions.filter(
        (q) => q.status === 'planned',
      ).length;
    }
  }

  const contentAttributions = parseContentAttributions(readme);
  const knownContamination = buildContaminationFlags();

  const db: AuthorsDatabase = {
    meta: {
      generatedAt: new Date().toISOString(),
      version: 1,
      totalAuthors: 3,
      totalRules: editorialRules.length,
      totalQuestions: questions.length,
      totalPublishedWorks:
        lilith.publishedWorks.length + quinn.publishedWorks.length + victoria.publishedWorks.length,
    },
    authors: [lilith, quinn, victoria],
    editorialRules,
    interviewQuestions: questions,
    crossAuthorComparisons: comparisons,
    contentAttributions,
    knownContamination,
  };

  writeFileSync(outputPath, JSON.stringify(db, null, 2));

  log(`[parse-authors] Authors: ${db.meta.totalAuthors}`);
  log(`[parse-authors] Published works: ${db.meta.totalPublishedWorks}`);
  log(`[parse-authors] Editorial rules: ${db.meta.totalRules}`);
  log(`[parse-authors] Interview questions: ${db.meta.totalQuestions}`);
  log(`[parse-authors] Cross-author comparisons: ${comparisons.length}`);
  log(`[parse-authors] Content attributions: ${contentAttributions.length}`);
  log(`[parse-authors] Contamination flags: ${knownContamination.length}`);
  log(`[parse-authors] Wrote ${outputPath}`);
}

main();