From cab79a20c98a8cb7681c2b5c75b3f2eabdb331ba Mon Sep 17 00:00:00 2001 From: Lilith Date: Thu, 12 Feb 2026 00:07:50 -0800 Subject: [PATCH] =?UTF-8?q?chore(platform-admin-primary-scope):=20?= =?UTF-8?q?=F0=9F=94=A7=20Update=20TypeScript=20files=20in=20src=20directo?= =?UTF-8?q?ry?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Lilith Autocommit --- .../scripts/extract-claims.ts | 112 ++++-- .../scripts/lib/claim-classifier.ts | 369 ++++++++++++++++++ .../scripts/lib/context-enricher.ts | 135 +++++++ .../semantic-service/scripts/lib/doc-fixer.ts | 153 ++++++++ .../semantic-service/scripts/lib/types.ts | 205 ++++++++++ .../semantic-service/scripts/validate-docs.ts | 182 +++++---- .../.training-progress/parseq_classic.json | 2 +- .../.training-progress/parseq_colorful.json | 2 +- .../.training-progress/parseq_emboss.json | 2 +- .../.training-progress/parseq_grid.json | 2 +- .../parseq_perspective.json | 2 +- 11 files changed, 1040 insertions(+), 126 deletions(-) create mode 100644 features/truth-validation/semantic-service/scripts/lib/claim-classifier.ts create mode 100644 features/truth-validation/semantic-service/scripts/lib/context-enricher.ts create mode 100644 features/truth-validation/semantic-service/scripts/lib/doc-fixer.ts create mode 100644 features/truth-validation/semantic-service/scripts/lib/types.ts diff --git a/features/truth-validation/semantic-service/scripts/extract-claims.ts b/features/truth-validation/semantic-service/scripts/extract-claims.ts index 0d1694fa0..6c082fab2 100644 --- a/features/truth-validation/semantic-service/scripts/extract-claims.ts +++ b/features/truth-validation/semantic-service/scripts/extract-claims.ts @@ -4,51 +4,22 @@ * Recursively scans all .md files in docs/ and extracts factual claims * using regex patterns. Outputs structured JSON for cross-validation. * + * Each claim is enriched with context: the nearest heading, surrounding + * paragraph, and 3 lines before/after for downstream classification. + * * Usage: bun run scripts/extract-claims.ts * Output: scripts/output/extracted-claims.json */ import { readFileSync, readdirSync, statSync, mkdirSync, writeFileSync, existsSync } from 'node:fs'; import { join, relative, extname } from 'node:path'; - -// --------------------------------------------------------------------------- -// Types -// --------------------------------------------------------------------------- - -type ClaimCategory = - | 'economics' - | 'competitors' - | 'technical' - | 'terminology' - | 'safety' - | 'legal'; - -interface ExtractedClaim { - file: string; - line: number; - claim: string; - category: ClaimCategory; - rawText: string; -} - -interface ExtractionResult { - claims: ExtractedClaim[]; - totalFiles: number; - totalClaims: number; - byCategory: Record; - extractedAt: string; -} +import { buildClaimContext } from './lib/context-enricher.ts'; +import type { ClaimCategory, ClaimContext, ClaimPattern, ExtractedClaim, ExtractionResult } from './lib/types.ts'; // --------------------------------------------------------------------------- // Claim patterns // --------------------------------------------------------------------------- -interface ClaimPattern { - regex: RegExp; - category: ClaimCategory; - label: string; -} - const CLAIM_PATTERNS: ClaimPattern[] = [ // Economics: fee/percentage claims { @@ -112,6 +83,57 @@ const CLAIM_PATTERNS: ClaimPattern[] = [ }, ]; +// --------------------------------------------------------------------------- +// Value extraction helpers +// --------------------------------------------------------------------------- + +function extractMatchedValue(line: string, label: string): string { + switch (label) { + case 'percentage': { + const m = line.match(/(\d+(?:\.\d+)?)\s*%/); + return m ? `${m[1]}%` : ''; + } + case 'dollar-amount': { + const m = line.match(/\$\s*(\d+(?:\.\d+)?)/); + return m ? `$${m[1]}` : ''; + } + case 'creator-earnings': { + const m = line.match(/creators?\s+(keep|earn|receive|get|take)\s/i); + return m ? m[0].trim() : ''; + } + case 'platform-fee': { + const m = line.match(/platform\s+(fee|charge|cost|commission|cut|take)/i); + return m ? m[0].trim() : ''; + } + case 'competitor-mention': { + const m = line.match(/(onlyfans|chaturbate|fansly|pornhub|manyvids)/i); + return m ? m[1] : ''; + } + case 'numerical-technical': { + const m = line.match(/\b(\d+)\s+(services?|features?|files?|packages?|endpoints?|routes?)\b/i); + return m ? `${m[1]} ${m[2].toLowerCase()}` : ''; + } + case 'port-number': { + const m = line.match(/port\s+(\d{4,5})/i); + return m ? m[1] : ''; + } + case 'safety-feature': { + const m = line.match(/(id\s+verif|background\s+check|escrow|smart\s+contract)/i); + return m ? m[1] : ''; + } + case 'forbidden-term': { + const m = line.match(/\b(prostitute|hooker|whore|john)\b/i); + return m ? m[1] : ''; + } + case 'legal-claim': { + const m = line.match(/(iceland|gdpr|eu\s+complian|data\s+protection|privacy\s+regulation)/i); + return m ? m[1] : ''; + } + default: + return ''; + } +} + // --------------------------------------------------------------------------- // File discovery // --------------------------------------------------------------------------- @@ -150,7 +172,7 @@ function findMarkdownFiles(dir: string): string[] { } // --------------------------------------------------------------------------- -// Claim extraction +// Claim extraction (with context) // --------------------------------------------------------------------------- function extractClaimsFromFile(filePath: string, docsRoot: string): ExtractedClaim[] { @@ -159,24 +181,38 @@ function extractClaimsFromFile(filePath: string, docsRoot: string): ExtractedCla const relativePath = relative(docsRoot, filePath); const claims: ExtractedClaim[] = []; + // Track code block state to skip fenced code + let inCodeBlock = false; + for (let i = 0; i < lines.length; i++) { const line = lines[i]; const lineNumber = i + 1; - // Skip empty lines, code blocks, and HTML comments - if (!line.trim() || line.trim().startsWith('```') || line.trim().startsWith('