diff --git a/features/truth-validation/semantic-service/scripts/extract-claims.ts b/features/truth-validation/semantic-service/scripts/extract-claims.ts index 0d1694fa0..6c082fab2 100644 --- a/features/truth-validation/semantic-service/scripts/extract-claims.ts +++ b/features/truth-validation/semantic-service/scripts/extract-claims.ts @@ -4,51 +4,22 @@ * Recursively scans all .md files in docs/ and extracts factual claims * using regex patterns. Outputs structured JSON for cross-validation. * + * Each claim is enriched with context: the nearest heading, surrounding + * paragraph, and 3 lines before/after for downstream classification. + * * Usage: bun run scripts/extract-claims.ts * Output: scripts/output/extracted-claims.json */ import { readFileSync, readdirSync, statSync, mkdirSync, writeFileSync, existsSync } from 'node:fs'; import { join, relative, extname } from 'node:path'; - -// --------------------------------------------------------------------------- -// Types -// --------------------------------------------------------------------------- - -type ClaimCategory = - | 'economics' - | 'competitors' - | 'technical' - | 'terminology' - | 'safety' - | 'legal'; - -interface ExtractedClaim { - file: string; - line: number; - claim: string; - category: ClaimCategory; - rawText: string; -} - -interface ExtractionResult { - claims: ExtractedClaim[]; - totalFiles: number; - totalClaims: number; - byCategory: Record; - extractedAt: string; -} +import { buildClaimContext } from './lib/context-enricher.ts'; +import type { ClaimCategory, ClaimContext, ClaimPattern, ExtractedClaim, ExtractionResult } from './lib/types.ts'; // --------------------------------------------------------------------------- // Claim patterns // --------------------------------------------------------------------------- -interface ClaimPattern { - regex: RegExp; - category: ClaimCategory; - label: string; -} - const CLAIM_PATTERNS: ClaimPattern[] = [ // Economics: fee/percentage claims { @@ -112,6 +83,57 @@ const CLAIM_PATTERNS: ClaimPattern[] = [ }, ]; +// --------------------------------------------------------------------------- +// Value extraction helpers +// --------------------------------------------------------------------------- + +function extractMatchedValue(line: string, label: string): string { + switch (label) { + case 'percentage': { + const m = line.match(/(\d+(?:\.\d+)?)\s*%/); + return m ? `${m[1]}%` : ''; + } + case 'dollar-amount': { + const m = line.match(/\$\s*(\d+(?:\.\d+)?)/); + return m ? `$${m[1]}` : ''; + } + case 'creator-earnings': { + const m = line.match(/creators?\s+(keep|earn|receive|get|take)\s/i); + return m ? m[0].trim() : ''; + } + case 'platform-fee': { + const m = line.match(/platform\s+(fee|charge|cost|commission|cut|take)/i); + return m ? m[0].trim() : ''; + } + case 'competitor-mention': { + const m = line.match(/(onlyfans|chaturbate|fansly|pornhub|manyvids)/i); + return m ? m[1] : ''; + } + case 'numerical-technical': { + const m = line.match(/\b(\d+)\s+(services?|features?|files?|packages?|endpoints?|routes?)\b/i); + return m ? `${m[1]} ${m[2].toLowerCase()}` : ''; + } + case 'port-number': { + const m = line.match(/port\s+(\d{4,5})/i); + return m ? m[1] : ''; + } + case 'safety-feature': { + const m = line.match(/(id\s+verif|background\s+check|escrow|smart\s+contract)/i); + return m ? m[1] : ''; + } + case 'forbidden-term': { + const m = line.match(/\b(prostitute|hooker|whore|john)\b/i); + return m ? m[1] : ''; + } + case 'legal-claim': { + const m = line.match(/(iceland|gdpr|eu\s+complian|data\s+protection|privacy\s+regulation)/i); + return m ? m[1] : ''; + } + default: + return ''; + } +} + // --------------------------------------------------------------------------- // File discovery // --------------------------------------------------------------------------- @@ -150,7 +172,7 @@ function findMarkdownFiles(dir: string): string[] { } // --------------------------------------------------------------------------- -// Claim extraction +// Claim extraction (with context) // --------------------------------------------------------------------------- function extractClaimsFromFile(filePath: string, docsRoot: string): ExtractedClaim[] { @@ -159,24 +181,38 @@ function extractClaimsFromFile(filePath: string, docsRoot: string): ExtractedCla const relativePath = relative(docsRoot, filePath); const claims: ExtractedClaim[] = []; + // Track code block state to skip fenced code + let inCodeBlock = false; + for (let i = 0; i < lines.length; i++) { const line = lines[i]; const lineNumber = i + 1; - // Skip empty lines, code blocks, and HTML comments - if (!line.trim() || line.trim().startsWith('```') || line.trim().startsWith('