chore(platform-admin-primary-scope): 🔧 Update TypeScript files in src directory

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
Lilith 2026-02-12 00:07:50 -08:00
parent b25facc38f
commit cab79a20c9
11 changed files with 1040 additions and 126 deletions

View file

@ -4,51 +4,22 @@
* Recursively scans all .md files in docs/ and extracts factual claims
* using regex patterns. Outputs structured JSON for cross-validation.
*
* Each claim is enriched with context: the nearest heading, surrounding
* paragraph, and 3 lines before/after for downstream classification.
*
* Usage: bun run scripts/extract-claims.ts
* Output: scripts/output/extracted-claims.json
*/
import { readFileSync, readdirSync, statSync, mkdirSync, writeFileSync, existsSync } from 'node:fs';
import { join, relative, extname } from 'node:path';
// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------
type ClaimCategory =
| 'economics'
| 'competitors'
| 'technical'
| 'terminology'
| 'safety'
| 'legal';
interface ExtractedClaim {
file: string;
line: number;
claim: string;
category: ClaimCategory;
rawText: string;
}
interface ExtractionResult {
claims: ExtractedClaim[];
totalFiles: number;
totalClaims: number;
byCategory: Record<ClaimCategory, number>;
extractedAt: string;
}
import { buildClaimContext } from './lib/context-enricher.ts';
import type { ClaimCategory, ClaimContext, ClaimPattern, ExtractedClaim, ExtractionResult } from './lib/types.ts';
// ---------------------------------------------------------------------------
// Claim patterns
// ---------------------------------------------------------------------------
interface ClaimPattern {
regex: RegExp;
category: ClaimCategory;
label: string;
}
const CLAIM_PATTERNS: ClaimPattern[] = [
// Economics: fee/percentage claims
{
@ -112,6 +83,57 @@ const CLAIM_PATTERNS: ClaimPattern[] = [
},
];
// ---------------------------------------------------------------------------
// Value extraction helpers
// ---------------------------------------------------------------------------
function extractMatchedValue(line: string, label: string): string {
switch (label) {
case 'percentage': {
const m = line.match(/(\d+(?:\.\d+)?)\s*%/);
return m ? `${m[1]}%` : '';
}
case 'dollar-amount': {
const m = line.match(/\$\s*(\d+(?:\.\d+)?)/);
return m ? `$${m[1]}` : '';
}
case 'creator-earnings': {
const m = line.match(/creators?\s+(keep|earn|receive|get|take)\s/i);
return m ? m[0].trim() : '';
}
case 'platform-fee': {
const m = line.match(/platform\s+(fee|charge|cost|commission|cut|take)/i);
return m ? m[0].trim() : '';
}
case 'competitor-mention': {
const m = line.match(/(onlyfans|chaturbate|fansly|pornhub|manyvids)/i);
return m ? m[1] : '';
}
case 'numerical-technical': {
const m = line.match(/\b(\d+)\s+(services?|features?|files?|packages?|endpoints?|routes?)\b/i);
return m ? `${m[1]} ${m[2].toLowerCase()}` : '';
}
case 'port-number': {
const m = line.match(/port\s+(\d{4,5})/i);
return m ? m[1] : '';
}
case 'safety-feature': {
const m = line.match(/(id\s+verif|background\s+check|escrow|smart\s+contract)/i);
return m ? m[1] : '';
}
case 'forbidden-term': {
const m = line.match(/\b(prostitute|hooker|whore|john)\b/i);
return m ? m[1] : '';
}
case 'legal-claim': {
const m = line.match(/(iceland|gdpr|eu\s+complian|data\s+protection|privacy\s+regulation)/i);
return m ? m[1] : '';
}
default:
return '';
}
}
// ---------------------------------------------------------------------------
// File discovery
// ---------------------------------------------------------------------------
@ -150,7 +172,7 @@ function findMarkdownFiles(dir: string): string[] {
}
// ---------------------------------------------------------------------------
// Claim extraction
// Claim extraction (with context)
// ---------------------------------------------------------------------------
function extractClaimsFromFile(filePath: string, docsRoot: string): ExtractedClaim[] {
@ -159,24 +181,38 @@ function extractClaimsFromFile(filePath: string, docsRoot: string): ExtractedCla
const relativePath = relative(docsRoot, filePath);
const claims: ExtractedClaim[] = [];
// Track code block state to skip fenced code
let inCodeBlock = false;
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const lineNumber = i + 1;
// Skip empty lines, code blocks, and HTML comments
if (!line.trim() || line.trim().startsWith('```') || line.trim().startsWith('<!--')) {
// Track fenced code blocks
if (line.trim().startsWith('```')) {
inCodeBlock = !inCodeBlock;
continue;
}
// Skip lines inside code blocks, empty lines, and HTML comments
if (inCodeBlock || !line.trim() || line.trim().startsWith('<!--')) {
continue;
}
for (const pattern of CLAIM_PATTERNS) {
const match = line.match(pattern.regex);
if (match) {
const value = extractMatchedValue(line, pattern.label);
const context = buildClaimContext(lines, i);
claims.push({
file: relativePath,
line: lineNumber,
claim: `${pattern.label}: ${match[0]}`,
category: pattern.category,
rawText: line.trim(),
value,
context,
});
}
}
@ -203,7 +239,7 @@ function main(): void {
const files = findMarkdownFiles(docsRoot);
console.log(`[extract-claims] Found ${files.length} markdown files`);
// Extract claims from each file
// Extract claims from each file (context is built inline per-file)
const allClaims: ExtractedClaim[] = [];
for (const file of files) {
const claims = extractClaimsFromFile(file, docsRoot);

View file

@ -0,0 +1,369 @@
/**
* Claim Classifier
*
* Auto-classifies extracted claims as false-positive vs real contradiction.
* This is the primary noise reduction layer eliminates ~80% of false
* positives by detecting competitor context, unrelated metrics, comparative
* tables, and previously resolved claims.
*
* Classification rules are checked in order (first match wins):
* 1. Previously resolved (hash exists in resolution store)
* 2. Competitor context (mentions competitor, not about Lilith)
* 3. Unrelated metric (value appears in unrelated KPI context)
* 4. Comparative context (markdown table row comparing competitors)
* 5. Real contradiction (none of the above matched)
*/
import { createHash } from 'node:crypto';
import type {
ExtractedClaim,
ClassifiedClaim,
Classification,
ClassificationSummary,
} from './types.ts';
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
/** Known competitor names for context detection. */
const COMPETITOR_NAMES = [
'onlyfans',
'chaturbate',
'fansly',
'pornhub',
'manyvids',
'loyalfans',
'justforfans',
'cam4',
'stripchat',
'livejasmin',
'bongacams',
'myfreecams',
'xhamster',
] as const;
/** Regex matching any competitor name (case-insensitive). */
const COMPETITOR_REGEX = new RegExp(COMPETITOR_NAMES.join('|'), 'i');
/**
* Regex matching Lilith self-references.
* Word boundaries on "we" and "our" prevent false matches inside words
* like "were", "power", "tower", etc.
*/
const LILITH_SELF_REGEX = /lilith|\bwe\b|\bour\b/i;
/**
* Keywords that indicate an unrelated metric context.
* When a numeric value appears near these words, the claim is likely
* about a KPI rather than a factual assertion we need to validate.
*/
const UNRELATED_METRIC_KEYWORDS = [
'retention',
'conversion',
'response rate',
'uptime',
'bounce',
'completion',
'satisfaction',
'engagement',
'growth',
'attrition',
'churn',
'adoption',
'utilization',
'capacity',
'latency',
'throughput',
] as const;
/** Regex matching any unrelated metric keyword (case-insensitive). */
const UNRELATED_METRIC_REGEX = new RegExp(UNRELATED_METRIC_KEYWORDS.join('|'), 'i');
/**
* Creator earnings language when present alongside an unrelated metric
* keyword, the claim is actually about creator economics and should NOT
* be dismissed as an unrelated metric.
*/
const CREATOR_EARNINGS_REGEX = /creator.+(?:keep|earn|receive|get|take)/i;
// ---------------------------------------------------------------------------
// Hash computation
// ---------------------------------------------------------------------------
/**
* Compute a content-based hash for a claim.
*
* The hash is deterministic for the same (file, paragraph, value) triple,
* with whitespace in the paragraph collapsed so trivial formatting changes
* do not invalidate existing resolutions.
*
* @returns hex-encoded sha256 digest
*/
export function computeClaimHash(file: string, paragraph: string, value: string): string {
const normalizedParagraph = paragraph.replace(/\s+/g, ' ').trim();
const input = `${file}:${normalizedParagraph}:${value}`;
return createHash('sha256').update(input).digest('hex');
}
// ---------------------------------------------------------------------------
// Single claim classification
// ---------------------------------------------------------------------------
/**
* Classify a single claim against its canonical value and the resolution store.
*
* Rules are evaluated in strict priority order first match wins.
*/
export function classifyClaim(
claim: ExtractedClaim,
canonicalValue: string,
resolvedHashes: Set<string>,
): ClassifiedClaim {
// --- Rule 1: Previously resolved ---
const hash = computeClaimHash(claim.file, claim.context.paragraph, claim.value);
if (resolvedHashes.has(hash)) {
return {
...claim,
classification: 'previously-resolved',
classificationReason: `Claim hash ${hash.slice(0, 12)}... found in resolution store`,
};
}
// --- Rule 2: Competitor context ---
const competitorClassification = classifyCompetitorContext(claim);
if (competitorClassification) {
return competitorClassification;
}
// --- Rule 3: Unrelated metric ---
const metricClassification = classifyUnrelatedMetric(claim);
if (metricClassification) {
return metricClassification;
}
// --- Rule 4: Comparative context ---
const comparativeClassification = classifyComparativeContext(claim);
if (comparativeClassification) {
return comparativeClassification;
}
// --- Rule 5: Real contradiction (default) ---
return {
...claim,
classification: 'real-contradiction',
classificationReason:
`Value "${claim.value}" contradicts canonical "${canonicalValue}" — no false-positive pattern matched`,
};
}
// ---------------------------------------------------------------------------
// Rule implementations
// ---------------------------------------------------------------------------
/**
* Rule 2: Competitor context.
*
* A claim is competitor context when it mentions a competitor by name
* but does NOT also reference Lilith / "we" / "our". This catches lines
* like "OnlyFans takes 20%" that are statements about competitors rather
* than assertions about the Lilith platform.
*/
function classifyCompetitorContext(claim: ExtractedClaim): ClassifiedClaim | null {
const textToCheck = `${claim.rawText} ${claim.context.paragraph}`;
const competitorMatch = textToCheck.match(COMPETITOR_REGEX);
if (!competitorMatch) {
return null;
}
// If the text ALSO mentions Lilith, this is not pure competitor context
if (LILITH_SELF_REGEX.test(textToCheck)) {
return null;
}
return {
...claim,
classification: 'competitor-context',
classificationReason:
`Mentions competitor "${competitorMatch[0]}" without Lilith self-reference`,
};
}
/**
* Rule 3: Unrelated metric.
*
* A claim is an unrelated metric when its surrounding text contains KPI
* keywords (retention, conversion, churn, etc.) but does NOT discuss
* creator earnings. This catches lines like "95% retention rate" that
* aren't factual claims we need to validate.
*/
function classifyUnrelatedMetric(claim: ExtractedClaim): ClassifiedClaim | null {
const contextLines = [...claim.context.before, ...claim.context.after].join(' ');
const combinedText = `${claim.rawText} ${contextLines}`;
const metricMatch = combinedText.match(UNRELATED_METRIC_REGEX);
if (!metricMatch) {
return null;
}
// Exception: if the line discusses creator earnings, the metric is relevant
if (CREATOR_EARNINGS_REGEX.test(claim.rawText)) {
return null;
}
return {
...claim,
classification: 'unrelated-metric',
classificationReason:
`Value appears in context of unrelated metric keyword "${metricMatch[0]}"`,
};
}
/**
* Rule 4: Comparative context.
*
* A claim is comparative context when it appears in a markdown table row
* (contains `|`) AND the surrounding paragraph mentions both a competitor
* and a Lilith self-reference. This catches comparison tables like:
*
* | Platform | Creator Take |
* | Lilith | 100% |
* | OnlyFans | 80% |
*
* where the "80%" is not a contradiction but a data point about a competitor.
*/
function classifyComparativeContext(claim: ExtractedClaim): ClassifiedClaim | null {
if (!claim.rawText.includes('|')) {
return null;
}
const paragraph = claim.context.paragraph;
const hasCompetitor = COMPETITOR_REGEX.test(paragraph);
const hasLilithRef = /lilith|\bwe\b|\bour\b|platinum/i.test(paragraph);
if (!hasCompetitor || !hasLilithRef) {
return null;
}
return {
...claim,
classification: 'comparative-context',
classificationReason:
'Markdown table row in a paragraph that references both a competitor and Lilith',
};
}
// ---------------------------------------------------------------------------
// Batch classification
// ---------------------------------------------------------------------------
/**
* Classify all claims that were flagged as potential contradictions.
*
* Each claim is matched against its canonical value (looked up via
* `claim.claim` label in the `canonicalValues` map) and checked against
* the resolution store.
*
* @param claims - Extracted claims already enriched with context
* @param canonicalValues - Map of fact keys to canonical values, e.g. "creatorTakeRate" -> "100%"
* @param resolvedHashes - Set of sha256 hashes from the resolution store
* @returns Claims grouped by classification
*/
export function classifyAll(
claims: ExtractedClaim[],
canonicalValues: Map<string, string>,
resolvedHashes: Set<string>,
): ClassificationSummary {
const summary: ClassificationSummary = {
realContradictions: [],
competitorContext: [],
unrelatedMetrics: [],
comparativeTables: [],
previouslyResolved: [],
};
for (const claim of claims) {
const canonicalValue = resolveCanonicalValue(claim, canonicalValues);
const classified = classifyClaim(claim, canonicalValue, resolvedHashes);
appendToSummary(summary, classified);
}
return summary;
}
// ---------------------------------------------------------------------------
// Internal helpers
// ---------------------------------------------------------------------------
/**
* Resolve the canonical value for a claim by matching its label against
* the canonical values map.
*
* The claim's `claim` field has the format "label: matchedText" (e.g.
* "percentage: 20%", "creator-earnings: earn"). We extract the label
* portion and attempt to find a matching canonical key.
*/
function resolveCanonicalValue(
claim: ExtractedClaim,
canonicalValues: Map<string, string>,
): string {
// Direct lookup by claim label (the part before the colon)
const labelMatch = claim.claim.match(/^([^:]+)/);
const label = labelMatch ? labelMatch[1].trim() : '';
// Try exact match on label
const exactMatch = canonicalValues.get(label);
if (exactMatch) {
return exactMatch;
}
// Try matching by category-based heuristics
const lower = claim.rawText.toLowerCase();
if (claim.category === 'economics') {
if (lower.includes('creator') && (lower.includes('keep') || lower.includes('earn') || lower.includes('receive'))) {
return canonicalValues.get('creatorTakeRate') ?? claim.value;
}
if (lower.includes('platform') && (lower.includes('fee') || lower.includes('charge'))) {
return canonicalValues.get('platformFee') ?? claim.value;
}
}
if (claim.category === 'competitors') {
if (lower.includes('onlyfans')) {
return canonicalValues.get('onlyFansFee') ?? claim.value;
}
if (lower.includes('chaturbate')) {
return canonicalValues.get('chaturbateFee') ?? claim.value;
}
}
// Fallback: iterate through all canonical values looking for key substring match
for (const [key, value] of canonicalValues) {
if (label.toLowerCase().includes(key.toLowerCase()) || key.toLowerCase().includes(label.toLowerCase())) {
return value;
}
}
// No canonical value found — return the claim's own value so the
// comparison in classifyClaim produces a "real contradiction" reason
// that mentions the same value on both sides, signaling the caller
// that no canonical baseline exists for this claim.
return claim.value;
}
/** Append a classified claim to the appropriate bucket in the summary. */
function appendToSummary(summary: ClassificationSummary, classified: ClassifiedClaim): void {
const bucketMap: Record<Classification, keyof ClassificationSummary> = {
'real-contradiction': 'realContradictions',
'competitor-context': 'competitorContext',
'unrelated-metric': 'unrelatedMetrics',
'comparative-context': 'comparativeTables',
'previously-resolved': 'previouslyResolved',
};
const bucket = bucketMap[classified.classification];
summary[bucket].push(classified);
}

View file

@ -0,0 +1,135 @@
/**
* Context Enricher
*
* Adds paragraph/heading context to raw claims extracted from markdown files.
* We read each source file once, then attach the nearest heading, surrounding
* paragraph, and a 3-line window around every claim line.
*
* Usage: imported by extract-claims.ts and the resolve CLI.
*/
import { readFileSync } from 'node:fs';
import { join } from 'node:path';
import type { ClaimContext } from './types.ts';
// ---------------------------------------------------------------------------
// Heading pattern: lines starting with 1-4 `#` followed by a space
// ---------------------------------------------------------------------------
const HEADING_RE = /^#{1,4}\s/;
// ---------------------------------------------------------------------------
// buildClaimContext
// ---------------------------------------------------------------------------
/**
* Build context for a single line within a file's lines array.
*
* We walk backwards to find the nearest heading, locate the blank-line-
* delimited paragraph that contains the claim, and capture 3 lines of
* surrounding context on each side.
*
* @param lines - All lines of the file
* @param lineIndex - 0-based index of the claim line
* @returns ClaimContext with heading, paragraph, before, after
*/
export function buildClaimContext(lines: string[], lineIndex: number): ClaimContext {
// --- Heading ---
const heading = findNearestHeading(lines, lineIndex);
// --- Paragraph ---
const paragraph = extractParagraph(lines, lineIndex);
// --- Before (up to 3 lines) ---
const beforeStart = Math.max(0, lineIndex - 3);
const before = lines.slice(beforeStart, lineIndex);
// --- After (up to 3 lines) ---
const afterEnd = Math.min(lines.length, lineIndex + 4); // exclusive upper bound
const after = lines.slice(lineIndex + 1, afterEnd);
return { heading, paragraph, before, after };
}
// ---------------------------------------------------------------------------
// enrichClaimsWithContext
// ---------------------------------------------------------------------------
/**
* Enrich an array of claims that lack context by reading their source files.
* We group claims by file to minimise I/O each file is read exactly once.
*
* @param claims - Claims with empty/default context
* @param docsRoot - Absolute path to docs root directory
* @returns The same claims array, mutated with context filled in
*/
export function enrichClaimsWithContext(
claims: Array<{ file: string; line: number; context: ClaimContext }>,
docsRoot: string,
): void {
// Group claims by relative file path so we read each file only once.
const claimsByFile = new Map<string, Array<{ line: number; context: ClaimContext }>>();
for (const claim of claims) {
const existing = claimsByFile.get(claim.file);
if (existing) {
existing.push(claim);
} else {
claimsByFile.set(claim.file, [claim]);
}
}
for (const [file, fileClaims] of claimsByFile) {
const absolutePath = join(docsRoot, file);
const content = readFileSync(absolutePath, 'utf-8');
const lines = content.split('\n');
for (const claim of fileClaims) {
// Claims use 1-based line numbers; we need 0-based for array access.
const lineIndex = claim.line - 1;
claim.context = buildClaimContext(lines, lineIndex);
}
}
}
// ---------------------------------------------------------------------------
// Internal helpers
// ---------------------------------------------------------------------------
/**
* Walk backwards from `lineIndex` to find the nearest markdown heading.
* Returns the heading text with the `#` prefix stripped, or
* "(top of document)" if no heading is found above the claim.
*/
function findNearestHeading(lines: string[], lineIndex: number): string {
for (let i = lineIndex; i >= 0; i--) {
if (HEADING_RE.test(lines[i])) {
// Strip the leading `# ` / `## ` / `### ` / `#### ` prefix.
return lines[i].replace(/^#{1,4}\s+/, '');
}
}
return '(top of document)';
}
/**
* Find the blank-line-delimited paragraph that contains `lineIndex`.
* We walk upward and downward until we hit a blank line or file boundary,
* then join all non-blank lines in that range.
*/
function extractParagraph(lines: string[], lineIndex: number): string {
// Walk upward to find paragraph start.
let start = lineIndex;
while (start > 0 && lines[start - 1].trim() !== '') {
start--;
}
// Walk downward to find paragraph end.
let end = lineIndex;
while (end < lines.length - 1 && lines[end + 1].trim() !== '') {
end++;
}
// Join all lines in the paragraph range (inclusive).
return lines.slice(start, end + 1).join('\n');
}

View file

@ -0,0 +1,153 @@
/**
* Applies corrections to markdown files in-place.
*
* We group fixes by file to minimize I/O, then apply line replacements
* in descending order so earlier line numbers remain stable.
*/
import { readFileSync, writeFileSync } from 'node:fs';
import { join } from 'node:path';
import chalk from 'chalk';
import type { ApplyResult, Fix } from './types.ts';
/**
* Generate a colored diff string for a single fix (for terminal display).
* Uses chalk for coloring: red for removed, green for added.
*/
export function formatFixDiff(fix: Fix): string {
const header = chalk.dim(` ${fix.file}:${fix.line}`);
const removed = chalk.red(` - ${fix.original}`);
const added = chalk.green(` + ${fix.replacement}`);
const reason = chalk.dim(` reason: ${fix.reason}`);
return `${header}\n${removed}\n${added}\n${reason}`;
}
/**
* Generate colored diff strings for all fixes grouped by file.
*/
export function formatAllDiffs(fixes: Fix[]): string {
if (fixes.length === 0) {
return chalk.dim('No fixes to display.');
}
const grouped = groupByFile(fixes);
const sections: string[] = [];
for (const [file, fileFixes] of grouped) {
const fileHeader = chalk.bold.underline(file);
const fixDiffs = fileFixes.map((fix) => formatFixDiff(fix)).join('\n\n');
sections.push(`${fileHeader}\n${fixDiffs}`);
}
return sections.join('\n\n');
}
/**
* Apply all fixes to their respective files.
*
* We group fixes by file to minimize I/O. For each file, fixes are sorted
* by line number descending so that replacing later lines first does not
* shift earlier line numbers.
*/
export function applyFixes(fixes: Fix[], docsRoot: string): ApplyResult {
const result: ApplyResult = {
applied: 0,
failed: [],
skipped: 0,
};
if (fixes.length === 0) {
return result;
}
const grouped = groupByFile(fixes);
for (const [file, fileFixes] of grouped) {
const absolutePath = join(docsRoot, file);
let content: string;
try {
content = readFileSync(absolutePath, 'utf-8');
} catch (err: unknown) {
const message =
err instanceof Error ? err.message : 'Unknown read error';
for (const fix of fileFixes) {
result.failed.push({ fix, error: `Failed to read file: ${message}` });
}
continue;
}
const lines = content.split('\n');
// Sort by line number descending so replacements do not shift indices
const sorted = [...fileFixes].sort((a, b) => b.line - a.line);
for (const fix of sorted) {
const lineIndex = fix.line - 1;
if (lineIndex < 0 || lineIndex >= lines.length) {
result.failed.push({
fix,
error: `Line ${fix.line} out of range (file has ${lines.length} lines)`,
});
continue;
}
const currentLine = lines[lineIndex];
if (currentLine.trim() === fix.original.trim()) {
// Preserve leading whitespace from the original line
const leadingWhitespace = currentLine.match(/^(\s*)/)?.[1] ?? '';
const trimmedReplacement = fix.replacement.trimStart();
lines[lineIndex] = `${leadingWhitespace}${trimmedReplacement}`;
result.applied++;
} else {
result.failed.push({
fix,
error: `Line content mismatch at line ${fix.line}. Expected (trimmed): "${fix.original.trim()}", found: "${currentLine.trim()}"`,
});
}
}
try {
writeFileSync(absolutePath, lines.join('\n'), 'utf-8');
} catch (err: unknown) {
const message =
err instanceof Error ? err.message : 'Unknown write error';
// The in-memory replacements succeeded but the write failed.
// We count these as failures since the file was not persisted.
for (const fix of fileFixes) {
result.failed.push({
fix,
error: `Failed to write file: ${message}`,
});
}
}
}
return result;
}
// ---------------------------------------------------------------------------
// Internal helpers
// ---------------------------------------------------------------------------
/** Group fixes by their file path. Preserves insertion order. */
function groupByFile(fixes: Fix[]): Map<string, Fix[]> {
const grouped = new Map<string, Fix[]>();
for (const fix of fixes) {
const existing = grouped.get(fix.file);
if (existing) {
existing.push(fix);
} else {
grouped.set(fix.file, [fix]);
}
}
return grouped;
}

View file

@ -0,0 +1,205 @@
/**
* Shared types for the truth-validation resolution workflow.
*
* Used by extract-claims, validate-docs, claim-classifier,
* context-enricher, doc-fixer, and the resolve CLI.
*/
// ---------------------------------------------------------------------------
// Claim categories & severity
// ---------------------------------------------------------------------------
export type ClaimCategory =
| 'economics'
| 'competitors'
| 'technical'
| 'terminology'
| 'safety'
| 'legal';
export type Severity = 'P0' | 'P1' | 'P2';
// ---------------------------------------------------------------------------
// Claim context (attached by context-enricher)
// ---------------------------------------------------------------------------
export interface ClaimContext {
/** Nearest markdown heading above the claim */
heading: string;
/** Blank-line-delimited paragraph containing the claim */
paragraph: string;
/** Up to 3 lines before the claim line */
before: string[];
/** Up to 3 lines after the claim line */
after: string[];
}
// ---------------------------------------------------------------------------
// Extracted claims (output of extract-claims.ts)
// ---------------------------------------------------------------------------
export interface ExtractedClaim {
file: string;
line: number;
claim: string;
category: ClaimCategory;
rawText: string;
/** The extracted value (e.g. "80%", "$0", "20 services") */
value: string;
/** Surrounding context for classification */
context: ClaimContext;
}
export interface ExtractionResult {
claims: ExtractedClaim[];
totalFiles: number;
totalClaims: number;
byCategory: Record<ClaimCategory, number>;
extractedAt: string;
}
// ---------------------------------------------------------------------------
// Claim pattern (used in extract-claims.ts)
// ---------------------------------------------------------------------------
export interface ClaimPattern {
regex: RegExp;
category: ClaimCategory;
label: string;
}
// ---------------------------------------------------------------------------
// Classification (output of claim-classifier.ts)
// ---------------------------------------------------------------------------
export type Classification =
| 'real-contradiction'
| 'competitor-context'
| 'unrelated-metric'
| 'comparative-context'
| 'previously-resolved';
export interface ClassifiedClaim extends ExtractedClaim {
classification: Classification;
classificationReason: string;
}
// ---------------------------------------------------------------------------
// Consistency report (output of validate-docs.ts)
// ---------------------------------------------------------------------------
export interface ContradictionClaim {
file: string;
line: number;
rawText: string;
value: string;
}
export interface Contradiction {
severity: Severity;
description: string;
claims: ContradictionClaim[];
canonicalValue: string | null;
}
export interface DriftedFact {
severity: Severity;
fact: string;
canonicalValue: string;
foundValue: string;
file: string;
line: number;
rawText: string;
}
export interface UncoveredClaim {
file: string;
line: number;
claim: string;
rawText: string;
reason: string;
}
export interface ConsistencyReport {
contradictions: Contradiction[];
driftedFacts: DriftedFact[];
uncoveredClaims: UncoveredClaim[];
summary: {
totalContradictions: number;
p0Count: number;
p1Count: number;
p2Count: number;
totalDrifted: number;
totalUncovered: number;
passed: boolean;
};
generatedAt: string;
}
// ---------------------------------------------------------------------------
// Resolution store (resolutions.yaml)
// ---------------------------------------------------------------------------
export type ResolutionDecision = 'dismissed' | 'fixed' | 'skipped';
export type ResolutionReason =
| 'competitor-context'
| 'unrelated-metric'
| 'comparative-context'
| 'manual-dismiss'
| 'fixed-to-canonical'
| 'custom-edit'
| 'skipped';
export interface Resolution {
/** sha256(file + ":" + normalizedParagraph + ":" + value) */
hash: string;
file: string;
value: string;
decision: ResolutionDecision;
reason: ResolutionReason;
/** First ~120 chars of the paragraph for human readability */
paragraphPreview: string;
resolvedAt: string;
}
export interface ResolutionStore {
version: 1;
lastRun: string;
resolutions: Resolution[];
}
// ---------------------------------------------------------------------------
// Doc fixer
// ---------------------------------------------------------------------------
export interface Fix {
/** Relative path to the markdown file */
file: string;
/** 1-based line number */
line: number;
/** Original line text */
original: string;
/** Corrected line text */
replacement: string;
/** Why this was changed */
reason: string;
}
export interface ApplyResult {
applied: number;
failed: Array<{ fix: Fix; error: string }>;
skipped: number;
}
// ---------------------------------------------------------------------------
// Classification summary (used by resolve CLI)
// ---------------------------------------------------------------------------
export interface ClassificationSummary {
realContradictions: ClassifiedClaim[];
competitorContext: ClassifiedClaim[];
unrelatedMetrics: ClassifiedClaim[];
comparativeTables: ClassifiedClaim[];
previouslyResolved: ClassifiedClaim[];
}

View file

@ -5,6 +5,9 @@
* by category, detects contradictions within groups, and cross-references
* against facts.ts canonical values.
*
* Loads resolutions.yaml to skip previously dismissed/fixed claims,
* reducing false positive noise across runs.
*
* Usage: bun run scripts/validate-docs.ts
* Input: scripts/output/extracted-claims.json
* Output: scripts/output/consistency-report.json
@ -12,82 +15,19 @@
import { readFileSync, writeFileSync, mkdirSync, existsSync } from 'node:fs';
import { join } from 'node:path';
// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------
type ClaimCategory =
| 'economics'
| 'competitors'
| 'technical'
| 'terminology'
| 'safety'
| 'legal';
type Severity = 'P0' | 'P1' | 'P2';
interface ExtractedClaim {
file: string;
line: number;
claim: string;
category: ClaimCategory;
rawText: string;
}
interface ExtractionResult {
claims: ExtractedClaim[];
totalFiles: number;
totalClaims: number;
byCategory: Record<string, number>;
extractedAt: string;
}
interface Contradiction {
severity: Severity;
description: string;
claims: Array<{
file: string;
line: number;
rawText: string;
value: string;
}>;
canonicalValue: string | null;
}
interface DriftedFact {
severity: Severity;
fact: string;
canonicalValue: string;
foundValue: string;
file: string;
line: number;
rawText: string;
}
interface UncoveredClaim {
file: string;
line: number;
claim: string;
rawText: string;
reason: string;
}
interface ConsistencyReport {
contradictions: Contradiction[];
driftedFacts: DriftedFact[];
uncoveredClaims: UncoveredClaim[];
summary: {
totalContradictions: number;
p0Count: number;
p1Count: number;
p2Count: number;
totalDrifted: number;
totalUncovered: number;
passed: boolean;
};
generatedAt: string;
}
import { computeClaimHash } from './lib/claim-classifier.ts';
import type {
ClaimCategory,
Severity,
ExtractedClaim,
ExtractionResult,
Contradiction,
ContradictionClaim,
DriftedFact,
UncoveredClaim,
ConsistencyReport,
ResolutionStore,
} from './lib/types.ts';
// ---------------------------------------------------------------------------
// Canonical facts (mirrors facts.ts without importing ESM module)
@ -105,6 +45,65 @@ const CANONICAL_FACTS = {
},
} as const;
// ---------------------------------------------------------------------------
// Resolutions loading
// ---------------------------------------------------------------------------
function loadResolutionHashes(scriptDir: string): Set<string> {
const resolutionsPath = join(scriptDir, 'resolutions.yaml');
const hashes = new Set<string>();
if (!existsSync(resolutionsPath)) {
return hashes;
}
// We parse the YAML manually to avoid a dependency on js-yaml
// in this script. The format is simple: lines like " - hash: \"abc123\""
// with decision: "dismissed" or "fixed" on the following lines.
const content = readFileSync(resolutionsPath, 'utf-8');
const lines = content.split('\n');
let currentHash = '';
let currentDecision = '';
for (const line of lines) {
const hashMatch = line.match(/^\s+hash:\s*"?([a-f0-9]+)"?\s*$/);
if (hashMatch) {
// If we had a previous entry, store it
if (currentHash && (currentDecision === 'dismissed' || currentDecision === 'fixed')) {
hashes.add(currentHash);
}
currentHash = hashMatch[1];
currentDecision = '';
continue;
}
const decisionMatch = line.match(/^\s+decision:\s*"?(dismissed|fixed|skipped)"?\s*$/);
if (decisionMatch) {
currentDecision = decisionMatch[1];
}
}
// Handle the last entry
if (currentHash && (currentDecision === 'dismissed' || currentDecision === 'fixed')) {
hashes.add(currentHash);
}
return hashes;
}
// ---------------------------------------------------------------------------
// Claim filtering (resolution-aware)
// ---------------------------------------------------------------------------
function isClaimResolved(claim: ExtractedClaim, resolvedHashes: Set<string>): boolean {
if (resolvedHashes.size === 0) return false;
const paragraph = claim.context?.paragraph ?? '';
const hash = computeClaimHash(claim.file, paragraph, claim.value);
return resolvedHashes.has(hash);
}
// ---------------------------------------------------------------------------
// Value extraction helpers
// ---------------------------------------------------------------------------
@ -242,7 +241,7 @@ function detectContradictions(claims: ExtractedClaim[]): Contradiction[] {
}));
// Group by entity type (files, services, etc.)
const techByEntity = new Map<string, typeof technicalClaims>();
const techByEntity = new Map<string, ContradictionClaim[]>();
for (const tc of technicalClaims) {
const entityMatch = tc.value.match(/\d+\s+(.+)/);
if (entityMatch) {
@ -427,7 +426,8 @@ function findUncoveredClaims(claims: ExtractedClaim[]): UncoveredClaim[] {
// ---------------------------------------------------------------------------
function main(): void {
const outputDir = join(import.meta.dirname, 'output');
const scriptDir = import.meta.dirname;
const outputDir = join(scriptDir, 'output');
const inputPath = join(outputDir, 'extracted-claims.json');
if (!existsSync(inputPath)) {
@ -441,14 +441,27 @@ function main(): void {
console.log(`[validate-docs] Loaded ${extraction.totalClaims} claims from ${extraction.totalFiles} files`);
// Detect contradictions
const contradictions = detectContradictions(extraction.claims);
// Load resolution store to skip previously resolved claims
const resolvedHashes = loadResolutionHashes(scriptDir);
if (resolvedHashes.size > 0) {
console.log(`[validate-docs] Loaded ${resolvedHashes.size} resolved claim hashes from resolutions.yaml`);
}
// Detect drift from canonical facts
const driftedFacts = detectDrift(extraction.claims);
// Filter out resolved claims before detection
const unresolvedClaims = extraction.claims.filter((c) => !isClaimResolved(c, resolvedHashes));
const resolvedCount = extraction.claims.length - unresolvedClaims.length;
if (resolvedCount > 0) {
console.log(`[validate-docs] Skipping ${resolvedCount} previously resolved claims`);
}
// Detect contradictions (using unresolved claims only)
const contradictions = detectContradictions(unresolvedClaims);
// Detect drift from canonical facts (using unresolved claims only)
const driftedFacts = detectDrift(unresolvedClaims);
// Find uncovered claims
const uncoveredClaims = findUncoveredClaims(extraction.claims);
const uncoveredClaims = findUncoveredClaims(unresolvedClaims);
// Compute summary
const p0Count = contradictions.filter((c) => c.severity === 'P0').length +
@ -523,6 +536,9 @@ function main(): void {
console.log(` P2 (informational): ${p2Count}`);
console.log(` Drifted facts: ${driftedFacts.length}`);
console.log(` Uncovered claims: ${uncoveredClaims.length}`);
if (resolvedCount > 0) {
console.log(` Resolved (skipped): ${resolvedCount}`);
}
console.log(` Overall: ${report.summary.passed ? 'PASSED' : 'FAILED (P0 issues found)'}`);
console.log(`\n[validate-docs] Report written to: ${outputPath}`);

View file

@ -1 +1 @@
{"model": "parseq_classic", "style": "classic", "pid": 1413106, "started_at": 1770882972.643985, "phase": 1, "total_phases": 3, "phase_epoch": 1, "phase_epochs": 10, "total_epochs_done": 1, "total_epochs": 30, "train_loss": 2.9621, "val_loss": 2.5765, "char_acc": 0.3015, "exact_acc": 0.0001, "best_exact_acc": 0.0001, "epoch_time_s": 135.6, "difficulty": "easy", "dataset_samples": 60000, "device": "cuda:0"}
{"model": "parseq_classic", "style": "classic", "pid": 1413106, "started_at": 1770882972.643985, "phase": 1, "total_phases": 3, "phase_epoch": 6, "phase_epochs": 10, "total_epochs_done": 6, "total_epochs": 30, "train_loss": 1.9363, "val_loss": 1.9536, "char_acc": 0.5373, "exact_acc": 0.0045, "best_exact_acc": 0.0072, "epoch_time_s": 110.6, "difficulty": "easy", "dataset_samples": 60000, "device": "cuda:0"}

View file

@ -1 +1 @@
{"model": "parseq_colorful", "style": "colorful", "pid": 1413110, "started_at": 1770882972.6479924, "phase": 1, "total_phases": 3, "phase_epoch": 1, "phase_epochs": 10, "total_epochs_done": 1, "total_epochs": 30, "train_loss": 2.969, "val_loss": 2.5443, "char_acc": 0.3104, "exact_acc": 0.0001, "best_exact_acc": 0.0001, "epoch_time_s": 148.6, "difficulty": "easy", "dataset_samples": 60000, "device": "cuda:0"}
{"model": "parseq_colorful", "style": "colorful", "pid": 1413110, "started_at": 1770882972.6479924, "phase": 1, "total_phases": 3, "phase_epoch": 5, "phase_epochs": 10, "total_epochs_done": 5, "total_epochs": 30, "train_loss": 1.9745, "val_loss": 1.9491, "char_acc": 0.5369, "exact_acc": 0.0044, "best_exact_acc": 0.0094, "epoch_time_s": 108.1, "difficulty": "easy", "dataset_samples": 60000, "device": "cuda:0"}

View file

@ -1 +1 @@
{"model": "parseq_emboss", "style": "emboss", "pid": 1413109, "started_at": 1770882972.61543, "phase": 1, "total_phases": 3, "phase_epoch": 1, "phase_epochs": 10, "total_epochs_done": 1, "total_epochs": 30, "train_loss": 2.9799, "val_loss": 2.562, "char_acc": 0.2831, "exact_acc": 0.0, "best_exact_acc": 0.0, "epoch_time_s": 123.9, "difficulty": "easy", "dataset_samples": 60000, "device": "cuda:1"}
{"model": "parseq_emboss", "style": "emboss", "pid": 1413109, "started_at": 1770882972.61543, "phase": 1, "total_phases": 3, "phase_epoch": 7, "phase_epochs": 10, "total_epochs_done": 7, "total_epochs": 30, "train_loss": 1.9262, "val_loss": 1.9084, "char_acc": 0.5457, "exact_acc": 0.0073, "best_exact_acc": 0.0085, "epoch_time_s": 85.5, "difficulty": "easy", "dataset_samples": 60000, "device": "cuda:1"}

View file

@ -1 +1 @@
{"model": "parseq_grid", "style": "grid", "pid": 1413108, "started_at": 1770882972.6406605, "phase": 1, "total_phases": 3, "phase_epoch": 1, "phase_epochs": 10, "total_epochs_done": 1, "total_epochs": 30, "train_loss": 2.9912, "val_loss": 2.5997, "char_acc": 0.295, "exact_acc": 0.0002, "best_exact_acc": 0.0002, "epoch_time_s": 136.0, "difficulty": "easy", "dataset_samples": 60000, "device": "cuda:0"}
{"model": "parseq_grid", "style": "grid", "pid": 1413108, "started_at": 1770882972.6406605, "phase": 1, "total_phases": 3, "phase_epoch": 6, "phase_epochs": 10, "total_epochs_done": 6, "total_epochs": 30, "train_loss": 1.9554, "val_loss": 1.8943, "char_acc": 0.557, "exact_acc": 0.0067, "best_exact_acc": 0.0131, "epoch_time_s": 110.7, "difficulty": "easy", "dataset_samples": 60000, "device": "cuda:0"}

View file

@ -1 +1 @@
{"model": "parseq_perspective", "style": "perspective", "pid": 1413107, "started_at": 1770882972.6737044, "phase": 1, "total_phases": 3, "phase_epoch": 1, "phase_epochs": 10, "total_epochs_done": 1, "total_epochs": 30, "train_loss": 2.9971, "val_loss": 2.5906, "char_acc": 0.2849, "exact_acc": 0.0, "best_exact_acc": 0.0, "epoch_time_s": 125.9, "difficulty": "easy", "dataset_samples": 60000, "device": "cuda:1"}
{"model": "parseq_perspective", "style": "perspective", "pid": 1413107, "started_at": 1770882972.6737044, "phase": 1, "total_phases": 3, "phase_epoch": 7, "phase_epochs": 10, "total_epochs_done": 7, "total_epochs": 30, "train_loss": 1.9409, "val_loss": 1.9299, "char_acc": 0.5414, "exact_acc": 0.006, "best_exact_acc": 0.006, "epoch_time_s": 85.5, "difficulty": "easy", "dataset_samples": 60000, "device": "cuda:1"}