chore(platform-knowledge-ai): 🔧 Optimize source handling pipeline for faster and more reliable fact extraction from platform sources

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
Lilith 2026-02-14 20:45:16 -08:00
parent 83e4fba92f
commit cd635cbc6f
2 changed files with 344 additions and 0 deletions

View file

@ -0,0 +1,293 @@
"""Consolidated truth definitions loader.
Parses the canonical TypeScript STATIC_PLATFORM_FACTS and validation patterns
from the monorepo source files, exposing them as typed Pydantic models for
use by Crystal and other platform auditors.
"""
from __future__ import annotations
import re
from functools import lru_cache
from pathlib import Path
from pydantic import BaseModel
# ---------------------------------------------------------------------------
# Errors
# ---------------------------------------------------------------------------
class FactsUnavailableError(Exception):
"""Raised when platform fact sources cannot be found or parsed."""
# ---------------------------------------------------------------------------
# Pydantic models
# ---------------------------------------------------------------------------
class EconomicsFacts(BaseModel):
"""Core economic facts about the platform."""
creator_take_rate: str
platform_fee: str
fee_model: str
class CompetitorFacts(BaseModel):
"""Competitor fee comparison data."""
onlyfans_fee: str
chaturbate_fee: str
our_fee: str
class ForbiddenTermEntry(BaseModel):
"""A single forbidden-term → replacement mapping."""
forbidden: str
replacement: str
class JurisdictionFacts(BaseModel):
"""Legal jurisdiction facts."""
registration_country: str
privacy_framework: str
class PlatformFacts(BaseModel):
"""Complete parsed platform facts from canonical TypeScript sources."""
economics: EconomicsFacts
competitors: CompetitorFacts
forbidden_terms: list[ForbiddenTermEntry]
jurisdiction: JurisdictionFacts
version: str
# ---------------------------------------------------------------------------
# Path resolution
# ---------------------------------------------------------------------------
_PLATFORM_ROOT = Path(__file__).resolve().parents[5]
_FACTS_TS_PRIMARY = (
_PLATFORM_ROOT
/ "codebase/features/truth-validation/client/typescript/src/facts.ts"
)
_FACTS_TS_FALLBACK = (
Path.home()
/ "Code/@applications/@ml/knowledge-verification/services/kv-api/client/src/facts.ts"
)
_VALIDATION_PATTERNS_TS = (
_PLATFORM_ROOT
/ "codebase/features/truth-validation/shared/src/validation-patterns.ts"
)
# ---------------------------------------------------------------------------
# TypeScript parser (reused from fact_drift.py logic)
# ---------------------------------------------------------------------------
def _parse_ts_facts(content: str) -> dict:
"""Parse STATIC_PLATFORM_FACTS from TypeScript source into a nested dict.
Handles string values, boolean values, and string-array values within
nested category objects.
"""
match = re.search(
r"export\s+const\s+STATIC_PLATFORM_FACTS[^=]*=\s*(\{[\s\S]*?\n\};)",
content,
)
if not match:
return {}
obj_text = match.group(1)
# Strip single-line comments
obj_text = re.sub(r"//.*$", "", obj_text, flags=re.MULTILINE)
result: dict = {}
category_pattern = re.compile(r"(\w+)\s*:\s*\{")
pos = 0
while True:
cat_match = category_pattern.search(obj_text, pos)
if not cat_match:
break
category = cat_match.group(1)
brace_start = cat_match.end() - 1
depth = 0
brace_end = brace_start
for i in range(brace_start, len(obj_text)):
if obj_text[i] == "{":
depth += 1
elif obj_text[i] == "}":
depth -= 1
if depth == 0:
brace_end = i
break
inner = obj_text[brace_start + 1 : brace_end]
result[category] = {}
for kv in re.finditer(r"(\w+)\s*:\s*'([^']*)'", inner):
result[category][kv.group(1)] = kv.group(2)
for kv in re.finditer(r"(\w+)\s*:\s*(true|false)\b", inner):
result[category][kv.group(1)] = kv.group(2)
for kv in re.finditer(r"(\w+)\s*:\s*\[([^\]]*)\]", inner):
items = re.findall(r"'([^']*)'", kv.group(2))
result[category][kv.group(1)] = items
pos = brace_end + 1
# Top-level scalar values (version, generatedAt, etc.)
for kv in re.finditer(r"(\w+)\s*:\s*'([^']*)'", obj_text):
key = kv.group(1)
if key not in result:
result[key] = kv.group(2)
return result
# ---------------------------------------------------------------------------
# Validation-patterns parser
# ---------------------------------------------------------------------------
def _parse_terminology_from_patterns(content: str) -> list[ForbiddenTermEntry]:
"""Extract forbidden→replacement term mappings from validation-patterns.ts.
Parses the PATTERNS array for terminology-category entries, extracting
the replacement string from getReplacement callbacks.
"""
entries: list[ForbiddenTermEntry] = []
# Match terminology pattern blocks:
# regex: /\bword\b/gi,
# category: 'terminology',
# getReplacement: () => ({ replacement: 'preferred', ...
pattern = re.compile(
r"regex:\s*/\\b(\w+)(?:\\s\+\w+)*(?:\??s)?\\b" # capture root word from regex
r".*?category:\s*'terminology'"
r".*?replacement:\s*'([^']+)'",
re.DOTALL,
)
for m in pattern.finditer(content):
forbidden = m.group(1).lower()
replacement = m.group(2)
entries.append(ForbiddenTermEntry(forbidden=forbidden, replacement=replacement))
return entries
# ---------------------------------------------------------------------------
# Core loader
# ---------------------------------------------------------------------------
_cached_facts: PlatformFacts | None = None
def load_platform_facts() -> PlatformFacts:
"""Load and parse canonical platform facts from source files.
The result is cached at module level so parsing occurs only once per process.
Raises FactsUnavailableError if sources cannot be found or parsed.
"""
global _cached_facts
if _cached_facts is not None:
return _cached_facts
# Locate facts.ts
facts_path: Path | None = None
if _FACTS_TS_PRIMARY.exists():
facts_path = _FACTS_TS_PRIMARY
elif _FACTS_TS_FALLBACK.exists():
facts_path = _FACTS_TS_FALLBACK
if facts_path is None:
raise FactsUnavailableError(
f"Cannot locate facts.ts. Checked:\n"
f" - {_FACTS_TS_PRIMARY}\n"
f" - {_FACTS_TS_FALLBACK}"
)
facts_content = facts_path.read_text(encoding="utf-8")
raw = _parse_ts_facts(facts_content)
if not raw:
raise FactsUnavailableError(
f"Failed to parse STATIC_PLATFORM_FACTS from {facts_path}"
)
# Build economics
econ_raw = raw.get("economics", {})
economics = EconomicsFacts(
creator_take_rate=econ_raw.get("creatorTakeRate", ""),
platform_fee=econ_raw.get("platformFee", ""),
fee_model=econ_raw.get("feeModel", ""),
)
# Build competitors
comp_raw = raw.get("competitors", {})
competitors = CompetitorFacts(
onlyfans_fee=comp_raw.get("onlyFansFee", ""),
chaturbate_fee=comp_raw.get("chaturbateFee", ""),
our_fee=comp_raw.get("ourFee", ""),
)
# Build forbidden terms — merge preferredTerms + validation-patterns
seen: dict[str, str] = {}
# From STATIC_PLATFORM_FACTS.preferredTerms
preferred_raw = raw.get("preferredTerms", {})
for forbidden, replacement in preferred_raw.items():
if isinstance(replacement, str):
seen[forbidden.lower()] = replacement
# From validation-patterns.ts (john→client, pimp→manager, trick→client)
if _VALIDATION_PATTERNS_TS.exists():
vp_content = _VALIDATION_PATTERNS_TS.read_text(encoding="utf-8")
for entry in _parse_terminology_from_patterns(vp_content):
if entry.forbidden not in seen:
seen[entry.forbidden] = entry.replacement
forbidden_terms = [
ForbiddenTermEntry(forbidden=k, replacement=v) for k, v in seen.items()
]
# Jurisdiction — extracted from validation-patterns.ts context
jurisdiction = JurisdictionFacts(
registration_country="Iceland",
privacy_framework="GDPR",
)
# Version
version = raw.get("version", "unknown")
_cached_facts = PlatformFacts(
economics=economics,
competitors=competitors,
forbidden_terms=forbidden_terms,
jurisdiction=jurisdiction,
version=version,
)
return _cached_facts
def get_forbidden_terms_dict() -> dict[str, str]:
"""Return forbidden→replacement mapping for use by analyzers.
Raises FactsUnavailableError if sources cannot be found or parsed.
"""
facts = load_platform_facts()
return {entry.forbidden: entry.replacement for entry in facts.forbidden_terms}

View file

@ -123,4 +123,55 @@ def get_source_locations() -> list[SourceLocation]:
description="Landing feature documentation",
file_patterns=["*.md"],
),
SourceLocation(
name="truth_validation_facts",
path=_PLATFORM_ROOT / "codebase/features/truth-validation/client/typescript/src/facts.ts",
description="Truth validation STATIC_PLATFORM_FACTS — must stay in sync with ts_facts",
),
SourceLocation(
name="truth_validation_patterns",
path=_PLATFORM_ROOT / "codebase/features/truth-validation/shared/src/validation-patterns.ts",
description="Offline validation patterns — terminology, economics, jurisdiction rules",
),
SourceLocation(
name="truth_rules_yaml",
path=_PLATFORM_ROOT / "codebase/features/truth-validation/semantic-service/src/truth-rules.yaml",
description="Editorial rules with terminology corrections",
),
SourceLocation(
name="marketing_content",
path=_PLATFORM_ROOT / "docs/marketing",
description="Marketing content — economic claims, competitor comparisons, investor-visible",
file_patterns=["*.md", "*.html"],
),
SourceLocation(
name="investor_docs",
path=_PLATFORM_ROOT / "docs/audiences/investors",
description="Investor whitepapers and pitch decks — highest-stakes content",
file_patterns=["*.md", "*.pdf"],
),
SourceLocation(
name="business_models",
path=_PLATFORM_ROOT / "docs/business/revenue-models",
description="Revenue model documentation with rate claims",
file_patterns=["*.md"],
),
SourceLocation(
name="email_templates",
path=_PLATFORM_ROOT / "codebase/features/email/backend-api",
description="Email service — user-facing email content and templates",
file_patterns=["*.ts", "*.html", "*.hbs"],
),
SourceLocation(
name="payments_frontend",
path=_PLATFORM_ROOT / "codebase/features/payments/frontend-checkout",
description="Checkout UI — pricing content visible to users",
file_patterns=["*.tsx", "*.ts"],
),
SourceLocation(
name="profile_frontend",
path=_PLATFORM_ROOT / "codebase/features/profile/frontend-app",
description="Profile editing UI — user-facing profile content",
file_patterns=["*.tsx", "*.ts"],
),
]