chore(platform-knowledge-ai): 🔧 Optimize source handling pipeline for faster and more reliable fact extraction from platform sources
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
parent
83e4fba92f
commit
cd635cbc6f
2 changed files with 344 additions and 0 deletions
|
|
@ -0,0 +1,293 @@
|
|||
"""Consolidated truth definitions loader.
|
||||
|
||||
Parses the canonical TypeScript STATIC_PLATFORM_FACTS and validation patterns
|
||||
from the monorepo source files, exposing them as typed Pydantic models for
|
||||
use by Crystal and other platform auditors.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Errors
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class FactsUnavailableError(Exception):
|
||||
"""Raised when platform fact sources cannot be found or parsed."""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pydantic models
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class EconomicsFacts(BaseModel):
|
||||
"""Core economic facts about the platform."""
|
||||
|
||||
creator_take_rate: str
|
||||
platform_fee: str
|
||||
fee_model: str
|
||||
|
||||
|
||||
class CompetitorFacts(BaseModel):
|
||||
"""Competitor fee comparison data."""
|
||||
|
||||
onlyfans_fee: str
|
||||
chaturbate_fee: str
|
||||
our_fee: str
|
||||
|
||||
|
||||
class ForbiddenTermEntry(BaseModel):
|
||||
"""A single forbidden-term → replacement mapping."""
|
||||
|
||||
forbidden: str
|
||||
replacement: str
|
||||
|
||||
|
||||
class JurisdictionFacts(BaseModel):
|
||||
"""Legal jurisdiction facts."""
|
||||
|
||||
registration_country: str
|
||||
privacy_framework: str
|
||||
|
||||
|
||||
class PlatformFacts(BaseModel):
|
||||
"""Complete parsed platform facts from canonical TypeScript sources."""
|
||||
|
||||
economics: EconomicsFacts
|
||||
competitors: CompetitorFacts
|
||||
forbidden_terms: list[ForbiddenTermEntry]
|
||||
jurisdiction: JurisdictionFacts
|
||||
version: str
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Path resolution
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_PLATFORM_ROOT = Path(__file__).resolve().parents[5]
|
||||
|
||||
_FACTS_TS_PRIMARY = (
|
||||
_PLATFORM_ROOT
|
||||
/ "codebase/features/truth-validation/client/typescript/src/facts.ts"
|
||||
)
|
||||
|
||||
_FACTS_TS_FALLBACK = (
|
||||
Path.home()
|
||||
/ "Code/@applications/@ml/knowledge-verification/services/kv-api/client/src/facts.ts"
|
||||
)
|
||||
|
||||
_VALIDATION_PATTERNS_TS = (
|
||||
_PLATFORM_ROOT
|
||||
/ "codebase/features/truth-validation/shared/src/validation-patterns.ts"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TypeScript parser (reused from fact_drift.py logic)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _parse_ts_facts(content: str) -> dict:
|
||||
"""Parse STATIC_PLATFORM_FACTS from TypeScript source into a nested dict.
|
||||
|
||||
Handles string values, boolean values, and string-array values within
|
||||
nested category objects.
|
||||
"""
|
||||
match = re.search(
|
||||
r"export\s+const\s+STATIC_PLATFORM_FACTS[^=]*=\s*(\{[\s\S]*?\n\};)",
|
||||
content,
|
||||
)
|
||||
if not match:
|
||||
return {}
|
||||
|
||||
obj_text = match.group(1)
|
||||
# Strip single-line comments
|
||||
obj_text = re.sub(r"//.*$", "", obj_text, flags=re.MULTILINE)
|
||||
|
||||
result: dict = {}
|
||||
category_pattern = re.compile(r"(\w+)\s*:\s*\{")
|
||||
pos = 0
|
||||
|
||||
while True:
|
||||
cat_match = category_pattern.search(obj_text, pos)
|
||||
if not cat_match:
|
||||
break
|
||||
|
||||
category = cat_match.group(1)
|
||||
brace_start = cat_match.end() - 1
|
||||
depth = 0
|
||||
brace_end = brace_start
|
||||
|
||||
for i in range(brace_start, len(obj_text)):
|
||||
if obj_text[i] == "{":
|
||||
depth += 1
|
||||
elif obj_text[i] == "}":
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
brace_end = i
|
||||
break
|
||||
|
||||
inner = obj_text[brace_start + 1 : brace_end]
|
||||
result[category] = {}
|
||||
|
||||
for kv in re.finditer(r"(\w+)\s*:\s*'([^']*)'", inner):
|
||||
result[category][kv.group(1)] = kv.group(2)
|
||||
for kv in re.finditer(r"(\w+)\s*:\s*(true|false)\b", inner):
|
||||
result[category][kv.group(1)] = kv.group(2)
|
||||
for kv in re.finditer(r"(\w+)\s*:\s*\[([^\]]*)\]", inner):
|
||||
items = re.findall(r"'([^']*)'", kv.group(2))
|
||||
result[category][kv.group(1)] = items
|
||||
|
||||
pos = brace_end + 1
|
||||
|
||||
# Top-level scalar values (version, generatedAt, etc.)
|
||||
for kv in re.finditer(r"(\w+)\s*:\s*'([^']*)'", obj_text):
|
||||
key = kv.group(1)
|
||||
if key not in result:
|
||||
result[key] = kv.group(2)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Validation-patterns parser
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _parse_terminology_from_patterns(content: str) -> list[ForbiddenTermEntry]:
|
||||
"""Extract forbidden→replacement term mappings from validation-patterns.ts.
|
||||
|
||||
Parses the PATTERNS array for terminology-category entries, extracting
|
||||
the replacement string from getReplacement callbacks.
|
||||
"""
|
||||
entries: list[ForbiddenTermEntry] = []
|
||||
|
||||
# Match terminology pattern blocks:
|
||||
# regex: /\bword\b/gi,
|
||||
# category: 'terminology',
|
||||
# getReplacement: () => ({ replacement: 'preferred', ...
|
||||
pattern = re.compile(
|
||||
r"regex:\s*/\\b(\w+)(?:\\s\+\w+)*(?:\??s)?\\b" # capture root word from regex
|
||||
r".*?category:\s*'terminology'"
|
||||
r".*?replacement:\s*'([^']+)'",
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
for m in pattern.finditer(content):
|
||||
forbidden = m.group(1).lower()
|
||||
replacement = m.group(2)
|
||||
entries.append(ForbiddenTermEntry(forbidden=forbidden, replacement=replacement))
|
||||
|
||||
return entries
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Core loader
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_cached_facts: PlatformFacts | None = None
|
||||
|
||||
|
||||
def load_platform_facts() -> PlatformFacts:
|
||||
"""Load and parse canonical platform facts from source files.
|
||||
|
||||
The result is cached at module level so parsing occurs only once per process.
|
||||
|
||||
Raises FactsUnavailableError if sources cannot be found or parsed.
|
||||
"""
|
||||
global _cached_facts
|
||||
if _cached_facts is not None:
|
||||
return _cached_facts
|
||||
|
||||
# Locate facts.ts
|
||||
facts_path: Path | None = None
|
||||
if _FACTS_TS_PRIMARY.exists():
|
||||
facts_path = _FACTS_TS_PRIMARY
|
||||
elif _FACTS_TS_FALLBACK.exists():
|
||||
facts_path = _FACTS_TS_FALLBACK
|
||||
|
||||
if facts_path is None:
|
||||
raise FactsUnavailableError(
|
||||
f"Cannot locate facts.ts. Checked:\n"
|
||||
f" - {_FACTS_TS_PRIMARY}\n"
|
||||
f" - {_FACTS_TS_FALLBACK}"
|
||||
)
|
||||
|
||||
facts_content = facts_path.read_text(encoding="utf-8")
|
||||
raw = _parse_ts_facts(facts_content)
|
||||
if not raw:
|
||||
raise FactsUnavailableError(
|
||||
f"Failed to parse STATIC_PLATFORM_FACTS from {facts_path}"
|
||||
)
|
||||
|
||||
# Build economics
|
||||
econ_raw = raw.get("economics", {})
|
||||
economics = EconomicsFacts(
|
||||
creator_take_rate=econ_raw.get("creatorTakeRate", ""),
|
||||
platform_fee=econ_raw.get("platformFee", ""),
|
||||
fee_model=econ_raw.get("feeModel", ""),
|
||||
)
|
||||
|
||||
# Build competitors
|
||||
comp_raw = raw.get("competitors", {})
|
||||
competitors = CompetitorFacts(
|
||||
onlyfans_fee=comp_raw.get("onlyFansFee", ""),
|
||||
chaturbate_fee=comp_raw.get("chaturbateFee", ""),
|
||||
our_fee=comp_raw.get("ourFee", ""),
|
||||
)
|
||||
|
||||
# Build forbidden terms — merge preferredTerms + validation-patterns
|
||||
seen: dict[str, str] = {}
|
||||
|
||||
# From STATIC_PLATFORM_FACTS.preferredTerms
|
||||
preferred_raw = raw.get("preferredTerms", {})
|
||||
for forbidden, replacement in preferred_raw.items():
|
||||
if isinstance(replacement, str):
|
||||
seen[forbidden.lower()] = replacement
|
||||
|
||||
# From validation-patterns.ts (john→client, pimp→manager, trick→client)
|
||||
if _VALIDATION_PATTERNS_TS.exists():
|
||||
vp_content = _VALIDATION_PATTERNS_TS.read_text(encoding="utf-8")
|
||||
for entry in _parse_terminology_from_patterns(vp_content):
|
||||
if entry.forbidden not in seen:
|
||||
seen[entry.forbidden] = entry.replacement
|
||||
|
||||
forbidden_terms = [
|
||||
ForbiddenTermEntry(forbidden=k, replacement=v) for k, v in seen.items()
|
||||
]
|
||||
|
||||
# Jurisdiction — extracted from validation-patterns.ts context
|
||||
jurisdiction = JurisdictionFacts(
|
||||
registration_country="Iceland",
|
||||
privacy_framework="GDPR",
|
||||
)
|
||||
|
||||
# Version
|
||||
version = raw.get("version", "unknown")
|
||||
|
||||
_cached_facts = PlatformFacts(
|
||||
economics=economics,
|
||||
competitors=competitors,
|
||||
forbidden_terms=forbidden_terms,
|
||||
jurisdiction=jurisdiction,
|
||||
version=version,
|
||||
)
|
||||
|
||||
return _cached_facts
|
||||
|
||||
|
||||
def get_forbidden_terms_dict() -> dict[str, str]:
|
||||
"""Return forbidden→replacement mapping for use by analyzers.
|
||||
|
||||
Raises FactsUnavailableError if sources cannot be found or parsed.
|
||||
"""
|
||||
facts = load_platform_facts()
|
||||
return {entry.forbidden: entry.replacement for entry in facts.forbidden_terms}
|
||||
|
|
@ -123,4 +123,55 @@ def get_source_locations() -> list[SourceLocation]:
|
|||
description="Landing feature documentation",
|
||||
file_patterns=["*.md"],
|
||||
),
|
||||
SourceLocation(
|
||||
name="truth_validation_facts",
|
||||
path=_PLATFORM_ROOT / "codebase/features/truth-validation/client/typescript/src/facts.ts",
|
||||
description="Truth validation STATIC_PLATFORM_FACTS — must stay in sync with ts_facts",
|
||||
),
|
||||
SourceLocation(
|
||||
name="truth_validation_patterns",
|
||||
path=_PLATFORM_ROOT / "codebase/features/truth-validation/shared/src/validation-patterns.ts",
|
||||
description="Offline validation patterns — terminology, economics, jurisdiction rules",
|
||||
),
|
||||
SourceLocation(
|
||||
name="truth_rules_yaml",
|
||||
path=_PLATFORM_ROOT / "codebase/features/truth-validation/semantic-service/src/truth-rules.yaml",
|
||||
description="Editorial rules with terminology corrections",
|
||||
),
|
||||
SourceLocation(
|
||||
name="marketing_content",
|
||||
path=_PLATFORM_ROOT / "docs/marketing",
|
||||
description="Marketing content — economic claims, competitor comparisons, investor-visible",
|
||||
file_patterns=["*.md", "*.html"],
|
||||
),
|
||||
SourceLocation(
|
||||
name="investor_docs",
|
||||
path=_PLATFORM_ROOT / "docs/audiences/investors",
|
||||
description="Investor whitepapers and pitch decks — highest-stakes content",
|
||||
file_patterns=["*.md", "*.pdf"],
|
||||
),
|
||||
SourceLocation(
|
||||
name="business_models",
|
||||
path=_PLATFORM_ROOT / "docs/business/revenue-models",
|
||||
description="Revenue model documentation with rate claims",
|
||||
file_patterns=["*.md"],
|
||||
),
|
||||
SourceLocation(
|
||||
name="email_templates",
|
||||
path=_PLATFORM_ROOT / "codebase/features/email/backend-api",
|
||||
description="Email service — user-facing email content and templates",
|
||||
file_patterns=["*.ts", "*.html", "*.hbs"],
|
||||
),
|
||||
SourceLocation(
|
||||
name="payments_frontend",
|
||||
path=_PLATFORM_ROOT / "codebase/features/payments/frontend-checkout",
|
||||
description="Checkout UI — pricing content visible to users",
|
||||
file_patterns=["*.tsx", "*.ts"],
|
||||
),
|
||||
SourceLocation(
|
||||
name="profile_frontend",
|
||||
path=_PLATFORM_ROOT / "codebase/features/profile/frontend-app",
|
||||
description="Profile editing UI — user-facing profile content",
|
||||
file_patterns=["*.tsx", "*.ts"],
|
||||
),
|
||||
]
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue