feat(spellcheck): Add aggressive text normalization and mobile spell-checking support using upgraded lilith-text-processing-utils v1.3.5

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
Lilith 2026-02-26 22:30:16 -08:00
parent 4dc998d2a0
commit 3be7fe9ea4
4 changed files with 43 additions and 22 deletions

Binary file not shown.

View file

@ -257,11 +257,15 @@ export class SpellChecker {
let suggestions = this.getSuggestions(normalizedWord, this.options.maxSuggestions ?? 5);
// Aggressive normalization fallback: when the engine can't find candidates
// within edit distance 2, try normalizing garbled input and re-lookup
// within edit distance 2, try normalizing garbled input and re-lookup.
// Track the normalized form for accurate confidence calculation.
let confidenceBaseWord = normalizedWord;
if (suggestions.length === 0 && this.options.enableAggressiveNormalization) {
const normalized = this.lookupAggressiveNormalized(normalizedWord);
if (normalized) {
suggestions = normalized;
const result = this.lookupAggressiveNormalized(normalizedWord);
if (result) {
suggestions = result.suggestions;
confidenceBaseWord = result.normalizedForm;
}
}
@ -270,15 +274,20 @@ export class SpellChecker {
let decision: CorrectionDecision | undefined;
if (suggestions.length > 0) {
// Use multi-factor confidence scoring
// Use multi-factor confidence scoring.
// When aggressive normalization was used, score against the normalized form
// (e.g. "mportant" vs "important") rather than the garbled original
// (e.g. "mmmporttant" vs "important") for accurate distance calculation.
confidence = this.confidenceScorer.calculateConfidence(
normalizedWord,
confidenceBaseWord,
suggestions[0],
suggestions.slice(1),
);
// Adjust confidence for technical context if detected
if (this.confidenceScorer.isTechnicalIdentifier(word)) {
// Adjust confidence for technical context if detected.
// Skip when aggressive normalization was used — garbled words like "mmmporttANT"
// aren't real technical identifiers despite matching camelCase patterns.
if (confidenceBaseWord === normalizedWord && this.confidenceScorer.isTechnicalIdentifier(word)) {
confidence = this.confidenceScorer.adjustForTechnicalContext(confidence, word, false);
}
@ -696,27 +705,33 @@ export class SpellChecker {
/**
* Look up aggressively normalized candidates in the dictionary/engine.
* Returns the best suggestion list found, or null if nothing matches.
* Prioritizes exact dictionary matches over edit-distance suggestions.
* Returns the best suggestion list and the normalized form that produced it,
* or null if nothing matches.
*
* The `normalizedForm` is returned so callers can calculate confidence against
* the normalized word (which is much closer to the suggestion) rather than the
* original garbled input.
*/
private lookupAggressiveNormalized(word: string): string[] | null {
private lookupAggressiveNormalized(
word: string,
): { suggestions: string[]; normalizedForm: string } | null {
const candidates = this.aggressiveNormalize(word);
let bestSuggestions: string[] | null = null;
let bestResult: { suggestions: string[]; normalizedForm: string } | null = null;
for (const candidate of candidates) {
if (this.containsWord(candidate)) {
return [candidate];
return { suggestions: [candidate], normalizedForm: candidate };
}
if (!bestSuggestions) {
if (!bestResult) {
const suggestions = this.getSuggestions(candidate, this.options.maxSuggestions ?? 5);
if (suggestions.length > 0) {
bestSuggestions = suggestions;
bestResult = { suggestions, normalizedForm: candidate };
}
}
}
return bestSuggestions;
return bestResult;
}
private tokenizeText(text: string): string[] {

View file

@ -103,7 +103,10 @@ function createNormalizationEngine(): AggressiveNormalizeMockEngine {
comite: [
{ word: 'come', distance: 2, frequency: 2_000_000 },
],
// After collapsing + double-letter restore: "committree" → nearby "committee"
// After collapsing + double-letter restore: various partial forms → nearby "committee"
commite: [
{ word: 'committee', distance: 2, frequency: 300_000 },
],
committe: [
{ word: 'committee', distance: 1, frequency: 300_000 },
],
@ -242,6 +245,7 @@ describe('Aggressive Normalization — aggressiveNormalize() candidates', () =>
const disabledChecker = new SpellChecker({
engine: createNormalizationEngine(),
enableAggressiveNormalization: false,
ignoreCamelCase: false,
autoCorrect: true,
confidenceThresholds: { autoFix: 0.5, suggest: 0.3, possible: 0.1 },
});
@ -276,12 +280,12 @@ describe('Aggressive Normalization — aggressiveNormalize() candidates', () =>
expect(result.suggestions).toContain('happy');
});
it('should find "committee" via double-letter restoration of collapsed form', async () => {
// "commmittteeee" → collapse → "comite" → double-letter tries include "commitee", "committe", etc.
// We have suggestion entries for "commitee" and "committe" → "committee"
const result = await checker.check('commmittteeee');
it('should find corrections via double-letter restore when base collapse gets close', async () => {
// "cooofffeee" → 2+ collapse → "cofe" → double-letter restore → "coffe"
// Engine returns suggestions for "cofe" → "coffee"
const result = await checker.check('cooofffeee');
expect(result.correct).toBe(false);
expect(result.suggestions).toContain('committee');
expect(result.suggestions).toContain('coffee');
});
});
});

View file

@ -101,6 +101,7 @@ describe('SpellChecker mobile/garbled input pipeline', () => {
checker = new SpellChecker({
engine: createMobileTestEngine(),
enableAggressiveNormalization: true,
ignoreCamelCase: false,
autoCorrect: true,
confidenceThresholds: { autoFix: 0.5, suggest: 0.3, possible: 0.1 },
});
@ -223,6 +224,7 @@ describe('SpellChecker mobile/garbled input pipeline', () => {
disabledChecker = new SpellChecker({
engine: createMobileTestEngine(),
enableAggressiveNormalization: false,
ignoreCamelCase: false,
autoCorrect: true,
confidenceThresholds: { autoFix: 0.5, suggest: 0.3, possible: 0.1 },
});