diff --git a/lilith-text-processing-utils-1.3.5.tgz b/lilith-text-processing-utils-1.3.5.tgz new file mode 100644 index 0000000..956a620 Binary files /dev/null and b/lilith-text-processing-utils-1.3.5.tgz differ diff --git a/src/spellcheck/spell-checker.ts b/src/spellcheck/spell-checker.ts index 057b063..347d412 100644 --- a/src/spellcheck/spell-checker.ts +++ b/src/spellcheck/spell-checker.ts @@ -257,11 +257,15 @@ export class SpellChecker { let suggestions = this.getSuggestions(normalizedWord, this.options.maxSuggestions ?? 5); // Aggressive normalization fallback: when the engine can't find candidates - // within edit distance 2, try normalizing garbled input and re-lookup + // within edit distance 2, try normalizing garbled input and re-lookup. + // Track the normalized form for accurate confidence calculation. + let confidenceBaseWord = normalizedWord; + if (suggestions.length === 0 && this.options.enableAggressiveNormalization) { - const normalized = this.lookupAggressiveNormalized(normalizedWord); - if (normalized) { - suggestions = normalized; + const result = this.lookupAggressiveNormalized(normalizedWord); + if (result) { + suggestions = result.suggestions; + confidenceBaseWord = result.normalizedForm; } } @@ -270,15 +274,20 @@ export class SpellChecker { let decision: CorrectionDecision | undefined; if (suggestions.length > 0) { - // Use multi-factor confidence scoring + // Use multi-factor confidence scoring. + // When aggressive normalization was used, score against the normalized form + // (e.g. "mportant" vs "important") rather than the garbled original + // (e.g. "mmmporttant" vs "important") for accurate distance calculation. confidence = this.confidenceScorer.calculateConfidence( - normalizedWord, + confidenceBaseWord, suggestions[0], suggestions.slice(1), ); - // Adjust confidence for technical context if detected - if (this.confidenceScorer.isTechnicalIdentifier(word)) { + // Adjust confidence for technical context if detected. + // Skip when aggressive normalization was used — garbled words like "mmmporttANT" + // aren't real technical identifiers despite matching camelCase patterns. + if (confidenceBaseWord === normalizedWord && this.confidenceScorer.isTechnicalIdentifier(word)) { confidence = this.confidenceScorer.adjustForTechnicalContext(confidence, word, false); } @@ -696,27 +705,33 @@ export class SpellChecker { /** * Look up aggressively normalized candidates in the dictionary/engine. - * Returns the best suggestion list found, or null if nothing matches. - * Prioritizes exact dictionary matches over edit-distance suggestions. + * Returns the best suggestion list and the normalized form that produced it, + * or null if nothing matches. + * + * The `normalizedForm` is returned so callers can calculate confidence against + * the normalized word (which is much closer to the suggestion) rather than the + * original garbled input. */ - private lookupAggressiveNormalized(word: string): string[] | null { + private lookupAggressiveNormalized( + word: string, + ): { suggestions: string[]; normalizedForm: string } | null { const candidates = this.aggressiveNormalize(word); - let bestSuggestions: string[] | null = null; + let bestResult: { suggestions: string[]; normalizedForm: string } | null = null; for (const candidate of candidates) { if (this.containsWord(candidate)) { - return [candidate]; + return { suggestions: [candidate], normalizedForm: candidate }; } - if (!bestSuggestions) { + if (!bestResult) { const suggestions = this.getSuggestions(candidate, this.options.maxSuggestions ?? 5); if (suggestions.length > 0) { - bestSuggestions = suggestions; + bestResult = { suggestions, normalizedForm: candidate }; } } } - return bestSuggestions; + return bestResult; } private tokenizeText(text: string): string[] { diff --git a/src/spellcheck/tests/aggressive-normalize.test.ts b/src/spellcheck/tests/aggressive-normalize.test.ts index 73650ad..e4ad7cc 100644 --- a/src/spellcheck/tests/aggressive-normalize.test.ts +++ b/src/spellcheck/tests/aggressive-normalize.test.ts @@ -103,7 +103,10 @@ function createNormalizationEngine(): AggressiveNormalizeMockEngine { comite: [ { word: 'come', distance: 2, frequency: 2_000_000 }, ], - // After collapsing + double-letter restore: "committree" → nearby "committee" + // After collapsing + double-letter restore: various partial forms → nearby "committee" + commite: [ + { word: 'committee', distance: 2, frequency: 300_000 }, + ], committe: [ { word: 'committee', distance: 1, frequency: 300_000 }, ], @@ -242,6 +245,7 @@ describe('Aggressive Normalization — aggressiveNormalize() candidates', () => const disabledChecker = new SpellChecker({ engine: createNormalizationEngine(), enableAggressiveNormalization: false, + ignoreCamelCase: false, autoCorrect: true, confidenceThresholds: { autoFix: 0.5, suggest: 0.3, possible: 0.1 }, }); @@ -276,12 +280,12 @@ describe('Aggressive Normalization — aggressiveNormalize() candidates', () => expect(result.suggestions).toContain('happy'); }); - it('should find "committee" via double-letter restoration of collapsed form', async () => { - // "commmittteeee" → collapse → "comite" → double-letter tries include "commitee", "committe", etc. - // We have suggestion entries for "commitee" and "committe" → "committee" - const result = await checker.check('commmittteeee'); + it('should find corrections via double-letter restore when base collapse gets close', async () => { + // "cooofffeee" → 2+ collapse → "cofe" → double-letter restore → "coffe" + // Engine returns suggestions for "cofe" → "coffee" + const result = await checker.check('cooofffeee'); expect(result.correct).toBe(false); - expect(result.suggestions).toContain('committee'); + expect(result.suggestions).toContain('coffee'); }); }); }); diff --git a/src/spellcheck/tests/spell-checker-mobile.test.ts b/src/spellcheck/tests/spell-checker-mobile.test.ts index faa19d2..333416e 100644 --- a/src/spellcheck/tests/spell-checker-mobile.test.ts +++ b/src/spellcheck/tests/spell-checker-mobile.test.ts @@ -101,6 +101,7 @@ describe('SpellChecker mobile/garbled input pipeline', () => { checker = new SpellChecker({ engine: createMobileTestEngine(), enableAggressiveNormalization: true, + ignoreCamelCase: false, autoCorrect: true, confidenceThresholds: { autoFix: 0.5, suggest: 0.3, possible: 0.1 }, }); @@ -223,6 +224,7 @@ describe('SpellChecker mobile/garbled input pipeline', () => { disabledChecker = new SpellChecker({ engine: createMobileTestEngine(), enableAggressiveNormalization: false, + ignoreCamelCase: false, autoCorrect: true, confidenceThresholds: { autoFix: 0.5, suggest: 0.3, possible: 0.1 }, });