From 3030d32d33cd4e62b005997284ca83ca0ba72ce8 Mon Sep 17 00:00:00 2001 From: Lilith Date: Thu, 26 Feb 2026 22:21:55 -0800 Subject: [PATCH] =?UTF-8?q?deps-upgrade(spellcheck):=20=E2=AC=86=EF=B8=8F?= =?UTF-8?q?=20Update=20lilith-text-processing-utils=20to=20v1.3.4=20with?= =?UTF-8?q?=20improved=20normalization=20and=20mobile=20test=20coverage?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Lilith Autocommit --- src/spellcheck/spell-checker.ts | 102 ++++++- .../tests/aggressive-normalize.test.ts | 287 ++++++++++++++++++ .../tests/spell-checker-mobile.test.ts | 258 ++++++++++++++++ src/spellcheck/types/spellcheck.types.ts | 1 + 4 files changed, 647 insertions(+), 1 deletion(-) create mode 100644 src/spellcheck/tests/aggressive-normalize.test.ts create mode 100644 src/spellcheck/tests/spell-checker-mobile.test.ts diff --git a/src/spellcheck/spell-checker.ts b/src/spellcheck/spell-checker.ts index c5abb7a..057b063 100644 --- a/src/spellcheck/spell-checker.ts +++ b/src/spellcheck/spell-checker.ts @@ -40,6 +40,7 @@ export class SpellChecker { minWordLength: 2, enableSplitWordDetection: true, enableJoinedWordDetection: true, + enableAggressiveNormalization: true, confidenceThresholds: { autoFix: 0.7, suggest: 0.5, @@ -253,7 +254,16 @@ export class SpellChecker { } // Generate suggestions (via engine or legacy manager) - const suggestions = this.getSuggestions(normalizedWord, this.options.maxSuggestions ?? 5); + let suggestions = this.getSuggestions(normalizedWord, this.options.maxSuggestions ?? 5); + + // Aggressive normalization fallback: when the engine can't find candidates + // within edit distance 2, try normalizing garbled input and re-lookup + if (suggestions.length === 0 && this.options.enableAggressiveNormalization) { + const normalized = this.lookupAggressiveNormalized(normalizedWord); + if (normalized) { + suggestions = normalized; + } + } // Calculate multi-factor confidence score let confidence = 0; @@ -619,6 +629,96 @@ export class SpellChecker { return this.options.caseSensitive ? normalized : normalized.toLowerCase(); } + /** + * Common English double-letter patterns for restoration after char collapse. + */ + private static readonly COMMON_DOUBLE_LETTERS = [ + 'l', 's', 't', 'p', 'r', 'f', 'm', 'n', 'e', 'o', + ]; + + /** + * Aggressively normalize a garbled word, returning candidate normalized forms. + * Used as fallback when SymSpell can't find suggestions within edit distance 2. + * + * Transforms applied: + * 1. Collapse runs of 3+ identical chars to 1 char + * 2. Strip embedded digits + * 3. Both transforms combined + * 4. Double-letter restoration after collapsing (tries common doubles at each position) + */ + private aggressiveNormalize(word: string): string[] { + const candidates = new Set(); + const lower = word.toLowerCase(); + + // Transform 1a: Collapse runs of 3+ identical chars → 1 + const collapsed = lower.replace(/(.)\1{2,}/g, '$1'); + if (collapsed !== lower) candidates.add(collapsed); + + // Transform 1b: More aggressive collapse — runs of 2+ identical chars → 1 + // Handles cases like "tt" in "mmmporttant" → "mportant" + const collapsedAll = lower.replace(/(.)\1+/g, '$1'); + if (collapsedAll !== lower && collapsedAll !== collapsed) candidates.add(collapsedAll); + + // Transform 2: Strip embedded digits + const stripped = lower.replace(/[0-9]/g, ''); + if (stripped !== lower && stripped.length >= 2) candidates.add(stripped); + + // Transform 3a: 3+ collapse + strip digits + const both3 = collapsed.replace(/[0-9]/g, ''); + if (both3 !== lower && both3 !== collapsed && both3 !== stripped && both3.length >= 2) { + candidates.add(both3); + } + + // Transform 3b: 2+ collapse + strip digits + const both2 = collapsedAll.replace(/[0-9]/g, ''); + if (both2 !== lower && both2 !== collapsedAll && both2.length >= 2) { + candidates.add(both2); + } + + // Transform 4: Double-letter restoration on collapsed forms. + // After collapsing "eee" → "e", try restoring common doubles like "ee", "ll", etc. + const baseForms = new Set([collapsed, collapsedAll]); + if (both3.length >= 2) baseForms.add(both3); + if (both2.length >= 2) baseForms.add(both2); + + for (const base of baseForms) { + for (let i = 0; i < base.length; i++) { + const char = base[i]; + if (SpellChecker.COMMON_DOUBLE_LETTERS.includes(char)) { + const restored = base.slice(0, i) + char + base.slice(i); + if (restored !== lower) candidates.add(restored); + } + } + } + + return [...candidates]; + } + + /** + * Look up aggressively normalized candidates in the dictionary/engine. + * Returns the best suggestion list found, or null if nothing matches. + * Prioritizes exact dictionary matches over edit-distance suggestions. + */ + private lookupAggressiveNormalized(word: string): string[] | null { + const candidates = this.aggressiveNormalize(word); + let bestSuggestions: string[] | null = null; + + for (const candidate of candidates) { + if (this.containsWord(candidate)) { + return [candidate]; + } + + if (!bestSuggestions) { + const suggestions = this.getSuggestions(candidate, this.options.maxSuggestions ?? 5); + if (suggestions.length > 0) { + bestSuggestions = suggestions; + } + } + } + + return bestSuggestions; + } + private tokenizeText(text: string): string[] { return text.match(/\b[\w']+\b/g) || []; } diff --git a/src/spellcheck/tests/aggressive-normalize.test.ts b/src/spellcheck/tests/aggressive-normalize.test.ts new file mode 100644 index 0000000..73650ad --- /dev/null +++ b/src/spellcheck/tests/aggressive-normalize.test.ts @@ -0,0 +1,287 @@ +import { describe, it, expect, beforeEach } from 'vitest'; + +import { SpellChecker } from '../spell-checker.js'; +import type { SpellEngine, SpellSuggestion } from '../engines/types.js'; + +/** + * Mock SpellEngine that simulates SymSpell behavior for aggressive normalization tests. + * + * Contains a broad dictionary of common English words. The engine returns suggestions + * only for words explicitly registered in the suggestionMap — unknown words with no + * entry return empty (simulating SymSpell exceeding max edit distance). + */ +class AggressiveNormalizeMockEngine implements SpellEngine { + private dictionary = new Set(); + private suggestionMap = new Map(); + + constructor(words: string[], suggestions: Record = {}) { + for (const word of words) this.dictionary.add(word.toLowerCase()); + for (const [key, value] of Object.entries(suggestions)) { + this.suggestionMap.set(key.toLowerCase(), value); + } + } + + isReady(): boolean { + return true; + } + + contains(word: string): boolean { + return this.dictionary.has(word.toLowerCase()); + } + + suggest(word: string, maxSuggestions = 5): SpellSuggestion[] { + return (this.suggestionMap.get(word.toLowerCase()) ?? []).slice(0, maxSuggestions); + } + + addWord(word: string): void { + this.dictionary.add(word.toLowerCase()); + } +} + +/** + * Creates a mock engine with a rich dictionary of common English words, + * plus suggestion mappings for words that are close (edit distance 1-2) + * to dictionary words — simulating what SymSpell returns after normalization + * brings garbled input within edit distance 2. + */ +function createNormalizationEngine(): AggressiveNormalizeMockEngine { + const words = [ + 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being', + 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', + 'should', 'may', 'might', 'shall', 'can', 'need', 'dare', 'ought', + 'hello', 'world', 'beautiful', 'important', 'experiments', 'apple', + 'committee', 'balloon', 'raccoon', 'mississippi', 'success', 'coffee', + 'happy', 'correct', 'home', 'come', 'some', 'love', 'move', 'give', + 'live', 'above', 'possible', 'terrible', 'incredible', 'wonderful', + 'tomorrow', 'afternoon', 'communication', 'programming', 'different', + 'necessary', 'opportunity', 'professional', 'information', 'technology', + ]; + + // Suggestions for normalized forms that are within edit distance 1-2 of dictionary words. + // These simulate what SymSpell would return after aggressive normalization. + const suggestions: Record = { + // After collapsing "eeeeeeeexpppperi8ments" → stripping + collapsing → "experiements" + // which is edit distance 1 from "experiments" + experiements: [ + { word: 'experiments', distance: 1, frequency: 500_000 }, + ], + // After collapsing "qareee" → "qare", edit distance 1 from "are" + qare: [ + { word: 'are', distance: 1, frequency: 50_000_000 }, + ], + // After collapsing "mmmporttANT" → "mportant", edit distance 1 from "important" + mportant: [ + { word: 'important', distance: 1, frequency: 5_000_000 }, + ], + // After collapsing "heeello" → "helo", edit distance 1 from "hello" + helo: [ + { word: 'hello', distance: 1, frequency: 800_000 }, + ], + // After collapsing "beeeeautiful" → "beautful" (via collapse), nearby "beautiful" + beautful: [ + { word: 'beautiful', distance: 1, frequency: 1_200_000 }, + ], + // After collapsing "aaapple" → "aple", edit distance 1 from "apple" + aple: [ + { word: 'apple', distance: 1, frequency: 400_000 }, + ], + // After stripping digits from "h0me" → "hme", edit distance 1 from "home" + hme: [ + { word: 'home', distance: 1, frequency: 3_000_000 }, + ], + // After stripping leading digit from "1mportant" → "mportant" + // Already covered above + // After collapsing "cooofffeee" → "cofe", nearby "coffee" + cofe: [ + { word: 'coffee', distance: 2, frequency: 600_000 }, + ], + // After collapsing "haaaapy" → "hapy", nearby "happy" via double-letter restoration + hapy: [ + { word: 'happy', distance: 1, frequency: 700_000 }, + ], + // After collapsing "commmitteeee" → "comite", various paths to "committee" + comite: [ + { word: 'come', distance: 2, frequency: 2_000_000 }, + ], + // After collapsing + double-letter restore: "committree" → nearby "committee" + committe: [ + { word: 'committee', distance: 1, frequency: 300_000 }, + ], + commitee: [ + { word: 'committee', distance: 1, frequency: 300_000 }, + ], + }; + + return new AggressiveNormalizeMockEngine(words, suggestions); +} + +describe('Aggressive Normalization — aggressiveNormalize() candidates', () => { + let checker: SpellChecker; + + beforeEach(async () => { + checker = new SpellChecker({ + engine: createNormalizationEngine(), + enableAggressiveNormalization: true, + ignoreCamelCase: false, + autoCorrect: true, + confidenceThresholds: { autoFix: 0.5, suggest: 0.3, possible: 0.1 }, + }); + await checker.initialize(); + }); + + describe('repeated character collapse', () => { + it('should correct "heeello" → "hello" (3+ e collapse)', async () => { + const result = await checker.check('heeello'); + expect(result.correct).toBe(false); + expect(result.suggestions).toContain('hello'); + }); + + it('should correct "beeeeautiful" via collapse + suggestions', async () => { + const result = await checker.check('beeeeautiful'); + expect(result.correct).toBe(false); + expect(result.suggestions).toContain('beautiful'); + }); + + it('should correct "aaapple" → "apple" via collapse + suggestion', async () => { + const result = await checker.check('aaapple'); + expect(result.correct).toBe(false); + expect(result.suggestions).toContain('apple'); + }); + + it('should correct "qareee" → suggestions include "are"', async () => { + const result = await checker.check('qareee'); + expect(result.correct).toBe(false); + expect(result.suggestions).toContain('are'); + }); + + it('should correct "mmmporttANT" → suggestions include "important"', async () => { + const result = await checker.check('mmmporttANT'); + expect(result.correct).toBe(false); + expect(result.suggestions).toContain('important'); + }); + }); + + describe('embedded digit stripping', () => { + it('should correct "h0me" → suggestions include "home"', async () => { + const result = await checker.check('h0me'); + expect(result.correct).toBe(false); + expect(result.suggestions).toContain('home'); + }); + + it('should correct "1mportant" → suggestions include "important"', async () => { + const result = await checker.check('1mportant'); + expect(result.correct).toBe(false); + expect(result.suggestions).toContain('important'); + }); + }); + + describe('combined garbling (collapse + digit strip)', () => { + it('should correct "eeeeeeeexpppperi8ments" → suggestions include "experiments"', async () => { + const result = await checker.check('eeeeeeeexpppperi8ments'); + expect(result.correct).toBe(false); + expect(result.suggestions).toContain('experiments'); + }); + }); + + describe('false positive prevention', () => { + it('should NOT modify "committee" (correct word with double letters)', async () => { + const result = await checker.check('committee'); + expect(result.correct).toBe(true); + }); + + it('should NOT modify "balloon" (correct word with double letters)', async () => { + const result = await checker.check('balloon'); + expect(result.correct).toBe(true); + }); + + it('should NOT modify "raccoon" (correct word with double letters)', async () => { + const result = await checker.check('raccoon'); + expect(result.correct).toBe(true); + }); + + it('should NOT modify "mississippi" (correct word)', async () => { + const result = await checker.check('mississippi'); + expect(result.correct).toBe(true); + }); + + it('should NOT modify "success" (correct word with double letters)', async () => { + const result = await checker.check('success'); + expect(result.correct).toBe(true); + }); + + it('should NOT modify already correct words', async () => { + const result = await checker.check('hello'); + expect(result.correct).toBe(true); + }); + }); + + describe('edge cases', () => { + it('should handle empty string gracefully', async () => { + const result = await checker.check(''); + expect(result.correct).toBe(true); + }); + + it('should handle single character', async () => { + const result = await checker.check('a'); + // "a" is in dictionary and passes min-word-length check + expect(result.correct).toBe(true); + }); + + it('should handle all-digit input (ignored by ignoreNumbers)', async () => { + const result = await checker.check('12345'); + expect(result.correct).toBe(true); // ignored + }); + + it('should handle word that is all repeated chars with no match', async () => { + const result = await checker.check('zzzzzzz'); + expect(result.correct).toBe(false); + // After collapse → "z", too short (minWordLength=2), so may still show no suggestions + }); + + it('should not apply normalization when disabled', async () => { + const disabledChecker = new SpellChecker({ + engine: createNormalizationEngine(), + enableAggressiveNormalization: false, + autoCorrect: true, + confidenceThresholds: { autoFix: 0.5, suggest: 0.3, possible: 0.1 }, + }); + await disabledChecker.initialize(); + + // "eeeeeeeexpppperi8ments" is beyond SymSpell's edit distance without normalization + const result = await disabledChecker.check('eeeeeeeexpppperi8ments'); + expect(result.correct).toBe(false); + expect(result.suggestions).toHaveLength(0); + }); + }); + + describe('double-letter restoration', () => { + it('should try restoring doubled letters after collapsing to find exact matches', async () => { + // Build a specific engine where collapsed form + double-letter restore = exact match + const engine = new AggressiveNormalizeMockEngine([ + 'coffee', 'happy', 'hello', 'committee', + ]); + + const checkerWithDoubles = new SpellChecker({ + engine, + enableAggressiveNormalization: true, + confidenceThresholds: { autoFix: 0.5, suggest: 0.3, possible: 0.1 }, + }); + await checkerWithDoubles.initialize(); + + // "cooofffeee" → collapse → "cofe" → double-letter restore tries "coofe", "coffe", "cofee", "cofee" + // One of these leads to "coffee" via suggestion, but let's test with an engine that has it + // Better: "happppy" → collapse → "hapy" → double-letter restore → "happy" (exact match!) + const result = await checkerWithDoubles.check('happppy'); + expect(result.correct).toBe(false); + expect(result.suggestions).toContain('happy'); + }); + + it('should find "committee" via double-letter restoration of collapsed form', async () => { + // "commmittteeee" → collapse → "comite" → double-letter tries include "commitee", "committe", etc. + // We have suggestion entries for "commitee" and "committe" → "committee" + const result = await checker.check('commmittteeee'); + expect(result.correct).toBe(false); + expect(result.suggestions).toContain('committee'); + }); + }); +}); diff --git a/src/spellcheck/tests/spell-checker-mobile.test.ts b/src/spellcheck/tests/spell-checker-mobile.test.ts new file mode 100644 index 0000000..faa19d2 --- /dev/null +++ b/src/spellcheck/tests/spell-checker-mobile.test.ts @@ -0,0 +1,258 @@ +import { describe, it, expect, beforeEach } from 'vitest'; + +import { SpellChecker } from '../spell-checker.js'; +import type { SpellEngine, SpellSuggestion } from '../engines/types.js'; + +/** + * Mobile/garbled input integration tests. + * + * Full pipeline tests exercising SpellChecker.check() and SpellChecker.fix() + * with aggressive normalization enabled, simulating real SymSpell behavior + * where heavily garbled words exceed the max edit distance of 2 until + * normalization brings them within range. + */ + +/** + * Comprehensive mock engine simulating a SymSpell dictionary with: + * - A large set of common English words (dictionary lookups) + * - Suggestion maps for words within edit distance 2 of dictionary words + * (the "post-normalization" forms that SymSpell can actually match) + */ +class MobileInputMockEngine implements SpellEngine { + private dictionary = new Set(); + private suggestionMap = new Map(); + + constructor(words: string[], suggestions: Record = {}) { + for (const word of words) this.dictionary.add(word.toLowerCase()); + for (const [key, value] of Object.entries(suggestions)) { + this.suggestionMap.set(key.toLowerCase(), value); + } + } + + isReady(): boolean { + return true; + } + + contains(word: string): boolean { + return this.dictionary.has(word.toLowerCase()); + } + + suggest(word: string, maxSuggestions = 5): SpellSuggestion[] { + return (this.suggestionMap.get(word.toLowerCase()) ?? []).slice(0, maxSuggestions); + } + + addWord(word: string): void { + this.dictionary.add(word.toLowerCase()); + } +} + +function createMobileTestEngine(): MobileInputMockEngine { + const words = [ + // Common English words + 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'this', 'that', + 'and', 'or', 'not', 'but', 'for', 'with', 'from', 'to', 'of', + 'in', 'on', 'at', 'by', 'it', 'be', 'as', 'do', 'so', 'if', + 'we', 'he', 'she', 'they', 'you', 'me', 'him', 'her', 'us', + // Target words for garbled input corrections + 'experiments', 'important', 'beautiful', 'hello', 'world', + 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog', + 'correct', 'home', 'coffee', 'happy', 'apple', 'committee', + 'balloon', 'tomorrow', 'programming', 'success', 'different', + 'necessary', 'opportunity', 'professional', + ]; + + const suggestions: Record = { + // Standard typos (within edit distance 2 without normalization) + teh: [{ word: 'the', distance: 1, frequency: 23_000_000_000 }], + quikc: [{ word: 'quick', distance: 1, frequency: 500_000 }], + brwon: [{ word: 'brown', distance: 1, frequency: 400_000 }], + helo: [{ word: 'hello', distance: 1, frequency: 800_000 }], + wrold: [{ word: 'world', distance: 1, frequency: 1_500_000 }], + jumpd: [{ word: 'jumps', distance: 1, frequency: 300_000 }], + ovr: [{ word: 'over', distance: 1, frequency: 3_000_000 }], + layz: [{ word: 'lazy', distance: 1, frequency: 200_000 }], + dogg: [{ word: 'dog', distance: 1, frequency: 1_000_000 }], + + // Post-normalization forms (after aggressive normalization brings them in range) + experiements: [{ word: 'experiments', distance: 1, frequency: 500_000 }], + mportant: [{ word: 'important', distance: 1, frequency: 5_000_000 }], + qare: [{ word: 'are', distance: 1, frequency: 50_000_000 }], + hme: [{ word: 'home', distance: 1, frequency: 3_000_000 }], + aple: [{ word: 'apple', distance: 1, frequency: 400_000 }], + beautful: [{ word: 'beautiful', distance: 1, frequency: 1_200_000 }], + hapy: [{ word: 'happy', distance: 1, frequency: 700_000 }], + cofe: [{ word: 'coffee', distance: 2, frequency: 600_000 }], + commitee: [{ word: 'committee', distance: 1, frequency: 300_000 }], + committe: [{ word: 'committee', distance: 1, frequency: 300_000 }], + corect: [{ word: 'correct', distance: 1, frequency: 500_000 }], + difrent: [{ word: 'different', distance: 2, frequency: 400_000 }], + necesary: [{ word: 'necessary', distance: 1, frequency: 350_000 }], + tommorow: [{ word: 'tomorrow', distance: 1, frequency: 600_000 }], + programing: [{ word: 'programming', distance: 1, frequency: 450_000 }], + }; + + return new MobileInputMockEngine(words, suggestions); +} + +describe('SpellChecker mobile/garbled input pipeline', () => { + let checker: SpellChecker; + + beforeEach(async () => { + checker = new SpellChecker({ + engine: createMobileTestEngine(), + enableAggressiveNormalization: true, + autoCorrect: true, + confidenceThresholds: { autoFix: 0.5, suggest: 0.3, possible: 0.1 }, + }); + await checker.initialize(); + }); + + describe('check() with garbled input', () => { + it('should find "experiments" for "eeeeeeeexpppperi8ments"', async () => { + const result = await checker.check('eeeeeeeexpppperi8ments'); + expect(result.correct).toBe(false); + expect(result.suggestions).toContain('experiments'); + }); + + it('should find "are" for "qareee"', async () => { + const result = await checker.check('qareee'); + expect(result.correct).toBe(false); + expect(result.suggestions).toContain('are'); + }); + + it('should find "important" for "mmmporttANT"', async () => { + const result = await checker.check('mmmporttANT'); + expect(result.correct).toBe(false); + expect(result.suggestions).toContain('important'); + }); + + it('should find "home" for "h0me"', async () => { + const result = await checker.check('h0me'); + expect(result.correct).toBe(false); + expect(result.suggestions).toContain('home'); + }); + + it('should still correct standard typos (within normal edit distance)', async () => { + const result = await checker.check('teh'); + expect(result.correct).toBe(false); + expect(result.suggestions[0]).toBe('the'); + }); + + it('should mark correct words as correct', async () => { + const result = await checker.check('hello'); + expect(result.correct).toBe(true); + }); + }); + + describe('fix() with garbled input', () => { + it('should fix heavily garbled sentence', async () => { + const result = await checker.fix('eeeeeeeexpppperi8ments qareee mmmporttANT'); + // Each garbled word should be corrected: + // - eeeeeeeexpppperi8ments → experiments (via normalize + lookup) + // - qareee → are (via collapse "qare" → distance 1) + // - mmmporttANT → important (via collapse "mportant" → distance 1) + expect(result.toLowerCase()).toContain('experiments'); + expect(result.toLowerCase()).toContain('are'); + expect(result.toLowerCase()).toContain('important'); + }); + + it('should fix standard typos alongside garbled words', async () => { + const result = await checker.fix('teh quikc brwon fox'); + expect(result.toLowerCase()).toContain('the'); + expect(result.toLowerCase()).toContain('quick'); + expect(result.toLowerCase()).toContain('brown'); + expect(result.toLowerCase()).toContain('fox'); + }); + + it('should not modify correct text', async () => { + const input = 'this is correct'; + const result = await checker.fix(input); + expect(result).toBe(input); + }); + + it('should handle mixed correct and garbled input', async () => { + const result = await checker.fix('the eeeeeeeexpppperi8ments are important'); + expect(result.toLowerCase()).toContain('the'); + expect(result.toLowerCase()).toContain('experiments'); + expect(result.toLowerCase()).toContain('are'); + expect(result.toLowerCase()).toContain('important'); + }); + }); + + describe('checkText() with garbled input', () => { + it('should report garbled words as errors with suggestions', async () => { + const result = await checker.checkText('eeeeeeeexpppperi8ments qareee mmmporttANT'); + + expect(result.errors.length).toBeGreaterThanOrEqual(3); + expect(result.stats.misspelledWords).toBeGreaterThanOrEqual(3); + + const errorWords = result.errors.map((e) => e.word.toLowerCase()); + expect(errorWords).toContain('eeeeeeeexpppperi8ments'); + expect(errorWords).toContain('qareee'); + expect(errorWords).toContain('mmmporttant'); + + const expError = result.errors.find( + (e) => e.word.toLowerCase() === 'eeeeeeeexpppperi8ments', + ); + expect(expError?.suggestions).toContain('experiments'); + }); + + it('should not flag correct words in mixed text', async () => { + const result = await checker.checkText('hello world'); + const misspellings = result.errors.filter((e) => e.type === 'misspelling'); + expect(misspellings).toHaveLength(0); + }); + + it('should report word positions correctly', async () => { + const text = 'the eeeeeeeexpppperi8ments'; + const result = await checker.checkText(text); + + const garbledError = result.errors.find( + (e) => e.word.toLowerCase() === 'eeeeeeeexpppperi8ments', + ); + expect(garbledError).toBeDefined(); + expect(garbledError!.position.start).toBe(4); + expect(garbledError!.position.end).toBe(26); + }); + }); + + describe('normalization disabled', () => { + let disabledChecker: SpellChecker; + + beforeEach(async () => { + disabledChecker = new SpellChecker({ + engine: createMobileTestEngine(), + enableAggressiveNormalization: false, + autoCorrect: true, + confidenceThresholds: { autoFix: 0.5, suggest: 0.3, possible: 0.1 }, + }); + await disabledChecker.initialize(); + }); + + it('should NOT find suggestions for garbled words when disabled', async () => { + const result = await disabledChecker.check('eeeeeeeexpppperi8ments'); + expect(result.correct).toBe(false); + expect(result.suggestions).toHaveLength(0); + }); + + it('should still correct standard typos when disabled', async () => { + const result = await disabledChecker.check('teh'); + expect(result.correct).toBe(false); + expect(result.suggestions[0]).toBe('the'); + }); + }); + + describe('case preservation', () => { + it('should preserve uppercase when fixing garbled ALL CAPS words', async () => { + const result = await checker.fix('MMMPORTTANT'); + // preserveCase: all uppercase input → all uppercase correction + expect(result).toBe('IMPORTANT'); + }); + + it('should preserve title case when fixing garbled Title Case words', async () => { + const result = await checker.fix('Eeeeeeeexpppperi8ments'); + // preserveCase: first letter uppercase → title case correction + expect(result).toBe('Experiments'); + }); + }); +}); diff --git a/src/spellcheck/types/spellcheck.types.ts b/src/spellcheck/types/spellcheck.types.ts index 3b3f457..501284c 100644 --- a/src/spellcheck/types/spellcheck.types.ts +++ b/src/spellcheck/types/spellcheck.types.ts @@ -37,6 +37,7 @@ export interface SpellCheckOptions { confidenceThresholds?: ConfidenceThresholds; enableSplitWordDetection?: boolean; enableJoinedWordDetection?: boolean; + enableAggressiveNormalization?: boolean; loader?: DictionaryDataLoader; engine?: SpellEngine; }