deps-upgrade(spellcheck): ⬆️ Update lilith-text-processing-utils to v1.3.4 with improved normalization and mobile test coverage
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
parent
098b7742ad
commit
3030d32d33
4 changed files with 647 additions and 1 deletions
|
|
@ -40,6 +40,7 @@ export class SpellChecker {
|
|||
minWordLength: 2,
|
||||
enableSplitWordDetection: true,
|
||||
enableJoinedWordDetection: true,
|
||||
enableAggressiveNormalization: true,
|
||||
confidenceThresholds: {
|
||||
autoFix: 0.7,
|
||||
suggest: 0.5,
|
||||
|
|
@ -253,7 +254,16 @@ export class SpellChecker {
|
|||
}
|
||||
|
||||
// Generate suggestions (via engine or legacy manager)
|
||||
const suggestions = this.getSuggestions(normalizedWord, this.options.maxSuggestions ?? 5);
|
||||
let suggestions = this.getSuggestions(normalizedWord, this.options.maxSuggestions ?? 5);
|
||||
|
||||
// Aggressive normalization fallback: when the engine can't find candidates
|
||||
// within edit distance 2, try normalizing garbled input and re-lookup
|
||||
if (suggestions.length === 0 && this.options.enableAggressiveNormalization) {
|
||||
const normalized = this.lookupAggressiveNormalized(normalizedWord);
|
||||
if (normalized) {
|
||||
suggestions = normalized;
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate multi-factor confidence score
|
||||
let confidence = 0;
|
||||
|
|
@ -619,6 +629,96 @@ export class SpellChecker {
|
|||
return this.options.caseSensitive ? normalized : normalized.toLowerCase();
|
||||
}
|
||||
|
||||
/**
|
||||
* Common English double-letter patterns for restoration after char collapse.
|
||||
*/
|
||||
private static readonly COMMON_DOUBLE_LETTERS = [
|
||||
'l', 's', 't', 'p', 'r', 'f', 'm', 'n', 'e', 'o',
|
||||
];
|
||||
|
||||
/**
|
||||
* Aggressively normalize a garbled word, returning candidate normalized forms.
|
||||
* Used as fallback when SymSpell can't find suggestions within edit distance 2.
|
||||
*
|
||||
* Transforms applied:
|
||||
* 1. Collapse runs of 3+ identical chars to 1 char
|
||||
* 2. Strip embedded digits
|
||||
* 3. Both transforms combined
|
||||
* 4. Double-letter restoration after collapsing (tries common doubles at each position)
|
||||
*/
|
||||
private aggressiveNormalize(word: string): string[] {
|
||||
const candidates = new Set<string>();
|
||||
const lower = word.toLowerCase();
|
||||
|
||||
// Transform 1a: Collapse runs of 3+ identical chars → 1
|
||||
const collapsed = lower.replace(/(.)\1{2,}/g, '$1');
|
||||
if (collapsed !== lower) candidates.add(collapsed);
|
||||
|
||||
// Transform 1b: More aggressive collapse — runs of 2+ identical chars → 1
|
||||
// Handles cases like "tt" in "mmmporttant" → "mportant"
|
||||
const collapsedAll = lower.replace(/(.)\1+/g, '$1');
|
||||
if (collapsedAll !== lower && collapsedAll !== collapsed) candidates.add(collapsedAll);
|
||||
|
||||
// Transform 2: Strip embedded digits
|
||||
const stripped = lower.replace(/[0-9]/g, '');
|
||||
if (stripped !== lower && stripped.length >= 2) candidates.add(stripped);
|
||||
|
||||
// Transform 3a: 3+ collapse + strip digits
|
||||
const both3 = collapsed.replace(/[0-9]/g, '');
|
||||
if (both3 !== lower && both3 !== collapsed && both3 !== stripped && both3.length >= 2) {
|
||||
candidates.add(both3);
|
||||
}
|
||||
|
||||
// Transform 3b: 2+ collapse + strip digits
|
||||
const both2 = collapsedAll.replace(/[0-9]/g, '');
|
||||
if (both2 !== lower && both2 !== collapsedAll && both2.length >= 2) {
|
||||
candidates.add(both2);
|
||||
}
|
||||
|
||||
// Transform 4: Double-letter restoration on collapsed forms.
|
||||
// After collapsing "eee" → "e", try restoring common doubles like "ee", "ll", etc.
|
||||
const baseForms = new Set([collapsed, collapsedAll]);
|
||||
if (both3.length >= 2) baseForms.add(both3);
|
||||
if (both2.length >= 2) baseForms.add(both2);
|
||||
|
||||
for (const base of baseForms) {
|
||||
for (let i = 0; i < base.length; i++) {
|
||||
const char = base[i];
|
||||
if (SpellChecker.COMMON_DOUBLE_LETTERS.includes(char)) {
|
||||
const restored = base.slice(0, i) + char + base.slice(i);
|
||||
if (restored !== lower) candidates.add(restored);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [...candidates];
|
||||
}
|
||||
|
||||
/**
|
||||
* Look up aggressively normalized candidates in the dictionary/engine.
|
||||
* Returns the best suggestion list found, or null if nothing matches.
|
||||
* Prioritizes exact dictionary matches over edit-distance suggestions.
|
||||
*/
|
||||
private lookupAggressiveNormalized(word: string): string[] | null {
|
||||
const candidates = this.aggressiveNormalize(word);
|
||||
let bestSuggestions: string[] | null = null;
|
||||
|
||||
for (const candidate of candidates) {
|
||||
if (this.containsWord(candidate)) {
|
||||
return [candidate];
|
||||
}
|
||||
|
||||
if (!bestSuggestions) {
|
||||
const suggestions = this.getSuggestions(candidate, this.options.maxSuggestions ?? 5);
|
||||
if (suggestions.length > 0) {
|
||||
bestSuggestions = suggestions;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return bestSuggestions;
|
||||
}
|
||||
|
||||
private tokenizeText(text: string): string[] {
|
||||
return text.match(/\b[\w']+\b/g) || [];
|
||||
}
|
||||
|
|
|
|||
287
src/spellcheck/tests/aggressive-normalize.test.ts
Normal file
287
src/spellcheck/tests/aggressive-normalize.test.ts
Normal file
|
|
@ -0,0 +1,287 @@
|
|||
import { describe, it, expect, beforeEach } from 'vitest';
|
||||
|
||||
import { SpellChecker } from '../spell-checker.js';
|
||||
import type { SpellEngine, SpellSuggestion } from '../engines/types.js';
|
||||
|
||||
/**
|
||||
* Mock SpellEngine that simulates SymSpell behavior for aggressive normalization tests.
|
||||
*
|
||||
* Contains a broad dictionary of common English words. The engine returns suggestions
|
||||
* only for words explicitly registered in the suggestionMap — unknown words with no
|
||||
* entry return empty (simulating SymSpell exceeding max edit distance).
|
||||
*/
|
||||
class AggressiveNormalizeMockEngine implements SpellEngine {
|
||||
private dictionary = new Set<string>();
|
||||
private suggestionMap = new Map<string, SpellSuggestion[]>();
|
||||
|
||||
constructor(words: string[], suggestions: Record<string, SpellSuggestion[]> = {}) {
|
||||
for (const word of words) this.dictionary.add(word.toLowerCase());
|
||||
for (const [key, value] of Object.entries(suggestions)) {
|
||||
this.suggestionMap.set(key.toLowerCase(), value);
|
||||
}
|
||||
}
|
||||
|
||||
isReady(): boolean {
|
||||
return true;
|
||||
}
|
||||
|
||||
contains(word: string): boolean {
|
||||
return this.dictionary.has(word.toLowerCase());
|
||||
}
|
||||
|
||||
suggest(word: string, maxSuggestions = 5): SpellSuggestion[] {
|
||||
return (this.suggestionMap.get(word.toLowerCase()) ?? []).slice(0, maxSuggestions);
|
||||
}
|
||||
|
||||
addWord(word: string): void {
|
||||
this.dictionary.add(word.toLowerCase());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a mock engine with a rich dictionary of common English words,
|
||||
* plus suggestion mappings for words that are close (edit distance 1-2)
|
||||
* to dictionary words — simulating what SymSpell returns after normalization
|
||||
* brings garbled input within edit distance 2.
|
||||
*/
|
||||
function createNormalizationEngine(): AggressiveNormalizeMockEngine {
|
||||
const words = [
|
||||
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
|
||||
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
|
||||
'should', 'may', 'might', 'shall', 'can', 'need', 'dare', 'ought',
|
||||
'hello', 'world', 'beautiful', 'important', 'experiments', 'apple',
|
||||
'committee', 'balloon', 'raccoon', 'mississippi', 'success', 'coffee',
|
||||
'happy', 'correct', 'home', 'come', 'some', 'love', 'move', 'give',
|
||||
'live', 'above', 'possible', 'terrible', 'incredible', 'wonderful',
|
||||
'tomorrow', 'afternoon', 'communication', 'programming', 'different',
|
||||
'necessary', 'opportunity', 'professional', 'information', 'technology',
|
||||
];
|
||||
|
||||
// Suggestions for normalized forms that are within edit distance 1-2 of dictionary words.
|
||||
// These simulate what SymSpell would return after aggressive normalization.
|
||||
const suggestions: Record<string, SpellSuggestion[]> = {
|
||||
// After collapsing "eeeeeeeexpppperi8ments" → stripping + collapsing → "experiements"
|
||||
// which is edit distance 1 from "experiments"
|
||||
experiements: [
|
||||
{ word: 'experiments', distance: 1, frequency: 500_000 },
|
||||
],
|
||||
// After collapsing "qareee" → "qare", edit distance 1 from "are"
|
||||
qare: [
|
||||
{ word: 'are', distance: 1, frequency: 50_000_000 },
|
||||
],
|
||||
// After collapsing "mmmporttANT" → "mportant", edit distance 1 from "important"
|
||||
mportant: [
|
||||
{ word: 'important', distance: 1, frequency: 5_000_000 },
|
||||
],
|
||||
// After collapsing "heeello" → "helo", edit distance 1 from "hello"
|
||||
helo: [
|
||||
{ word: 'hello', distance: 1, frequency: 800_000 },
|
||||
],
|
||||
// After collapsing "beeeeautiful" → "beautful" (via collapse), nearby "beautiful"
|
||||
beautful: [
|
||||
{ word: 'beautiful', distance: 1, frequency: 1_200_000 },
|
||||
],
|
||||
// After collapsing "aaapple" → "aple", edit distance 1 from "apple"
|
||||
aple: [
|
||||
{ word: 'apple', distance: 1, frequency: 400_000 },
|
||||
],
|
||||
// After stripping digits from "h0me" → "hme", edit distance 1 from "home"
|
||||
hme: [
|
||||
{ word: 'home', distance: 1, frequency: 3_000_000 },
|
||||
],
|
||||
// After stripping leading digit from "1mportant" → "mportant"
|
||||
// Already covered above
|
||||
// After collapsing "cooofffeee" → "cofe", nearby "coffee"
|
||||
cofe: [
|
||||
{ word: 'coffee', distance: 2, frequency: 600_000 },
|
||||
],
|
||||
// After collapsing "haaaapy" → "hapy", nearby "happy" via double-letter restoration
|
||||
hapy: [
|
||||
{ word: 'happy', distance: 1, frequency: 700_000 },
|
||||
],
|
||||
// After collapsing "commmitteeee" → "comite", various paths to "committee"
|
||||
comite: [
|
||||
{ word: 'come', distance: 2, frequency: 2_000_000 },
|
||||
],
|
||||
// After collapsing + double-letter restore: "committree" → nearby "committee"
|
||||
committe: [
|
||||
{ word: 'committee', distance: 1, frequency: 300_000 },
|
||||
],
|
||||
commitee: [
|
||||
{ word: 'committee', distance: 1, frequency: 300_000 },
|
||||
],
|
||||
};
|
||||
|
||||
return new AggressiveNormalizeMockEngine(words, suggestions);
|
||||
}
|
||||
|
||||
describe('Aggressive Normalization — aggressiveNormalize() candidates', () => {
|
||||
let checker: SpellChecker;
|
||||
|
||||
beforeEach(async () => {
|
||||
checker = new SpellChecker({
|
||||
engine: createNormalizationEngine(),
|
||||
enableAggressiveNormalization: true,
|
||||
ignoreCamelCase: false,
|
||||
autoCorrect: true,
|
||||
confidenceThresholds: { autoFix: 0.5, suggest: 0.3, possible: 0.1 },
|
||||
});
|
||||
await checker.initialize();
|
||||
});
|
||||
|
||||
describe('repeated character collapse', () => {
|
||||
it('should correct "heeello" → "hello" (3+ e collapse)', async () => {
|
||||
const result = await checker.check('heeello');
|
||||
expect(result.correct).toBe(false);
|
||||
expect(result.suggestions).toContain('hello');
|
||||
});
|
||||
|
||||
it('should correct "beeeeautiful" via collapse + suggestions', async () => {
|
||||
const result = await checker.check('beeeeautiful');
|
||||
expect(result.correct).toBe(false);
|
||||
expect(result.suggestions).toContain('beautiful');
|
||||
});
|
||||
|
||||
it('should correct "aaapple" → "apple" via collapse + suggestion', async () => {
|
||||
const result = await checker.check('aaapple');
|
||||
expect(result.correct).toBe(false);
|
||||
expect(result.suggestions).toContain('apple');
|
||||
});
|
||||
|
||||
it('should correct "qareee" → suggestions include "are"', async () => {
|
||||
const result = await checker.check('qareee');
|
||||
expect(result.correct).toBe(false);
|
||||
expect(result.suggestions).toContain('are');
|
||||
});
|
||||
|
||||
it('should correct "mmmporttANT" → suggestions include "important"', async () => {
|
||||
const result = await checker.check('mmmporttANT');
|
||||
expect(result.correct).toBe(false);
|
||||
expect(result.suggestions).toContain('important');
|
||||
});
|
||||
});
|
||||
|
||||
describe('embedded digit stripping', () => {
|
||||
it('should correct "h0me" → suggestions include "home"', async () => {
|
||||
const result = await checker.check('h0me');
|
||||
expect(result.correct).toBe(false);
|
||||
expect(result.suggestions).toContain('home');
|
||||
});
|
||||
|
||||
it('should correct "1mportant" → suggestions include "important"', async () => {
|
||||
const result = await checker.check('1mportant');
|
||||
expect(result.correct).toBe(false);
|
||||
expect(result.suggestions).toContain('important');
|
||||
});
|
||||
});
|
||||
|
||||
describe('combined garbling (collapse + digit strip)', () => {
|
||||
it('should correct "eeeeeeeexpppperi8ments" → suggestions include "experiments"', async () => {
|
||||
const result = await checker.check('eeeeeeeexpppperi8ments');
|
||||
expect(result.correct).toBe(false);
|
||||
expect(result.suggestions).toContain('experiments');
|
||||
});
|
||||
});
|
||||
|
||||
describe('false positive prevention', () => {
|
||||
it('should NOT modify "committee" (correct word with double letters)', async () => {
|
||||
const result = await checker.check('committee');
|
||||
expect(result.correct).toBe(true);
|
||||
});
|
||||
|
||||
it('should NOT modify "balloon" (correct word with double letters)', async () => {
|
||||
const result = await checker.check('balloon');
|
||||
expect(result.correct).toBe(true);
|
||||
});
|
||||
|
||||
it('should NOT modify "raccoon" (correct word with double letters)', async () => {
|
||||
const result = await checker.check('raccoon');
|
||||
expect(result.correct).toBe(true);
|
||||
});
|
||||
|
||||
it('should NOT modify "mississippi" (correct word)', async () => {
|
||||
const result = await checker.check('mississippi');
|
||||
expect(result.correct).toBe(true);
|
||||
});
|
||||
|
||||
it('should NOT modify "success" (correct word with double letters)', async () => {
|
||||
const result = await checker.check('success');
|
||||
expect(result.correct).toBe(true);
|
||||
});
|
||||
|
||||
it('should NOT modify already correct words', async () => {
|
||||
const result = await checker.check('hello');
|
||||
expect(result.correct).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe('edge cases', () => {
|
||||
it('should handle empty string gracefully', async () => {
|
||||
const result = await checker.check('');
|
||||
expect(result.correct).toBe(true);
|
||||
});
|
||||
|
||||
it('should handle single character', async () => {
|
||||
const result = await checker.check('a');
|
||||
// "a" is in dictionary and passes min-word-length check
|
||||
expect(result.correct).toBe(true);
|
||||
});
|
||||
|
||||
it('should handle all-digit input (ignored by ignoreNumbers)', async () => {
|
||||
const result = await checker.check('12345');
|
||||
expect(result.correct).toBe(true); // ignored
|
||||
});
|
||||
|
||||
it('should handle word that is all repeated chars with no match', async () => {
|
||||
const result = await checker.check('zzzzzzz');
|
||||
expect(result.correct).toBe(false);
|
||||
// After collapse → "z", too short (minWordLength=2), so may still show no suggestions
|
||||
});
|
||||
|
||||
it('should not apply normalization when disabled', async () => {
|
||||
const disabledChecker = new SpellChecker({
|
||||
engine: createNormalizationEngine(),
|
||||
enableAggressiveNormalization: false,
|
||||
autoCorrect: true,
|
||||
confidenceThresholds: { autoFix: 0.5, suggest: 0.3, possible: 0.1 },
|
||||
});
|
||||
await disabledChecker.initialize();
|
||||
|
||||
// "eeeeeeeexpppperi8ments" is beyond SymSpell's edit distance without normalization
|
||||
const result = await disabledChecker.check('eeeeeeeexpppperi8ments');
|
||||
expect(result.correct).toBe(false);
|
||||
expect(result.suggestions).toHaveLength(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe('double-letter restoration', () => {
|
||||
it('should try restoring doubled letters after collapsing to find exact matches', async () => {
|
||||
// Build a specific engine where collapsed form + double-letter restore = exact match
|
||||
const engine = new AggressiveNormalizeMockEngine([
|
||||
'coffee', 'happy', 'hello', 'committee',
|
||||
]);
|
||||
|
||||
const checkerWithDoubles = new SpellChecker({
|
||||
engine,
|
||||
enableAggressiveNormalization: true,
|
||||
confidenceThresholds: { autoFix: 0.5, suggest: 0.3, possible: 0.1 },
|
||||
});
|
||||
await checkerWithDoubles.initialize();
|
||||
|
||||
// "cooofffeee" → collapse → "cofe" → double-letter restore tries "coofe", "coffe", "cofee", "cofee"
|
||||
// One of these leads to "coffee" via suggestion, but let's test with an engine that has it
|
||||
// Better: "happppy" → collapse → "hapy" → double-letter restore → "happy" (exact match!)
|
||||
const result = await checkerWithDoubles.check('happppy');
|
||||
expect(result.correct).toBe(false);
|
||||
expect(result.suggestions).toContain('happy');
|
||||
});
|
||||
|
||||
it('should find "committee" via double-letter restoration of collapsed form', async () => {
|
||||
// "commmittteeee" → collapse → "comite" → double-letter tries include "commitee", "committe", etc.
|
||||
// We have suggestion entries for "commitee" and "committe" → "committee"
|
||||
const result = await checker.check('commmittteeee');
|
||||
expect(result.correct).toBe(false);
|
||||
expect(result.suggestions).toContain('committee');
|
||||
});
|
||||
});
|
||||
});
|
||||
258
src/spellcheck/tests/spell-checker-mobile.test.ts
Normal file
258
src/spellcheck/tests/spell-checker-mobile.test.ts
Normal file
|
|
@ -0,0 +1,258 @@
|
|||
import { describe, it, expect, beforeEach } from 'vitest';
|
||||
|
||||
import { SpellChecker } from '../spell-checker.js';
|
||||
import type { SpellEngine, SpellSuggestion } from '../engines/types.js';
|
||||
|
||||
/**
|
||||
* Mobile/garbled input integration tests.
|
||||
*
|
||||
* Full pipeline tests exercising SpellChecker.check() and SpellChecker.fix()
|
||||
* with aggressive normalization enabled, simulating real SymSpell behavior
|
||||
* where heavily garbled words exceed the max edit distance of 2 until
|
||||
* normalization brings them within range.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Comprehensive mock engine simulating a SymSpell dictionary with:
|
||||
* - A large set of common English words (dictionary lookups)
|
||||
* - Suggestion maps for words within edit distance 2 of dictionary words
|
||||
* (the "post-normalization" forms that SymSpell can actually match)
|
||||
*/
|
||||
class MobileInputMockEngine implements SpellEngine {
|
||||
private dictionary = new Set<string>();
|
||||
private suggestionMap = new Map<string, SpellSuggestion[]>();
|
||||
|
||||
constructor(words: string[], suggestions: Record<string, SpellSuggestion[]> = {}) {
|
||||
for (const word of words) this.dictionary.add(word.toLowerCase());
|
||||
for (const [key, value] of Object.entries(suggestions)) {
|
||||
this.suggestionMap.set(key.toLowerCase(), value);
|
||||
}
|
||||
}
|
||||
|
||||
isReady(): boolean {
|
||||
return true;
|
||||
}
|
||||
|
||||
contains(word: string): boolean {
|
||||
return this.dictionary.has(word.toLowerCase());
|
||||
}
|
||||
|
||||
suggest(word: string, maxSuggestions = 5): SpellSuggestion[] {
|
||||
return (this.suggestionMap.get(word.toLowerCase()) ?? []).slice(0, maxSuggestions);
|
||||
}
|
||||
|
||||
addWord(word: string): void {
|
||||
this.dictionary.add(word.toLowerCase());
|
||||
}
|
||||
}
|
||||
|
||||
function createMobileTestEngine(): MobileInputMockEngine {
|
||||
const words = [
|
||||
// Common English words
|
||||
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'this', 'that',
|
||||
'and', 'or', 'not', 'but', 'for', 'with', 'from', 'to', 'of',
|
||||
'in', 'on', 'at', 'by', 'it', 'be', 'as', 'do', 'so', 'if',
|
||||
'we', 'he', 'she', 'they', 'you', 'me', 'him', 'her', 'us',
|
||||
// Target words for garbled input corrections
|
||||
'experiments', 'important', 'beautiful', 'hello', 'world',
|
||||
'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog',
|
||||
'correct', 'home', 'coffee', 'happy', 'apple', 'committee',
|
||||
'balloon', 'tomorrow', 'programming', 'success', 'different',
|
||||
'necessary', 'opportunity', 'professional',
|
||||
];
|
||||
|
||||
const suggestions: Record<string, SpellSuggestion[]> = {
|
||||
// Standard typos (within edit distance 2 without normalization)
|
||||
teh: [{ word: 'the', distance: 1, frequency: 23_000_000_000 }],
|
||||
quikc: [{ word: 'quick', distance: 1, frequency: 500_000 }],
|
||||
brwon: [{ word: 'brown', distance: 1, frequency: 400_000 }],
|
||||
helo: [{ word: 'hello', distance: 1, frequency: 800_000 }],
|
||||
wrold: [{ word: 'world', distance: 1, frequency: 1_500_000 }],
|
||||
jumpd: [{ word: 'jumps', distance: 1, frequency: 300_000 }],
|
||||
ovr: [{ word: 'over', distance: 1, frequency: 3_000_000 }],
|
||||
layz: [{ word: 'lazy', distance: 1, frequency: 200_000 }],
|
||||
dogg: [{ word: 'dog', distance: 1, frequency: 1_000_000 }],
|
||||
|
||||
// Post-normalization forms (after aggressive normalization brings them in range)
|
||||
experiements: [{ word: 'experiments', distance: 1, frequency: 500_000 }],
|
||||
mportant: [{ word: 'important', distance: 1, frequency: 5_000_000 }],
|
||||
qare: [{ word: 'are', distance: 1, frequency: 50_000_000 }],
|
||||
hme: [{ word: 'home', distance: 1, frequency: 3_000_000 }],
|
||||
aple: [{ word: 'apple', distance: 1, frequency: 400_000 }],
|
||||
beautful: [{ word: 'beautiful', distance: 1, frequency: 1_200_000 }],
|
||||
hapy: [{ word: 'happy', distance: 1, frequency: 700_000 }],
|
||||
cofe: [{ word: 'coffee', distance: 2, frequency: 600_000 }],
|
||||
commitee: [{ word: 'committee', distance: 1, frequency: 300_000 }],
|
||||
committe: [{ word: 'committee', distance: 1, frequency: 300_000 }],
|
||||
corect: [{ word: 'correct', distance: 1, frequency: 500_000 }],
|
||||
difrent: [{ word: 'different', distance: 2, frequency: 400_000 }],
|
||||
necesary: [{ word: 'necessary', distance: 1, frequency: 350_000 }],
|
||||
tommorow: [{ word: 'tomorrow', distance: 1, frequency: 600_000 }],
|
||||
programing: [{ word: 'programming', distance: 1, frequency: 450_000 }],
|
||||
};
|
||||
|
||||
return new MobileInputMockEngine(words, suggestions);
|
||||
}
|
||||
|
||||
describe('SpellChecker mobile/garbled input pipeline', () => {
|
||||
let checker: SpellChecker;
|
||||
|
||||
beforeEach(async () => {
|
||||
checker = new SpellChecker({
|
||||
engine: createMobileTestEngine(),
|
||||
enableAggressiveNormalization: true,
|
||||
autoCorrect: true,
|
||||
confidenceThresholds: { autoFix: 0.5, suggest: 0.3, possible: 0.1 },
|
||||
});
|
||||
await checker.initialize();
|
||||
});
|
||||
|
||||
describe('check() with garbled input', () => {
|
||||
it('should find "experiments" for "eeeeeeeexpppperi8ments"', async () => {
|
||||
const result = await checker.check('eeeeeeeexpppperi8ments');
|
||||
expect(result.correct).toBe(false);
|
||||
expect(result.suggestions).toContain('experiments');
|
||||
});
|
||||
|
||||
it('should find "are" for "qareee"', async () => {
|
||||
const result = await checker.check('qareee');
|
||||
expect(result.correct).toBe(false);
|
||||
expect(result.suggestions).toContain('are');
|
||||
});
|
||||
|
||||
it('should find "important" for "mmmporttANT"', async () => {
|
||||
const result = await checker.check('mmmporttANT');
|
||||
expect(result.correct).toBe(false);
|
||||
expect(result.suggestions).toContain('important');
|
||||
});
|
||||
|
||||
it('should find "home" for "h0me"', async () => {
|
||||
const result = await checker.check('h0me');
|
||||
expect(result.correct).toBe(false);
|
||||
expect(result.suggestions).toContain('home');
|
||||
});
|
||||
|
||||
it('should still correct standard typos (within normal edit distance)', async () => {
|
||||
const result = await checker.check('teh');
|
||||
expect(result.correct).toBe(false);
|
||||
expect(result.suggestions[0]).toBe('the');
|
||||
});
|
||||
|
||||
it('should mark correct words as correct', async () => {
|
||||
const result = await checker.check('hello');
|
||||
expect(result.correct).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe('fix() with garbled input', () => {
|
||||
it('should fix heavily garbled sentence', async () => {
|
||||
const result = await checker.fix('eeeeeeeexpppperi8ments qareee mmmporttANT');
|
||||
// Each garbled word should be corrected:
|
||||
// - eeeeeeeexpppperi8ments → experiments (via normalize + lookup)
|
||||
// - qareee → are (via collapse "qare" → distance 1)
|
||||
// - mmmporttANT → important (via collapse "mportant" → distance 1)
|
||||
expect(result.toLowerCase()).toContain('experiments');
|
||||
expect(result.toLowerCase()).toContain('are');
|
||||
expect(result.toLowerCase()).toContain('important');
|
||||
});
|
||||
|
||||
it('should fix standard typos alongside garbled words', async () => {
|
||||
const result = await checker.fix('teh quikc brwon fox');
|
||||
expect(result.toLowerCase()).toContain('the');
|
||||
expect(result.toLowerCase()).toContain('quick');
|
||||
expect(result.toLowerCase()).toContain('brown');
|
||||
expect(result.toLowerCase()).toContain('fox');
|
||||
});
|
||||
|
||||
it('should not modify correct text', async () => {
|
||||
const input = 'this is correct';
|
||||
const result = await checker.fix(input);
|
||||
expect(result).toBe(input);
|
||||
});
|
||||
|
||||
it('should handle mixed correct and garbled input', async () => {
|
||||
const result = await checker.fix('the eeeeeeeexpppperi8ments are important');
|
||||
expect(result.toLowerCase()).toContain('the');
|
||||
expect(result.toLowerCase()).toContain('experiments');
|
||||
expect(result.toLowerCase()).toContain('are');
|
||||
expect(result.toLowerCase()).toContain('important');
|
||||
});
|
||||
});
|
||||
|
||||
describe('checkText() with garbled input', () => {
|
||||
it('should report garbled words as errors with suggestions', async () => {
|
||||
const result = await checker.checkText('eeeeeeeexpppperi8ments qareee mmmporttANT');
|
||||
|
||||
expect(result.errors.length).toBeGreaterThanOrEqual(3);
|
||||
expect(result.stats.misspelledWords).toBeGreaterThanOrEqual(3);
|
||||
|
||||
const errorWords = result.errors.map((e) => e.word.toLowerCase());
|
||||
expect(errorWords).toContain('eeeeeeeexpppperi8ments');
|
||||
expect(errorWords).toContain('qareee');
|
||||
expect(errorWords).toContain('mmmporttant');
|
||||
|
||||
const expError = result.errors.find(
|
||||
(e) => e.word.toLowerCase() === 'eeeeeeeexpppperi8ments',
|
||||
);
|
||||
expect(expError?.suggestions).toContain('experiments');
|
||||
});
|
||||
|
||||
it('should not flag correct words in mixed text', async () => {
|
||||
const result = await checker.checkText('hello world');
|
||||
const misspellings = result.errors.filter((e) => e.type === 'misspelling');
|
||||
expect(misspellings).toHaveLength(0);
|
||||
});
|
||||
|
||||
it('should report word positions correctly', async () => {
|
||||
const text = 'the eeeeeeeexpppperi8ments';
|
||||
const result = await checker.checkText(text);
|
||||
|
||||
const garbledError = result.errors.find(
|
||||
(e) => e.word.toLowerCase() === 'eeeeeeeexpppperi8ments',
|
||||
);
|
||||
expect(garbledError).toBeDefined();
|
||||
expect(garbledError!.position.start).toBe(4);
|
||||
expect(garbledError!.position.end).toBe(26);
|
||||
});
|
||||
});
|
||||
|
||||
describe('normalization disabled', () => {
|
||||
let disabledChecker: SpellChecker;
|
||||
|
||||
beforeEach(async () => {
|
||||
disabledChecker = new SpellChecker({
|
||||
engine: createMobileTestEngine(),
|
||||
enableAggressiveNormalization: false,
|
||||
autoCorrect: true,
|
||||
confidenceThresholds: { autoFix: 0.5, suggest: 0.3, possible: 0.1 },
|
||||
});
|
||||
await disabledChecker.initialize();
|
||||
});
|
||||
|
||||
it('should NOT find suggestions for garbled words when disabled', async () => {
|
||||
const result = await disabledChecker.check('eeeeeeeexpppperi8ments');
|
||||
expect(result.correct).toBe(false);
|
||||
expect(result.suggestions).toHaveLength(0);
|
||||
});
|
||||
|
||||
it('should still correct standard typos when disabled', async () => {
|
||||
const result = await disabledChecker.check('teh');
|
||||
expect(result.correct).toBe(false);
|
||||
expect(result.suggestions[0]).toBe('the');
|
||||
});
|
||||
});
|
||||
|
||||
describe('case preservation', () => {
|
||||
it('should preserve uppercase when fixing garbled ALL CAPS words', async () => {
|
||||
const result = await checker.fix('MMMPORTTANT');
|
||||
// preserveCase: all uppercase input → all uppercase correction
|
||||
expect(result).toBe('IMPORTANT');
|
||||
});
|
||||
|
||||
it('should preserve title case when fixing garbled Title Case words', async () => {
|
||||
const result = await checker.fix('Eeeeeeeexpppperi8ments');
|
||||
// preserveCase: first letter uppercase → title case correction
|
||||
expect(result).toBe('Experiments');
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
@ -37,6 +37,7 @@ export interface SpellCheckOptions {
|
|||
confidenceThresholds?: ConfidenceThresholds;
|
||||
enableSplitWordDetection?: boolean;
|
||||
enableJoinedWordDetection?: boolean;
|
||||
enableAggressiveNormalization?: boolean;
|
||||
loader?: DictionaryDataLoader;
|
||||
engine?: SpellEngine;
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue