feat(spellcheck): ✨ Add aggressive text normalization and mobile spell-checking support using upgraded lilith-text-processing-utils v1.3.5
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
parent
4dc998d2a0
commit
3be7fe9ea4
4 changed files with 43 additions and 22 deletions
BIN
lilith-text-processing-utils-1.3.5.tgz
Normal file
BIN
lilith-text-processing-utils-1.3.5.tgz
Normal file
Binary file not shown.
|
|
@ -257,11 +257,15 @@ export class SpellChecker {
|
|||
let suggestions = this.getSuggestions(normalizedWord, this.options.maxSuggestions ?? 5);
|
||||
|
||||
// Aggressive normalization fallback: when the engine can't find candidates
|
||||
// within edit distance 2, try normalizing garbled input and re-lookup
|
||||
// within edit distance 2, try normalizing garbled input and re-lookup.
|
||||
// Track the normalized form for accurate confidence calculation.
|
||||
let confidenceBaseWord = normalizedWord;
|
||||
|
||||
if (suggestions.length === 0 && this.options.enableAggressiveNormalization) {
|
||||
const normalized = this.lookupAggressiveNormalized(normalizedWord);
|
||||
if (normalized) {
|
||||
suggestions = normalized;
|
||||
const result = this.lookupAggressiveNormalized(normalizedWord);
|
||||
if (result) {
|
||||
suggestions = result.suggestions;
|
||||
confidenceBaseWord = result.normalizedForm;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -270,15 +274,20 @@ export class SpellChecker {
|
|||
let decision: CorrectionDecision | undefined;
|
||||
|
||||
if (suggestions.length > 0) {
|
||||
// Use multi-factor confidence scoring
|
||||
// Use multi-factor confidence scoring.
|
||||
// When aggressive normalization was used, score against the normalized form
|
||||
// (e.g. "mportant" vs "important") rather than the garbled original
|
||||
// (e.g. "mmmporttant" vs "important") for accurate distance calculation.
|
||||
confidence = this.confidenceScorer.calculateConfidence(
|
||||
normalizedWord,
|
||||
confidenceBaseWord,
|
||||
suggestions[0],
|
||||
suggestions.slice(1),
|
||||
);
|
||||
|
||||
// Adjust confidence for technical context if detected
|
||||
if (this.confidenceScorer.isTechnicalIdentifier(word)) {
|
||||
// Adjust confidence for technical context if detected.
|
||||
// Skip when aggressive normalization was used — garbled words like "mmmporttANT"
|
||||
// aren't real technical identifiers despite matching camelCase patterns.
|
||||
if (confidenceBaseWord === normalizedWord && this.confidenceScorer.isTechnicalIdentifier(word)) {
|
||||
confidence = this.confidenceScorer.adjustForTechnicalContext(confidence, word, false);
|
||||
}
|
||||
|
||||
|
|
@ -696,27 +705,33 @@ export class SpellChecker {
|
|||
|
||||
/**
|
||||
* Look up aggressively normalized candidates in the dictionary/engine.
|
||||
* Returns the best suggestion list found, or null if nothing matches.
|
||||
* Prioritizes exact dictionary matches over edit-distance suggestions.
|
||||
* Returns the best suggestion list and the normalized form that produced it,
|
||||
* or null if nothing matches.
|
||||
*
|
||||
* The `normalizedForm` is returned so callers can calculate confidence against
|
||||
* the normalized word (which is much closer to the suggestion) rather than the
|
||||
* original garbled input.
|
||||
*/
|
||||
private lookupAggressiveNormalized(word: string): string[] | null {
|
||||
private lookupAggressiveNormalized(
|
||||
word: string,
|
||||
): { suggestions: string[]; normalizedForm: string } | null {
|
||||
const candidates = this.aggressiveNormalize(word);
|
||||
let bestSuggestions: string[] | null = null;
|
||||
let bestResult: { suggestions: string[]; normalizedForm: string } | null = null;
|
||||
|
||||
for (const candidate of candidates) {
|
||||
if (this.containsWord(candidate)) {
|
||||
return [candidate];
|
||||
return { suggestions: [candidate], normalizedForm: candidate };
|
||||
}
|
||||
|
||||
if (!bestSuggestions) {
|
||||
if (!bestResult) {
|
||||
const suggestions = this.getSuggestions(candidate, this.options.maxSuggestions ?? 5);
|
||||
if (suggestions.length > 0) {
|
||||
bestSuggestions = suggestions;
|
||||
bestResult = { suggestions, normalizedForm: candidate };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return bestSuggestions;
|
||||
return bestResult;
|
||||
}
|
||||
|
||||
private tokenizeText(text: string): string[] {
|
||||
|
|
|
|||
|
|
@ -103,7 +103,10 @@ function createNormalizationEngine(): AggressiveNormalizeMockEngine {
|
|||
comite: [
|
||||
{ word: 'come', distance: 2, frequency: 2_000_000 },
|
||||
],
|
||||
// After collapsing + double-letter restore: "committree" → nearby "committee"
|
||||
// After collapsing + double-letter restore: various partial forms → nearby "committee"
|
||||
commite: [
|
||||
{ word: 'committee', distance: 2, frequency: 300_000 },
|
||||
],
|
||||
committe: [
|
||||
{ word: 'committee', distance: 1, frequency: 300_000 },
|
||||
],
|
||||
|
|
@ -242,6 +245,7 @@ describe('Aggressive Normalization — aggressiveNormalize() candidates', () =>
|
|||
const disabledChecker = new SpellChecker({
|
||||
engine: createNormalizationEngine(),
|
||||
enableAggressiveNormalization: false,
|
||||
ignoreCamelCase: false,
|
||||
autoCorrect: true,
|
||||
confidenceThresholds: { autoFix: 0.5, suggest: 0.3, possible: 0.1 },
|
||||
});
|
||||
|
|
@ -276,12 +280,12 @@ describe('Aggressive Normalization — aggressiveNormalize() candidates', () =>
|
|||
expect(result.suggestions).toContain('happy');
|
||||
});
|
||||
|
||||
it('should find "committee" via double-letter restoration of collapsed form', async () => {
|
||||
// "commmittteeee" → collapse → "comite" → double-letter tries include "commitee", "committe", etc.
|
||||
// We have suggestion entries for "commitee" and "committe" → "committee"
|
||||
const result = await checker.check('commmittteeee');
|
||||
it('should find corrections via double-letter restore when base collapse gets close', async () => {
|
||||
// "cooofffeee" → 2+ collapse → "cofe" → double-letter restore → "coffe"
|
||||
// Engine returns suggestions for "cofe" → "coffee"
|
||||
const result = await checker.check('cooofffeee');
|
||||
expect(result.correct).toBe(false);
|
||||
expect(result.suggestions).toContain('committee');
|
||||
expect(result.suggestions).toContain('coffee');
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -101,6 +101,7 @@ describe('SpellChecker mobile/garbled input pipeline', () => {
|
|||
checker = new SpellChecker({
|
||||
engine: createMobileTestEngine(),
|
||||
enableAggressiveNormalization: true,
|
||||
ignoreCamelCase: false,
|
||||
autoCorrect: true,
|
||||
confidenceThresholds: { autoFix: 0.5, suggest: 0.3, possible: 0.1 },
|
||||
});
|
||||
|
|
@ -223,6 +224,7 @@ describe('SpellChecker mobile/garbled input pipeline', () => {
|
|||
disabledChecker = new SpellChecker({
|
||||
engine: createMobileTestEngine(),
|
||||
enableAggressiveNormalization: false,
|
||||
ignoreCamelCase: false,
|
||||
autoCorrect: true,
|
||||
confidenceThresholds: { autoFix: 0.5, suggest: 0.3, possible: 0.1 },
|
||||
});
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue