spellchecker-wasm/CreateDictionary.cs
Mike Maietta 2db472fa15
Dictionaries/en_US (#18)
* Adding frequency dictionary for en_US and adding c# file for generating new dictionaries

* Adding json configuration to allow testing for multiple language regions.
* Changed assert to expect.js.
* Tests use decompressed dictionaries in lib/
2020-02-16 17:59:27 -08:00

157 lines
No EOL
4.1 KiB
C#

// CreateDictionary.cs
// Credits to https://github.com/wolfgarbe/SymSpell/issues/15#issuecomment-350243639
using System;
using System.IO;
using System.Linq;
using System.Collections.Generic;
using System.Text;
class FrequencyDictionary {
static void Main() {
// Spelling dictionary
// http://app.aspell.net/create?defaults=en_US
// http://wordlist.aspell.net/
string aspellScowlFilepath = @"scowl-60size-0var-en_US.txt";
// Google ngrams
// http://storage.googleapis.com/books/ngrams/books/datasetsv2.html
String ngramsPrefix = @"googlebooks-eng-1M-1gram-20090715-";
string destFile = @"frequency_dictionary_en_US_60size_1M_1gram_20090715.txt";
Console.WriteLine("Creating aspell frequency dictionary");
DictionaryFactory df = new DictionaryFactory();
df.CreateWordFrequencyDictionary(aspellScowlFilepath, ngramsPrefix, destFile);
}
}
class DictionaryFactory {
Dictionary < string, Int64 > termlist = new Dictionary < string, Int64 > ();
//create a word frequency dictionary
public void CreateWordFrequencyDictionary(string scowlFilename, string googleBooksPrefix, string outputFilename) {
HashSet < string > hs = new HashSet < string > ();
using(StreamReader sr = new StreamReader(scowlFilename)) {
String line;
//process a single line at a time only for memory efficiency
while ((line = sr.ReadLine()) != null) {
if (line.Length < 1)
continue;
if (Char.IsUpper(line.Last()))
continue; //do not allow abbreviations
if ((line.Length <= 2) && Char.IsUpper(line.First()))
continue;
hs.Add(line.ToLower());
}
}
string[] wordFilter = {
"ha",
"te",
"sp",
"th",
"ca",
"yu",
"ms",
"ins",
"ith",
"spp",
"hou",
"ewith",
"fori"
};
// Dictionaries can have crazy UX effects. You can provide some bad words to censor for a chatbot. (Note: you'll never win)
// string[] badWords = File.ReadAllLines (@"bad-words.txt");
for (int i = 0; i < 10; i++) {
using(StreamReader sr = new StreamReader(googleBooksPrefix + i.ToString() + ".csv")) {
String line;
//process a single line at a time only for memory efficiency
while ((line = sr.ReadLine()) != null) {
string[] lineParts = line.Split('\t');
if (lineParts.Length >= 3) {
string key = lineParts[0].ToLower();
//allow only terms from the google n-grams which are also in the SCOWL lis
if (!hs.Contains(key))
continue;
//allow only terms which start with a letter
if (!Char.IsLetter(key.First()))
continue;
//only a & i are genuine single letter english words
if ((key.Length == 1) && (key != "a") && (key != "i"))
continue;
//addition filters
if (key.EndsWith("."))
continue;
if ((key.Length == 2) && ((key.StartsWith("'")) || (key.EndsWith("'"))))
continue;
if (wordFilter.Contains(key))
continue;
// UNCOMMENT to remove bad words
// if (badWords.Contains(key)) continue;
//set word counts
Int64 count;
if (Int64.TryParse(lineParts[2], out count)) {
//add to dictionary
if (termlist.ContainsKey(key)) {
termlist[key] += count;
} else {
termlist[key] = count;
}
// Console.WriteLine(key+" "+count.ToString("N0"));
}
}
}
}
}
//add some additional terms
foreach(string key in new string[15] {
"can't",
"won't",
"don't",
"couldn't",
"shouldn't",
"wouldn't",
"needn't",
"mustn't",
"she'll",
"we'll",
"he'll",
"they'll",
"i'll",
"i'm",
"wasn't"
}) {
termlist[key] = 300000;
}
//sort by frequency
List < KeyValuePair < string, Int64 >> termlist2 = termlist.ToList();
termlist2.Sort((x, y) => y.Value.CompareTo(x.Value));
//limit size
if (termlist2.Count > 500000)
termlist2.RemoveRange(500000, termlist2.Count - 500000);
//write new dict to file
using(System.IO.StreamWriter file =
new System.IO.StreamWriter(outputFilename, false, Encoding.UTF8)) {
for (int i = 0; i < termlist2.Count; i++)
file.WriteLine(termlist2[i].Key + " " + termlist2[i].Value.ToString());
}
Console.WriteLine("Ready: " + termlist.Count.ToString("N0") + " terms");
}
}