* Adding frequency dictionary for en_US and adding c# file for generating new dictionaries * Adding json configuration to allow testing for multiple language regions. * Changed assert to expect.js. * Tests use decompressed dictionaries in lib/
157 lines
No EOL
4.1 KiB
C#
157 lines
No EOL
4.1 KiB
C#
// CreateDictionary.cs
|
|
// Credits to https://github.com/wolfgarbe/SymSpell/issues/15#issuecomment-350243639
|
|
|
|
using System;
|
|
using System.IO;
|
|
using System.Linq;
|
|
using System.Collections.Generic;
|
|
using System.Text;
|
|
|
|
class FrequencyDictionary {
|
|
static void Main() {
|
|
// Spelling dictionary
|
|
// http://app.aspell.net/create?defaults=en_US
|
|
// http://wordlist.aspell.net/
|
|
string aspellScowlFilepath = @"scowl-60size-0var-en_US.txt";
|
|
|
|
// Google ngrams
|
|
// http://storage.googleapis.com/books/ngrams/books/datasetsv2.html
|
|
String ngramsPrefix = @"googlebooks-eng-1M-1gram-20090715-";
|
|
|
|
string destFile = @"frequency_dictionary_en_US_60size_1M_1gram_20090715.txt";
|
|
|
|
Console.WriteLine("Creating aspell frequency dictionary");
|
|
DictionaryFactory df = new DictionaryFactory();
|
|
df.CreateWordFrequencyDictionary(aspellScowlFilepath, ngramsPrefix, destFile);
|
|
}
|
|
}
|
|
|
|
class DictionaryFactory {
|
|
Dictionary < string, Int64 > termlist = new Dictionary < string, Int64 > ();
|
|
|
|
|
|
//create a word frequency dictionary
|
|
public void CreateWordFrequencyDictionary(string scowlFilename, string googleBooksPrefix, string outputFilename) {
|
|
HashSet < string > hs = new HashSet < string > ();
|
|
using(StreamReader sr = new StreamReader(scowlFilename)) {
|
|
String line;
|
|
//process a single line at a time only for memory efficiency
|
|
while ((line = sr.ReadLine()) != null) {
|
|
if (line.Length < 1)
|
|
continue;
|
|
if (Char.IsUpper(line.Last()))
|
|
continue; //do not allow abbreviations
|
|
if ((line.Length <= 2) && Char.IsUpper(line.First()))
|
|
continue;
|
|
hs.Add(line.ToLower());
|
|
}
|
|
}
|
|
|
|
|
|
string[] wordFilter = {
|
|
"ha",
|
|
"te",
|
|
"sp",
|
|
"th",
|
|
"ca",
|
|
"yu",
|
|
"ms",
|
|
"ins",
|
|
"ith",
|
|
"spp",
|
|
"hou",
|
|
"ewith",
|
|
"fori"
|
|
};
|
|
|
|
// Dictionaries can have crazy UX effects. You can provide some bad words to censor for a chatbot. (Note: you'll never win)
|
|
// string[] badWords = File.ReadAllLines (@"bad-words.txt");
|
|
|
|
for (int i = 0; i < 10; i++) {
|
|
using(StreamReader sr = new StreamReader(googleBooksPrefix + i.ToString() + ".csv")) {
|
|
String line;
|
|
|
|
//process a single line at a time only for memory efficiency
|
|
while ((line = sr.ReadLine()) != null) {
|
|
string[] lineParts = line.Split('\t');
|
|
if (lineParts.Length >= 3) {
|
|
string key = lineParts[0].ToLower();
|
|
|
|
//allow only terms from the google n-grams which are also in the SCOWL lis
|
|
if (!hs.Contains(key))
|
|
continue;
|
|
|
|
//allow only terms which start with a letter
|
|
if (!Char.IsLetter(key.First()))
|
|
continue;
|
|
|
|
//only a & i are genuine single letter english words
|
|
if ((key.Length == 1) && (key != "a") && (key != "i"))
|
|
continue;
|
|
|
|
//addition filters
|
|
if (key.EndsWith("."))
|
|
continue;
|
|
if ((key.Length == 2) && ((key.StartsWith("'")) || (key.EndsWith("'"))))
|
|
continue;
|
|
if (wordFilter.Contains(key))
|
|
continue;
|
|
|
|
// UNCOMMENT to remove bad words
|
|
// if (badWords.Contains(key)) continue;
|
|
|
|
//set word counts
|
|
Int64 count;
|
|
if (Int64.TryParse(lineParts[2], out count)) {
|
|
//add to dictionary
|
|
if (termlist.ContainsKey(key)) {
|
|
termlist[key] += count;
|
|
} else {
|
|
termlist[key] = count;
|
|
}
|
|
// Console.WriteLine(key+" "+count.ToString("N0"));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//add some additional terms
|
|
foreach(string key in new string[15] {
|
|
"can't",
|
|
"won't",
|
|
"don't",
|
|
"couldn't",
|
|
"shouldn't",
|
|
"wouldn't",
|
|
"needn't",
|
|
"mustn't",
|
|
"she'll",
|
|
"we'll",
|
|
"he'll",
|
|
"they'll",
|
|
"i'll",
|
|
"i'm",
|
|
"wasn't"
|
|
}) {
|
|
termlist[key] = 300000;
|
|
}
|
|
|
|
//sort by frequency
|
|
List < KeyValuePair < string, Int64 >> termlist2 = termlist.ToList();
|
|
termlist2.Sort((x, y) => y.Value.CompareTo(x.Value));
|
|
|
|
//limit size
|
|
if (termlist2.Count > 500000)
|
|
termlist2.RemoveRange(500000, termlist2.Count - 500000);
|
|
|
|
//write new dict to file
|
|
using(System.IO.StreamWriter file =
|
|
new System.IO.StreamWriter(outputFilename, false, Encoding.UTF8)) {
|
|
for (int i = 0; i < termlist2.Count; i++)
|
|
file.WriteLine(termlist2[i].Key + " " + termlist2[i].Value.ToString());
|
|
}
|
|
|
|
Console.WriteLine("Ready: " + termlist.Count.ToString("N0") + " terms");
|
|
}
|
|
} |