Csv dictionary support (#145)
* the dictionary loader now supports word frequencies * word frequency validation upon building * added default word frequencies to all dictionaries * updated documentation
This commit is contained in:
parent
b5cd92f1f7
commit
2510aba58a
30 changed files with 1175323 additions and 1175101 deletions
100
scripts/inject-dictionary-frequencies.js
Normal file
100
scripts/inject-dictionary-frequencies.js
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
const { basename } = require('path');
|
||||
const { createReadStream, existsSync } = require('fs');
|
||||
|
||||
|
||||
const DELIMITER = ' ';
|
||||
|
||||
|
||||
function printHelp() {
|
||||
console.log(`Usage ${basename(process.argv[1])} DICTIONARY-FILE-NAME.txt WORDS-WITH-FREQUENCIES.txt LOCALE`);
|
||||
console.log('Matches up the words from DICTIONARY-FILE-NAME with the frequencies in WORDS-WITH-FREQUENCIES file.');
|
||||
console.log('LOCALE could be any valid JS locale, for exmaple: en, en-US, etc...');
|
||||
}
|
||||
|
||||
|
||||
function validateInput() {
|
||||
if (process.argv.length < 5) {
|
||||
printHelp();
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
|
||||
if (!existsSync(process.argv[3])) {
|
||||
console.error(`Failure! Could not find the WORDS-WITH-FREQUENCIES file "${process.argv[3]}."`);
|
||||
process.exit(2);
|
||||
}
|
||||
|
||||
|
||||
if (!existsSync(process.argv[2])) {
|
||||
console.error(`Failure! Could not find dictionary file "${process.argv[2]}."`);
|
||||
process.exit(2);
|
||||
}
|
||||
|
||||
return { wordsWithFrequenciesFileName: process.argv[3], dictionaryFileName: process.argv[2], locale: process.argv[4] };
|
||||
}
|
||||
|
||||
|
||||
async function inject({ wordsWithFrequenciesFileName, dictionaryFileName, locale }) {
|
||||
// read the frequencies
|
||||
let lineReader = require('readline').createInterface({
|
||||
input: createReadStream(wordsWithFrequenciesFileName)
|
||||
});
|
||||
|
||||
|
||||
const frequencies = {};
|
||||
for await (const line of lineReader) {
|
||||
if (!line.includes(DELIMITER)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const parts = line.split(DELIMITER);
|
||||
const word = parts[0].toLocaleLowerCase(locale);
|
||||
let frequency = parts.length > 1 ? Number.parseInt(parts[1]) : 0;
|
||||
if (Number.isNaN(frequency) || frequency < 0) {
|
||||
frequency = 0;
|
||||
}
|
||||
|
||||
frequencies[word] = frequency;
|
||||
}
|
||||
|
||||
// read the dictionary words
|
||||
lineReader = require('readline').createInterface({
|
||||
input: createReadStream(dictionaryFileName)
|
||||
});
|
||||
|
||||
|
||||
const outputWords = [];
|
||||
for await (const word of lineReader) {
|
||||
const lowercaseWord = word.toLocaleLowerCase(locale);
|
||||
|
||||
outputWords.push({
|
||||
w: `${word}`,
|
||||
f: frequencies[lowercaseWord] || 0
|
||||
});
|
||||
}
|
||||
|
||||
return outputWords;
|
||||
}
|
||||
|
||||
|
||||
function printWords(wordList) {
|
||||
if (!Array.isArray(wordList)) {
|
||||
return;
|
||||
}
|
||||
|
||||
wordList.forEach(w => {
|
||||
let out = w.w;
|
||||
if (w.f) {
|
||||
out += `${DELIMITER}${w.f}`;
|
||||
}
|
||||
|
||||
console.log(out);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
||||
/** main **/
|
||||
inject(validateInput())
|
||||
.then(words => printWords(words))
|
||||
.catch(e => console.error(e));
|
||||
Loading…
Add table
Add a link
Reference in a new issue