dictionary frequency normalization script
This commit is contained in:
parent
671df5288c
commit
1e35e35d77
3 changed files with 77 additions and 15 deletions
|
|
@ -3,5 +3,11 @@ exports.print = function(str) {
|
||||||
};
|
};
|
||||||
|
|
||||||
exports.printError = function(str) {
|
exports.printError = function(str) {
|
||||||
process.stderr.write(`${str}\n`);
|
process.stderr.write(`${str instanceof Error ? str.stack : str}\n`);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
exports.printWordsWithFrequencies = function(words) {
|
||||||
|
if (Array.isArray(words)) {
|
||||||
|
words.forEach(w => exports.print(`${w.word}${w.frequency ? '\t' + w.frequency : ''}`));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
63
scripts/normalize-frequencies.js
Normal file
63
scripts/normalize-frequencies.js
Normal file
|
|
@ -0,0 +1,63 @@
|
||||||
|
const { basename } = require('path');
|
||||||
|
const { createReadStream, existsSync } = require('fs');
|
||||||
|
const { createInterface } = require('readline');
|
||||||
|
const { print, printError, printWordsWithFrequencies } = require('./_printers.js');
|
||||||
|
|
||||||
|
|
||||||
|
function printHelp() {
|
||||||
|
print(`Usage ${basename(process.argv[1])} WORD-LIST.txt`);
|
||||||
|
print('Normalizes dictionary frequencies up to 255.');
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
function validateInput() {
|
||||||
|
if (process.argv.length < 3) {
|
||||||
|
printHelp();
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!existsSync(process.argv[2])) {
|
||||||
|
printError(`Failure! Could not find word list file "${process.argv[3]}".`);
|
||||||
|
process.exit(2);
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
fileName: process.argv[2],
|
||||||
|
maxAllowedFrequency: 255
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async function normalize({ fileName, maxAllowedFrequency }) {
|
||||||
|
const words = [];
|
||||||
|
|
||||||
|
if (!fileName) {
|
||||||
|
return words;
|
||||||
|
}
|
||||||
|
|
||||||
|
let maxWordFrequency = 0;
|
||||||
|
|
||||||
|
for await (const line of createInterface({ input: createReadStream(fileName) })) {
|
||||||
|
let [word, frequency] = line.split("\t");
|
||||||
|
|
||||||
|
frequency = Number.isNaN(Number.parseInt(frequency)) ? 0 : Number.parseInt(frequency)
|
||||||
|
maxWordFrequency = Math.max(maxWordFrequency, frequency);
|
||||||
|
|
||||||
|
words.push({word, frequency});
|
||||||
|
}
|
||||||
|
|
||||||
|
const normalizationRatio = maxAllowedFrequency / maxWordFrequency;
|
||||||
|
|
||||||
|
for (word of words) {
|
||||||
|
word.frequency = Math.ceil(word.frequency * normalizationRatio);
|
||||||
|
}
|
||||||
|
|
||||||
|
return words;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** main **/
|
||||||
|
normalize(validateInput())
|
||||||
|
.then(words => printWordsWithFrequencies(words))
|
||||||
|
.catch(e => printError(e));
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
const { basename } = require('path');
|
const { basename } = require('path');
|
||||||
const { createReadStream, existsSync } = require('fs');
|
const { createReadStream, existsSync } = require('fs');
|
||||||
const { createInterface } = require('readline');
|
const { createInterface } = require('readline');
|
||||||
const { print, printError } = require('./_printers.js');
|
const { print, printError, printWordsWithFrequencies } = require('./_printers.js');
|
||||||
|
|
||||||
|
|
||||||
function printHelp() {
|
function printHelp() {
|
||||||
|
|
@ -35,13 +35,6 @@ function validateInput() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
function printWords(wordList) {
|
|
||||||
if (Array.isArray(wordList)) {
|
|
||||||
wordList.forEach(w => print(`${w.word}${w.frequency ? '\t' + w.frequency : ''}`));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
async function readWords(fileName) {
|
async function readWords(fileName) {
|
||||||
const words = [];
|
const words = [];
|
||||||
|
|
||||||
|
|
@ -51,9 +44,9 @@ async function readWords(fileName) {
|
||||||
|
|
||||||
for await (const line of createInterface({ input: createReadStream(fileName) })) {
|
for await (const line of createInterface({ input: createReadStream(fileName) })) {
|
||||||
const [word, frequency] = line.split("\t");
|
const [word, frequency] = line.split("\t");
|
||||||
words.push({
|
words.push({
|
||||||
word,
|
word,
|
||||||
frequency: Number.isNaN(Number.parseInt(frequency)) ? 0 : Number.parseInt(frequency)
|
frequency: Number.isNaN(Number.parseInt(frequency)) ? 0 : Number.parseInt(frequency)
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -122,5 +115,5 @@ async function work({ definitionFile, wordsFile, locale }) {
|
||||||
|
|
||||||
/** main **/
|
/** main **/
|
||||||
work(validateInput())
|
work(validateInput())
|
||||||
.then(words => printWords(words))
|
.then(words => printWordsWithFrequencies(words))
|
||||||
.catch(e => printError(e));
|
.catch(e => printError(e));
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue