1
0
Fork 0

dictionary frequency normalization script

This commit is contained in:
sspanak 2024-04-08 19:14:29 +03:00 committed by Dimo Karaivanov
parent 671df5288c
commit 1e35e35d77
3 changed files with 77 additions and 15 deletions

View file

@ -3,5 +3,11 @@ exports.print = function(str) {
}; };
exports.printError = function(str) { exports.printError = function(str) {
process.stderr.write(`${str}\n`); process.stderr.write(`${str instanceof Error ? str.stack : str}\n`);
}; };
exports.printWordsWithFrequencies = function(words) {
if (Array.isArray(words)) {
words.forEach(w => exports.print(`${w.word}${w.frequency ? '\t' + w.frequency : ''}`));
}
}

View file

@ -0,0 +1,63 @@
const { basename } = require('path');
const { createReadStream, existsSync } = require('fs');
const { createInterface } = require('readline');
const { print, printError, printWordsWithFrequencies } = require('./_printers.js');
function printHelp() {
print(`Usage ${basename(process.argv[1])} WORD-LIST.txt`);
print('Normalizes dictionary frequencies up to 255.');
}
function validateInput() {
if (process.argv.length < 3) {
printHelp();
process.exit(1);
}
if (!existsSync(process.argv[2])) {
printError(`Failure! Could not find word list file "${process.argv[3]}".`);
process.exit(2);
}
return {
fileName: process.argv[2],
maxAllowedFrequency: 255
};
}
async function normalize({ fileName, maxAllowedFrequency }) {
const words = [];
if (!fileName) {
return words;
}
let maxWordFrequency = 0;
for await (const line of createInterface({ input: createReadStream(fileName) })) {
let [word, frequency] = line.split("\t");
frequency = Number.isNaN(Number.parseInt(frequency)) ? 0 : Number.parseInt(frequency)
maxWordFrequency = Math.max(maxWordFrequency, frequency);
words.push({word, frequency});
}
const normalizationRatio = maxAllowedFrequency / maxWordFrequency;
for (word of words) {
word.frequency = Math.ceil(word.frequency * normalizationRatio);
}
return words;
}
/** main **/
normalize(validateInput())
.then(words => printWordsWithFrequencies(words))
.catch(e => printError(e));

View file

@ -1,7 +1,7 @@
const { basename } = require('path'); const { basename } = require('path');
const { createReadStream, existsSync } = require('fs'); const { createReadStream, existsSync } = require('fs');
const { createInterface } = require('readline'); const { createInterface } = require('readline');
const { print, printError } = require('./_printers.js'); const { print, printError, printWordsWithFrequencies } = require('./_printers.js');
function printHelp() { function printHelp() {
@ -35,13 +35,6 @@ function validateInput() {
} }
function printWords(wordList) {
if (Array.isArray(wordList)) {
wordList.forEach(w => print(`${w.word}${w.frequency ? '\t' + w.frequency : ''}`));
}
}
async function readWords(fileName) { async function readWords(fileName) {
const words = []; const words = [];
@ -51,9 +44,9 @@ async function readWords(fileName) {
for await (const line of createInterface({ input: createReadStream(fileName) })) { for await (const line of createInterface({ input: createReadStream(fileName) })) {
const [word, frequency] = line.split("\t"); const [word, frequency] = line.split("\t");
words.push({ words.push({
word, word,
frequency: Number.isNaN(Number.parseInt(frequency)) ? 0 : Number.parseInt(frequency) frequency: Number.isNaN(Number.parseInt(frequency)) ? 0 : Number.parseInt(frequency)
}); });
} }
@ -122,5 +115,5 @@ async function work({ definitionFile, wordsFile, locale }) {
/** main **/ /** main **/
work(validateInput()) work(validateInput())
.then(words => printWords(words)) .then(words => printWordsWithFrequencies(words))
.catch(e => printError(e)); .catch(e => printError(e));