1
0
Fork 0
tt9/scripts/sort-dictionary.js
sspanak 8a971ea15e JavaScript code cleanup and optimizations
* removed all console statements

  * sorting script: no longer creating functions within the loop

  * aosp2tt9: fixed incorrect usage of a global variable

  * injest-words: fixed a non-strict comparison

  * remove-repeating-words: commented out the case-sensitive search function, as it may be going out of use
2024-03-28 09:59:27 +02:00

126 lines
No EOL
2.8 KiB
JavaScript

const { basename } = require('path');
const { createReadStream, existsSync } = require('fs');
const { createInterface } = require('readline');
const { print, printError } = require('./_printers.js');
function printHelp() {
print(`Usage ${basename(process.argv[1])} LOCALE WORD-LIST.txt LANGUAGE-DEFINITION.yml`);
print('Sorts a dictionary for optimum search speed.');
}
function validateInput() {
if (process.argv.length < 4) {
printHelp();
process.exit(1);
}
if (!existsSync(process.argv[3])) {
printError(`Failure! Could not find word list file "${process.argv[3]}".`);
process.exit(2);
}
if (!existsSync(process.argv[4])) {
printError(`Failure! Could not find language definition file "${process.argv[4]}".`);
process.exit(2);
}
return {
definitionFile: process.argv[4],
wordsFile: process.argv[3],
locale: process.argv[2]
};
}
function printWords(wordList) {
if (Array.isArray(wordList)) {
wordList.forEach(w => print(`${w.word}${w.frequency ? '\t' + w.frequency : ''}`));
}
}
async function readWords(fileName) {
const words = [];
if (!fileName) {
return words;
}
for await (const line of createInterface({ input: createReadStream(fileName) })) {
const [word, frequency] = line.split("\t");
words.push({
word,
frequency: Number.isNaN(Number.parseInt(frequency)) ? 0 : Number.parseInt(frequency)
});
}
return words;
}
async function readDefinition(fileName) {
if (!fileName) {
return new Map();
}
let lettersPattern = /^\s+-\s*\[([^\]]+)/;
let letterWeights = new Map([["'", 1], ['-', 1], ['"', 1], ['.', 1]]);
let key = 2;
for await (const line of createInterface({ input: createReadStream(fileName) })) {
if (line.includes('SPECIAL') || line.includes('PUNCTUATION')) {
continue;
}
const matches = line.match(lettersPattern);
if (matches && matches[1]) {
const letters = matches[1].replace(/\s/g, '').split(',');
for (let l of letters) {
letterWeights.set(l, key);
}
key++;
}
}
return letterWeights;
}
function dictionarySort(a, b, letterWeights, locale) {
if (a.word.length !== b.word.length) {
return a.word.length - b.word.length;
}
for (let i = 0, end = a.word.length; i < end; i++) {
const charA = a.word.toLocaleLowerCase(locale).charAt(i);
const charB = b.word.toLocaleLowerCase(locale).charAt(i);
const distance = letterWeights.get(charA) - letterWeights.get(charB);
if (distance !== 0) {
return distance;
}
}
return 0;
}
async function work({ definitionFile, wordsFile, locale }) {
return Promise.all([
readWords(wordsFile),
readDefinition(definitionFile)
]).then(([words, letterWeights]) =>
words.sort((a, b) => dictionarySort(a, b, letterWeights, locale))
);
}
/** main **/
work(validateInput())
.then(words => printWords(words))
.catch(e => printError(e));