1
0
Fork 0

a script for removing compound words consisting of existing simple words

This commit is contained in:
sspanak 2024-02-22 17:16:22 +02:00 committed by Dimo Karaivanov
parent 3c7249fd3a
commit a25cdf8bf1

View file

@ -0,0 +1,112 @@
const { basename } = require('path');
const { createReadStream, existsSync } = require('fs');
const { createInterface } = require('readline');
function printHelp() {
console.log(`Usage ${basename(process.argv[1])} LOCALE word-list.txt`);
console.log('Searches for compound words with that also exsit as separate words and removes the compound variants.');
console.log('For example, "fly-by" will be removed, if the word list contains both "fly" and "by".')
console.log('LOCALE could be any valid JS locale, for exmaple: en, en-US, etc...')
}
function validateInput() {
if (process.argv.length < 4) {
printHelp();
process.exit(1);
}
if (!existsSync(process.argv[3])) {
console.error(`Failure! Could not find word list file "${process.argv[3]}."`);
process.exit(2);
}
return {
fileName: process.argv[3],
locale: process.argv[2],
separator: '-'
};
}
function printWords(wordList) {
if (wordList instanceof Set) {
wordList.forEach(w => console.log(w));
}
}
async function readWords(fileName) {
const words = new Set();
if (!fileName) {
return words;
}
for await (const line of createInterface({ input: createReadStream(fileName) })) {
words.add(line);
}
return words;
}
function removeCompoundWords(locale, words, lowerCaseWords, separator) {
if (!(words instanceof Set)) {
return new Set();
}
const uniqueWords = new Set();
words.forEach(w => {
// simple words
if (!w.includes(separator)) {
uniqueWords.add(w);
return;
}
// compound words
let partMissing = false;
const parts = w.split(separator);
if (parts.length > 1) {
for (const splw of parts) {
if (splw.length === 0) {
continue;
}
if (!lowerCaseWords.has(splw.toLocaleLowerCase(locale))) {
partMissing = true;
break;
}
}
}
if (partMissing) {
uniqueWords.add(w);
}
});
return uniqueWords;
}
function wordsToLowerCase(locale, words) {
const lowerWords = new Set();
if (words instanceof Set) {
words.forEach(w => lowerWords.add(w.toLocaleLowerCase(locale)))
}
return lowerWords;
}
async function work({ fileName, locale, separator }) {
const words = await readWords(fileName);
return removeCompoundWords(locale, words, wordsToLowerCase(locale, words), separator);
}
/** main **/
work(validateInput())
.then(words => printWords(words))
.catch(e => console.error(e));