diff --git a/scripts/remove-extra-dashes.js b/scripts/remove-extra-dashes.js new file mode 100644 index 00000000..952d458f --- /dev/null +++ b/scripts/remove-extra-dashes.js @@ -0,0 +1,112 @@ +const { basename } = require('path'); +const { createReadStream, existsSync } = require('fs'); +const { createInterface } = require('readline'); + + +function printHelp() { + console.log(`Usage ${basename(process.argv[1])} LOCALE word-list.txt`); + console.log('Searches for compound words with that also exsit as separate words and removes the compound variants.'); + console.log('For example, "fly-by" will be removed, if the word list contains both "fly" and "by".') + console.log('LOCALE could be any valid JS locale, for exmaple: en, en-US, etc...') +} + + +function validateInput() { + if (process.argv.length < 4) { + printHelp(); + process.exit(1); + } + + if (!existsSync(process.argv[3])) { + console.error(`Failure! Could not find word list file "${process.argv[3]}."`); + process.exit(2); + } + + return { + fileName: process.argv[3], + locale: process.argv[2], + separator: '-' + }; +} + + +function printWords(wordList) { + if (wordList instanceof Set) { + wordList.forEach(w => console.log(w)); + } +} + + +async function readWords(fileName) { + const words = new Set(); + + if (!fileName) { + return words; + } + + for await (const line of createInterface({ input: createReadStream(fileName) })) { + words.add(line); + } + + return words; +} + + +function removeCompoundWords(locale, words, lowerCaseWords, separator) { + if (!(words instanceof Set)) { + return new Set(); + } + + const uniqueWords = new Set(); + words.forEach(w => { + // simple words + if (!w.includes(separator)) { + uniqueWords.add(w); + return; + } + + // compound words + let partMissing = false; + const parts = w.split(separator); + if (parts.length > 1) { + for (const splw of parts) { + if (splw.length === 0) { + continue; + } + + if (!lowerCaseWords.has(splw.toLocaleLowerCase(locale))) { + partMissing = true; + break; + } + } + } + + if (partMissing) { + uniqueWords.add(w); + } + }); + + return uniqueWords; +} + + +function wordsToLowerCase(locale, words) { + const lowerWords = new Set(); + if (words instanceof Set) { + words.forEach(w => lowerWords.add(w.toLocaleLowerCase(locale))) + } + return lowerWords; +} + + +async function work({ fileName, locale, separator }) { + const words = await readWords(fileName); + return removeCompoundWords(locale, words, wordsToLowerCase(locale, words), separator); +} + + + +/** main **/ +work(validateInput()) + .then(words => printWords(words)) + .catch(e => console.error(e)); \ No newline at end of file