From 18aef27996dfcc6c1c3d5b513e48a4540fddcea6 Mon Sep 17 00:00:00 2001 From: sspanak Date: Tue, 12 Mar 2024 16:28:37 +0200 Subject: [PATCH] improved dictionary processing scripts * remove-repeating-words script now supports lowercase/uppercase option * remove-foreign-words script now supports foreign words file of more than 2^24 words --- scripts/remove-dictionary-repeating-words.js | 24 ++++++++++---- scripts/remove-foreign-words.js | 35 +++++++++++--------- 2 files changed, 37 insertions(+), 22 deletions(-) diff --git a/scripts/remove-dictionary-repeating-words.js b/scripts/remove-dictionary-repeating-words.js index 50e3d8f7..643ac745 100644 --- a/scripts/remove-dictionary-repeating-words.js +++ b/scripts/remove-dictionary-repeating-words.js @@ -5,10 +5,16 @@ const { createInterface } = require('readline'); const GEO_NAME = /[A-Z]\w+\-[^\n]+/; +function print(str) { + process.stdout.write(`${str}\n`); +} + + function printHelp() { - console.log(`Usage ${basename(process.argv[1])} LOCALE FILENAME.txt `); - console.log('Removes repeating words from a word list'); - console.log('\nLocale could be any valid JS locale, for exmaple: en, en-US, etc...'); + print(`Usage ${basename(process.argv[1])} LOCALE FILENAME.txt [--prefer-lowercase]`); + print('Removes repeating words from a word list'); + print("If --prefer-lowercase is set, the lowercase variants will be preserved, otherwise capitalized or uppercase variants will remain.") + print('\nLocale could be any valid JS locale, for example: en, en-US, etc...'); } @@ -25,7 +31,11 @@ function validateInput() { process.exit(2); } - return { fileName: process.argv[3], locale: process.argv[2] }; + return { + fileName: process.argv[3], + locale: process.argv[2], + preferLowercase: !!process.argv[4] + }; } @@ -50,7 +60,7 @@ function getLowercaseWordKey(locale, word) { -async function removeRepeatingWords({ fileName, locale }) { +async function removeRepeatingWords({ fileName, locale, preferLowercase }) { const wordMap = new Map(); let lineReader = createInterface({ input: createReadStream(fileName) }); @@ -64,7 +74,7 @@ async function removeRepeatingWords({ fileName, locale }) { wordMap.set(lowercaseKey, line); } - if (wordMap.has(lowercaseKey) && !wordMap.has(line)) { + if (!preferLowercase && wordMap.has(lowercaseKey) && !wordMap.has(line)) { wordMap.set(lowercaseKey, line); } } @@ -79,7 +89,7 @@ function printWords(wordList) { return; } - wordList.forEach(w => console.log(w)); + wordList.forEach(w => print(w)); } diff --git a/scripts/remove-foreign-words.js b/scripts/remove-foreign-words.js index 3e8e3823..b6dbfb8a 100644 --- a/scripts/remove-foreign-words.js +++ b/scripts/remove-foreign-words.js @@ -3,9 +3,14 @@ const { createReadStream, existsSync } = require('fs'); const { createInterface } = require('readline'); +function print(str) { + process.stdout.write(`${str}\n`); +} + + function printHelp() { - console.log(`Usage ${basename(process.argv[1])} --blacklist|whitelist DICTIONARY_LOCALE DICTIONARY.TXT FOREIGN_WORDS_LOCALE FOREIGN-WORD-DICTIONARY.txt`); - console.log('Removes foreign words from a dictionary. "blacklist" and "whitelist" determine how the FOREIGN-WORD-DICTIONARY.txt is used.'); + print(`Usage ${basename(process.argv[1])} --blacklist|whitelist DICTIONARY_LOCALE DICTIONARY.TXT FOREIGN_WORDS_LOCALE FOREIGN-WORD-DICTIONARY.txt`); + print('Removes foreign words from a dictionary. "blacklist" and "whitelist" determine how the FOREIGN-WORD-DICTIONARY.txt is used.'); } @@ -42,16 +47,16 @@ function validateInput() { async function work({ isBlacklist, locale, fileName, foreignWordsLocale, foreignWordsFileName }) { - const foreignWords = new Set(); + const originalWords = new Map(); - let lineReader = createInterface({ input: createReadStream(foreignWordsFileName) }); + let lineReader = createInterface({ input: createReadStream(fileName) }); for await (const line of lineReader) { - foreignWords.add(line.toLocaleLowerCase(foreignWordsLocale)); + originalWords.set(line.toLocaleLowerCase(foreignWordsLocale), line); } - const goodWords = new Set(); - lineReader = createInterface({ input: createReadStream(fileName) }); + + lineReader = createInterface({ input: createReadStream(foreignWordsFileName) }); for await (const line of lineReader) { if (typeof line !== 'string' || line.length === 0) { continue; @@ -59,23 +64,23 @@ async function work({ isBlacklist, locale, fileName, foreignWordsLocale, foreign const wordKey = line.toLocaleLowerCase(locale); - if ( - (!isBlacklist && foreignWords.has(wordKey)) - || (isBlacklist && !foreignWords.has(wordKey)) - ) { - goodWords.add(line); + if (isBlacklist && originalWords.has(wordKey)) { + originalWords.delete(wordKey); } + if (!isBlacklist && originalWords.has(wordKey)) { + goodWords.add(line); + } } - return goodWords; + return Array.from(isBlacklist ? originalWords.values() : goodWords); } function printWords(wordList) { - if (wordList instanceof Set) { - wordList.forEach(w => console.log(w)); + if (Array.isArray(wordList)) { + wordList.forEach(w => print(w)); } }