From ae85de128efccb3a2650f14e76fe4998d65f15ad Mon Sep 17 00:00:00 2001 From: sspanak Date: Fri, 23 Feb 2024 14:30:23 +0200 Subject: [PATCH] small improvements in the word processing scripts --- scripts/add-new-words.sh | 2 +- scripts/clean-n-sort.sh | 5 +-- scripts/inject-dictionary-frequencies.js | 45 ++++++++++-------------- 3 files changed, 22 insertions(+), 30 deletions(-) diff --git a/scripts/add-new-words.sh b/scripts/add-new-words.sh index fa8482f4..1298f87d 100755 --- a/scripts/add-new-words.sh +++ b/scripts/add-new-words.sh @@ -38,7 +38,7 @@ sed -E 's/[\t0-9]+//g' $DICTIONARY_FILE > $WORK_DIR/_TT9_base.txt \ && echo >> $WORK_DIR/_TT9_combined.txt \ && cat $WORK_DIR/_TT9_2.txt >> $WORK_DIR/_TT9_combined.txt \ && node scripts/remove-dictionary-repeating-words.js $LOCALE $WORK_DIR/_TT9_combined.txt > $WORK_DIR/_TT9_clean.txt \ - && node scripts/inject-dictionary-frequencies.js $WORK_DIR/_TT9_clean.txt $FREQUENCY_FILE $LOCALE > $WORK_DIR/_TT9_output.txt \ + && node scripts/inject-dictionary-frequencies.js $LOCALE $WORK_DIR/_TT9_clean.txt $FREQUENCY_FILE > $WORK_DIR/_TT9_output.txt \ && cat $WORK_DIR/_TT9_output.txt rm -rf $WORK_DIR diff --git a/scripts/clean-n-sort.sh b/scripts/clean-n-sort.sh index 8ef71a01..78d143c9 100755 --- a/scripts/clean-n-sort.sh +++ b/scripts/clean-n-sort.sh @@ -29,8 +29,9 @@ FREQUENCY_FILE=$4 WORK_DIR="/tmp/TT9_$(uuidgen)" mkdir -p $WORK_DIR \ - && node scripts/remove-dictionary-repeating-words.js $LOCALE $DICTIONARY_FILE > $WORK_DIR/clean.txt \ - && node scripts/inject-dictionary-frequencies.js $WORK_DIR/clean.txt $FREQUENCY_FILE $LOCALE > $WORK_DIR/freqz.txt \ + && sed -E 's/[\t0-9]+//g' $DICTIONARY_FILE > $WORK_DIR/nofreq.txt \ + && node scripts/remove-dictionary-repeating-words.js $LOCALE $WORK_DIR/nofreq.txt > $WORK_DIR/clean.txt \ + && node scripts/inject-dictionary-frequencies.js $LOCALE $WORK_DIR/clean.txt $FREQUENCY_FILE > $WORK_DIR/freqz.txt \ && node scripts/sort-dictionary.js $LOCALE $WORK_DIR/freqz.txt $DEFINITION_FILE rm -rf $WORK_DIR diff --git a/scripts/inject-dictionary-frequencies.js b/scripts/inject-dictionary-frequencies.js index 5d5ff18e..4f9d11ea 100644 --- a/scripts/inject-dictionary-frequencies.js +++ b/scripts/inject-dictionary-frequencies.js @@ -6,7 +6,7 @@ const DELIMITER = ' '; function printHelp() { - console.log(`Usage ${basename(process.argv[1])} DICTIONARY-FILE-NAME.txt WORDS-WITH-FREQUENCIES.txt LOCALE`); + console.log(`Usage ${basename(process.argv[1])} LOCALE DICTIONARY-FILE-NAME.txt WORDS-WITH-FREQUENCIES.txt`); console.log('Matches up the words from DICTIONARY-FILE-NAME with the frequencies in WORDS-WITH-FREQUENCIES file.'); console.log('LOCALE could be any valid JS locale, for exmaple: en, en-US, etc...'); } @@ -19,18 +19,22 @@ function validateInput() { } + if (!existsSync(process.argv[4])) { + console.error(`Failure! Could not find the WORDS-WITH-FREQUENCIES file "${process.argv[4]}."`); + process.exit(2); + } + + if (!existsSync(process.argv[3])) { - console.error(`Failure! Could not find the WORDS-WITH-FREQUENCIES file "${process.argv[3]}."`); + console.error(`Failure! Could not find dictionary file "${process.argv[3]}."`); process.exit(2); } - - if (!existsSync(process.argv[2])) { - console.error(`Failure! Could not find dictionary file "${process.argv[2]}."`); - process.exit(2); - } - - return { wordsWithFrequenciesFileName: process.argv[3], dictionaryFileName: process.argv[2], locale: process.argv[4] }; + return { + locale: process.argv[2], + dictionaryFileName: process.argv[3], + wordsWithFrequenciesFileName: process.argv[4] + }; } @@ -41,7 +45,7 @@ async function inject({ wordsWithFrequenciesFileName, dictionaryFileName, locale }); - const frequencies = {}; + const frequencies = new Map(); for await (const line of lineReader) { if (!line.includes(DELIMITER)) { continue; @@ -54,7 +58,7 @@ async function inject({ wordsWithFrequenciesFileName, dictionaryFileName, locale frequency = 0; } - frequencies[word] = frequency; + frequencies.set(word, frequency) } // read the dictionary words @@ -66,11 +70,7 @@ async function inject({ wordsWithFrequenciesFileName, dictionaryFileName, locale const outputWords = []; for await (const word of lineReader) { const lowercaseWord = word.toLocaleLowerCase(locale); - - outputWords.push({ - w: `${word}`, - f: frequencies[lowercaseWord] || 0 - }); + outputWords.push(`${word}${ (frequencies.get(lowercaseWord) || 0) > 0 ? DELIMITER + frequencies.get(lowercaseWord) : '' }`); } return outputWords; @@ -78,18 +78,9 @@ async function inject({ wordsWithFrequenciesFileName, dictionaryFileName, locale function printWords(wordList) { - if (!Array.isArray(wordList)) { - return; + if (Array.isArray(wordList)) { + wordList.forEach(w => console.log(w)); } - - wordList.forEach(w => { - let out = w.w; - if (w.f) { - out += `${DELIMITER}${w.f}`; - } - - console.log(out); - }); }