From ef2f92c0ac26b0baeaadb2e7dd79b4ffae029978 Mon Sep 17 00:00:00 2001 From: sspanak Date: Mon, 5 Feb 2024 21:33:39 +0200 Subject: [PATCH] language processing scripts improvements * remove-foreign-words script now supports blacklist and whitelist mode * fixed .domain sorting * clean'n'sort script --- scripts/add-new-words.sh | 2 +- scripts/clean-n-sort.sh | 36 +++++++++++++++ scripts/remove-foreign-words.js | 77 ++++++++++++++------------------- scripts/sort-dictionary.js | 2 +- 4 files changed, 71 insertions(+), 46 deletions(-) create mode 100755 scripts/clean-n-sort.sh diff --git a/scripts/add-new-words.sh b/scripts/add-new-words.sh index 171217a2..fa8482f4 100755 --- a/scripts/add-new-words.sh +++ b/scripts/add-new-words.sh @@ -33,7 +33,7 @@ WORK_DIR="/tmp/TT9_$(uuidgen)" mkdir -p $WORK_DIR && \ sed -E 's/[\t0-9]+//g' $DICTIONARY_FILE > $WORK_DIR/_TT9_base.txt \ && node scripts/injest-words.js $NEW_WORDS_FILE $IGNORE_SPLIT_LIST_FILE > $WORK_DIR/_TT9_1.txt \ - && node scripts/remove-foreign-words.js $LOCALE $WORK_DIR/_TT9_1.txt $LOCALE $WORK_DIR/_TT9_base.txt > $WORK_DIR/_TT9_2.txt \ + && node scripts/remove-foreign-words.js --blacklist $LOCALE $WORK_DIR/_TT9_1.txt $LOCALE $WORK_DIR/_TT9_base.txt > $WORK_DIR/_TT9_2.txt \ && cp $WORK_DIR/_TT9_base.txt $WORK_DIR/_TT9_combined.txt \ && echo >> $WORK_DIR/_TT9_combined.txt \ && cat $WORK_DIR/_TT9_2.txt >> $WORK_DIR/_TT9_combined.txt \ diff --git a/scripts/clean-n-sort.sh b/scripts/clean-n-sort.sh new file mode 100755 index 00000000..8ef71a01 --- /dev/null +++ b/scripts/clean-n-sort.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +if [ $# -lt 4 ]; then + echo "Usage: $0 LOCALE base-dictionary-file.csv definition-file.txt frequency-file.csv" + echo 'Removes the repeating words injects the frequencies and sorts a dictionary file. Useful, when adding new words directly to the dictionary .csv.' + echo 'LOCALE could be any valid JS locale, for exmaple: en, en-US, etc...' + exit 1 +fi + +if ! [[ -f $2 ]]; then + echo "base-dictionary-file: '$2' does not exist" + exit 2 +fi + +if ! [[ -f $3 ]]; then + echo "definition-file: '$3' does not exist" + exit 2 +fi + +if ! [[ -f $4 ]]; then + echo "frequency-file: '$4' does not exist" + exit 2 +fi + +LOCALE=$1 +DICTIONARY_FILE=$2 +DEFINITION_FILE=$3 +FREQUENCY_FILE=$4 +WORK_DIR="/tmp/TT9_$(uuidgen)" + +mkdir -p $WORK_DIR \ + && node scripts/remove-dictionary-repeating-words.js $LOCALE $DICTIONARY_FILE > $WORK_DIR/clean.txt \ + && node scripts/inject-dictionary-frequencies.js $WORK_DIR/clean.txt $FREQUENCY_FILE $LOCALE > $WORK_DIR/freqz.txt \ + && node scripts/sort-dictionary.js $LOCALE $WORK_DIR/freqz.txt $DEFINITION_FILE + +rm -rf $WORK_DIR diff --git a/scripts/remove-foreign-words.js b/scripts/remove-foreign-words.js index 2bf48c11..3e8e3823 100644 --- a/scripts/remove-foreign-words.js +++ b/scripts/remove-foreign-words.js @@ -4,95 +4,84 @@ const { createInterface } = require('readline'); function printHelp() { - console.log(`Usage ${basename(process.argv[1])} DICTIONARY_LOCALE DICTIONARY.TXT FOREIGN_WORDS_LOCALE FOREIGN-WORD-DICTIONARY.txt`); - console.log('Removes foreign words from a dictionary'); + console.log(`Usage ${basename(process.argv[1])} --blacklist|whitelist DICTIONARY_LOCALE DICTIONARY.TXT FOREIGN_WORDS_LOCALE FOREIGN-WORD-DICTIONARY.txt`); + console.log('Removes foreign words from a dictionary. "blacklist" and "whitelist" determine how the FOREIGN-WORD-DICTIONARY.txt is used.'); } function validateInput() { - if (process.argv.length < 6) { + if (process.argv.length < 7) { printHelp(); process.exit(1); } - if (!existsSync(process.argv[3])) { - console.error(`Failure! Could not find words file "${process.argv[3]}."`); + if (process.argv[2] !== '--blacklist' && process.argv[2] !== '--whitelist') { + console.error(`Failure! You must specify whether to use the foreign words file as a blacklist or a whitelist."`); + process.exit(3); + } + + if (!existsSync(process.argv[4])) { + console.error(`Failure! Could not find words file "${process.argv[4]}."`); process.exit(2); } - if (!existsSync(process.argv[5])) { - console.error(`Failure! Could not find foreign words file "${process.argv[4]}."`); + if (!existsSync(process.argv[6])) { + console.error(`Failure! Could not find foreign words file "${process.argv[6]}."`); process.exit(2); } return { - locale: process.argv[2], - fileName: process.argv[3], - foreignWordsLocale: process.argv[4], - foreignWordsFileName: process.argv[5] + isBlacklist: process.argv[2] === '--blacklist', + locale: process.argv[3], + fileName: process.argv[4], + foreignWordsLocale: process.argv[5], + foreignWordsFileName: process.argv[6] }; } - -function getLowercaseWordKey(locale, word) { - return getWordkey(word).toLocaleLowerCase(locale); -} - - - -function getWordkey(word) { - if (typeof word !== 'string' || word.length === 0) { - return ''; - } - - return word; -} - - - -async function removeForeignWords({ locale, foreignWordsLocale, fileName, foreignWordsFileName }) { - const foreignWords = {}; +async function work({ isBlacklist, locale, fileName, foreignWordsLocale, foreignWordsFileName }) { + const foreignWords = new Set(); let lineReader = createInterface({ input: createReadStream(foreignWordsFileName) }); for await (const line of lineReader) { - foreignWords[getLowercaseWordKey(foreignWordsLocale, line)] = true; + foreignWords.add(line.toLocaleLowerCase(foreignWordsLocale)); } - const wordMap = {}; + const goodWords = new Set(); lineReader = createInterface({ input: createReadStream(fileName) }); for await (const line of lineReader) { - const word = getWordkey(line); - const lowercaseWord = getLowercaseWordKey(locale, line); - - if (word === '') { + if (typeof line !== 'string' || line.length === 0) { continue; } + const wordKey = line.toLocaleLowerCase(locale); - if (!foreignWords[lowercaseWord]) { - wordMap[word] = true; + if ( + (!isBlacklist && foreignWords.has(wordKey)) + || (isBlacklist && !foreignWords.has(wordKey)) + ) { + goodWords.add(line); } + } - return Object.keys(wordMap); + return goodWords; } function printWords(wordList) { - if (!Array.isArray(wordList)) { - return; + if (wordList instanceof Set) { + wordList.forEach(w => console.log(w)); } - - wordList.forEach(w => console.log(w)); } /** main **/ -removeForeignWords(validateInput()) +work(validateInput()) .then(words => printWords(words)) .catch(e => console.error(e)); diff --git a/scripts/sort-dictionary.js b/scripts/sort-dictionary.js index 297b17c3..94299bd7 100644 --- a/scripts/sort-dictionary.js +++ b/scripts/sort-dictionary.js @@ -66,7 +66,7 @@ async function readDefinition(fileName) { } let lettersPattern = /^\s+-\s*\[([^\]]+)/; - let letterWeights = new Map([["'", 1], ['-', 1], ['"', 1]]); + let letterWeights = new Map([["'", 1], ['-', 1], ['"', 1], ['.', 1]]); let key = 2; for await (const line of createInterface({ input: createReadStream(fileName) })) {