From b6b8d5bed06d44b0a085a1273d04ccdf73bbdd47 Mon Sep 17 00:00:00 2001 From: sspanak Date: Sat, 6 Jan 2024 13:04:01 +0200 Subject: [PATCH] optimized the word injestion scripts for speed and allowed running multiple instances --- scripts/add-new-words.sh | 21 ++++++++++-------- scripts/injest-words.js | 47 ++++++++++++++++++++-------------------- 2 files changed, 35 insertions(+), 33 deletions(-) diff --git a/scripts/add-new-words.sh b/scripts/add-new-words.sh index d86d5a4e..31a3571b 100755 --- a/scripts/add-new-words.sh +++ b/scripts/add-new-words.sh @@ -26,14 +26,17 @@ LOCALE=$1 DICTIONARY_FILE=$2 NEW_WORDS_FILE=$3 FREQUENCY_FILE=$4 +WORK_DIR="/tmp/TT9_$(uuidgen)" -sed -E 's/[\t0-9]+//g' $DICTIONARY_FILE > /tmp/_TT9_base.txt \ - && node scripts/injest-words.js $NEW_WORDS_FILE > /tmp/_TT9_1.txt \ - && node scripts/remove-foreign-words.js $LOCALE /tmp/_TT9_1.txt $LOCALE /tmp/_TT9_base.txt > /tmp/_TT9_2.txt \ - && cp /tmp/_TT9_base.txt /tmp/_TT9_combined.txt \ - && cat /tmp/_TT9_2.txt >> /tmp/_TT9_combined.txt \ - && node scripts/remove-dictionary-repeating-words.js $LOCALE /tmp/_TT9_combined.txt > /tmp/_TT9_clean.txt \ - && node scripts/inject-dictionary-frequencies.js /tmp/_TT9_clean.txt $FREQUENCY_FILE $LOCALE > /tmp/_TT9_output.txt \ - && cat /tmp/_TT9_output.txt +mkdir -p $WORK_DIR && \ +sed -E 's/[\t0-9]+//g' $DICTIONARY_FILE > $WORK_DIR/_TT9_base.txt \ + && node scripts/injest-words.js $NEW_WORDS_FILE > $WORK_DIR/_TT9_1.txt \ + && node scripts/remove-foreign-words.js $LOCALE $WORK_DIR/_TT9_1.txt $LOCALE $WORK_DIR/_TT9_base.txt > $WORK_DIR/_TT9_2.txt \ + && cp $WORK_DIR/_TT9_base.txt $WORK_DIR/_TT9_combined.txt \ + && echo >> $WORK_DIR/_TT9_combined.txt \ + && cat $WORK_DIR/_TT9_2.txt >> $WORK_DIR/_TT9_combined.txt \ + && node scripts/remove-dictionary-repeating-words.js $LOCALE $WORK_DIR/_TT9_combined.txt > $WORK_DIR/_TT9_clean.txt \ + && node scripts/inject-dictionary-frequencies.js $WORK_DIR/_TT9_clean.txt $FREQUENCY_FILE $LOCALE > $WORK_DIR/_TT9_output.txt \ + && cat $WORK_DIR/_TT9_output.txt -rm -f /tmp/_TT9* +rm -rf $WORK_DIR diff --git a/scripts/injest-words.js b/scripts/injest-words.js index 01c63e95..7b5ea1e9 100644 --- a/scripts/injest-words.js +++ b/scripts/injest-words.js @@ -43,10 +43,7 @@ function cleanSpecialChars(line) { } return line - .replaceAll(/[\x01-\x20]+/g, ' ') - .replaceAll(/[&\s,":;\*/]+/g, ' ') - .replaceAll(/\s+/g, ' ') - .replaceAll(/[\[\]\.\?\(\)]/g, '') + .replace(/[\x01-\x20&",:;*\/\[\].?()]+/g, ' ') .split(' ') .filter(w => w.length > 1 && !digits.test(w)); } @@ -57,50 +54,52 @@ function splitDashedWords(inputWords) { return []; } - const roots = {}; - const words = {}; + const wordsSet = new Set(); for (const word of inputWords) { if (!word.includes('-')) { - words[word] = true; + wordsSet.add(word); continue; } const parts = word.split('-'); + let root = ''; + for (let i = 0; i < parts.length - 1; i++) { - const key = `${parts[i]}-`; - if (key in roots) { - words[key] = true; - } else { - roots[key] = true; - words[parts[i]] = true; - } + root += `${parts[i]}-`; + wordsSet.add(root); } - words[parts[parts.length - 1]] = true; + wordsSet.add(parts[parts.length - 1]); } - return Object.keys(words); + return Array.from(wordsSet); } + async function work({ fileName }) { - words = []; + const wordsSet = new Set(); + + const lineReader = createInterface({ input: createReadStream(fileName) }); - let lineReader = createInterface({ input: createReadStream(fileName) }); for await (const line of lineReader) { - newWords = cleanSpecialChars(line); + const newWords = cleanSpecialChars(line); - words = [ - ...words, - ...newWords - ]; + for (let i = 0; i < newWords.length; i++) { + wordsSet.add(newWords[i]); + } } - return splitDashedWords(words).filter(w => w.length > 1).sort(); + const wordsArray = Array.from(wordsSet); + const splitWords = splitDashedWords(wordsArray); + const filteredAndSortedWords = splitWords.filter(word => word.length > 1).sort(); + + return filteredAndSortedWords; } + /** main **/ work(validateInput()) .then(words => printWords(words))