1
0
Fork 0

optimized the word injestion scripts for speed and allowed running multiple instances

This commit is contained in:
sspanak 2024-01-06 13:04:01 +02:00 committed by Dimo Karaivanov
parent 93e3e0aec4
commit b6b8d5bed0
2 changed files with 35 additions and 33 deletions

View file

@ -26,14 +26,17 @@ LOCALE=$1
DICTIONARY_FILE=$2
NEW_WORDS_FILE=$3
FREQUENCY_FILE=$4
WORK_DIR="/tmp/TT9_$(uuidgen)"
sed -E 's/[\t0-9]+//g' $DICTIONARY_FILE > /tmp/_TT9_base.txt \
&& node scripts/injest-words.js $NEW_WORDS_FILE > /tmp/_TT9_1.txt \
&& node scripts/remove-foreign-words.js $LOCALE /tmp/_TT9_1.txt $LOCALE /tmp/_TT9_base.txt > /tmp/_TT9_2.txt \
&& cp /tmp/_TT9_base.txt /tmp/_TT9_combined.txt \
&& cat /tmp/_TT9_2.txt >> /tmp/_TT9_combined.txt \
&& node scripts/remove-dictionary-repeating-words.js $LOCALE /tmp/_TT9_combined.txt > /tmp/_TT9_clean.txt \
&& node scripts/inject-dictionary-frequencies.js /tmp/_TT9_clean.txt $FREQUENCY_FILE $LOCALE > /tmp/_TT9_output.txt \
&& cat /tmp/_TT9_output.txt
mkdir -p $WORK_DIR && \
sed -E 's/[\t0-9]+//g' $DICTIONARY_FILE > $WORK_DIR/_TT9_base.txt \
&& node scripts/injest-words.js $NEW_WORDS_FILE > $WORK_DIR/_TT9_1.txt \
&& node scripts/remove-foreign-words.js $LOCALE $WORK_DIR/_TT9_1.txt $LOCALE $WORK_DIR/_TT9_base.txt > $WORK_DIR/_TT9_2.txt \
&& cp $WORK_DIR/_TT9_base.txt $WORK_DIR/_TT9_combined.txt \
&& echo >> $WORK_DIR/_TT9_combined.txt \
&& cat $WORK_DIR/_TT9_2.txt >> $WORK_DIR/_TT9_combined.txt \
&& node scripts/remove-dictionary-repeating-words.js $LOCALE $WORK_DIR/_TT9_combined.txt > $WORK_DIR/_TT9_clean.txt \
&& node scripts/inject-dictionary-frequencies.js $WORK_DIR/_TT9_clean.txt $FREQUENCY_FILE $LOCALE > $WORK_DIR/_TT9_output.txt \
&& cat $WORK_DIR/_TT9_output.txt
rm -f /tmp/_TT9*
rm -rf $WORK_DIR