1
0
Fork 0

language processing scripts improvements

* remove-foreign-words script now supports blacklist and whitelist mode

  * fixed .domain sorting

  * clean'n'sort script
This commit is contained in:
sspanak 2024-02-05 21:33:39 +02:00 committed by Dimo Karaivanov
parent 071a4c29c2
commit ef2f92c0ac
4 changed files with 71 additions and 46 deletions

View file

@ -33,7 +33,7 @@ WORK_DIR="/tmp/TT9_$(uuidgen)"
mkdir -p $WORK_DIR && \
sed -E 's/[\t0-9]+//g' $DICTIONARY_FILE > $WORK_DIR/_TT9_base.txt \
&& node scripts/injest-words.js $NEW_WORDS_FILE $IGNORE_SPLIT_LIST_FILE > $WORK_DIR/_TT9_1.txt \
&& node scripts/remove-foreign-words.js $LOCALE $WORK_DIR/_TT9_1.txt $LOCALE $WORK_DIR/_TT9_base.txt > $WORK_DIR/_TT9_2.txt \
&& node scripts/remove-foreign-words.js --blacklist $LOCALE $WORK_DIR/_TT9_1.txt $LOCALE $WORK_DIR/_TT9_base.txt > $WORK_DIR/_TT9_2.txt \
&& cp $WORK_DIR/_TT9_base.txt $WORK_DIR/_TT9_combined.txt \
&& echo >> $WORK_DIR/_TT9_combined.txt \
&& cat $WORK_DIR/_TT9_2.txt >> $WORK_DIR/_TT9_combined.txt \