New dictionary tools (#830)
* new dictionary tools for generating an app dictionary from raw word lists * replaced the normalize-transcribed script with a Python version, since the Javascript version was running out of memory for larger languages like Japanese
This commit is contained in:
parent
a34baef0f3
commit
62e8a08576
12 changed files with 533 additions and 439 deletions
79
scripts/clean-raw-dictionary.sh
Executable file
79
scripts/clean-raw-dictionary.sh
Executable file
|
|
@ -0,0 +1,79 @@
|
|||
#!/bin/bash
|
||||
|
||||
if [ $# -lt 7 ]; then
|
||||
echo "Usage: $0 <raw-word-list.txt> <output.txt> <suffixes.txt> <hunspell.aff> <hunspell.dic> <allowed-lowercase-char-list> <allowed-uppercase-char-list>"
|
||||
echo
|
||||
echo "Example (Slovak, no need of whitelist filter):"
|
||||
echo " $0 sk-raw.txt sk-filtered.txt /dev/null sk_SK.aff sk_SK.dic aáäbcčdďeéfghiíjklĺľmnňoóôpqrŕsštťuúvwxyýzž AÁÄBCČDĎEÉFGHIÍJKLĹĽMNŇOÓÔPQRŔSŠTŤUÚVWXYÝZŽ"
|
||||
echo
|
||||
echo "Example (Slovak, with whitelist filter):"
|
||||
echo " $0 sk-raw.txt sk-filtered.txt sk-suffix.txt sk_SK.aff sk_SK.dic aáäbcčdďeéfghiíjklĺľmnňoóôpqrŕsštťuúvwxyýzž AÁÄBCČDĎEÉFGHIÍJKLĹĽMNŇOÓÔPQRŔSŠTŤUÚVWXYÝZŽ"
|
||||
|
||||
exit 1
|
||||
fi
|
||||
|
||||
DICTIONARY_FILE=$1
|
||||
if ! [[ -f $DICTIONARY_FILE ]]; then
|
||||
echo "base-dictionary-file: '$DICTIONARY_FILE' does not exist"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
SUFFIXES_FILE=$3
|
||||
|
||||
AFF_FILE=$4
|
||||
if ! [[ -f "$AFF_FILE" ]]; then
|
||||
echo ".aff file: '$AFF_FILE' does not exist"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
DIC_FILE=$5
|
||||
if ! [[ -f "$DIC_FILE" ]]; then
|
||||
echo ".dic file: '$DIC_FILE' does not exist"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
OUTPUT_FILE=$2
|
||||
ALLOWED_LOWERCASE_CHARS=$6
|
||||
ALLOWED_UPPERCASE_CHARS=$7
|
||||
WORK_DIR="/tmp/TT9_$(uuidgen)"
|
||||
SCRIPT_DIR="$(dirname "$0")"
|
||||
|
||||
if ! [[ -d $SCRIPT_DIR/venv ]]; then
|
||||
python -m venv $SCRIPT_DIR/venv && source $SCRIPT_DIR/venv/bin/activate && pip install -r $SCRIPT_DIR/requirements.txt
|
||||
fi
|
||||
|
||||
|
||||
generate_words() {
|
||||
CLEAN_WORDS=$1
|
||||
OUTPUT=$2
|
||||
DICTIONARY=${AFF_FILE::-4}
|
||||
|
||||
if ! [[ -f "$SUFFIXES_FILE" ]]; then
|
||||
echo "Suffixes file: '$SUFFIXES_FILE' does not exist. Skipping extra word generation."
|
||||
cp $CLEAN_WORDS $OUTPUT
|
||||
return
|
||||
fi
|
||||
|
||||
printf "Extracting valid words for generating new ones... " && hunspell -i UTF-8 -G -d "$DICTIONARY" $CLEAN_WORDS | sort -u | uniq > $WORK_DIR/generation-stems.txt && echo "OK" \
|
||||
&& python $SCRIPT_DIR/generate-words-from-suffixes.py $WORK_DIR/generation-stems.txt $SUFFIXES_FILE $WORK_DIR/generated-raw.txt \
|
||||
&& printf "Validating generated words with Hunspell... " && hunspell -i UTF-8 -G -d "$DICTIONARY" $WORK_DIR/generated-raw.txt > $WORK_DIR/generated-valid.txt && echo "OK" \
|
||||
&& printf "Merging generated and input words... " && cat $CLEAN_WORDS $WORK_DIR/generated-valid.txt | sort -u | uniq > $OUTPUT && echo "OK"
|
||||
}
|
||||
|
||||
# remove Roman numerals: ^(M{0,3})(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$
|
||||
|
||||
date
|
||||
mkdir -p $WORK_DIR \
|
||||
&& printf "Removing foreign letters... " && grep --text -E "^[$ALLOWED_LOWERCASE_CHARS$ALLOWED_UPPERCASE_CHARS]+$" $DICTIONARY_FILE > $WORK_DIR/noforeign.txt && echo "OK" \
|
||||
&& printf "Removing frequencies and duplicates... " && sed -E 's/[\t0-9]+//g' $WORK_DIR/noforeign.txt | sort | uniq > $WORK_DIR/nofreq_norepeat.txt && echo "OK" \
|
||||
&& printf "Removing lowerUPPER... " && grep -vE "[$ALLOWED_LOWERCASE_CHARS][$ALLOWED_UPPERCASE_CHARS]" $WORK_DIR/nofreq_norepeat.txt > $WORK_DIR/no_low_up.txt && echo "OK" \
|
||||
&& printf "Removing UPPERlower... " && grep -vE "[$ALLOWED_UPPERCASE_CHARS]{2,}[$ALLOWED_LOWERCASE_CHARS]" $WORK_DIR/no_low_up.txt > $WORK_DIR/no_up_low.txt && echo "OK" \
|
||||
&& printf "Removing single chars... " && grep -vE "^.$" $WORK_DIR/no_up_low.txt > $WORK_DIR/no_single.txt && echo "OK" \
|
||||
&& printf "Removing words with repeeeeaaaated letters... " && grep -vE "(.)\1{2,}" $WORK_DIR/no_single.txt | grep -vE "^(.)\1$" | sort | uniq > $WORK_DIR/no_multi.txt && echo "OK" \
|
||||
&& generate_words $WORK_DIR/no_multi.txt $WORK_DIR/generated.txt \
|
||||
&& echo "Preparing to fix the text case." && source $SCRIPT_DIR/venv/bin/activate && python $SCRIPT_DIR/fix-text-case.py $WORK_DIR/generated.txt $WORK_DIR/text_case.txt --aff "$AFF_FILE" --dic "$DIC_FILE" \
|
||||
&& INITIAL_COUNT=$(wc -l < "$DICTIONARY_FILE") && FINAL_COUNT=$(wc -l < "$WORK_DIR/text_case.txt") && echo "Word count: $INITIAL_COUNT -> $FINAL_COUNT" \
|
||||
&& mv $WORK_DIR/text_case.txt "$OUTPUT_FILE"
|
||||
|
||||
rm -rf $WORK_DIR
|
||||
date
|
||||
Loading…
Add table
Add a link
Reference in a new issue