1
0
Fork 0
tt9/scripts/remove-random-words.sh
Dimo Karaivanov 62e8a08576
New dictionary tools (#830)
* new dictionary tools for generating an app dictionary from raw word lists

* replaced the normalize-transcribed script with a Python version, since the Javascript version was running out of memory for larger languages like Japanese
2025-06-27 17:01:42 +03:00

46 lines
1.6 KiB
Bash
Executable file

#!/bin/bash
if [ $# -ne 7 ]; then
echo "Usage: $0 <locale> <all-words.txt> <courpus-words-with-frequencies.txt> <bad-combinations.txt> <output-file.txt> <vowels-list> <unpopular-max-length>"
echo "Example (Polish): $0 pl-PL pl.txt pl-corpus.txt pl-bad-combinations.txt pl-reduced.txt aąeęijoóuy 13"
exit 1
fi
LOCALE="$1"
ORIGINAL_WORDS="$2"
CORPUS_WORDS="$3"
BAD_COMB_FILE="$4"
OUTPUT_FILE="$5"
VOWELS="$6"
UNPOPULAR_MAX_LENGTH="$7"
if ! [[ -f "$ORIGINAL_WORDS" ]]; then
echo "All words file: '$ORIGINAL_WORDS' does not exist"
exit 2
fi
if ! [[ -f "$CORPUS_WORDS" ]]; then
echo "Corpus words file: '$CORPUS_WORDS' does not exist"
exit 2
fi
if ! [[ -f "$BAD_COMB_FILE" ]]; then
echo "Bad letter combinations file: '$BAD_COMB_FILE' does not exist"
exit 2
fi
BAD_LETTER_COMBINATIONS=$(paste -sd'|' "$BAD_COMB_FILE")
sed -E 's/^[^\t]+\t[12]$//g' "$CORPUS_WORDS" | grep . | sed -E 's/[\t0-9]+$//g' > __tmp__popular.txt &
grep -Ev "(.)\1{2,}" "$ORIGINAL_WORDS" | grep -Ev "($BAD_LETTER_COMBINATIONS)" | grep -Ev "[$VOWELS]{3,}" | sort -u | uniq > __tmp__reduced.txt &
wait
node ~/src/tt9/scripts/remove-foreign-words.js --blacklist "$LOCALE" __tmp__reduced.txt "$LOCALE" __tmp__popular.txt | grep -Ev "(.)\1{2,}" > __tmp__unpopular.txt
node ~/src/tt9/scripts/remove-foreign-words.js --blacklist "$LOCALE" __tmp__reduced.txt "$LOCALE" __tmp__unpopular.txt > __tmp__popular_reduced.txt &
awk -v maxlen="$UNPOPULAR_MAX_LENGTH" 'length($0) <= maxlen' __tmp__unpopular.txt > __tmp__unpopular_reduced.txt &
wait
cat __tmp__popular_reduced.txt __tmp__unpopular_reduced.txt | sort -u | uniq > $OUTPUT_FILE
rm -f __tmp__*.txt