* new dictionary tools for generating an app dictionary from raw word lists * replaced the normalize-transcribed script with a Python version, since the Javascript version was running out of memory for larger languages like Japanese
46 lines
1.6 KiB
Bash
Executable file
46 lines
1.6 KiB
Bash
Executable file
#!/bin/bash
|
|
|
|
if [ $# -ne 7 ]; then
|
|
echo "Usage: $0 <locale> <all-words.txt> <courpus-words-with-frequencies.txt> <bad-combinations.txt> <output-file.txt> <vowels-list> <unpopular-max-length>"
|
|
echo "Example (Polish): $0 pl-PL pl.txt pl-corpus.txt pl-bad-combinations.txt pl-reduced.txt aąeęijoóuy 13"
|
|
exit 1
|
|
fi
|
|
|
|
LOCALE="$1"
|
|
ORIGINAL_WORDS="$2"
|
|
CORPUS_WORDS="$3"
|
|
BAD_COMB_FILE="$4"
|
|
OUTPUT_FILE="$5"
|
|
VOWELS="$6"
|
|
UNPOPULAR_MAX_LENGTH="$7"
|
|
|
|
if ! [[ -f "$ORIGINAL_WORDS" ]]; then
|
|
echo "All words file: '$ORIGINAL_WORDS' does not exist"
|
|
exit 2
|
|
fi
|
|
|
|
if ! [[ -f "$CORPUS_WORDS" ]]; then
|
|
echo "Corpus words file: '$CORPUS_WORDS' does not exist"
|
|
exit 2
|
|
fi
|
|
|
|
if ! [[ -f "$BAD_COMB_FILE" ]]; then
|
|
echo "Bad letter combinations file: '$BAD_COMB_FILE' does not exist"
|
|
exit 2
|
|
fi
|
|
|
|
BAD_LETTER_COMBINATIONS=$(paste -sd'|' "$BAD_COMB_FILE")
|
|
|
|
|
|
sed -E 's/^[^\t]+\t[12]$//g' "$CORPUS_WORDS" | grep . | sed -E 's/[\t0-9]+$//g' > __tmp__popular.txt &
|
|
grep -Ev "(.)\1{2,}" "$ORIGINAL_WORDS" | grep -Ev "($BAD_LETTER_COMBINATIONS)" | grep -Ev "[$VOWELS]{3,}" | sort -u | uniq > __tmp__reduced.txt &
|
|
wait
|
|
|
|
node ~/src/tt9/scripts/remove-foreign-words.js --blacklist "$LOCALE" __tmp__reduced.txt "$LOCALE" __tmp__popular.txt | grep -Ev "(.)\1{2,}" > __tmp__unpopular.txt
|
|
node ~/src/tt9/scripts/remove-foreign-words.js --blacklist "$LOCALE" __tmp__reduced.txt "$LOCALE" __tmp__unpopular.txt > __tmp__popular_reduced.txt &
|
|
awk -v maxlen="$UNPOPULAR_MAX_LENGTH" 'length($0) <= maxlen' __tmp__unpopular.txt > __tmp__unpopular_reduced.txt &
|
|
wait
|
|
|
|
cat __tmp__popular_reduced.txt __tmp__unpopular_reduced.txt | sort -u | uniq > $OUTPUT_FILE
|
|
|
|
rm -f __tmp__*.txt
|