New dictionary tools (#830)

* new dictionary tools for generating an app dictionary from raw word lists * replaced the normalize-transcribed script with a Python version, since the Javascript version was running out of memory for larger languages like Japanese
2025-06-27 17:01:42 +03:00 · 2025-06-27 17:01:42 +03:00 · 62e8a08576
commit 62e8a08576
parent a34baef0f3
12 changed files with 533 additions and 439 deletions
--- a/scripts/capitalize-dictionary-words.js
+++ b/scripts/capitalize-dictionary-words.js
@ -1,94 +0,0 @@
 const { basename } = require('path');
 const { createReadStream, existsSync } = require('fs');
 const { print, printError } = require('./_printers.js');
 function printHelp() {
 	print(`Usage ${basename(process.argv[1])} DICTIONARY-FILE-NAME.txt LIST-OF-CAPITALIZED-WORDS.txt MIN-WORD-LENGTH LOCALE`);
 	print('Capitalizes a word list using capitalized words in another list.');
 	print('\nMIN-WORD-LENGTH must be a positive number.');
 	print('LOCALE could be any valid JS locale, for exmaple: en, en-US, etc...');
 }
 function validateInput() {
 	if (process.argv.length < 6) {
 		printHelp();
 		process.exit(1);
 	}
 	if (!existsSync(process.argv[3])) {
 		printError(`Failure! Could not find list-of-capitals file "${process.argv[3]}".`);
 		process.exit(2);
 	}
 	if (!existsSync(process.argv[2])) {
 		printError(`Failure! Could not find dictionary file "${process.argv[2]}".`);
 		process.exit(2);
 	}
 	const minWordLength = Number.parseInt(process.argv[4]);
 	if (Number.isNaN(minWordLength) || minWordLength < 0) {
 		printError(`Failure! The minimum word length must be a positive number.`);
 		process.exit(2);
 	}
 	return { capitalsFileName: process.argv[3], dictionaryFileName: process.argv[2], locale: process.argv[5], minWordLength };
 }
 async function capitalize({ dictionaryFileName, capitalsFileName, locale, minWordLength }) {
 	// read the dictionary
 	let lineReader = require('readline').createInterface({
 	  input: createReadStream(dictionaryFileName)
 	});
 	const words = {};
 	for await (const line of lineReader) {
 		words[line] = true;
 	}
 	// convert the dictionary words using the second file
 	lineReader = require('readline').createInterface({
 	  input: createReadStream(capitalsFileName)
 	});
 	for await (const capitalizedWord of lineReader) {
 		if (capitalizedWord.length < minWordLength) {
 			continue;
 		}
 		const lowercaseWord = capitalizedWord.toLocaleLowerCase(locale);
 		if (words[lowercaseWord]) {
 			delete words[lowercaseWord];
 			words[capitalizedWord] = true;
 		}
 		const possessiveLowercaseWord = `${lowercaseWord}'s`;
 		if (words[possessiveLowercaseWord]) {
 			delete words[possessiveLowercaseWord];
 			words[`${capitalizedWord}'s`] = true;
 		}
 	}
 	return Object.keys(words);
 }
 function printWords(wordList) {
 	if (!Array.isArray(wordList)) {
 		return;
 	}
 	wordList.forEach(w => print(w));
 }
 /** main **/
 capitalize(validateInput())
 	.then(words => printWords(words))
 	.catch(e => printError(e));
--- a/scripts/clean-raw-dictionary.sh
+++ b/scripts/clean-raw-dictionary.sh
@ -0,0 +1,79 @@
 #!/bin/bash
 if [ $# -lt 7 ]; then
 	echo "Usage: $0 <raw-word-list.txt> <output.txt> <suffixes.txt> <hunspell.aff> <hunspell.dic> <allowed-lowercase-char-list> <allowed-uppercase-char-list>"
 	echo
 	echo "Example (Slovak, no need of whitelist filter):"
 	echo "  $0 sk-raw.txt sk-filtered.txt /dev/null sk_SK.aff sk_SK.dic aáäbcčdďeéfghiíjklĺľmnňoóôpqrŕsštťuúvwxyýzž AÁÄBCČDĎEÉFGHIÍJKLĹĽMNŇOÓÔPQRŔSŠTŤUÚVWXYÝZŽ"
 	echo
 	echo "Example (Slovak, with whitelist filter):"
 	echo "  $0 sk-raw.txt sk-filtered.txt sk-suffix.txt sk_SK.aff sk_SK.dic aáäbcčdďeéfghiíjklĺľmnňoóôpqrŕsštťuúvwxyýzž AÁÄBCČDĎEÉFGHIÍJKLĹĽMNŇOÓÔPQRŔSŠTŤUÚVWXYÝZŽ"
 	exit 1
 fi
 DICTIONARY_FILE=$1
 if ! [[ -f $DICTIONARY_FILE ]]; then
 	echo "base-dictionary-file: '$DICTIONARY_FILE' does not exist"
 	exit 2
 fi
 SUFFIXES_FILE=$3
 AFF_FILE=$4
 if ! [[ -f "$AFF_FILE" ]]; then
 	echo ".aff file: '$AFF_FILE' does not exist"
 	exit 2
 fi
 DIC_FILE=$5
 if ! [[ -f "$DIC_FILE" ]]; then
 	echo ".dic file: '$DIC_FILE' does not exist"
 	exit 2
 fi
 OUTPUT_FILE=$2
 ALLOWED_LOWERCASE_CHARS=$6
 ALLOWED_UPPERCASE_CHARS=$7
 WORK_DIR="/tmp/TT9_$(uuidgen)"
 SCRIPT_DIR="$(dirname "$0")"
 if ! [[ -d $SCRIPT_DIR/venv ]]; then
 	python -m venv $SCRIPT_DIR/venv && source $SCRIPT_DIR/venv/bin/activate && pip install -r $SCRIPT_DIR/requirements.txt
 fi
 generate_words() {
 	CLEAN_WORDS=$1
 	OUTPUT=$2
 	DICTIONARY=${AFF_FILE::-4}
 	if ! [[ -f "$SUFFIXES_FILE" ]]; then
 		echo "Suffixes file: '$SUFFIXES_FILE' does not exist. Skipping extra word generation."
 		cp $CLEAN_WORDS $OUTPUT
 		return
 	fi
 	printf "Extracting valid words for generating new ones... " && hunspell -i UTF-8 -G -d "$DICTIONARY" $CLEAN_WORDS | sort -u | uniq > $WORK_DIR/generation-stems.txt && echo "OK" \
 	&& python $SCRIPT_DIR/generate-words-from-suffixes.py $WORK_DIR/generation-stems.txt $SUFFIXES_FILE $WORK_DIR/generated-raw.txt \
 	&& printf "Validating generated words with Hunspell... " && hunspell -i UTF-8 -G -d "$DICTIONARY" $WORK_DIR/generated-raw.txt > $WORK_DIR/generated-valid.txt && echo "OK" \
 	&& printf "Merging generated and input words... " && cat $CLEAN_WORDS $WORK_DIR/generated-valid.txt | sort -u | uniq > $OUTPUT && echo "OK"
 }
 # remove Roman numerals: ^(M{0,3})(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$
 date
 mkdir -p $WORK_DIR \
 	&& printf "Removing foreign letters... " && grep --text -E "^[$ALLOWED_LOWERCASE_CHARS$ALLOWED_UPPERCASE_CHARS]+$" $DICTIONARY_FILE > $WORK_DIR/noforeign.txt && echo "OK" \
 	&& printf "Removing frequencies and duplicates... " && sed -E 's/[\t0-9]+//g' $WORK_DIR/noforeign.txt | sort | uniq > $WORK_DIR/nofreq_norepeat.txt && echo "OK" \
 	&& printf "Removing lowerUPPER... " && grep -vE "[$ALLOWED_LOWERCASE_CHARS][$ALLOWED_UPPERCASE_CHARS]" $WORK_DIR/nofreq_norepeat.txt > $WORK_DIR/no_low_up.txt && echo "OK" \
 	&& printf "Removing UPPERlower... " && grep -vE "[$ALLOWED_UPPERCASE_CHARS]{2,}[$ALLOWED_LOWERCASE_CHARS]" $WORK_DIR/no_low_up.txt > $WORK_DIR/no_up_low.txt && echo "OK" \
 	&& printf "Removing single chars... " && grep -vE "^.$" $WORK_DIR/no_up_low.txt > $WORK_DIR/no_single.txt && echo "OK" \
 	&& printf "Removing words with repeeeeaaaated letters... " && grep -vE "(.)\1{2,}" $WORK_DIR/no_single.txt | grep -vE "^(.)\1$" | sort | uniq > $WORK_DIR/no_multi.txt && echo "OK" \
 	&& generate_words $WORK_DIR/no_multi.txt $WORK_DIR/generated.txt \
 	&& echo "Preparing to fix the text case." && source $SCRIPT_DIR/venv/bin/activate && python $SCRIPT_DIR/fix-text-case.py $WORK_DIR/generated.txt $WORK_DIR/text_case.txt --aff "$AFF_FILE" --dic "$DIC_FILE" \
 	&& INITIAL_COUNT=$(wc -l < "$DICTIONARY_FILE") && FINAL_COUNT=$(wc -l < "$WORK_DIR/text_case.txt") && echo "Word count: $INITIAL_COUNT -> $FINAL_COUNT" \
 	&& mv $WORK_DIR/text_case.txt "$OUTPUT_FILE"
 rm -rf $WORK_DIR
 date
--- a/scripts/extract-frequencies-from-text.js
+++ b/scripts/extract-frequencies-from-text.js
@ -1,149 +0,0 @@
 const { basename } = require('path');
 const { createReadStream, existsSync } = require('fs');
 const { createInterface } = require('readline');
 const { print, printError } = require('./_printers.js');
 function printHelp() {
 	print(`Usage ${basename(process.argv[1])} text.txt UnicodeRange [ExcludeRange] [EraseRange]`);
 	print('From the given text file, extracts and counts the unique words that belong to a given unicode range. Assumes all punctuation, whitespace and foreign characters as word separators.');
 	print('Example UnicodeRange: U+900-U+97FU+200CU+200D');
 	print('Example ExcludeRange: U+950-U+957U+966-U+97F');
 	print('Example EraseRange: U+964U+965U+970U+971');
 }
 function validateInput() {
 	if (process.argv.length < 3) {
 		printHelp();
 		process.exit(1);
 	}
 	if (!existsSync(process.argv[2])) {
 		printError(`Failure! Could not find word list file "${process.argv[2]}".`);
 		process.exit(2);
 	}
 	if (!validateUnicodeRange(process.argv[3])) {
 		printError(`Failure! Invalid or missing search Unicode range(s): "${process.argv[3]}".`);
 		process.exit(2);
 	}
 	if (process.argv[4] && !validateUnicodeRange(process.argv[4])) {
 		printError(`Failure! Invalid exclude range(s): "${process.argv[4]}".`);
 		process.exit(2);
 	}
 	if (process.argv[5] && !validateUnicodeRange(process.argv[5])) {
 		printError(`Failure! Invalid exclude range(s): "${process.argv[5]}".`);
 		process.exit(2);
 	}
 	return {
 		fileName: process.argv[2],
 		searchRegexString: process.argv[3],
 		excludeRegexString: typeof process.argv[4] === 'string' ? process.argv[4] : '',
 		eraseRegexString: typeof process.argv[5] === 'string' ? process.argv[5] : ''
 	};
 }
 function validateUnicodeRange(inputRange) {
 	return /^([uU]\+[\da-fA-F]+)(\-*[uU]\+[\da-fA-F]+)*$/.test(inputRange);
 }
 function URangeToXRange(range) {
 	if (range.length === 0) {
 		return null;
 	}
 	return range
 		.toUpperCase()
 		.replaceAll(/U\+([\dA-F]+)/g, "\\u{$1}");
 }
 function printWords(wordList) {
 	if (Array.isArray(wordList)) {
 		wordList.forEach(w => print(`${w.w}\t${w.f}`));
 	}
 }
 function cleanInvalidChars(line, eraseRegex, excludeRegexString) {
 	const spacesOnly = /^\s+$/;
 	if (!line || !line.length || spacesOnly.test(line)) {
 		return [];
 	}
 	const invalidWordRegex = excludeRegexString.length > 0 ? new RegExp("[" + URangeToXRange(excludeRegexString) + "]", "u") : null;
 	return line
 		.replace(eraseRegex, ' ')
 		.split(' ')
 		.filter(w => w.length > 1 && (invalidWordRegex === null || !invalidWordRegex.test(w)));
 }
 async function readWords(fileName, eraseRegex, excludeRegexString) {
 	const words = new Map();
 	if (!fileName) {
 		return words;
 	}
 	for await (const line of createInterface({ input: createReadStream(fileName) })) {
 		const parts = cleanInvalidChars(line, eraseRegex, excludeRegexString);
 		parts.forEach(w => {
 			words.set(w, words.has(w) ? words.get(w) + 1 : 1);
 		});
 	}
 	return words;
 }
 function sortWords(wordsMap) {
 	const words = [];
 	for (let [w, f] of wordsMap) {
 		words.push({ w, f });
 	}
 	return words.sort((a, b) => {
 		if (a.f > b.f) {
 			return -1;
 		}
 		if (a.f < b.f) {
 			return 1;
 		}
 		if (a.w < b.w) {
 			return -1;
 		}
 		if (a.w > b.w) {
 			return 1;
 		}
 		return 0;
 	});
 }
 async function work({ fileName, searchRegexString, excludeRegexString, eraseRegexString }) {
 	const eraseRegex = new RegExp("([^" + URangeToXRange(searchRegexString) + " ]+|[" + URangeToXRange(eraseRegexString) + "])", "gu");
 	return sortWords(
 		await readWords(fileName, eraseRegex, excludeRegexString)
 	);
 }
 /** main **/
 work(validateInput())
 	.then(words => printWords(words))
 	.catch(e => printError(e));
--- a/scripts/extract-words-from-text.py
+++ b/scripts/extract-words-from-text.py
@ -0,0 +1,47 @@
 import sys
 import re
 from collections import Counter
 from os.path import basename
 def usage():
    print(f"Usage: e{basename(__file__)} [--freq|-f] <allowed_letters> <file1> [file2 ...]")
    sys.exit(1)
 # Check and parse arguments
 args = sys.argv[1:]
 if not args or len(args) < 2:
    usage()
 show_freq = False
 if args[0] in ("--freq", "-f"):
    show_freq = True
    args = args[1:]
 if len(args) < 2:
    usage()
 allowed_letters = set(args[0])
 file_paths = args[1:]
 # Unicode word pattern
 word_pattern = re.compile(r'\b\w+\b', re.UNICODE)
 word_counts = Counter()
 # Process files
 for path in file_paths:
    try:
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                for word in word_pattern.findall(line):
                    if all(char in allowed_letters for char in word):
                        word_counts[word] += 1
    except Exception as e:
        print(f"Error reading {path}: {e}", file=sys.stderr)
 # Output
 if show_freq:
    for word, count in sorted(word_counts.items()):
        print(f"{word}\t{count}")
 else:
    for word in sorted(word_counts):
        print(word)
--- a/scripts/fix-text-case.py
+++ b/scripts/fix-text-case.py
@ -0,0 +1,108 @@
 import argparse
 import os
 import time
 from multiprocessing import Pool, cpu_count, Manager
 from collections import defaultdict
 import hunspell
 def load_unique_words(full_list_path):
    words = dict()
    with open(full_list_path, 'r', encoding='utf-8', errors='replace') as f:
        for line in f:
            word = line.strip()
            if '<EFBFBD>' in word:
                continue
            word_lower = word.lower()
            if word_lower not in words or words[word_lower] == word_lower:
                words[word_lower] = word
    return words.values()
 def init_hunspell_worker(aff_path, dic_path):
    global hobj, hunspell_stems
    hobj = hunspell.HunSpell(dic_path, aff_path)
    with open(dic_path, "r") as f:
        hunspell_stems = set({
            line.split('/')[0].strip()
            for line in f
            if not line.startswith('#')
        })
 def fix_word_text_case(word):
    word_lower = word.lower()
    # check for direct matches to avoid expensive calls to HunSpell.suggest()
    if word_lower != word and word_lower in hunspell_stems:
        return word_lower
    if word in hunspell_stems:
        return word
    # name -> Name
    hunspell_variants = hobj.suggest(word_lower)
    for variant in hunspell_variants:
        if word_lower != variant and word_lower == variant.lower():
            return variant
    # if it can be either lowercase or uppercase, then we want to keep the lowercase
    if word_lower in hunspell_variants:
        return word_lower
    # if it is an unknown word, keep it as-is
    return word
 def print_progress(current, total, start_time, interval):
    if current % interval == 0 or current == total:
        avg_time = (time.time() - start_time) / current
        remaining_time = (total - current) * avg_time
        HH, rem = divmod(int(remaining_time), 3600)
        MM, SS = divmod(rem, 60)
        print(f"\rFixing text case using hunspell... {current}/{total}, Remaining: {HH:02}:{MM:02}:{SS:02}", end=" ")
 def run_hunspell_batch(words, aff_path, dic_path, num_workers):
    total = len(words)
    start_time = time.time()
    with Pool(
        processes=num_workers,
        initializer=init_hunspell_worker,
        initargs=(aff_path, dic_path)
    ) as pool:
        for i, correct_word in enumerate (pool.imap_unordered(fix_word_text_case, words), 1):
            print_progress(i, total, start_time, 300)
            yield correct_word
 def main():
    parser = argparse.ArgumentParser(description="Correct the text case of a word list using Hunspell.")
    parser.add_argument("word_list", help="Path to the full list of words.")
    parser.add_argument("output", help="Path to save the corrected words.")
    parser.add_argument("--aff", required=True, help="Path to Hunspell .aff file.")
    parser.add_argument("--dic", required=True, help="Path to Hunspell .dic file.")
    args = parser.parse_args()
    if not os.path.exists(args.word_list):
        print(f"Full word list not found: {args.word_list}")
        return
    if not os.path.exists(args.aff):
        print(f"Hunspell .aff file not found: {args.aff}")
        return
    if not os.path.exists(args.dic):
        print(f"Hunspell .dic file not found: {args.dic}")
        return
    all_words = load_unique_words(args.word_list)
    print(f"Loaded {len(all_words)} candidate words.")
    corrected_words = run_hunspell_batch(all_words, args.aff, args.dic, cpu_count())
    with open(args.output, 'w', encoding='utf-8') as f:
        for word in sorted(corrected_words):
            f.write(word + '\n')
    print(" ") # clear the '\r'
 if __name__ == "__main__":
    main()
--- a/scripts/generate-words-from-suffixes.py
+++ b/scripts/generate-words-from-suffixes.py
@ -0,0 +1,64 @@
 import argparse
 import os
 from multiprocessing import Pool, cpu_count
 from collections import defaultdict
 def load_unique_words(word_list_path):
    words = set()
    with open(word_list_path, 'r', encoding='utf-8', errors='replace') as f:
        for line in f:
            word = line.strip()
            if '<EFBFBD>' in word:
                continue
            words.add(word.lower())
    return words
 def load_known_suffixes(suffix_file_path):
    suffixes = set()
    with open(suffix_file_path, 'r', encoding='utf-8', errors='replace') as f:
        for line in f:
            suffix = line.strip()
            if suffix:
                suffixes.add(suffix)
    return suffixes
 def generate_from_args(args):
    word, suffixes = args
    return {word + suffix for suffix in suffixes}
 def generate_words(words, suffixes, num_workers):
    new_words = set()
    with Pool(processes=num_workers) as pool:
        for result in pool.imap_unordered(generate_from_args, ((word, suffixes) for word in words)):
            new_words.update(result)
    return new_words
 def main():
    parser = argparse.ArgumentParser(description="Naively generate new words using a list of stems and a list of suffixes. Note that, you will have to clean up the invalid words after that.")
    parser.add_argument("word_list", help="Path to the full list of words to filter")
    parser.add_argument("suffix_file", help="Path to the file containing known suffixes")
    parser.add_argument("output", help="Path to save the filtered output")
    args = parser.parse_args()
    if not os.path.exists(args.word_list):
        print(f"Full word list not found: {args.word_list}")
        return
    if not os.path.exists(args.suffix_file):
        print(f"Suffix file not found: {args.suffix_file}")
        return
    print("Generating new words...", end=' ')
    all_words = load_unique_words(args.word_list)
    known_suffixes = load_known_suffixes(args.suffix_file)
    print(f"\rGenerating new words out of {len(all_words)} stems and {len(known_suffixes)} suffixes...", end=' ')
    generated = generate_words(all_words, known_suffixes, cpu_count())
    print(f"OK ({len(generated) - len(all_words)} new words)")
    with open(args.output, 'w', encoding='utf-8') as f:
        for word in generated:
            f.write(word + '\n')
 if __name__ == "__main__":
    main()
--- a/scripts/injest-script-words.js
+++ b/scripts/injest-script-words.js
@ -1,111 +0,0 @@
 const { basename } = require('path');
 const { createReadStream, existsSync } = require('fs');
 const { createInterface } = require('readline');
 const { print, printError } = require('./_printers.js');
 function printHelp() {
 	print(`Usage ${basename(process.argv[1])} word-list.txt UnicodeRange [ExcludeRange]`);
 	print('Extracts words that belong to a given unicode range. Assumes all punctuation, whitespace and foreign characters as word separators.');
 	print('Example UnicodeRange: U+900-U+97F');
 	print('Example ExcludeRange: U+950-U+954U+964-U+971U+200CU+200D');
 }
 function validateInput() {
 	if (process.argv.length < 3) {
 		printHelp();
 		process.exit(1);
 	}
 	if (!existsSync(process.argv[2])) {
 		printError(`Failure! Could not find word list file "${process.argv[2]}".`);
 		process.exit(2);
 	}
 	if (!validateUnicodeRange(process.argv[3])) {
 		printError(`Failure! Invalid or missing search Unicode range(s): "${process.argv[3]}".`);
 		process.exit(2);
 	}
 	if (process.argv[4] && !validateUnicodeRange(process.argv[4])) {
 		printError(`Failure! Invalid exclude range(s): "${process.argv[4]}".`);
 		process.exit(2);
 	}
 	return {
 		fileName: process.argv[2],
 		searchRegexString: process.argv[3],
 		excludeRegexString: typeof process.argv[4] === 'string' ? process.argv[4] : ''
 	};
 }
 function validateUnicodeRange(inputRange) {
 	return /^([uU]\+[\da-fA-F]+)(\-*[uU]\+[\da-fA-F]+)*$/.test(inputRange);
 }
 function URangeToXRange(range) {
 	if (range.length === 0) {
 		return null;
 	}
 	return range
 		.toUpperCase()
 		.replaceAll(/U\+([\dA-F]+)/g, "\\u{$1}");
 }
 function printWords(wordList) {
 	if (Array.isArray(wordList)) {
 		wordList.forEach(w => print(w));
 	}
 }
 function cleanInvalidChars(line, searchRegex, excludeRegex) {
 	const spacesOnly = /^\s+$/;
 	if (!line || !line.length || spacesOnly.test(line)) {
 		return [];
 	}
 	const cleanLine = excludeRegex !== null ? line.replace(excludeRegex, ' ') : line;
 	return cleanLine
 		.replace(searchRegex, ' ')
 		.split(' ')
 		.filter(w => w.length > 1);
 }
 async function readWords(fileName, searchRegex, excludeRegex) {
 	const words = new Set();
 	if (!fileName) {
 		return words;
 	}
 	for await (const line of createInterface({ input: createReadStream(fileName) })) {
 		cleanInvalidChars(line, searchRegex, excludeRegex).forEach(w => words.add(w));
 	}
 	return words;
 }
 async function work({ fileName, searchRegexString, excludeRegexString }) {
 	const searchRegex = new RegExp("[^" + URangeToXRange(searchRegexString) + "]+", "gu");
 	const excludeRegex = excludeRegexString.length > 0 ? new RegExp("[" + URangeToXRange(excludeRegexString) + "]+", "gu") : null;
 	const words = Array.from(await readWords(fileName, searchRegex, excludeRegex));
 	return words.filter(word => word.length > 1).sort();
 }
 /** main **/
 work(validateInput())
 	.then(words => printWords(words))
 	.catch(e => printError(e));
--- a/scripts/normalize-transcribed.js
+++ b/scripts/normalize-transcribed.js
@ -1,85 +0,0 @@
 const { basename } = require('path');
 const { existsSync, readFileSync } = require('fs');;
 const { print, printError } = require('./_printers.js')
 function printHelp() {
 	print(`Usage ${basename(process.argv[1])} WORD-LIST.txt`);
 	print('Normalizes the frequencies in a dictionary with transcriptions.');
 }
 function validateInput() {
 	if (process.argv.length < 3) {
 		printHelp();
 		process.exit(1);
 	}
 	if (!existsSync(process.argv[2])) {
 		printError(`Failure! Could not find word list file "${process.argv[2]}".`);
 		process.exit(2);
 	}
 	return {
 		fileName: process.argv[2]
 	};
 }
 function printWords(wordList) {
 	if (Array.isArray(wordList)) {
 		wordList.forEach(w => print(
 			w.number !== null ? `${w.chinese}\t${w.latin}\t${w.number}` : `${w.chinese}\t${w.latin}`
 		));
 	}
 }
 const { fileName } = validateInput();
 const data = readFileSync(fileName, 'utf8');
 const lines = data.trim().split('\n');
 // Parse the data into an array of objects
 let entries = lines.map(line => {
 	const parts = line.split('\t');
 	return {
 		original: line,
 		chinese: parts[0],
 		latin: parts[1],
 		number: parts[2] ? parseInt(parts[2], 10) : null
 	};
 });
 // Group entries by the Latin character sequence
 const groups = {};
 entries.forEach(entry => {
 	if (!groups[entry.latin]) {
 		groups[entry.latin] = [];
 	}
 	groups[entry.latin].push(entry);
 });
 // Process each group: sort by number (descending) and reassign ordinal numbers
 let sortedEntries = [];
 for (const key in groups) {
 	let group = groups[key];
 	// Separate entries with and without numbers
 	let withNumbers = group.filter(e => e.number !== null);
 	let withoutNumbers = group.filter(e => e.number === null);
 	// Sort by number in descending order
 	withNumbers.sort((a, b) => b.number - a.number);
 	// Assign ordinal rankings
 	for (let i = 0; i < withNumbers.length; i++) {
 		withNumbers[i].number = (withNumbers.length - i).toString();
 	}
 	// Preserve original order for entries without numbers
 	sortedEntries.push(...withNumbers, ...withoutNumbers);
 }
 printWords(sortedEntries);
--- a/scripts/normalize-transcribed.py
+++ b/scripts/normalize-transcribed.py
@ -0,0 +1,86 @@
 import sys
 import os
 import argparse
 from collections import defaultdict
 def print_error(message):
    print(message, file=sys.stderr)
 def parse_args():
    parser = argparse.ArgumentParser(
        description="Normalizes the frequencies in a dictionary with transcriptions."
    )
    parser.add_argument(
        "word_list",
        help="Path to the word list file (e.g., WORD-LIST.txt)"
    )
    return parser.parse_args()
 def validate_file(file_path):
    if not os.path.isfile(file_path):
        print_error(f'Failure! Could not find word list file "{file_path}".')
        sys.exit(2)
 def load_entries(file_path):
    with open(file_path, encoding='utf-8') as f:
        lines = [line.strip() for line in f if line.strip()]
    entries = []
    for line_num, line in enumerate(lines, start=1):
        parts = line.split('\t')
        if len(parts) < 2:
            print_error(f"Malformed line {line_num}: '{line}' (expected at least 2 tab-separated fields)")
            sys.exit(3)
        chinese, latin = parts[:2]
        number = None
        if len(parts) > 2:
            try:
                number = int(parts[2])
            except ValueError:
                print_error(f"Malformed line {line_num}: '{line}' (third field must be an integer if present)")
                sys.exit(3)
        entries.append({'chinese': chinese, 'latin': latin, 'number': number})
    return entries
 def group_entries(entries):
    groups = defaultdict(list)
    for entry in entries:
        groups[entry['latin']].append(entry)
    return groups
 def normalize_frequencies(groups):
    sorted_entries = []
    for group in groups.values():
        with_numbers = [e for e in group if e['number'] is not None]
        without_numbers = [e for e in group if e['number'] is None]
        with_numbers.sort(key=lambda e: e['number'], reverse=True)
        for rank, entry in enumerate(with_numbers, start=1):
            entry['number'] = str(len(with_numbers) - rank + 1)
        sorted_entries.extend(with_numbers)
        sorted_entries.extend(without_numbers)
    return sorted_entries
 def print_entries(entries):
    for e in entries:
        parts = [e['chinese'], e['latin']]
        if e['number'] is not None:
            parts.append(e['number'])
        print('\t'.join(parts))
 def main():
    args = parse_args()
    validate_file(args.word_list)
    entries = load_entries(args.word_list)
    groups = group_entries(entries)
    sorted_entries = normalize_frequencies(groups)
    print_entries(sorted_entries)
 if __name__ == "__main__":
    main()
--- a/scripts/remove-random-words.sh
+++ b/scripts/remove-random-words.sh
@ -0,0 +1,46 @@
 #!/bin/bash
 if [ $# -ne 7 ]; then
 	echo "Usage: $0 <locale> <all-words.txt> <courpus-words-with-frequencies.txt> <bad-combinations.txt> <output-file.txt> <vowels-list> <unpopular-max-length>"
 	echo "Example (Polish): $0 pl-PL pl.txt pl-corpus.txt pl-bad-combinations.txt pl-reduced.txt aąeęijoóuy 13"
 	exit 1
 fi
 LOCALE="$1"
 ORIGINAL_WORDS="$2"
 CORPUS_WORDS="$3"
 BAD_COMB_FILE="$4"
 OUTPUT_FILE="$5"
 VOWELS="$6"
 UNPOPULAR_MAX_LENGTH="$7"
 if ! [[ -f "$ORIGINAL_WORDS" ]]; then
 	echo "All words file: '$ORIGINAL_WORDS' does not exist"
 	exit 2
 fi
 if ! [[ -f "$CORPUS_WORDS" ]]; then
 	echo "Corpus words file: '$CORPUS_WORDS' does not exist"
 	exit 2
 fi
 if ! [[ -f "$BAD_COMB_FILE" ]]; then
 	echo "Bad letter combinations file: '$BAD_COMB_FILE' does not exist"
 	exit 2
 fi
 BAD_LETTER_COMBINATIONS=$(paste -sd'|' "$BAD_COMB_FILE")
 sed -E 's/^[^\t]+\t[12]$//g' "$CORPUS_WORDS" | grep . | sed -E 's/[\t0-9]+$//g' > __tmp__popular.txt &
 grep -Ev "(.)\1{2,}" "$ORIGINAL_WORDS" | grep -Ev "($BAD_LETTER_COMBINATIONS)" | grep -Ev "[$VOWELS]{3,}" | sort -u | uniq > __tmp__reduced.txt &
 wait
 node ~/src/tt9/scripts/remove-foreign-words.js --blacklist "$LOCALE" __tmp__reduced.txt "$LOCALE" __tmp__popular.txt | grep -Ev "(.)\1{2,}" > __tmp__unpopular.txt
 node ~/src/tt9/scripts/remove-foreign-words.js --blacklist "$LOCALE" __tmp__reduced.txt "$LOCALE" __tmp__unpopular.txt > __tmp__popular_reduced.txt &
 awk -v maxlen="$UNPOPULAR_MAX_LENGTH" 'length($0) <= maxlen' __tmp__unpopular.txt > __tmp__unpopular_reduced.txt &
 wait
 cat __tmp__popular_reduced.txt __tmp__unpopular_reduced.txt | sort -u | uniq > $OUTPUT_FILE
 rm -f __tmp__*.txt
--- a/scripts/requirements.txt
+++ b/scripts/requirements.txt
@ -0,0 +1 @@
 hunspell==0.5.5
--- a/scripts/whitelist-filter.py
+++ b/scripts/whitelist-filter.py
@ -0,0 +1,102 @@
 import argparse
 import os
 from multiprocessing import Pool, cpu_count
 from collections import defaultdict
 def load_stem_buckets(whitelist_path):
    buckets = defaultdict(set)
    with open(whitelist_path, 'r', encoding='utf-8', errors='replace') as f:
        for line in f:
            word = line.strip()
            if '<EFBFBD>' in word:
                continue
            word_lc = word.lower()
            first_char = word_lc[0]
            buckets[first_char].add(word_lc)
    return dict(buckets)
 def load_unique_words(full_list_path):
    words = set()
    with open(full_list_path, 'r', encoding='utf-8', errors='replace') as f:
        for line in f:
            word = line.strip()
            if '<EFBFBD>' in word:
                continue
            words.add(word)
    return words
 def load_known_suffixes(suffix_file_path):
    suffixes = set()
    with open(suffix_file_path, 'r', encoding='utf-8', errors='replace') as f:
        for line in f:
            suffix = line.strip()
            if suffix:
                suffixes.add(suffix)
    return suffixes
 def match_word(word, buckets, known_suffixes):
    """Return all valid combinations: base word and word+suffix if found in stems."""
    word_lc = word.lower()
    first_char = word_lc[0]
    possible_stems = buckets.get(first_char, set())
    matches = []
    if word_lc in possible_stems:
        matches.append(word)
    for suffix in known_suffixes:
        compound_word = word_lc + suffix
        if compound_word in possible_stems:
            matches.append(compound_word)
    return matches
 def filter_words_parallel(all_words, stem_buckets, known_suffixes, num_workers):
    args = [(word, stem_buckets, known_suffixes) for word in all_words]
    with Pool(processes=num_workers) as pool:
        results = pool.starmap(match_word, args)
    matched_words = set()
    for match_list in results:
        matched_words.update(match_list)
    return matched_words
 def main():
    parser = argparse.ArgumentParser(description="Filter given words by a stem whitelist. The list of suffixes is used to generate more variants of the valid words.")
    parser.add_argument("whitelist", help="Path to the whitelist file (with valid words)")
    parser.add_argument("full_list", help="Path to the full list of words to filter")
    parser.add_argument("suffix_file", help="Path to the file containing known suffixes")
    parser.add_argument("output", help="Path to save the filtered output")
    args = parser.parse_args()
    if not os.path.exists(args.whitelist):
        print(f"Whitelist file not found: {args.whitelist}")
        return
    if not os.path.exists(args.full_list):
        print(f"Full word list not found: {args.full_list}")
        return
    if not os.path.exists(args.suffix_file):
        print(f"Suffix file not found: {args.suffix_file}")
        return
    stem_buckets = load_stem_buckets(args.whitelist)
    print(f"Loaded {sum(len(s) for s in stem_buckets.values())} valid stems across {len(stem_buckets)} buckets.")
    all_words = load_unique_words(args.full_list)
    print(f"Loaded {len(all_words)} candidate words.")
    known_suffixes = load_known_suffixes(args.suffix_file)
    print(f"Loaded {len(known_suffixes)} known suffixes.")
    workers = cpu_count()
    print(f"Filtering using {workers} threads...", end=' ')
    filtered = filter_words_parallel(all_words, stem_buckets, known_suffixes, workers)
    print(f"OK. Matched {len(filtered)} words.")
    with open(args.output, 'w', encoding='utf-8') as f:
        for word in sorted(filtered):
            f.write(word + '\n')
 if __name__ == "__main__":
    main()