New dictionary tools (#830)

* new dictionary tools for generating an app dictionary from raw word lists * replaced the normalize-transcribed script with a Python version, since the Javascript version was running out of memory for larger languages like Japanese
2025-06-27 17:01:42 +03:00 · 2025-06-27 17:01:42 +03:00 · 62e8a08576
commit 62e8a08576
parent a34baef0f3
12 changed files with 533 additions and 439 deletions
--- a/scripts/capitalize-dictionary-words.js
+++ b/scripts/capitalize-dictionary-words.js
@ -1,94 +0,0 @@
-const { basename } = require('path');
-const { createReadStream, existsSync } = require('fs');
-const { print, printError } = require('./_printers.js');
-
-
-function printHelp() {
-	print(`Usage ${basename(process.argv[1])} DICTIONARY-FILE-NAME.txt LIST-OF-CAPITALIZED-WORDS.txt MIN-WORD-LENGTH LOCALE`);
-	print('Capitalizes a word list using capitalized words in another list.');
-	print('\nMIN-WORD-LENGTH must be a positive number.');
-	print('LOCALE could be any valid JS locale, for exmaple: en, en-US, etc...');
-}
-
-
-
-function validateInput() {
-	if (process.argv.length < 6) {
-		printHelp();
-		process.exit(1);
-	}
-
-	if (!existsSync(process.argv[3])) {
-		printError(`Failure! Could not find list-of-capitals file "${process.argv[3]}".`);
-		process.exit(2);
-	}
-
-	if (!existsSync(process.argv[2])) {
-		printError(`Failure! Could not find dictionary file "${process.argv[2]}".`);
-		process.exit(2);
-	}
-
-	const minWordLength = Number.parseInt(process.argv[4]);
-	if (Number.isNaN(minWordLength) || minWordLength < 0) {
-		printError(`Failure! The minimum word length must be a positive number.`);
-		process.exit(2);
-	}
-
-	return { capitalsFileName: process.argv[3], dictionaryFileName: process.argv[2], locale: process.argv[5], minWordLength };
-}
-
-
-async function capitalize({ dictionaryFileName, capitalsFileName, locale, minWordLength }) {
-	// read the dictionary
-	let lineReader = require('readline').createInterface({
-	  input: createReadStream(dictionaryFileName)
-	});
-
-	const words = {};
-	for await (const line of lineReader) {
-		words[line] = true;
-	}
-
-
-	// convert the dictionary words using the second file
-	lineReader = require('readline').createInterface({
-	  input: createReadStream(capitalsFileName)
-	});
-
-	for await (const capitalizedWord of lineReader) {
-		if (capitalizedWord.length < minWordLength) {
-			continue;
-		}
-
-		const lowercaseWord = capitalizedWord.toLocaleLowerCase(locale);
-		if (words[lowercaseWord]) {
-			delete words[lowercaseWord];
-			words[capitalizedWord] = true;
-		}
-
-		const possessiveLowercaseWord = `${lowercaseWord}'s`;
-		if (words[possessiveLowercaseWord]) {
-			delete words[possessiveLowercaseWord];
-			words[`${capitalizedWord}'s`] = true;
-		}
-	}
-
-	return Object.keys(words);
-}
-
-
-
-function printWords(wordList) {
-	if (!Array.isArray(wordList)) {
-		return;
-	}
-
-	wordList.forEach(w => print(w));
-}
-
-
-
-/** main **/
-capitalize(validateInput())
-	.then(words => printWords(words))
-	.catch(e => printError(e));
--- a/scripts/clean-raw-dictionary.sh
+++ b/scripts/clean-raw-dictionary.sh
@ -0,0 +1,79 @@
+#!/bin/bash
+
+if [ $# -lt 7 ]; then
+	echo "Usage: $0 <raw-word-list.txt> <output.txt> <suffixes.txt> <hunspell.aff> <hunspell.dic> <allowed-lowercase-char-list> <allowed-uppercase-char-list>"
+	echo
+	echo "Example (Slovak, no need of whitelist filter):"
+	echo "  $0 sk-raw.txt sk-filtered.txt /dev/null sk_SK.aff sk_SK.dic aáäbcčdďeéfghiíjklĺľmnňoóôpqrŕsštťuúvwxyýzž AÁÄBCČDĎEÉFGHIÍJKLĹĽMNŇOÓÔPQRŔSŠTŤUÚVWXYÝZŽ"
+	echo
+	echo "Example (Slovak, with whitelist filter):"
+	echo "  $0 sk-raw.txt sk-filtered.txt sk-suffix.txt sk_SK.aff sk_SK.dic aáäbcčdďeéfghiíjklĺľmnňoóôpqrŕsštťuúvwxyýzž AÁÄBCČDĎEÉFGHIÍJKLĹĽMNŇOÓÔPQRŔSŠTŤUÚVWXYÝZŽ"
+
+	exit 1
+fi
+
+DICTIONARY_FILE=$1
+if ! [[ -f $DICTIONARY_FILE ]]; then
+	echo "base-dictionary-file: '$DICTIONARY_FILE' does not exist"
+	exit 2
+fi
+
+SUFFIXES_FILE=$3
+
+AFF_FILE=$4
+if ! [[ -f "$AFF_FILE" ]]; then
+	echo ".aff file: '$AFF_FILE' does not exist"
+	exit 2
+fi
+
+DIC_FILE=$5
+if ! [[ -f "$DIC_FILE" ]]; then
+	echo ".dic file: '$DIC_FILE' does not exist"
+	exit 2
+fi
+
+OUTPUT_FILE=$2
+ALLOWED_LOWERCASE_CHARS=$6
+ALLOWED_UPPERCASE_CHARS=$7
+WORK_DIR="/tmp/TT9_$(uuidgen)"
+SCRIPT_DIR="$(dirname "$0")"
+
+if ! [[ -d $SCRIPT_DIR/venv ]]; then
+	python -m venv $SCRIPT_DIR/venv && source $SCRIPT_DIR/venv/bin/activate && pip install -r $SCRIPT_DIR/requirements.txt
+fi
+
+
+generate_words() {
+	CLEAN_WORDS=$1
+	OUTPUT=$2
+	DICTIONARY=${AFF_FILE::-4}
+
+	if ! [[ -f "$SUFFIXES_FILE" ]]; then
+		echo "Suffixes file: '$SUFFIXES_FILE' does not exist. Skipping extra word generation."
+		cp $CLEAN_WORDS $OUTPUT
+		return
+	fi
+
+	printf "Extracting valid words for generating new ones... " && hunspell -i UTF-8 -G -d "$DICTIONARY" $CLEAN_WORDS | sort -u | uniq > $WORK_DIR/generation-stems.txt && echo "OK" \
+	&& python $SCRIPT_DIR/generate-words-from-suffixes.py $WORK_DIR/generation-stems.txt $SUFFIXES_FILE $WORK_DIR/generated-raw.txt \
+	&& printf "Validating generated words with Hunspell... " && hunspell -i UTF-8 -G -d "$DICTIONARY" $WORK_DIR/generated-raw.txt > $WORK_DIR/generated-valid.txt && echo "OK" \
+	&& printf "Merging generated and input words... " && cat $CLEAN_WORDS $WORK_DIR/generated-valid.txt | sort -u | uniq > $OUTPUT && echo "OK"
+}
+
+# remove Roman numerals: ^(M{0,3})(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$
+
+date
+mkdir -p $WORK_DIR \
+	&& printf "Removing foreign letters... " && grep --text -E "^[$ALLOWED_LOWERCASE_CHARS$ALLOWED_UPPERCASE_CHARS]+$" $DICTIONARY_FILE > $WORK_DIR/noforeign.txt && echo "OK" \
+	&& printf "Removing frequencies and duplicates... " && sed -E 's/[\t0-9]+//g' $WORK_DIR/noforeign.txt | sort | uniq > $WORK_DIR/nofreq_norepeat.txt && echo "OK" \
+	&& printf "Removing lowerUPPER... " && grep -vE "[$ALLOWED_LOWERCASE_CHARS][$ALLOWED_UPPERCASE_CHARS]" $WORK_DIR/nofreq_norepeat.txt > $WORK_DIR/no_low_up.txt && echo "OK" \
+	&& printf "Removing UPPERlower... " && grep -vE "[$ALLOWED_UPPERCASE_CHARS]{2,}[$ALLOWED_LOWERCASE_CHARS]" $WORK_DIR/no_low_up.txt > $WORK_DIR/no_up_low.txt && echo "OK" \
+	&& printf "Removing single chars... " && grep -vE "^.$" $WORK_DIR/no_up_low.txt > $WORK_DIR/no_single.txt && echo "OK" \
+	&& printf "Removing words with repeeeeaaaated letters... " && grep -vE "(.)\1{2,}" $WORK_DIR/no_single.txt | grep -vE "^(.)\1$" | sort | uniq > $WORK_DIR/no_multi.txt && echo "OK" \
+	&& generate_words $WORK_DIR/no_multi.txt $WORK_DIR/generated.txt \
+	&& echo "Preparing to fix the text case." && source $SCRIPT_DIR/venv/bin/activate && python $SCRIPT_DIR/fix-text-case.py $WORK_DIR/generated.txt $WORK_DIR/text_case.txt --aff "$AFF_FILE" --dic "$DIC_FILE" \
+	&& INITIAL_COUNT=$(wc -l < "$DICTIONARY_FILE") && FINAL_COUNT=$(wc -l < "$WORK_DIR/text_case.txt") && echo "Word count: $INITIAL_COUNT -> $FINAL_COUNT" \
+	&& mv $WORK_DIR/text_case.txt "$OUTPUT_FILE"
+
+rm -rf $WORK_DIR
+date
--- a/scripts/extract-frequencies-from-text.js
+++ b/scripts/extract-frequencies-from-text.js
@ -1,149 +0,0 @@
-const { basename } = require('path');
-const { createReadStream, existsSync } = require('fs');
-const { createInterface } = require('readline');
-const { print, printError } = require('./_printers.js');
-
-
-function printHelp() {
-	print(`Usage ${basename(process.argv[1])} text.txt UnicodeRange [ExcludeRange] [EraseRange]`);
-	print('From the given text file, extracts and counts the unique words that belong to a given unicode range. Assumes all punctuation, whitespace and foreign characters as word separators.');
-	print('Example UnicodeRange: U+900-U+97FU+200CU+200D');
-	print('Example ExcludeRange: U+950-U+957U+966-U+97F');
-	print('Example EraseRange: U+964U+965U+970U+971');
-}
-
-
-
-function validateInput() {
-	if (process.argv.length < 3) {
-		printHelp();
-		process.exit(1);
-	}
-
-	if (!existsSync(process.argv[2])) {
-		printError(`Failure! Could not find word list file "${process.argv[2]}".`);
-		process.exit(2);
-	}
-
-	if (!validateUnicodeRange(process.argv[3])) {
-		printError(`Failure! Invalid or missing search Unicode range(s): "${process.argv[3]}".`);
-		process.exit(2);
-	}
-
-	if (process.argv[4] && !validateUnicodeRange(process.argv[4])) {
-		printError(`Failure! Invalid exclude range(s): "${process.argv[4]}".`);
-		process.exit(2);
-	}
-
-	if (process.argv[5] && !validateUnicodeRange(process.argv[5])) {
-		printError(`Failure! Invalid exclude range(s): "${process.argv[5]}".`);
-		process.exit(2);
-	}
-
-	return {
-		fileName: process.argv[2],
-		searchRegexString: process.argv[3],
-		excludeRegexString: typeof process.argv[4] === 'string' ? process.argv[4] : '',
-		eraseRegexString: typeof process.argv[5] === 'string' ? process.argv[5] : ''
-	};
-}
-
-
-function validateUnicodeRange(inputRange) {
-	return /^([uU]\+[\da-fA-F]+)(\-*[uU]\+[\da-fA-F]+)*$/.test(inputRange);
-}
-
-
-function URangeToXRange(range) {
-	if (range.length === 0) {
-		return null;
-	}
-
-	return range
-		.toUpperCase()
-		.replaceAll(/U\+([\dA-F]+)/g, "\\u{$1}");
-}
-
-
-function printWords(wordList) {
-	if (Array.isArray(wordList)) {
-		wordList.forEach(w => print(`${w.w}\t${w.f}`));
-	}
-}
-
-
-function cleanInvalidChars(line, eraseRegex, excludeRegexString) {
-	const spacesOnly = /^\s+$/;
-
-	if (!line || !line.length || spacesOnly.test(line)) {
-		return [];
-	}
-
-	const invalidWordRegex = excludeRegexString.length > 0 ? new RegExp("[" + URangeToXRange(excludeRegexString) + "]", "u") : null;
-
-	return line
-		.replace(eraseRegex, ' ')
-		.split(' ')
-		.filter(w => w.length > 1 && (invalidWordRegex === null || !invalidWordRegex.test(w)));
-}
-
-
-async function readWords(fileName, eraseRegex, excludeRegexString) {
-	const words = new Map();
-
-	if (!fileName) {
-		return words;
-	}
-
-	for await (const line of createInterface({ input: createReadStream(fileName) })) {
-		const parts = cleanInvalidChars(line, eraseRegex, excludeRegexString);
-		parts.forEach(w => {
-			words.set(w, words.has(w) ? words.get(w) + 1 : 1);
-		});
-	}
-
-	return words;
-}
-
-
-function sortWords(wordsMap) {
-	const words = [];
-	for (let [w, f] of wordsMap) {
-		words.push({ w, f });
-	}
-
-	return words.sort((a, b) => {
-		if (a.f > b.f) {
-			return -1;
-		}
-
-		if (a.f < b.f) {
-			return 1;
-		}
-
-		if (a.w < b.w) {
-			return -1;
-		}
-
-		if (a.w > b.w) {
-			return 1;
-		}
-
-		return 0;
-	});
-}
-
-
-async function work({ fileName, searchRegexString, excludeRegexString, eraseRegexString }) {
-	const eraseRegex = new RegExp("([^" + URangeToXRange(searchRegexString) + " ]+|[" + URangeToXRange(eraseRegexString) + "])", "gu");
-	return sortWords(
-		await readWords(fileName, eraseRegex, excludeRegexString)
-	);
-}
-
-
-
-/** main **/
-work(validateInput())
-	.then(words => printWords(words))
-	.catch(e => printError(e));
--- a/scripts/extract-words-from-text.py
+++ b/scripts/extract-words-from-text.py
@ -0,0 +1,47 @@
+import sys
+import re
+from collections import Counter
+from os.path import basename
+
+def usage():
+    print(f"Usage: e{basename(__file__)} [--freq|-f] <allowed_letters> <file1> [file2 ...]")
+    sys.exit(1)
+
+# Check and parse arguments
+args = sys.argv[1:]
+if not args or len(args) < 2:
+    usage()
+
+show_freq = False
+if args[0] in ("--freq", "-f"):
+    show_freq = True
+    args = args[1:]
+
+if len(args) < 2:
+    usage()
+
+allowed_letters = set(args[0])
+file_paths = args[1:]
+
+# Unicode word pattern
+word_pattern = re.compile(r'\b\w+\b', re.UNICODE)
+word_counts = Counter()
+
+# Process files
+for path in file_paths:
+    try:
+        with open(path, 'r', encoding='utf-8') as f:
+            for line in f:
+                for word in word_pattern.findall(line):
+                    if all(char in allowed_letters for char in word):
+                        word_counts[word] += 1
+    except Exception as e:
+        print(f"Error reading {path}: {e}", file=sys.stderr)
+
+# Output
+if show_freq:
+    for word, count in sorted(word_counts.items()):
+        print(f"{word}\t{count}")
+else:
+    for word in sorted(word_counts):
+        print(word)
--- a/scripts/fix-text-case.py
+++ b/scripts/fix-text-case.py
@ -0,0 +1,108 @@
+import argparse
+import os
+import time
+from multiprocessing import Pool, cpu_count, Manager
+from collections import defaultdict
+import hunspell
+
+def load_unique_words(full_list_path):
+    words = dict()
+    with open(full_list_path, 'r', encoding='utf-8', errors='replace') as f:
+        for line in f:
+            word = line.strip()
+            if '<EFBFBD>' in word:
+                continue
+
+            word_lower = word.lower()
+            if word_lower not in words or words[word_lower] == word_lower:
+                words[word_lower] = word
+
+    return words.values()
+
+def init_hunspell_worker(aff_path, dic_path):
+    global hobj, hunspell_stems
+    hobj = hunspell.HunSpell(dic_path, aff_path)
+    with open(dic_path, "r") as f:
+        hunspell_stems = set({
+            line.split('/')[0].strip()
+            for line in f
+            if not line.startswith('#')
+        })
+
+def fix_word_text_case(word):
+    word_lower = word.lower()
+
+    # check for direct matches to avoid expensive calls to HunSpell.suggest()
+    if word_lower != word and word_lower in hunspell_stems:
+        return word_lower
+
+    if word in hunspell_stems:
+        return word
+
+    # name -> Name
+    hunspell_variants = hobj.suggest(word_lower)
+    for variant in hunspell_variants:
+        if word_lower != variant and word_lower == variant.lower():
+            return variant
+
+    # if it can be either lowercase or uppercase, then we want to keep the lowercase
+    if word_lower in hunspell_variants:
+        return word_lower
+
+    # if it is an unknown word, keep it as-is
+    return word
+
+def print_progress(current, total, start_time, interval):
+    if current % interval == 0 or current == total:
+        avg_time = (time.time() - start_time) / current
+        remaining_time = (total - current) * avg_time
+        HH, rem = divmod(int(remaining_time), 3600)
+        MM, SS = divmod(rem, 60)
+        print(f"\rFixing text case using hunspell... {current}/{total}, Remaining: {HH:02}:{MM:02}:{SS:02}", end=" ")
+
+
+def run_hunspell_batch(words, aff_path, dic_path, num_workers):
+    total = len(words)
+    start_time = time.time()
+
+    with Pool(
+        processes=num_workers,
+        initializer=init_hunspell_worker,
+        initargs=(aff_path, dic_path)
+    ) as pool:
+        for i, correct_word in enumerate (pool.imap_unordered(fix_word_text_case, words), 1):
+            print_progress(i, total, start_time, 300)
+            yield correct_word
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Correct the text case of a word list using Hunspell.")
+    parser.add_argument("word_list", help="Path to the full list of words.")
+    parser.add_argument("output", help="Path to save the corrected words.")
+    parser.add_argument("--aff", required=True, help="Path to Hunspell .aff file.")
+    parser.add_argument("--dic", required=True, help="Path to Hunspell .dic file.")
+    args = parser.parse_args()
+
+    if not os.path.exists(args.word_list):
+        print(f"Full word list not found: {args.word_list}")
+        return
+    if not os.path.exists(args.aff):
+        print(f"Hunspell .aff file not found: {args.aff}")
+        return
+    if not os.path.exists(args.dic):
+        print(f"Hunspell .dic file not found: {args.dic}")
+        return
+
+    all_words = load_unique_words(args.word_list)
+    print(f"Loaded {len(all_words)} candidate words.")
+
+    corrected_words = run_hunspell_batch(all_words, args.aff, args.dic, cpu_count())
+
+    with open(args.output, 'w', encoding='utf-8') as f:
+        for word in sorted(corrected_words):
+            f.write(word + '\n')
+
+    print(" ") # clear the '\r'
+
+if __name__ == "__main__":
+    main()
--- a/scripts/generate-words-from-suffixes.py
+++ b/scripts/generate-words-from-suffixes.py
@ -0,0 +1,64 @@
+import argparse
+import os
+from multiprocessing import Pool, cpu_count
+from collections import defaultdict
+
+def load_unique_words(word_list_path):
+    words = set()
+    with open(word_list_path, 'r', encoding='utf-8', errors='replace') as f:
+        for line in f:
+            word = line.strip()
+            if '<EFBFBD>' in word:
+                continue
+            words.add(word.lower())
+    return words
+
+def load_known_suffixes(suffix_file_path):
+    suffixes = set()
+    with open(suffix_file_path, 'r', encoding='utf-8', errors='replace') as f:
+        for line in f:
+            suffix = line.strip()
+            if suffix:
+                suffixes.add(suffix)
+    return suffixes
+
+def generate_from_args(args):
+    word, suffixes = args
+    return {word + suffix for suffix in suffixes}
+
+def generate_words(words, suffixes, num_workers):
+    new_words = set()
+    with Pool(processes=num_workers) as pool:
+        for result in pool.imap_unordered(generate_from_args, ((word, suffixes) for word in words)):
+            new_words.update(result)
+    return new_words
+
+def main():
+    parser = argparse.ArgumentParser(description="Naively generate new words using a list of stems and a list of suffixes. Note that, you will have to clean up the invalid words after that.")
+    parser.add_argument("word_list", help="Path to the full list of words to filter")
+    parser.add_argument("suffix_file", help="Path to the file containing known suffixes")
+    parser.add_argument("output", help="Path to save the filtered output")
+    args = parser.parse_args()
+
+    if not os.path.exists(args.word_list):
+        print(f"Full word list not found: {args.word_list}")
+        return
+    if not os.path.exists(args.suffix_file):
+        print(f"Suffix file not found: {args.suffix_file}")
+        return
+
+    print("Generating new words...", end=' ')
+
+    all_words = load_unique_words(args.word_list)
+    known_suffixes = load_known_suffixes(args.suffix_file)
+
+    print(f"\rGenerating new words out of {len(all_words)} stems and {len(known_suffixes)} suffixes...", end=' ')
+    generated = generate_words(all_words, known_suffixes, cpu_count())
+    print(f"OK ({len(generated) - len(all_words)} new words)")
+
+    with open(args.output, 'w', encoding='utf-8') as f:
+        for word in generated:
+            f.write(word + '\n')
+
+if __name__ == "__main__":
+    main()
--- a/scripts/injest-script-words.js
+++ b/scripts/injest-script-words.js
@ -1,111 +0,0 @@
-const { basename } = require('path');
-const { createReadStream, existsSync } = require('fs');
-const { createInterface } = require('readline');
-const { print, printError } = require('./_printers.js');
-
-
-function printHelp() {
-	print(`Usage ${basename(process.argv[1])} word-list.txt UnicodeRange [ExcludeRange]`);
-	print('Extracts words that belong to a given unicode range. Assumes all punctuation, whitespace and foreign characters as word separators.');
-	print('Example UnicodeRange: U+900-U+97F');
-	print('Example ExcludeRange: U+950-U+954U+964-U+971U+200CU+200D');
-}
-
-
-
-function validateInput() {
-	if (process.argv.length < 3) {
-		printHelp();
-		process.exit(1);
-	}
-
-	if (!existsSync(process.argv[2])) {
-		printError(`Failure! Could not find word list file "${process.argv[2]}".`);
-		process.exit(2);
-	}
-
-	if (!validateUnicodeRange(process.argv[3])) {
-		printError(`Failure! Invalid or missing search Unicode range(s): "${process.argv[3]}".`);
-		process.exit(2);
-	}
-
-	if (process.argv[4] && !validateUnicodeRange(process.argv[4])) {
-		printError(`Failure! Invalid exclude range(s): "${process.argv[4]}".`);
-		process.exit(2);
-	}
-
-	return {
-		fileName: process.argv[2],
-		searchRegexString: process.argv[3],
-		excludeRegexString: typeof process.argv[4] === 'string' ? process.argv[4] : ''
-	};
-}
-
-
-function validateUnicodeRange(inputRange) {
-	return /^([uU]\+[\da-fA-F]+)(\-*[uU]\+[\da-fA-F]+)*$/.test(inputRange);
-}
-
-
-function URangeToXRange(range) {
-	if (range.length === 0) {
-		return null;
-	}
-
-	return range
-		.toUpperCase()
-		.replaceAll(/U\+([\dA-F]+)/g, "\\u{$1}");
-}
-
-
-function printWords(wordList) {
-	if (Array.isArray(wordList)) {
-		wordList.forEach(w => print(w));
-	}
-}
-
-
-function cleanInvalidChars(line, searchRegex, excludeRegex) {
-	const spacesOnly = /^\s+$/;
-
-	if (!line || !line.length || spacesOnly.test(line)) {
-		return [];
-	}
-
-	const cleanLine = excludeRegex !== null ? line.replace(excludeRegex, ' ') : line;
-	return cleanLine
-		.replace(searchRegex, ' ')
-		.split(' ')
-		.filter(w => w.length > 1);
-}
-
-
-async function readWords(fileName, searchRegex, excludeRegex) {
-	const words = new Set();
-
-	if (!fileName) {
-		return words;
-	}
-
-	for await (const line of createInterface({ input: createReadStream(fileName) })) {
-		cleanInvalidChars(line, searchRegex, excludeRegex).forEach(w => words.add(w));
-	}
-
-	return words;
-}
-
-
-async function work({ fileName, searchRegexString, excludeRegexString }) {
-	const searchRegex = new RegExp("[^" + URangeToXRange(searchRegexString) + "]+", "gu");
-	const excludeRegex = excludeRegexString.length > 0 ? new RegExp("[" + URangeToXRange(excludeRegexString) + "]+", "gu") : null;
-
-	const words = Array.from(await readWords(fileName, searchRegex, excludeRegex));
-	return words.filter(word => word.length > 1).sort();
-}
-
-
-
-/** main **/
-work(validateInput())
-	.then(words => printWords(words))
-	.catch(e => printError(e));
--- a/scripts/normalize-transcribed.js
+++ b/scripts/normalize-transcribed.js
@ -1,85 +0,0 @@
-const { basename } = require('path');
-const { existsSync, readFileSync } = require('fs');;
-const { print, printError } = require('./_printers.js')
-
-
-function printHelp() {
-	print(`Usage ${basename(process.argv[1])} WORD-LIST.txt`);
-	print('Normalizes the frequencies in a dictionary with transcriptions.');
-}
-
-
-
-function validateInput() {
-	if (process.argv.length < 3) {
-		printHelp();
-		process.exit(1);
-	}
-
-	if (!existsSync(process.argv[2])) {
-		printError(`Failure! Could not find word list file "${process.argv[2]}".`);
-		process.exit(2);
-	}
-
-	return {
-		fileName: process.argv[2]
-	};
-}
-
-
-function printWords(wordList) {
-	if (Array.isArray(wordList)) {
-		wordList.forEach(w => print(
-			w.number !== null ? `${w.chinese}\t${w.latin}\t${w.number}` : `${w.chinese}\t${w.latin}`
-		));
-	}
-}
-
-
-const { fileName } = validateInput();
-
-const data = readFileSync(fileName, 'utf8');
-const lines = data.trim().split('\n');
-
-// Parse the data into an array of objects
-let entries = lines.map(line => {
-	const parts = line.split('\t');
-	return {
-		original: line,
-		chinese: parts[0],
-		latin: parts[1],
-		number: parts[2] ? parseInt(parts[2], 10) : null
-	};
-});
-
-// Group entries by the Latin character sequence
-const groups = {};
-entries.forEach(entry => {
-	if (!groups[entry.latin]) {
-		groups[entry.latin] = [];
-	}
-	groups[entry.latin].push(entry);
-});
-
-// Process each group: sort by number (descending) and reassign ordinal numbers
-let sortedEntries = [];
-for (const key in groups) {
-	let group = groups[key];
-
-	// Separate entries with and without numbers
-	let withNumbers = group.filter(e => e.number !== null);
-	let withoutNumbers = group.filter(e => e.number === null);
-
-	// Sort by number in descending order
-	withNumbers.sort((a, b) => b.number - a.number);
-
-	// Assign ordinal rankings
-	for (let i = 0; i < withNumbers.length; i++) {
-		withNumbers[i].number = (withNumbers.length - i).toString();
-	}
-
-	// Preserve original order for entries without numbers
-	sortedEntries.push(...withNumbers, ...withoutNumbers);
-}
-
-printWords(sortedEntries);
--- a/scripts/normalize-transcribed.py
+++ b/scripts/normalize-transcribed.py
@ -0,0 +1,86 @@
+import sys
+import os
+import argparse
+from collections import defaultdict
+
+def print_error(message):
+    print(message, file=sys.stderr)
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Normalizes the frequencies in a dictionary with transcriptions."
+    )
+    parser.add_argument(
+        "word_list",
+        help="Path to the word list file (e.g., WORD-LIST.txt)"
+    )
+    return parser.parse_args()
+
+def validate_file(file_path):
+    if not os.path.isfile(file_path):
+        print_error(f'Failure! Could not find word list file "{file_path}".')
+        sys.exit(2)
+
+def load_entries(file_path):
+    with open(file_path, encoding='utf-8') as f:
+        lines = [line.strip() for line in f if line.strip()]
+
+    entries = []
+    for line_num, line in enumerate(lines, start=1):
+        parts = line.split('\t')
+        if len(parts) < 2:
+            print_error(f"Malformed line {line_num}: '{line}' (expected at least 2 tab-separated fields)")
+            sys.exit(3)
+
+        chinese, latin = parts[:2]
+        number = None
+        if len(parts) > 2:
+            try:
+                number = int(parts[2])
+            except ValueError:
+                print_error(f"Malformed line {line_num}: '{line}' (third field must be an integer if present)")
+                sys.exit(3)
+
+        entries.append({'chinese': chinese, 'latin': latin, 'number': number})
+
+    return entries
+
+def group_entries(entries):
+    groups = defaultdict(list)
+    for entry in entries:
+        groups[entry['latin']].append(entry)
+    return groups
+
+def normalize_frequencies(groups):
+    sorted_entries = []
+    for group in groups.values():
+        with_numbers = [e for e in group if e['number'] is not None]
+        without_numbers = [e for e in group if e['number'] is None]
+
+        with_numbers.sort(key=lambda e: e['number'], reverse=True)
+
+        for rank, entry in enumerate(with_numbers, start=1):
+            entry['number'] = str(len(with_numbers) - rank + 1)
+
+        sorted_entries.extend(with_numbers)
+        sorted_entries.extend(without_numbers)
+
+    return sorted_entries
+
+def print_entries(entries):
+    for e in entries:
+        parts = [e['chinese'], e['latin']]
+        if e['number'] is not None:
+            parts.append(e['number'])
+        print('\t'.join(parts))
+
+def main():
+    args = parse_args()
+    validate_file(args.word_list)
+    entries = load_entries(args.word_list)
+    groups = group_entries(entries)
+    sorted_entries = normalize_frequencies(groups)
+    print_entries(sorted_entries)
+
+if __name__ == "__main__":
+    main()
--- a/scripts/remove-random-words.sh
+++ b/scripts/remove-random-words.sh
@ -0,0 +1,46 @@
+#!/bin/bash
+
+if [ $# -ne 7 ]; then
+	echo "Usage: $0 <locale> <all-words.txt> <courpus-words-with-frequencies.txt> <bad-combinations.txt> <output-file.txt> <vowels-list> <unpopular-max-length>"
+	echo "Example (Polish): $0 pl-PL pl.txt pl-corpus.txt pl-bad-combinations.txt pl-reduced.txt aąeęijoóuy 13"
+	exit 1
+fi
+
+LOCALE="$1"
+ORIGINAL_WORDS="$2"
+CORPUS_WORDS="$3"
+BAD_COMB_FILE="$4"
+OUTPUT_FILE="$5"
+VOWELS="$6"
+UNPOPULAR_MAX_LENGTH="$7"
+
+if ! [[ -f "$ORIGINAL_WORDS" ]]; then
+	echo "All words file: '$ORIGINAL_WORDS' does not exist"
+	exit 2
+fi
+
+if ! [[ -f "$CORPUS_WORDS" ]]; then
+	echo "Corpus words file: '$CORPUS_WORDS' does not exist"
+	exit 2
+fi
+
+if ! [[ -f "$BAD_COMB_FILE" ]]; then
+	echo "Bad letter combinations file: '$BAD_COMB_FILE' does not exist"
+	exit 2
+fi
+
+BAD_LETTER_COMBINATIONS=$(paste -sd'|' "$BAD_COMB_FILE")
+
+
+sed -E 's/^[^\t]+\t[12]$//g' "$CORPUS_WORDS" | grep . | sed -E 's/[\t0-9]+$//g' > __tmp__popular.txt &
+grep -Ev "(.)\1{2,}" "$ORIGINAL_WORDS" | grep -Ev "($BAD_LETTER_COMBINATIONS)" | grep -Ev "[$VOWELS]{3,}" | sort -u | uniq > __tmp__reduced.txt &
+wait
+
+node ~/src/tt9/scripts/remove-foreign-words.js --blacklist "$LOCALE" __tmp__reduced.txt "$LOCALE" __tmp__popular.txt | grep -Ev "(.)\1{2,}" > __tmp__unpopular.txt
+node ~/src/tt9/scripts/remove-foreign-words.js --blacklist "$LOCALE" __tmp__reduced.txt "$LOCALE" __tmp__unpopular.txt > __tmp__popular_reduced.txt &
+awk -v maxlen="$UNPOPULAR_MAX_LENGTH" 'length($0) <= maxlen' __tmp__unpopular.txt > __tmp__unpopular_reduced.txt &
+wait
+
+cat __tmp__popular_reduced.txt __tmp__unpopular_reduced.txt | sort -u | uniq > $OUTPUT_FILE
+
+rm -f __tmp__*.txt
--- a/scripts/requirements.txt
+++ b/scripts/requirements.txt
@ -0,0 +1 @@
+hunspell==0.5.5
--- a/scripts/whitelist-filter.py
+++ b/scripts/whitelist-filter.py
@ -0,0 +1,102 @@
+import argparse
+import os
+from multiprocessing import Pool, cpu_count
+from collections import defaultdict
+
+def load_stem_buckets(whitelist_path):
+    buckets = defaultdict(set)
+    with open(whitelist_path, 'r', encoding='utf-8', errors='replace') as f:
+        for line in f:
+            word = line.strip()
+            if '<EFBFBD>' in word:
+                continue
+            word_lc = word.lower()
+            first_char = word_lc[0]
+            buckets[first_char].add(word_lc)
+    return dict(buckets)
+
+def load_unique_words(full_list_path):
+    words = set()
+    with open(full_list_path, 'r', encoding='utf-8', errors='replace') as f:
+        for line in f:
+            word = line.strip()
+            if '<EFBFBD>' in word:
+                continue
+            words.add(word)
+    return words
+
+def load_known_suffixes(suffix_file_path):
+    suffixes = set()
+    with open(suffix_file_path, 'r', encoding='utf-8', errors='replace') as f:
+        for line in f:
+            suffix = line.strip()
+            if suffix:
+                suffixes.add(suffix)
+    return suffixes
+
+def match_word(word, buckets, known_suffixes):
+    """Return all valid combinations: base word and word+suffix if found in stems."""
+    word_lc = word.lower()
+    first_char = word_lc[0]
+    possible_stems = buckets.get(first_char, set())
+
+    matches = []
+
+    if word_lc in possible_stems:
+        matches.append(word)
+
+    for suffix in known_suffixes:
+        compound_word = word_lc + suffix
+        if compound_word in possible_stems:
+            matches.append(compound_word)
+
+    return matches
+
+def filter_words_parallel(all_words, stem_buckets, known_suffixes, num_workers):
+    args = [(word, stem_buckets, known_suffixes) for word in all_words]
+    with Pool(processes=num_workers) as pool:
+        results = pool.starmap(match_word, args)
+
+    matched_words = set()
+    for match_list in results:
+        matched_words.update(match_list)
+    return matched_words
+
+def main():
+    parser = argparse.ArgumentParser(description="Filter given words by a stem whitelist. The list of suffixes is used to generate more variants of the valid words.")
+    parser.add_argument("whitelist", help="Path to the whitelist file (with valid words)")
+    parser.add_argument("full_list", help="Path to the full list of words to filter")
+    parser.add_argument("suffix_file", help="Path to the file containing known suffixes")
+    parser.add_argument("output", help="Path to save the filtered output")
+    args = parser.parse_args()
+
+    if not os.path.exists(args.whitelist):
+        print(f"Whitelist file not found: {args.whitelist}")
+        return
+    if not os.path.exists(args.full_list):
+        print(f"Full word list not found: {args.full_list}")
+        return
+    if not os.path.exists(args.suffix_file):
+        print(f"Suffix file not found: {args.suffix_file}")
+        return
+
+    stem_buckets = load_stem_buckets(args.whitelist)
+    print(f"Loaded {sum(len(s) for s in stem_buckets.values())} valid stems across {len(stem_buckets)} buckets.")
+
+    all_words = load_unique_words(args.full_list)
+    print(f"Loaded {len(all_words)} candidate words.")
+
+    known_suffixes = load_known_suffixes(args.suffix_file)
+    print(f"Loaded {len(known_suffixes)} known suffixes.")
+
+    workers = cpu_count()
+    print(f"Filtering using {workers} threads...", end=' ')
+    filtered = filter_words_parallel(all_words, stem_buckets, known_suffixes, workers)
+    print(f"OK. Matched {len(filtered)} words.")
+
+    with open(args.output, 'w', encoding='utf-8') as f:
+        for word in sorted(filtered):
+            f.write(word + '\n')
+
+if __name__ == "__main__":
+    main()