From 62e8a08576033f6db9c066e89d7a429d246de416 Mon Sep 17 00:00:00 2001 From: Dimo Karaivanov Date: Fri, 27 Jun 2025 17:01:42 +0300 Subject: [PATCH] New dictionary tools (#830) * new dictionary tools for generating an app dictionary from raw word lists * replaced the normalize-transcribed script with a Python version, since the Javascript version was running out of memory for larger languages like Japanese --- scripts/capitalize-dictionary-words.js | 94 -------------- scripts/clean-raw-dictionary.sh | 79 ++++++++++++ scripts/extract-frequencies-from-text.js | 149 ----------------------- scripts/extract-words-from-text.py | 47 +++++++ scripts/fix-text-case.py | 108 ++++++++++++++++ scripts/generate-words-from-suffixes.py | 64 ++++++++++ scripts/injest-script-words.js | 111 ----------------- scripts/normalize-transcribed.js | 85 ------------- scripts/normalize-transcribed.py | 86 +++++++++++++ scripts/remove-random-words.sh | 46 +++++++ scripts/requirements.txt | 1 + scripts/whitelist-filter.py | 102 ++++++++++++++++ 12 files changed, 533 insertions(+), 439 deletions(-) delete mode 100644 scripts/capitalize-dictionary-words.js create mode 100755 scripts/clean-raw-dictionary.sh delete mode 100644 scripts/extract-frequencies-from-text.js create mode 100644 scripts/extract-words-from-text.py create mode 100644 scripts/fix-text-case.py create mode 100644 scripts/generate-words-from-suffixes.py delete mode 100644 scripts/injest-script-words.js delete mode 100644 scripts/normalize-transcribed.js create mode 100644 scripts/normalize-transcribed.py create mode 100755 scripts/remove-random-words.sh create mode 100644 scripts/requirements.txt create mode 100644 scripts/whitelist-filter.py diff --git a/scripts/capitalize-dictionary-words.js b/scripts/capitalize-dictionary-words.js deleted file mode 100644 index e00c418e..00000000 --- a/scripts/capitalize-dictionary-words.js +++ /dev/null @@ -1,94 +0,0 @@ -const { basename } = require('path'); -const { createReadStream, existsSync } = require('fs'); -const { print, printError } = require('./_printers.js'); - - -function printHelp() { - print(`Usage ${basename(process.argv[1])} DICTIONARY-FILE-NAME.txt LIST-OF-CAPITALIZED-WORDS.txt MIN-WORD-LENGTH LOCALE`); - print('Capitalizes a word list using capitalized words in another list.'); - print('\nMIN-WORD-LENGTH must be a positive number.'); - print('LOCALE could be any valid JS locale, for exmaple: en, en-US, etc...'); -} - - - -function validateInput() { - if (process.argv.length < 6) { - printHelp(); - process.exit(1); - } - - if (!existsSync(process.argv[3])) { - printError(`Failure! Could not find list-of-capitals file "${process.argv[3]}".`); - process.exit(2); - } - - if (!existsSync(process.argv[2])) { - printError(`Failure! Could not find dictionary file "${process.argv[2]}".`); - process.exit(2); - } - - const minWordLength = Number.parseInt(process.argv[4]); - if (Number.isNaN(minWordLength) || minWordLength < 0) { - printError(`Failure! The minimum word length must be a positive number.`); - process.exit(2); - } - - return { capitalsFileName: process.argv[3], dictionaryFileName: process.argv[2], locale: process.argv[5], minWordLength }; -} - - -async function capitalize({ dictionaryFileName, capitalsFileName, locale, minWordLength }) { - // read the dictionary - let lineReader = require('readline').createInterface({ - input: createReadStream(dictionaryFileName) - }); - - const words = {}; - for await (const line of lineReader) { - words[line] = true; - } - - - // convert the dictionary words using the second file - lineReader = require('readline').createInterface({ - input: createReadStream(capitalsFileName) - }); - - for await (const capitalizedWord of lineReader) { - if (capitalizedWord.length < minWordLength) { - continue; - } - - const lowercaseWord = capitalizedWord.toLocaleLowerCase(locale); - if (words[lowercaseWord]) { - delete words[lowercaseWord]; - words[capitalizedWord] = true; - } - - const possessiveLowercaseWord = `${lowercaseWord}'s`; - if (words[possessiveLowercaseWord]) { - delete words[possessiveLowercaseWord]; - words[`${capitalizedWord}'s`] = true; - } - } - - return Object.keys(words); -} - - - -function printWords(wordList) { - if (!Array.isArray(wordList)) { - return; - } - - wordList.forEach(w => print(w)); -} - - - -/** main **/ -capitalize(validateInput()) - .then(words => printWords(words)) - .catch(e => printError(e)); diff --git a/scripts/clean-raw-dictionary.sh b/scripts/clean-raw-dictionary.sh new file mode 100755 index 00000000..65296a2e --- /dev/null +++ b/scripts/clean-raw-dictionary.sh @@ -0,0 +1,79 @@ +#!/bin/bash + +if [ $# -lt 7 ]; then + echo "Usage: $0 " + echo + echo "Example (Slovak, no need of whitelist filter):" + echo " $0 sk-raw.txt sk-filtered.txt /dev/null sk_SK.aff sk_SK.dic aáäbcčdďeéfghiíjklĺľmnňoóôpqrŕsštťuúvwxyýzž AÁÄBCČDĎEÉFGHIÍJKLĹĽMNŇOÓÔPQRŔSŠTŤUÚVWXYÝZŽ" + echo + echo "Example (Slovak, with whitelist filter):" + echo " $0 sk-raw.txt sk-filtered.txt sk-suffix.txt sk_SK.aff sk_SK.dic aáäbcčdďeéfghiíjklĺľmnňoóôpqrŕsštťuúvwxyýzž AÁÄBCČDĎEÉFGHIÍJKLĹĽMNŇOÓÔPQRŔSŠTŤUÚVWXYÝZŽ" + + exit 1 +fi + +DICTIONARY_FILE=$1 +if ! [[ -f $DICTIONARY_FILE ]]; then + echo "base-dictionary-file: '$DICTIONARY_FILE' does not exist" + exit 2 +fi + +SUFFIXES_FILE=$3 + +AFF_FILE=$4 +if ! [[ -f "$AFF_FILE" ]]; then + echo ".aff file: '$AFF_FILE' does not exist" + exit 2 +fi + +DIC_FILE=$5 +if ! [[ -f "$DIC_FILE" ]]; then + echo ".dic file: '$DIC_FILE' does not exist" + exit 2 +fi + +OUTPUT_FILE=$2 +ALLOWED_LOWERCASE_CHARS=$6 +ALLOWED_UPPERCASE_CHARS=$7 +WORK_DIR="/tmp/TT9_$(uuidgen)" +SCRIPT_DIR="$(dirname "$0")" + +if ! [[ -d $SCRIPT_DIR/venv ]]; then + python -m venv $SCRIPT_DIR/venv && source $SCRIPT_DIR/venv/bin/activate && pip install -r $SCRIPT_DIR/requirements.txt +fi + + +generate_words() { + CLEAN_WORDS=$1 + OUTPUT=$2 + DICTIONARY=${AFF_FILE::-4} + + if ! [[ -f "$SUFFIXES_FILE" ]]; then + echo "Suffixes file: '$SUFFIXES_FILE' does not exist. Skipping extra word generation." + cp $CLEAN_WORDS $OUTPUT + return + fi + + printf "Extracting valid words for generating new ones... " && hunspell -i UTF-8 -G -d "$DICTIONARY" $CLEAN_WORDS | sort -u | uniq > $WORK_DIR/generation-stems.txt && echo "OK" \ + && python $SCRIPT_DIR/generate-words-from-suffixes.py $WORK_DIR/generation-stems.txt $SUFFIXES_FILE $WORK_DIR/generated-raw.txt \ + && printf "Validating generated words with Hunspell... " && hunspell -i UTF-8 -G -d "$DICTIONARY" $WORK_DIR/generated-raw.txt > $WORK_DIR/generated-valid.txt && echo "OK" \ + && printf "Merging generated and input words... " && cat $CLEAN_WORDS $WORK_DIR/generated-valid.txt | sort -u | uniq > $OUTPUT && echo "OK" +} + +# remove Roman numerals: ^(M{0,3})(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$ + +date +mkdir -p $WORK_DIR \ + && printf "Removing foreign letters... " && grep --text -E "^[$ALLOWED_LOWERCASE_CHARS$ALLOWED_UPPERCASE_CHARS]+$" $DICTIONARY_FILE > $WORK_DIR/noforeign.txt && echo "OK" \ + && printf "Removing frequencies and duplicates... " && sed -E 's/[\t0-9]+//g' $WORK_DIR/noforeign.txt | sort | uniq > $WORK_DIR/nofreq_norepeat.txt && echo "OK" \ + && printf "Removing lowerUPPER... " && grep -vE "[$ALLOWED_LOWERCASE_CHARS][$ALLOWED_UPPERCASE_CHARS]" $WORK_DIR/nofreq_norepeat.txt > $WORK_DIR/no_low_up.txt && echo "OK" \ + && printf "Removing UPPERlower... " && grep -vE "[$ALLOWED_UPPERCASE_CHARS]{2,}[$ALLOWED_LOWERCASE_CHARS]" $WORK_DIR/no_low_up.txt > $WORK_DIR/no_up_low.txt && echo "OK" \ + && printf "Removing single chars... " && grep -vE "^.$" $WORK_DIR/no_up_low.txt > $WORK_DIR/no_single.txt && echo "OK" \ + && printf "Removing words with repeeeeaaaated letters... " && grep -vE "(.)\1{2,}" $WORK_DIR/no_single.txt | grep -vE "^(.)\1$" | sort | uniq > $WORK_DIR/no_multi.txt && echo "OK" \ + && generate_words $WORK_DIR/no_multi.txt $WORK_DIR/generated.txt \ + && echo "Preparing to fix the text case." && source $SCRIPT_DIR/venv/bin/activate && python $SCRIPT_DIR/fix-text-case.py $WORK_DIR/generated.txt $WORK_DIR/text_case.txt --aff "$AFF_FILE" --dic "$DIC_FILE" \ + && INITIAL_COUNT=$(wc -l < "$DICTIONARY_FILE") && FINAL_COUNT=$(wc -l < "$WORK_DIR/text_case.txt") && echo "Word count: $INITIAL_COUNT -> $FINAL_COUNT" \ + && mv $WORK_DIR/text_case.txt "$OUTPUT_FILE" + +rm -rf $WORK_DIR +date \ No newline at end of file diff --git a/scripts/extract-frequencies-from-text.js b/scripts/extract-frequencies-from-text.js deleted file mode 100644 index 1da4f7d0..00000000 --- a/scripts/extract-frequencies-from-text.js +++ /dev/null @@ -1,149 +0,0 @@ -const { basename } = require('path'); -const { createReadStream, existsSync } = require('fs'); -const { createInterface } = require('readline'); -const { print, printError } = require('./_printers.js'); - - -function printHelp() { - print(`Usage ${basename(process.argv[1])} text.txt UnicodeRange [ExcludeRange] [EraseRange]`); - print('From the given text file, extracts and counts the unique words that belong to a given unicode range. Assumes all punctuation, whitespace and foreign characters as word separators.'); - print('Example UnicodeRange: U+900-U+97FU+200CU+200D'); - print('Example ExcludeRange: U+950-U+957U+966-U+97F'); - print('Example EraseRange: U+964U+965U+970U+971'); -} - - - -function validateInput() { - if (process.argv.length < 3) { - printHelp(); - process.exit(1); - } - - if (!existsSync(process.argv[2])) { - printError(`Failure! Could not find word list file "${process.argv[2]}".`); - process.exit(2); - } - - if (!validateUnicodeRange(process.argv[3])) { - printError(`Failure! Invalid or missing search Unicode range(s): "${process.argv[3]}".`); - process.exit(2); - } - - if (process.argv[4] && !validateUnicodeRange(process.argv[4])) { - printError(`Failure! Invalid exclude range(s): "${process.argv[4]}".`); - process.exit(2); - } - - if (process.argv[5] && !validateUnicodeRange(process.argv[5])) { - printError(`Failure! Invalid exclude range(s): "${process.argv[5]}".`); - process.exit(2); - } - - return { - fileName: process.argv[2], - searchRegexString: process.argv[3], - excludeRegexString: typeof process.argv[4] === 'string' ? process.argv[4] : '', - eraseRegexString: typeof process.argv[5] === 'string' ? process.argv[5] : '' - }; -} - - -function validateUnicodeRange(inputRange) { - return /^([uU]\+[\da-fA-F]+)(\-*[uU]\+[\da-fA-F]+)*$/.test(inputRange); -} - - -function URangeToXRange(range) { - if (range.length === 0) { - return null; - } - - return range - .toUpperCase() - .replaceAll(/U\+([\dA-F]+)/g, "\\u{$1}"); -} - - -function printWords(wordList) { - if (Array.isArray(wordList)) { - wordList.forEach(w => print(`${w.w}\t${w.f}`)); - } -} - - -function cleanInvalidChars(line, eraseRegex, excludeRegexString) { - const spacesOnly = /^\s+$/; - - if (!line || !line.length || spacesOnly.test(line)) { - return []; - } - - const invalidWordRegex = excludeRegexString.length > 0 ? new RegExp("[" + URangeToXRange(excludeRegexString) + "]", "u") : null; - - return line - .replace(eraseRegex, ' ') - .split(' ') - .filter(w => w.length > 1 && (invalidWordRegex === null || !invalidWordRegex.test(w))); -} - - -async function readWords(fileName, eraseRegex, excludeRegexString) { - const words = new Map(); - - if (!fileName) { - return words; - } - - for await (const line of createInterface({ input: createReadStream(fileName) })) { - const parts = cleanInvalidChars(line, eraseRegex, excludeRegexString); - parts.forEach(w => { - words.set(w, words.has(w) ? words.get(w) + 1 : 1); - }); - } - - return words; -} - - -function sortWords(wordsMap) { - const words = []; - for (let [w, f] of wordsMap) { - words.push({ w, f }); - } - - return words.sort((a, b) => { - if (a.f > b.f) { - return -1; - } - - if (a.f < b.f) { - return 1; - } - - if (a.w < b.w) { - return -1; - } - - if (a.w > b.w) { - return 1; - } - - return 0; - }); -} - - -async function work({ fileName, searchRegexString, excludeRegexString, eraseRegexString }) { - const eraseRegex = new RegExp("([^" + URangeToXRange(searchRegexString) + " ]+|[" + URangeToXRange(eraseRegexString) + "])", "gu"); - return sortWords( - await readWords(fileName, eraseRegex, excludeRegexString) - ); -} - - - -/** main **/ -work(validateInput()) - .then(words => printWords(words)) - .catch(e => printError(e)); diff --git a/scripts/extract-words-from-text.py b/scripts/extract-words-from-text.py new file mode 100644 index 00000000..51a847ec --- /dev/null +++ b/scripts/extract-words-from-text.py @@ -0,0 +1,47 @@ +import sys +import re +from collections import Counter +from os.path import basename + +def usage(): + print(f"Usage: e{basename(__file__)} [--freq|-f] [file2 ...]") + sys.exit(1) + +# Check and parse arguments +args = sys.argv[1:] +if not args or len(args) < 2: + usage() + +show_freq = False +if args[0] in ("--freq", "-f"): + show_freq = True + args = args[1:] + +if len(args) < 2: + usage() + +allowed_letters = set(args[0]) +file_paths = args[1:] + +# Unicode word pattern +word_pattern = re.compile(r'\b\w+\b', re.UNICODE) +word_counts = Counter() + +# Process files +for path in file_paths: + try: + with open(path, 'r', encoding='utf-8') as f: + for line in f: + for word in word_pattern.findall(line): + if all(char in allowed_letters for char in word): + word_counts[word] += 1 + except Exception as e: + print(f"Error reading {path}: {e}", file=sys.stderr) + +# Output +if show_freq: + for word, count in sorted(word_counts.items()): + print(f"{word}\t{count}") +else: + for word in sorted(word_counts): + print(word) diff --git a/scripts/fix-text-case.py b/scripts/fix-text-case.py new file mode 100644 index 00000000..ec2bbd3a --- /dev/null +++ b/scripts/fix-text-case.py @@ -0,0 +1,108 @@ +import argparse +import os +import time +from multiprocessing import Pool, cpu_count, Manager +from collections import defaultdict +import hunspell + +def load_unique_words(full_list_path): + words = dict() + with open(full_list_path, 'r', encoding='utf-8', errors='replace') as f: + for line in f: + word = line.strip() + if '�' in word: + continue + + word_lower = word.lower() + if word_lower not in words or words[word_lower] == word_lower: + words[word_lower] = word + + return words.values() + +def init_hunspell_worker(aff_path, dic_path): + global hobj, hunspell_stems + hobj = hunspell.HunSpell(dic_path, aff_path) + with open(dic_path, "r") as f: + hunspell_stems = set({ + line.split('/')[0].strip() + for line in f + if not line.startswith('#') + }) + +def fix_word_text_case(word): + word_lower = word.lower() + + # check for direct matches to avoid expensive calls to HunSpell.suggest() + if word_lower != word and word_lower in hunspell_stems: + return word_lower + + if word in hunspell_stems: + return word + + # name -> Name + hunspell_variants = hobj.suggest(word_lower) + for variant in hunspell_variants: + if word_lower != variant and word_lower == variant.lower(): + return variant + + # if it can be either lowercase or uppercase, then we want to keep the lowercase + if word_lower in hunspell_variants: + return word_lower + + # if it is an unknown word, keep it as-is + return word + +def print_progress(current, total, start_time, interval): + if current % interval == 0 or current == total: + avg_time = (time.time() - start_time) / current + remaining_time = (total - current) * avg_time + HH, rem = divmod(int(remaining_time), 3600) + MM, SS = divmod(rem, 60) + print(f"\rFixing text case using hunspell... {current}/{total}, Remaining: {HH:02}:{MM:02}:{SS:02}", end=" ") + + +def run_hunspell_batch(words, aff_path, dic_path, num_workers): + total = len(words) + start_time = time.time() + + with Pool( + processes=num_workers, + initializer=init_hunspell_worker, + initargs=(aff_path, dic_path) + ) as pool: + for i, correct_word in enumerate (pool.imap_unordered(fix_word_text_case, words), 1): + print_progress(i, total, start_time, 300) + yield correct_word + + +def main(): + parser = argparse.ArgumentParser(description="Correct the text case of a word list using Hunspell.") + parser.add_argument("word_list", help="Path to the full list of words.") + parser.add_argument("output", help="Path to save the corrected words.") + parser.add_argument("--aff", required=True, help="Path to Hunspell .aff file.") + parser.add_argument("--dic", required=True, help="Path to Hunspell .dic file.") + args = parser.parse_args() + + if not os.path.exists(args.word_list): + print(f"Full word list not found: {args.word_list}") + return + if not os.path.exists(args.aff): + print(f"Hunspell .aff file not found: {args.aff}") + return + if not os.path.exists(args.dic): + print(f"Hunspell .dic file not found: {args.dic}") + return + + all_words = load_unique_words(args.word_list) + print(f"Loaded {len(all_words)} candidate words.") + + corrected_words = run_hunspell_batch(all_words, args.aff, args.dic, cpu_count()) + + with open(args.output, 'w', encoding='utf-8') as f: + for word in sorted(corrected_words): + f.write(word + '\n') + + print(" ") # clear the '\r' + +if __name__ == "__main__": + main() diff --git a/scripts/generate-words-from-suffixes.py b/scripts/generate-words-from-suffixes.py new file mode 100644 index 00000000..222a9b95 --- /dev/null +++ b/scripts/generate-words-from-suffixes.py @@ -0,0 +1,64 @@ +import argparse +import os +from multiprocessing import Pool, cpu_count +from collections import defaultdict + +def load_unique_words(word_list_path): + words = set() + with open(word_list_path, 'r', encoding='utf-8', errors='replace') as f: + for line in f: + word = line.strip() + if '�' in word: + continue + words.add(word.lower()) + return words + +def load_known_suffixes(suffix_file_path): + suffixes = set() + with open(suffix_file_path, 'r', encoding='utf-8', errors='replace') as f: + for line in f: + suffix = line.strip() + if suffix: + suffixes.add(suffix) + return suffixes + +def generate_from_args(args): + word, suffixes = args + return {word + suffix for suffix in suffixes} + +def generate_words(words, suffixes, num_workers): + new_words = set() + with Pool(processes=num_workers) as pool: + for result in pool.imap_unordered(generate_from_args, ((word, suffixes) for word in words)): + new_words.update(result) + return new_words + +def main(): + parser = argparse.ArgumentParser(description="Naively generate new words using a list of stems and a list of suffixes. Note that, you will have to clean up the invalid words after that.") + parser.add_argument("word_list", help="Path to the full list of words to filter") + parser.add_argument("suffix_file", help="Path to the file containing known suffixes") + parser.add_argument("output", help="Path to save the filtered output") + args = parser.parse_args() + + if not os.path.exists(args.word_list): + print(f"Full word list not found: {args.word_list}") + return + if not os.path.exists(args.suffix_file): + print(f"Suffix file not found: {args.suffix_file}") + return + + print("Generating new words...", end=' ') + + all_words = load_unique_words(args.word_list) + known_suffixes = load_known_suffixes(args.suffix_file) + + print(f"\rGenerating new words out of {len(all_words)} stems and {len(known_suffixes)} suffixes...", end=' ') + generated = generate_words(all_words, known_suffixes, cpu_count()) + print(f"OK ({len(generated) - len(all_words)} new words)") + + with open(args.output, 'w', encoding='utf-8') as f: + for word in generated: + f.write(word + '\n') + +if __name__ == "__main__": + main() diff --git a/scripts/injest-script-words.js b/scripts/injest-script-words.js deleted file mode 100644 index ba57fbae..00000000 --- a/scripts/injest-script-words.js +++ /dev/null @@ -1,111 +0,0 @@ -const { basename } = require('path'); -const { createReadStream, existsSync } = require('fs'); -const { createInterface } = require('readline'); -const { print, printError } = require('./_printers.js'); - - -function printHelp() { - print(`Usage ${basename(process.argv[1])} word-list.txt UnicodeRange [ExcludeRange]`); - print('Extracts words that belong to a given unicode range. Assumes all punctuation, whitespace and foreign characters as word separators.'); - print('Example UnicodeRange: U+900-U+97F'); - print('Example ExcludeRange: U+950-U+954U+964-U+971U+200CU+200D'); -} - - - -function validateInput() { - if (process.argv.length < 3) { - printHelp(); - process.exit(1); - } - - if (!existsSync(process.argv[2])) { - printError(`Failure! Could not find word list file "${process.argv[2]}".`); - process.exit(2); - } - - if (!validateUnicodeRange(process.argv[3])) { - printError(`Failure! Invalid or missing search Unicode range(s): "${process.argv[3]}".`); - process.exit(2); - } - - if (process.argv[4] && !validateUnicodeRange(process.argv[4])) { - printError(`Failure! Invalid exclude range(s): "${process.argv[4]}".`); - process.exit(2); - } - - return { - fileName: process.argv[2], - searchRegexString: process.argv[3], - excludeRegexString: typeof process.argv[4] === 'string' ? process.argv[4] : '' - }; -} - - -function validateUnicodeRange(inputRange) { - return /^([uU]\+[\da-fA-F]+)(\-*[uU]\+[\da-fA-F]+)*$/.test(inputRange); -} - - -function URangeToXRange(range) { - if (range.length === 0) { - return null; - } - - return range - .toUpperCase() - .replaceAll(/U\+([\dA-F]+)/g, "\\u{$1}"); -} - - -function printWords(wordList) { - if (Array.isArray(wordList)) { - wordList.forEach(w => print(w)); - } -} - - -function cleanInvalidChars(line, searchRegex, excludeRegex) { - const spacesOnly = /^\s+$/; - - if (!line || !line.length || spacesOnly.test(line)) { - return []; - } - - const cleanLine = excludeRegex !== null ? line.replace(excludeRegex, ' ') : line; - return cleanLine - .replace(searchRegex, ' ') - .split(' ') - .filter(w => w.length > 1); -} - - -async function readWords(fileName, searchRegex, excludeRegex) { - const words = new Set(); - - if (!fileName) { - return words; - } - - for await (const line of createInterface({ input: createReadStream(fileName) })) { - cleanInvalidChars(line, searchRegex, excludeRegex).forEach(w => words.add(w)); - } - - return words; -} - - -async function work({ fileName, searchRegexString, excludeRegexString }) { - const searchRegex = new RegExp("[^" + URangeToXRange(searchRegexString) + "]+", "gu"); - const excludeRegex = excludeRegexString.length > 0 ? new RegExp("[" + URangeToXRange(excludeRegexString) + "]+", "gu") : null; - - const words = Array.from(await readWords(fileName, searchRegex, excludeRegex)); - return words.filter(word => word.length > 1).sort(); -} - - - -/** main **/ -work(validateInput()) - .then(words => printWords(words)) - .catch(e => printError(e)); diff --git a/scripts/normalize-transcribed.js b/scripts/normalize-transcribed.js deleted file mode 100644 index 1d7f8608..00000000 --- a/scripts/normalize-transcribed.js +++ /dev/null @@ -1,85 +0,0 @@ -const { basename } = require('path'); -const { existsSync, readFileSync } = require('fs');; -const { print, printError } = require('./_printers.js') - - -function printHelp() { - print(`Usage ${basename(process.argv[1])} WORD-LIST.txt`); - print('Normalizes the frequencies in a dictionary with transcriptions.'); -} - - - -function validateInput() { - if (process.argv.length < 3) { - printHelp(); - process.exit(1); - } - - if (!existsSync(process.argv[2])) { - printError(`Failure! Could not find word list file "${process.argv[2]}".`); - process.exit(2); - } - - return { - fileName: process.argv[2] - }; -} - - -function printWords(wordList) { - if (Array.isArray(wordList)) { - wordList.forEach(w => print( - w.number !== null ? `${w.chinese}\t${w.latin}\t${w.number}` : `${w.chinese}\t${w.latin}` - )); - } -} - - -const { fileName } = validateInput(); - -const data = readFileSync(fileName, 'utf8'); -const lines = data.trim().split('\n'); - -// Parse the data into an array of objects -let entries = lines.map(line => { - const parts = line.split('\t'); - return { - original: line, - chinese: parts[0], - latin: parts[1], - number: parts[2] ? parseInt(parts[2], 10) : null - }; -}); - -// Group entries by the Latin character sequence -const groups = {}; -entries.forEach(entry => { - if (!groups[entry.latin]) { - groups[entry.latin] = []; - } - groups[entry.latin].push(entry); -}); - -// Process each group: sort by number (descending) and reassign ordinal numbers -let sortedEntries = []; -for (const key in groups) { - let group = groups[key]; - - // Separate entries with and without numbers - let withNumbers = group.filter(e => e.number !== null); - let withoutNumbers = group.filter(e => e.number === null); - - // Sort by number in descending order - withNumbers.sort((a, b) => b.number - a.number); - - // Assign ordinal rankings - for (let i = 0; i < withNumbers.length; i++) { - withNumbers[i].number = (withNumbers.length - i).toString(); - } - - // Preserve original order for entries without numbers - sortedEntries.push(...withNumbers, ...withoutNumbers); -} - -printWords(sortedEntries); diff --git a/scripts/normalize-transcribed.py b/scripts/normalize-transcribed.py new file mode 100644 index 00000000..086fdb8d --- /dev/null +++ b/scripts/normalize-transcribed.py @@ -0,0 +1,86 @@ +import sys +import os +import argparse +from collections import defaultdict + +def print_error(message): + print(message, file=sys.stderr) + +def parse_args(): + parser = argparse.ArgumentParser( + description="Normalizes the frequencies in a dictionary with transcriptions." + ) + parser.add_argument( + "word_list", + help="Path to the word list file (e.g., WORD-LIST.txt)" + ) + return parser.parse_args() + +def validate_file(file_path): + if not os.path.isfile(file_path): + print_error(f'Failure! Could not find word list file "{file_path}".') + sys.exit(2) + +def load_entries(file_path): + with open(file_path, encoding='utf-8') as f: + lines = [line.strip() for line in f if line.strip()] + + entries = [] + for line_num, line in enumerate(lines, start=1): + parts = line.split('\t') + if len(parts) < 2: + print_error(f"Malformed line {line_num}: '{line}' (expected at least 2 tab-separated fields)") + sys.exit(3) + + chinese, latin = parts[:2] + number = None + if len(parts) > 2: + try: + number = int(parts[2]) + except ValueError: + print_error(f"Malformed line {line_num}: '{line}' (third field must be an integer if present)") + sys.exit(3) + + entries.append({'chinese': chinese, 'latin': latin, 'number': number}) + + return entries + +def group_entries(entries): + groups = defaultdict(list) + for entry in entries: + groups[entry['latin']].append(entry) + return groups + +def normalize_frequencies(groups): + sorted_entries = [] + for group in groups.values(): + with_numbers = [e for e in group if e['number'] is not None] + without_numbers = [e for e in group if e['number'] is None] + + with_numbers.sort(key=lambda e: e['number'], reverse=True) + + for rank, entry in enumerate(with_numbers, start=1): + entry['number'] = str(len(with_numbers) - rank + 1) + + sorted_entries.extend(with_numbers) + sorted_entries.extend(without_numbers) + + return sorted_entries + +def print_entries(entries): + for e in entries: + parts = [e['chinese'], e['latin']] + if e['number'] is not None: + parts.append(e['number']) + print('\t'.join(parts)) + +def main(): + args = parse_args() + validate_file(args.word_list) + entries = load_entries(args.word_list) + groups = group_entries(entries) + sorted_entries = normalize_frequencies(groups) + print_entries(sorted_entries) + +if __name__ == "__main__": + main() diff --git a/scripts/remove-random-words.sh b/scripts/remove-random-words.sh new file mode 100755 index 00000000..eb7a67f0 --- /dev/null +++ b/scripts/remove-random-words.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +if [ $# -ne 7 ]; then + echo "Usage: $0 " + echo "Example (Polish): $0 pl-PL pl.txt pl-corpus.txt pl-bad-combinations.txt pl-reduced.txt aąeęijoóuy 13" + exit 1 +fi + +LOCALE="$1" +ORIGINAL_WORDS="$2" +CORPUS_WORDS="$3" +BAD_COMB_FILE="$4" +OUTPUT_FILE="$5" +VOWELS="$6" +UNPOPULAR_MAX_LENGTH="$7" + +if ! [[ -f "$ORIGINAL_WORDS" ]]; then + echo "All words file: '$ORIGINAL_WORDS' does not exist" + exit 2 +fi + +if ! [[ -f "$CORPUS_WORDS" ]]; then + echo "Corpus words file: '$CORPUS_WORDS' does not exist" + exit 2 +fi + +if ! [[ -f "$BAD_COMB_FILE" ]]; then + echo "Bad letter combinations file: '$BAD_COMB_FILE' does not exist" + exit 2 +fi + +BAD_LETTER_COMBINATIONS=$(paste -sd'|' "$BAD_COMB_FILE") + + +sed -E 's/^[^\t]+\t[12]$//g' "$CORPUS_WORDS" | grep . | sed -E 's/[\t0-9]+$//g' > __tmp__popular.txt & +grep -Ev "(.)\1{2,}" "$ORIGINAL_WORDS" | grep -Ev "($BAD_LETTER_COMBINATIONS)" | grep -Ev "[$VOWELS]{3,}" | sort -u | uniq > __tmp__reduced.txt & +wait + +node ~/src/tt9/scripts/remove-foreign-words.js --blacklist "$LOCALE" __tmp__reduced.txt "$LOCALE" __tmp__popular.txt | grep -Ev "(.)\1{2,}" > __tmp__unpopular.txt +node ~/src/tt9/scripts/remove-foreign-words.js --blacklist "$LOCALE" __tmp__reduced.txt "$LOCALE" __tmp__unpopular.txt > __tmp__popular_reduced.txt & +awk -v maxlen="$UNPOPULAR_MAX_LENGTH" 'length($0) <= maxlen' __tmp__unpopular.txt > __tmp__unpopular_reduced.txt & +wait + +cat __tmp__popular_reduced.txt __tmp__unpopular_reduced.txt | sort -u | uniq > $OUTPUT_FILE + +rm -f __tmp__*.txt diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 00000000..7dc80967 --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1 @@ +hunspell==0.5.5 diff --git a/scripts/whitelist-filter.py b/scripts/whitelist-filter.py new file mode 100644 index 00000000..e6fc92ee --- /dev/null +++ b/scripts/whitelist-filter.py @@ -0,0 +1,102 @@ +import argparse +import os +from multiprocessing import Pool, cpu_count +from collections import defaultdict + +def load_stem_buckets(whitelist_path): + buckets = defaultdict(set) + with open(whitelist_path, 'r', encoding='utf-8', errors='replace') as f: + for line in f: + word = line.strip() + if '�' in word: + continue + word_lc = word.lower() + first_char = word_lc[0] + buckets[first_char].add(word_lc) + return dict(buckets) + +def load_unique_words(full_list_path): + words = set() + with open(full_list_path, 'r', encoding='utf-8', errors='replace') as f: + for line in f: + word = line.strip() + if '�' in word: + continue + words.add(word) + return words + +def load_known_suffixes(suffix_file_path): + suffixes = set() + with open(suffix_file_path, 'r', encoding='utf-8', errors='replace') as f: + for line in f: + suffix = line.strip() + if suffix: + suffixes.add(suffix) + return suffixes + +def match_word(word, buckets, known_suffixes): + """Return all valid combinations: base word and word+suffix if found in stems.""" + word_lc = word.lower() + first_char = word_lc[0] + possible_stems = buckets.get(first_char, set()) + + matches = [] + + if word_lc in possible_stems: + matches.append(word) + + for suffix in known_suffixes: + compound_word = word_lc + suffix + if compound_word in possible_stems: + matches.append(compound_word) + + return matches + +def filter_words_parallel(all_words, stem_buckets, known_suffixes, num_workers): + args = [(word, stem_buckets, known_suffixes) for word in all_words] + with Pool(processes=num_workers) as pool: + results = pool.starmap(match_word, args) + + matched_words = set() + for match_list in results: + matched_words.update(match_list) + return matched_words + +def main(): + parser = argparse.ArgumentParser(description="Filter given words by a stem whitelist. The list of suffixes is used to generate more variants of the valid words.") + parser.add_argument("whitelist", help="Path to the whitelist file (with valid words)") + parser.add_argument("full_list", help="Path to the full list of words to filter") + parser.add_argument("suffix_file", help="Path to the file containing known suffixes") + parser.add_argument("output", help="Path to save the filtered output") + args = parser.parse_args() + + if not os.path.exists(args.whitelist): + print(f"Whitelist file not found: {args.whitelist}") + return + if not os.path.exists(args.full_list): + print(f"Full word list not found: {args.full_list}") + return + if not os.path.exists(args.suffix_file): + print(f"Suffix file not found: {args.suffix_file}") + return + + stem_buckets = load_stem_buckets(args.whitelist) + print(f"Loaded {sum(len(s) for s in stem_buckets.values())} valid stems across {len(stem_buckets)} buckets.") + + all_words = load_unique_words(args.full_list) + print(f"Loaded {len(all_words)} candidate words.") + + known_suffixes = load_known_suffixes(args.suffix_file) + print(f"Loaded {len(known_suffixes)} known suffixes.") + + workers = cpu_count() + print(f"Filtering using {workers} threads...", end=' ') + filtered = filter_words_parallel(all_words, stem_buckets, known_suffixes, workers) + print(f"OK. Matched {len(filtered)} words.") + + with open(args.output, 'w', encoding='utf-8') as f: + for word in sorted(filtered): + f.write(word + '\n') + +if __name__ == "__main__": + main()