New dictionary tools (#830)

* new dictionary tools for generating an app dictionary from raw word lists * replaced the normalize-transcribed script with a Python version, since the Javascript version was running out of memory for larger languages like Japanese
2025-06-27 17:01:42 +03:00 · 2025-06-27 17:01:42 +03:00 · 62e8a08576
commit 62e8a08576
parent a34baef0f3
12 changed files with 533 additions and 439 deletions
--- a/scripts/fix-text-case.py
+++ b/scripts/fix-text-case.py
@ -0,0 +1,108 @@
+import argparse
+import os
+import time
+from multiprocessing import Pool, cpu_count, Manager
+from collections import defaultdict
+import hunspell
+
+def load_unique_words(full_list_path):
+    words = dict()
+    with open(full_list_path, 'r', encoding='utf-8', errors='replace') as f:
+        for line in f:
+            word = line.strip()
+            if '<EFBFBD>' in word:
+                continue
+
+            word_lower = word.lower()
+            if word_lower not in words or words[word_lower] == word_lower:
+                words[word_lower] = word
+
+    return words.values()
+
+def init_hunspell_worker(aff_path, dic_path):
+    global hobj, hunspell_stems
+    hobj = hunspell.HunSpell(dic_path, aff_path)
+    with open(dic_path, "r") as f:
+        hunspell_stems = set({
+            line.split('/')[0].strip()
+            for line in f
+            if not line.startswith('#')
+        })
+
+def fix_word_text_case(word):
+    word_lower = word.lower()
+
+    # check for direct matches to avoid expensive calls to HunSpell.suggest()
+    if word_lower != word and word_lower in hunspell_stems:
+        return word_lower
+
+    if word in hunspell_stems:
+        return word
+
+    # name -> Name
+    hunspell_variants = hobj.suggest(word_lower)
+    for variant in hunspell_variants:
+        if word_lower != variant and word_lower == variant.lower():
+            return variant
+
+    # if it can be either lowercase or uppercase, then we want to keep the lowercase
+    if word_lower in hunspell_variants:
+        return word_lower
+
+    # if it is an unknown word, keep it as-is
+    return word
+
+def print_progress(current, total, start_time, interval):
+    if current % interval == 0 or current == total:
+        avg_time = (time.time() - start_time) / current
+        remaining_time = (total - current) * avg_time
+        HH, rem = divmod(int(remaining_time), 3600)
+        MM, SS = divmod(rem, 60)
+        print(f"\rFixing text case using hunspell... {current}/{total}, Remaining: {HH:02}:{MM:02}:{SS:02}", end=" ")
+
+
+def run_hunspell_batch(words, aff_path, dic_path, num_workers):
+    total = len(words)
+    start_time = time.time()
+
+    with Pool(
+        processes=num_workers,
+        initializer=init_hunspell_worker,
+        initargs=(aff_path, dic_path)
+    ) as pool:
+        for i, correct_word in enumerate (pool.imap_unordered(fix_word_text_case, words), 1):
+            print_progress(i, total, start_time, 300)
+            yield correct_word
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Correct the text case of a word list using Hunspell.")
+    parser.add_argument("word_list", help="Path to the full list of words.")
+    parser.add_argument("output", help="Path to save the corrected words.")
+    parser.add_argument("--aff", required=True, help="Path to Hunspell .aff file.")
+    parser.add_argument("--dic", required=True, help="Path to Hunspell .dic file.")
+    args = parser.parse_args()
+
+    if not os.path.exists(args.word_list):
+        print(f"Full word list not found: {args.word_list}")
+        return
+    if not os.path.exists(args.aff):
+        print(f"Hunspell .aff file not found: {args.aff}")
+        return
+    if not os.path.exists(args.dic):
+        print(f"Hunspell .dic file not found: {args.dic}")
+        return
+
+    all_words = load_unique_words(args.word_list)
+    print(f"Loaded {len(all_words)} candidate words.")
+
+    corrected_words = run_hunspell_batch(all_words, args.aff, args.dic, cpu_count())
+
+    with open(args.output, 'w', encoding='utf-8') as f:
+        for word in sorted(corrected_words):
+            f.write(word + '\n')
+
+    print(" ") # clear the '\r'
+
+if __name__ == "__main__":
+    main()