1
0
Fork 0
tt9/scripts/fix-text-case.py
Dimo Karaivanov 62e8a08576
New dictionary tools (#830)
* new dictionary tools for generating an app dictionary from raw word lists

* replaced the normalize-transcribed script with a Python version, since the Javascript version was running out of memory for larger languages like Japanese
2025-06-27 17:01:42 +03:00

108 lines
3.6 KiB
Python
Raw Permalink Blame History

import argparse
import os
import time
from multiprocessing import Pool, cpu_count, Manager
from collections import defaultdict
import hunspell
def load_unique_words(full_list_path):
words = dict()
with open(full_list_path, 'r', encoding='utf-8', errors='replace') as f:
for line in f:
word = line.strip()
if '<EFBFBD>' in word:
continue
word_lower = word.lower()
if word_lower not in words or words[word_lower] == word_lower:
words[word_lower] = word
return words.values()
def init_hunspell_worker(aff_path, dic_path):
global hobj, hunspell_stems
hobj = hunspell.HunSpell(dic_path, aff_path)
with open(dic_path, "r") as f:
hunspell_stems = set({
line.split('/')[0].strip()
for line in f
if not line.startswith('#')
})
def fix_word_text_case(word):
word_lower = word.lower()
# check for direct matches to avoid expensive calls to HunSpell.suggest()
if word_lower != word and word_lower in hunspell_stems:
return word_lower
if word in hunspell_stems:
return word
# name -> Name
hunspell_variants = hobj.suggest(word_lower)
for variant in hunspell_variants:
if word_lower != variant and word_lower == variant.lower():
return variant
# if it can be either lowercase or uppercase, then we want to keep the lowercase
if word_lower in hunspell_variants:
return word_lower
# if it is an unknown word, keep it as-is
return word
def print_progress(current, total, start_time, interval):
if current % interval == 0 or current == total:
avg_time = (time.time() - start_time) / current
remaining_time = (total - current) * avg_time
HH, rem = divmod(int(remaining_time), 3600)
MM, SS = divmod(rem, 60)
print(f"\rFixing text case using hunspell... {current}/{total}, Remaining: {HH:02}:{MM:02}:{SS:02}", end=" ")
def run_hunspell_batch(words, aff_path, dic_path, num_workers):
total = len(words)
start_time = time.time()
with Pool(
processes=num_workers,
initializer=init_hunspell_worker,
initargs=(aff_path, dic_path)
) as pool:
for i, correct_word in enumerate (pool.imap_unordered(fix_word_text_case, words), 1):
print_progress(i, total, start_time, 300)
yield correct_word
def main():
parser = argparse.ArgumentParser(description="Correct the text case of a word list using Hunspell.")
parser.add_argument("word_list", help="Path to the full list of words.")
parser.add_argument("output", help="Path to save the corrected words.")
parser.add_argument("--aff", required=True, help="Path to Hunspell .aff file.")
parser.add_argument("--dic", required=True, help="Path to Hunspell .dic file.")
args = parser.parse_args()
if not os.path.exists(args.word_list):
print(f"Full word list not found: {args.word_list}")
return
if not os.path.exists(args.aff):
print(f"Hunspell .aff file not found: {args.aff}")
return
if not os.path.exists(args.dic):
print(f"Hunspell .dic file not found: {args.dic}")
return
all_words = load_unique_words(args.word_list)
print(f"Loaded {len(all_words)} candidate words.")
corrected_words = run_hunspell_batch(all_words, args.aff, args.dic, cpu_count())
with open(args.output, 'w', encoding='utf-8') as f:
for word in sorted(corrected_words):
f.write(word + '\n')
print(" ") # clear the '\r'
if __name__ == "__main__":
main()