* new dictionary tools for generating an app dictionary from raw word lists * replaced the normalize-transcribed script with a Python version, since the Javascript version was running out of memory for larger languages like Japanese
108 lines
3.6 KiB
Python
108 lines
3.6 KiB
Python
import argparse
|
||
import os
|
||
import time
|
||
from multiprocessing import Pool, cpu_count, Manager
|
||
from collections import defaultdict
|
||
import hunspell
|
||
|
||
def load_unique_words(full_list_path):
|
||
words = dict()
|
||
with open(full_list_path, 'r', encoding='utf-8', errors='replace') as f:
|
||
for line in f:
|
||
word = line.strip()
|
||
if '<EFBFBD>' in word:
|
||
continue
|
||
|
||
word_lower = word.lower()
|
||
if word_lower not in words or words[word_lower] == word_lower:
|
||
words[word_lower] = word
|
||
|
||
return words.values()
|
||
|
||
def init_hunspell_worker(aff_path, dic_path):
|
||
global hobj, hunspell_stems
|
||
hobj = hunspell.HunSpell(dic_path, aff_path)
|
||
with open(dic_path, "r") as f:
|
||
hunspell_stems = set({
|
||
line.split('/')[0].strip()
|
||
for line in f
|
||
if not line.startswith('#')
|
||
})
|
||
|
||
def fix_word_text_case(word):
|
||
word_lower = word.lower()
|
||
|
||
# check for direct matches to avoid expensive calls to HunSpell.suggest()
|
||
if word_lower != word and word_lower in hunspell_stems:
|
||
return word_lower
|
||
|
||
if word in hunspell_stems:
|
||
return word
|
||
|
||
# name -> Name
|
||
hunspell_variants = hobj.suggest(word_lower)
|
||
for variant in hunspell_variants:
|
||
if word_lower != variant and word_lower == variant.lower():
|
||
return variant
|
||
|
||
# if it can be either lowercase or uppercase, then we want to keep the lowercase
|
||
if word_lower in hunspell_variants:
|
||
return word_lower
|
||
|
||
# if it is an unknown word, keep it as-is
|
||
return word
|
||
|
||
def print_progress(current, total, start_time, interval):
|
||
if current % interval == 0 or current == total:
|
||
avg_time = (time.time() - start_time) / current
|
||
remaining_time = (total - current) * avg_time
|
||
HH, rem = divmod(int(remaining_time), 3600)
|
||
MM, SS = divmod(rem, 60)
|
||
print(f"\rFixing text case using hunspell... {current}/{total}, Remaining: {HH:02}:{MM:02}:{SS:02}", end=" ")
|
||
|
||
|
||
def run_hunspell_batch(words, aff_path, dic_path, num_workers):
|
||
total = len(words)
|
||
start_time = time.time()
|
||
|
||
with Pool(
|
||
processes=num_workers,
|
||
initializer=init_hunspell_worker,
|
||
initargs=(aff_path, dic_path)
|
||
) as pool:
|
||
for i, correct_word in enumerate (pool.imap_unordered(fix_word_text_case, words), 1):
|
||
print_progress(i, total, start_time, 300)
|
||
yield correct_word
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="Correct the text case of a word list using Hunspell.")
|
||
parser.add_argument("word_list", help="Path to the full list of words.")
|
||
parser.add_argument("output", help="Path to save the corrected words.")
|
||
parser.add_argument("--aff", required=True, help="Path to Hunspell .aff file.")
|
||
parser.add_argument("--dic", required=True, help="Path to Hunspell .dic file.")
|
||
args = parser.parse_args()
|
||
|
||
if not os.path.exists(args.word_list):
|
||
print(f"Full word list not found: {args.word_list}")
|
||
return
|
||
if not os.path.exists(args.aff):
|
||
print(f"Hunspell .aff file not found: {args.aff}")
|
||
return
|
||
if not os.path.exists(args.dic):
|
||
print(f"Hunspell .dic file not found: {args.dic}")
|
||
return
|
||
|
||
all_words = load_unique_words(args.word_list)
|
||
print(f"Loaded {len(all_words)} candidate words.")
|
||
|
||
corrected_words = run_hunspell_batch(all_words, args.aff, args.dic, cpu_count())
|
||
|
||
with open(args.output, 'w', encoding='utf-8') as f:
|
||
for word in sorted(corrected_words):
|
||
f.write(word + '\n')
|
||
|
||
print(" ") # clear the '\r'
|
||
|
||
if __name__ == "__main__":
|
||
main()
|