New dictionary tools (#830)
* new dictionary tools for generating an app dictionary from raw word lists * replaced the normalize-transcribed script with a Python version, since the Javascript version was running out of memory for larger languages like Japanese
This commit is contained in:
parent
a34baef0f3
commit
62e8a08576
12 changed files with 533 additions and 439 deletions
108
scripts/fix-text-case.py
Normal file
108
scripts/fix-text-case.py
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
import argparse
|
||||
import os
|
||||
import time
|
||||
from multiprocessing import Pool, cpu_count, Manager
|
||||
from collections import defaultdict
|
||||
import hunspell
|
||||
|
||||
def load_unique_words(full_list_path):
|
||||
words = dict()
|
||||
with open(full_list_path, 'r', encoding='utf-8', errors='replace') as f:
|
||||
for line in f:
|
||||
word = line.strip()
|
||||
if '<EFBFBD>' in word:
|
||||
continue
|
||||
|
||||
word_lower = word.lower()
|
||||
if word_lower not in words or words[word_lower] == word_lower:
|
||||
words[word_lower] = word
|
||||
|
||||
return words.values()
|
||||
|
||||
def init_hunspell_worker(aff_path, dic_path):
|
||||
global hobj, hunspell_stems
|
||||
hobj = hunspell.HunSpell(dic_path, aff_path)
|
||||
with open(dic_path, "r") as f:
|
||||
hunspell_stems = set({
|
||||
line.split('/')[0].strip()
|
||||
for line in f
|
||||
if not line.startswith('#')
|
||||
})
|
||||
|
||||
def fix_word_text_case(word):
|
||||
word_lower = word.lower()
|
||||
|
||||
# check for direct matches to avoid expensive calls to HunSpell.suggest()
|
||||
if word_lower != word and word_lower in hunspell_stems:
|
||||
return word_lower
|
||||
|
||||
if word in hunspell_stems:
|
||||
return word
|
||||
|
||||
# name -> Name
|
||||
hunspell_variants = hobj.suggest(word_lower)
|
||||
for variant in hunspell_variants:
|
||||
if word_lower != variant and word_lower == variant.lower():
|
||||
return variant
|
||||
|
||||
# if it can be either lowercase or uppercase, then we want to keep the lowercase
|
||||
if word_lower in hunspell_variants:
|
||||
return word_lower
|
||||
|
||||
# if it is an unknown word, keep it as-is
|
||||
return word
|
||||
|
||||
def print_progress(current, total, start_time, interval):
|
||||
if current % interval == 0 or current == total:
|
||||
avg_time = (time.time() - start_time) / current
|
||||
remaining_time = (total - current) * avg_time
|
||||
HH, rem = divmod(int(remaining_time), 3600)
|
||||
MM, SS = divmod(rem, 60)
|
||||
print(f"\rFixing text case using hunspell... {current}/{total}, Remaining: {HH:02}:{MM:02}:{SS:02}", end=" ")
|
||||
|
||||
|
||||
def run_hunspell_batch(words, aff_path, dic_path, num_workers):
|
||||
total = len(words)
|
||||
start_time = time.time()
|
||||
|
||||
with Pool(
|
||||
processes=num_workers,
|
||||
initializer=init_hunspell_worker,
|
||||
initargs=(aff_path, dic_path)
|
||||
) as pool:
|
||||
for i, correct_word in enumerate (pool.imap_unordered(fix_word_text_case, words), 1):
|
||||
print_progress(i, total, start_time, 300)
|
||||
yield correct_word
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Correct the text case of a word list using Hunspell.")
|
||||
parser.add_argument("word_list", help="Path to the full list of words.")
|
||||
parser.add_argument("output", help="Path to save the corrected words.")
|
||||
parser.add_argument("--aff", required=True, help="Path to Hunspell .aff file.")
|
||||
parser.add_argument("--dic", required=True, help="Path to Hunspell .dic file.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not os.path.exists(args.word_list):
|
||||
print(f"Full word list not found: {args.word_list}")
|
||||
return
|
||||
if not os.path.exists(args.aff):
|
||||
print(f"Hunspell .aff file not found: {args.aff}")
|
||||
return
|
||||
if not os.path.exists(args.dic):
|
||||
print(f"Hunspell .dic file not found: {args.dic}")
|
||||
return
|
||||
|
||||
all_words = load_unique_words(args.word_list)
|
||||
print(f"Loaded {len(all_words)} candidate words.")
|
||||
|
||||
corrected_words = run_hunspell_batch(all_words, args.aff, args.dic, cpu_count())
|
||||
|
||||
with open(args.output, 'w', encoding='utf-8') as f:
|
||||
for word in sorted(corrected_words):
|
||||
f.write(word + '\n')
|
||||
|
||||
print(" ") # clear the '\r'
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue