* new dictionary tools for generating an app dictionary from raw word lists * replaced the normalize-transcribed script with a Python version, since the Javascript version was running out of memory for larger languages like Japanese
64 lines
2.3 KiB
Python
64 lines
2.3 KiB
Python
import argparse
|
||
import os
|
||
from multiprocessing import Pool, cpu_count
|
||
from collections import defaultdict
|
||
|
||
def load_unique_words(word_list_path):
|
||
words = set()
|
||
with open(word_list_path, 'r', encoding='utf-8', errors='replace') as f:
|
||
for line in f:
|
||
word = line.strip()
|
||
if '<EFBFBD>' in word:
|
||
continue
|
||
words.add(word.lower())
|
||
return words
|
||
|
||
def load_known_suffixes(suffix_file_path):
|
||
suffixes = set()
|
||
with open(suffix_file_path, 'r', encoding='utf-8', errors='replace') as f:
|
||
for line in f:
|
||
suffix = line.strip()
|
||
if suffix:
|
||
suffixes.add(suffix)
|
||
return suffixes
|
||
|
||
def generate_from_args(args):
|
||
word, suffixes = args
|
||
return {word + suffix for suffix in suffixes}
|
||
|
||
def generate_words(words, suffixes, num_workers):
|
||
new_words = set()
|
||
with Pool(processes=num_workers) as pool:
|
||
for result in pool.imap_unordered(generate_from_args, ((word, suffixes) for word in words)):
|
||
new_words.update(result)
|
||
return new_words
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="Naively generate new words using a list of stems and a list of suffixes. Note that, you will have to clean up the invalid words after that.")
|
||
parser.add_argument("word_list", help="Path to the full list of words to filter")
|
||
parser.add_argument("suffix_file", help="Path to the file containing known suffixes")
|
||
parser.add_argument("output", help="Path to save the filtered output")
|
||
args = parser.parse_args()
|
||
|
||
if not os.path.exists(args.word_list):
|
||
print(f"Full word list not found: {args.word_list}")
|
||
return
|
||
if not os.path.exists(args.suffix_file):
|
||
print(f"Suffix file not found: {args.suffix_file}")
|
||
return
|
||
|
||
print("Generating new words...", end=' ')
|
||
|
||
all_words = load_unique_words(args.word_list)
|
||
known_suffixes = load_known_suffixes(args.suffix_file)
|
||
|
||
print(f"\rGenerating new words out of {len(all_words)} stems and {len(known_suffixes)} suffixes...", end=' ')
|
||
generated = generate_words(all_words, known_suffixes, cpu_count())
|
||
print(f"OK ({len(generated) - len(all_words)} new words)")
|
||
|
||
with open(args.output, 'w', encoding='utf-8') as f:
|
||
for word in generated:
|
||
f.write(word + '\n')
|
||
|
||
if __name__ == "__main__":
|
||
main()
|