New dictionary tools (#830)
* new dictionary tools for generating an app dictionary from raw word lists * replaced the normalize-transcribed script with a Python version, since the Javascript version was running out of memory for larger languages like Japanese
This commit is contained in:
parent
a34baef0f3
commit
62e8a08576
12 changed files with 533 additions and 439 deletions
64
scripts/generate-words-from-suffixes.py
Normal file
64
scripts/generate-words-from-suffixes.py
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
import argparse
|
||||
import os
|
||||
from multiprocessing import Pool, cpu_count
|
||||
from collections import defaultdict
|
||||
|
||||
def load_unique_words(word_list_path):
|
||||
words = set()
|
||||
with open(word_list_path, 'r', encoding='utf-8', errors='replace') as f:
|
||||
for line in f:
|
||||
word = line.strip()
|
||||
if '<EFBFBD>' in word:
|
||||
continue
|
||||
words.add(word.lower())
|
||||
return words
|
||||
|
||||
def load_known_suffixes(suffix_file_path):
|
||||
suffixes = set()
|
||||
with open(suffix_file_path, 'r', encoding='utf-8', errors='replace') as f:
|
||||
for line in f:
|
||||
suffix = line.strip()
|
||||
if suffix:
|
||||
suffixes.add(suffix)
|
||||
return suffixes
|
||||
|
||||
def generate_from_args(args):
|
||||
word, suffixes = args
|
||||
return {word + suffix for suffix in suffixes}
|
||||
|
||||
def generate_words(words, suffixes, num_workers):
|
||||
new_words = set()
|
||||
with Pool(processes=num_workers) as pool:
|
||||
for result in pool.imap_unordered(generate_from_args, ((word, suffixes) for word in words)):
|
||||
new_words.update(result)
|
||||
return new_words
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Naively generate new words using a list of stems and a list of suffixes. Note that, you will have to clean up the invalid words after that.")
|
||||
parser.add_argument("word_list", help="Path to the full list of words to filter")
|
||||
parser.add_argument("suffix_file", help="Path to the file containing known suffixes")
|
||||
parser.add_argument("output", help="Path to save the filtered output")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not os.path.exists(args.word_list):
|
||||
print(f"Full word list not found: {args.word_list}")
|
||||
return
|
||||
if not os.path.exists(args.suffix_file):
|
||||
print(f"Suffix file not found: {args.suffix_file}")
|
||||
return
|
||||
|
||||
print("Generating new words...", end=' ')
|
||||
|
||||
all_words = load_unique_words(args.word_list)
|
||||
known_suffixes = load_known_suffixes(args.suffix_file)
|
||||
|
||||
print(f"\rGenerating new words out of {len(all_words)} stems and {len(known_suffixes)} suffixes...", end=' ')
|
||||
generated = generate_words(all_words, known_suffixes, cpu_count())
|
||||
print(f"OK ({len(generated) - len(all_words)} new words)")
|
||||
|
||||
with open(args.output, 'w', encoding='utf-8') as f:
|
||||
for word in generated:
|
||||
f.write(word + '\n')
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue