tt9/scripts/generate-words-from-suffixes.py

import argparse
import os
from multiprocessing import Pool, cpu_count
from collections import defaultdict

def load_unique_words(word_list_path):
    words = set()
    with open(word_list_path, 'r', encoding='utf-8', errors='replace') as f:
        for line in f:
            word = line.strip()
            if '<EFBFBD>' in word:
                continue
            words.add(word.lower())
    return words

def load_known_suffixes(suffix_file_path):
    suffixes = set()
    with open(suffix_file_path, 'r', encoding='utf-8', errors='replace') as f:
        for line in f:
            suffix = line.strip()
            if suffix:
                suffixes.add(suffix)
    return suffixes

def generate_from_args(args):
    word, suffixes = args
    return {word + suffix for suffix in suffixes}

def generate_words(words, suffixes, num_workers):
    new_words = set()
    with Pool(processes=num_workers) as pool:
        for result in pool.imap_unordered(generate_from_args, ((word, suffixes) for word in words)):
            new_words.update(result)
    return new_words

def main():
    parser = argparse.ArgumentParser(description="Naively generate new words using a list of stems and a list of suffixes. Note that, you will have to clean up the invalid words after that.")
    parser.add_argument("word_list", help="Path to the full list of words to filter")
    parser.add_argument("suffix_file", help="Path to the file containing known suffixes")
    parser.add_argument("output", help="Path to save the filtered output")
    args = parser.parse_args()

    if not os.path.exists(args.word_list):
        print(f"Full word list not found: {args.word_list}")
        return
    if not os.path.exists(args.suffix_file):
        print(f"Suffix file not found: {args.suffix_file}")
        return

    print("Generating new words...", end=' ')

    all_words = load_unique_words(args.word_list)
    known_suffixes = load_known_suffixes(args.suffix_file)

    print(f"\rGenerating new words out of {len(all_words)} stems and {len(known_suffixes)} suffixes...", end=' ')
    generated = generate_words(all_words, known_suffixes, cpu_count())
    print(f"OK ({len(generated) - len(all_words)} new words)")

    with open(args.output, 'w', encoding='utf-8') as f:
        for word in generated:
            f.write(word + '\n')

if __name__ == "__main__":
    main()