New dictionary tools (#830)
* new dictionary tools for generating an app dictionary from raw word lists * replaced the normalize-transcribed script with a Python version, since the Javascript version was running out of memory for larger languages like Japanese
This commit is contained in:
parent
a34baef0f3
commit
62e8a08576
12 changed files with 533 additions and 439 deletions
|
|
@ -1,94 +0,0 @@
|
||||||
const { basename } = require('path');
|
|
||||||
const { createReadStream, existsSync } = require('fs');
|
|
||||||
const { print, printError } = require('./_printers.js');
|
|
||||||
|
|
||||||
|
|
||||||
function printHelp() {
|
|
||||||
print(`Usage ${basename(process.argv[1])} DICTIONARY-FILE-NAME.txt LIST-OF-CAPITALIZED-WORDS.txt MIN-WORD-LENGTH LOCALE`);
|
|
||||||
print('Capitalizes a word list using capitalized words in another list.');
|
|
||||||
print('\nMIN-WORD-LENGTH must be a positive number.');
|
|
||||||
print('LOCALE could be any valid JS locale, for exmaple: en, en-US, etc...');
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
function validateInput() {
|
|
||||||
if (process.argv.length < 6) {
|
|
||||||
printHelp();
|
|
||||||
process.exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!existsSync(process.argv[3])) {
|
|
||||||
printError(`Failure! Could not find list-of-capitals file "${process.argv[3]}".`);
|
|
||||||
process.exit(2);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!existsSync(process.argv[2])) {
|
|
||||||
printError(`Failure! Could not find dictionary file "${process.argv[2]}".`);
|
|
||||||
process.exit(2);
|
|
||||||
}
|
|
||||||
|
|
||||||
const minWordLength = Number.parseInt(process.argv[4]);
|
|
||||||
if (Number.isNaN(minWordLength) || minWordLength < 0) {
|
|
||||||
printError(`Failure! The minimum word length must be a positive number.`);
|
|
||||||
process.exit(2);
|
|
||||||
}
|
|
||||||
|
|
||||||
return { capitalsFileName: process.argv[3], dictionaryFileName: process.argv[2], locale: process.argv[5], minWordLength };
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
async function capitalize({ dictionaryFileName, capitalsFileName, locale, minWordLength }) {
|
|
||||||
// read the dictionary
|
|
||||||
let lineReader = require('readline').createInterface({
|
|
||||||
input: createReadStream(dictionaryFileName)
|
|
||||||
});
|
|
||||||
|
|
||||||
const words = {};
|
|
||||||
for await (const line of lineReader) {
|
|
||||||
words[line] = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// convert the dictionary words using the second file
|
|
||||||
lineReader = require('readline').createInterface({
|
|
||||||
input: createReadStream(capitalsFileName)
|
|
||||||
});
|
|
||||||
|
|
||||||
for await (const capitalizedWord of lineReader) {
|
|
||||||
if (capitalizedWord.length < minWordLength) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
const lowercaseWord = capitalizedWord.toLocaleLowerCase(locale);
|
|
||||||
if (words[lowercaseWord]) {
|
|
||||||
delete words[lowercaseWord];
|
|
||||||
words[capitalizedWord] = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
const possessiveLowercaseWord = `${lowercaseWord}'s`;
|
|
||||||
if (words[possessiveLowercaseWord]) {
|
|
||||||
delete words[possessiveLowercaseWord];
|
|
||||||
words[`${capitalizedWord}'s`] = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return Object.keys(words);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
function printWords(wordList) {
|
|
||||||
if (!Array.isArray(wordList)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
wordList.forEach(w => print(w));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/** main **/
|
|
||||||
capitalize(validateInput())
|
|
||||||
.then(words => printWords(words))
|
|
||||||
.catch(e => printError(e));
|
|
||||||
79
scripts/clean-raw-dictionary.sh
Executable file
79
scripts/clean-raw-dictionary.sh
Executable file
|
|
@ -0,0 +1,79 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if [ $# -lt 7 ]; then
|
||||||
|
echo "Usage: $0 <raw-word-list.txt> <output.txt> <suffixes.txt> <hunspell.aff> <hunspell.dic> <allowed-lowercase-char-list> <allowed-uppercase-char-list>"
|
||||||
|
echo
|
||||||
|
echo "Example (Slovak, no need of whitelist filter):"
|
||||||
|
echo " $0 sk-raw.txt sk-filtered.txt /dev/null sk_SK.aff sk_SK.dic aáäbcčdďeéfghiíjklĺľmnňoóôpqrŕsštťuúvwxyýzž AÁÄBCČDĎEÉFGHIÍJKLĹĽMNŇOÓÔPQRŔSŠTŤUÚVWXYÝZŽ"
|
||||||
|
echo
|
||||||
|
echo "Example (Slovak, with whitelist filter):"
|
||||||
|
echo " $0 sk-raw.txt sk-filtered.txt sk-suffix.txt sk_SK.aff sk_SK.dic aáäbcčdďeéfghiíjklĺľmnňoóôpqrŕsštťuúvwxyýzž AÁÄBCČDĎEÉFGHIÍJKLĹĽMNŇOÓÔPQRŔSŠTŤUÚVWXYÝZŽ"
|
||||||
|
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
DICTIONARY_FILE=$1
|
||||||
|
if ! [[ -f $DICTIONARY_FILE ]]; then
|
||||||
|
echo "base-dictionary-file: '$DICTIONARY_FILE' does not exist"
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
SUFFIXES_FILE=$3
|
||||||
|
|
||||||
|
AFF_FILE=$4
|
||||||
|
if ! [[ -f "$AFF_FILE" ]]; then
|
||||||
|
echo ".aff file: '$AFF_FILE' does not exist"
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
DIC_FILE=$5
|
||||||
|
if ! [[ -f "$DIC_FILE" ]]; then
|
||||||
|
echo ".dic file: '$DIC_FILE' does not exist"
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
OUTPUT_FILE=$2
|
||||||
|
ALLOWED_LOWERCASE_CHARS=$6
|
||||||
|
ALLOWED_UPPERCASE_CHARS=$7
|
||||||
|
WORK_DIR="/tmp/TT9_$(uuidgen)"
|
||||||
|
SCRIPT_DIR="$(dirname "$0")"
|
||||||
|
|
||||||
|
if ! [[ -d $SCRIPT_DIR/venv ]]; then
|
||||||
|
python -m venv $SCRIPT_DIR/venv && source $SCRIPT_DIR/venv/bin/activate && pip install -r $SCRIPT_DIR/requirements.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
generate_words() {
|
||||||
|
CLEAN_WORDS=$1
|
||||||
|
OUTPUT=$2
|
||||||
|
DICTIONARY=${AFF_FILE::-4}
|
||||||
|
|
||||||
|
if ! [[ -f "$SUFFIXES_FILE" ]]; then
|
||||||
|
echo "Suffixes file: '$SUFFIXES_FILE' does not exist. Skipping extra word generation."
|
||||||
|
cp $CLEAN_WORDS $OUTPUT
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf "Extracting valid words for generating new ones... " && hunspell -i UTF-8 -G -d "$DICTIONARY" $CLEAN_WORDS | sort -u | uniq > $WORK_DIR/generation-stems.txt && echo "OK" \
|
||||||
|
&& python $SCRIPT_DIR/generate-words-from-suffixes.py $WORK_DIR/generation-stems.txt $SUFFIXES_FILE $WORK_DIR/generated-raw.txt \
|
||||||
|
&& printf "Validating generated words with Hunspell... " && hunspell -i UTF-8 -G -d "$DICTIONARY" $WORK_DIR/generated-raw.txt > $WORK_DIR/generated-valid.txt && echo "OK" \
|
||||||
|
&& printf "Merging generated and input words... " && cat $CLEAN_WORDS $WORK_DIR/generated-valid.txt | sort -u | uniq > $OUTPUT && echo "OK"
|
||||||
|
}
|
||||||
|
|
||||||
|
# remove Roman numerals: ^(M{0,3})(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$
|
||||||
|
|
||||||
|
date
|
||||||
|
mkdir -p $WORK_DIR \
|
||||||
|
&& printf "Removing foreign letters... " && grep --text -E "^[$ALLOWED_LOWERCASE_CHARS$ALLOWED_UPPERCASE_CHARS]+$" $DICTIONARY_FILE > $WORK_DIR/noforeign.txt && echo "OK" \
|
||||||
|
&& printf "Removing frequencies and duplicates... " && sed -E 's/[\t0-9]+//g' $WORK_DIR/noforeign.txt | sort | uniq > $WORK_DIR/nofreq_norepeat.txt && echo "OK" \
|
||||||
|
&& printf "Removing lowerUPPER... " && grep -vE "[$ALLOWED_LOWERCASE_CHARS][$ALLOWED_UPPERCASE_CHARS]" $WORK_DIR/nofreq_norepeat.txt > $WORK_DIR/no_low_up.txt && echo "OK" \
|
||||||
|
&& printf "Removing UPPERlower... " && grep -vE "[$ALLOWED_UPPERCASE_CHARS]{2,}[$ALLOWED_LOWERCASE_CHARS]" $WORK_DIR/no_low_up.txt > $WORK_DIR/no_up_low.txt && echo "OK" \
|
||||||
|
&& printf "Removing single chars... " && grep -vE "^.$" $WORK_DIR/no_up_low.txt > $WORK_DIR/no_single.txt && echo "OK" \
|
||||||
|
&& printf "Removing words with repeeeeaaaated letters... " && grep -vE "(.)\1{2,}" $WORK_DIR/no_single.txt | grep -vE "^(.)\1$" | sort | uniq > $WORK_DIR/no_multi.txt && echo "OK" \
|
||||||
|
&& generate_words $WORK_DIR/no_multi.txt $WORK_DIR/generated.txt \
|
||||||
|
&& echo "Preparing to fix the text case." && source $SCRIPT_DIR/venv/bin/activate && python $SCRIPT_DIR/fix-text-case.py $WORK_DIR/generated.txt $WORK_DIR/text_case.txt --aff "$AFF_FILE" --dic "$DIC_FILE" \
|
||||||
|
&& INITIAL_COUNT=$(wc -l < "$DICTIONARY_FILE") && FINAL_COUNT=$(wc -l < "$WORK_DIR/text_case.txt") && echo "Word count: $INITIAL_COUNT -> $FINAL_COUNT" \
|
||||||
|
&& mv $WORK_DIR/text_case.txt "$OUTPUT_FILE"
|
||||||
|
|
||||||
|
rm -rf $WORK_DIR
|
||||||
|
date
|
||||||
|
|
@ -1,149 +0,0 @@
|
||||||
const { basename } = require('path');
|
|
||||||
const { createReadStream, existsSync } = require('fs');
|
|
||||||
const { createInterface } = require('readline');
|
|
||||||
const { print, printError } = require('./_printers.js');
|
|
||||||
|
|
||||||
|
|
||||||
function printHelp() {
|
|
||||||
print(`Usage ${basename(process.argv[1])} text.txt UnicodeRange [ExcludeRange] [EraseRange]`);
|
|
||||||
print('From the given text file, extracts and counts the unique words that belong to a given unicode range. Assumes all punctuation, whitespace and foreign characters as word separators.');
|
|
||||||
print('Example UnicodeRange: U+900-U+97FU+200CU+200D');
|
|
||||||
print('Example ExcludeRange: U+950-U+957U+966-U+97F');
|
|
||||||
print('Example EraseRange: U+964U+965U+970U+971');
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
function validateInput() {
|
|
||||||
if (process.argv.length < 3) {
|
|
||||||
printHelp();
|
|
||||||
process.exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!existsSync(process.argv[2])) {
|
|
||||||
printError(`Failure! Could not find word list file "${process.argv[2]}".`);
|
|
||||||
process.exit(2);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!validateUnicodeRange(process.argv[3])) {
|
|
||||||
printError(`Failure! Invalid or missing search Unicode range(s): "${process.argv[3]}".`);
|
|
||||||
process.exit(2);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (process.argv[4] && !validateUnicodeRange(process.argv[4])) {
|
|
||||||
printError(`Failure! Invalid exclude range(s): "${process.argv[4]}".`);
|
|
||||||
process.exit(2);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (process.argv[5] && !validateUnicodeRange(process.argv[5])) {
|
|
||||||
printError(`Failure! Invalid exclude range(s): "${process.argv[5]}".`);
|
|
||||||
process.exit(2);
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
fileName: process.argv[2],
|
|
||||||
searchRegexString: process.argv[3],
|
|
||||||
excludeRegexString: typeof process.argv[4] === 'string' ? process.argv[4] : '',
|
|
||||||
eraseRegexString: typeof process.argv[5] === 'string' ? process.argv[5] : ''
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
function validateUnicodeRange(inputRange) {
|
|
||||||
return /^([uU]\+[\da-fA-F]+)(\-*[uU]\+[\da-fA-F]+)*$/.test(inputRange);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
function URangeToXRange(range) {
|
|
||||||
if (range.length === 0) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
return range
|
|
||||||
.toUpperCase()
|
|
||||||
.replaceAll(/U\+([\dA-F]+)/g, "\\u{$1}");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
function printWords(wordList) {
|
|
||||||
if (Array.isArray(wordList)) {
|
|
||||||
wordList.forEach(w => print(`${w.w}\t${w.f}`));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
function cleanInvalidChars(line, eraseRegex, excludeRegexString) {
|
|
||||||
const spacesOnly = /^\s+$/;
|
|
||||||
|
|
||||||
if (!line || !line.length || spacesOnly.test(line)) {
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
|
|
||||||
const invalidWordRegex = excludeRegexString.length > 0 ? new RegExp("[" + URangeToXRange(excludeRegexString) + "]", "u") : null;
|
|
||||||
|
|
||||||
return line
|
|
||||||
.replace(eraseRegex, ' ')
|
|
||||||
.split(' ')
|
|
||||||
.filter(w => w.length > 1 && (invalidWordRegex === null || !invalidWordRegex.test(w)));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
async function readWords(fileName, eraseRegex, excludeRegexString) {
|
|
||||||
const words = new Map();
|
|
||||||
|
|
||||||
if (!fileName) {
|
|
||||||
return words;
|
|
||||||
}
|
|
||||||
|
|
||||||
for await (const line of createInterface({ input: createReadStream(fileName) })) {
|
|
||||||
const parts = cleanInvalidChars(line, eraseRegex, excludeRegexString);
|
|
||||||
parts.forEach(w => {
|
|
||||||
words.set(w, words.has(w) ? words.get(w) + 1 : 1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
return words;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
function sortWords(wordsMap) {
|
|
||||||
const words = [];
|
|
||||||
for (let [w, f] of wordsMap) {
|
|
||||||
words.push({ w, f });
|
|
||||||
}
|
|
||||||
|
|
||||||
return words.sort((a, b) => {
|
|
||||||
if (a.f > b.f) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (a.f < b.f) {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (a.w < b.w) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (a.w > b.w) {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
async function work({ fileName, searchRegexString, excludeRegexString, eraseRegexString }) {
|
|
||||||
const eraseRegex = new RegExp("([^" + URangeToXRange(searchRegexString) + " ]+|[" + URangeToXRange(eraseRegexString) + "])", "gu");
|
|
||||||
return sortWords(
|
|
||||||
await readWords(fileName, eraseRegex, excludeRegexString)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/** main **/
|
|
||||||
work(validateInput())
|
|
||||||
.then(words => printWords(words))
|
|
||||||
.catch(e => printError(e));
|
|
||||||
47
scripts/extract-words-from-text.py
Normal file
47
scripts/extract-words-from-text.py
Normal file
|
|
@ -0,0 +1,47 @@
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
from collections import Counter
|
||||||
|
from os.path import basename
|
||||||
|
|
||||||
|
def usage():
|
||||||
|
print(f"Usage: e{basename(__file__)} [--freq|-f] <allowed_letters> <file1> [file2 ...]")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Check and parse arguments
|
||||||
|
args = sys.argv[1:]
|
||||||
|
if not args or len(args) < 2:
|
||||||
|
usage()
|
||||||
|
|
||||||
|
show_freq = False
|
||||||
|
if args[0] in ("--freq", "-f"):
|
||||||
|
show_freq = True
|
||||||
|
args = args[1:]
|
||||||
|
|
||||||
|
if len(args) < 2:
|
||||||
|
usage()
|
||||||
|
|
||||||
|
allowed_letters = set(args[0])
|
||||||
|
file_paths = args[1:]
|
||||||
|
|
||||||
|
# Unicode word pattern
|
||||||
|
word_pattern = re.compile(r'\b\w+\b', re.UNICODE)
|
||||||
|
word_counts = Counter()
|
||||||
|
|
||||||
|
# Process files
|
||||||
|
for path in file_paths:
|
||||||
|
try:
|
||||||
|
with open(path, 'r', encoding='utf-8') as f:
|
||||||
|
for line in f:
|
||||||
|
for word in word_pattern.findall(line):
|
||||||
|
if all(char in allowed_letters for char in word):
|
||||||
|
word_counts[word] += 1
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading {path}: {e}", file=sys.stderr)
|
||||||
|
|
||||||
|
# Output
|
||||||
|
if show_freq:
|
||||||
|
for word, count in sorted(word_counts.items()):
|
||||||
|
print(f"{word}\t{count}")
|
||||||
|
else:
|
||||||
|
for word in sorted(word_counts):
|
||||||
|
print(word)
|
||||||
108
scripts/fix-text-case.py
Normal file
108
scripts/fix-text-case.py
Normal file
|
|
@ -0,0 +1,108 @@
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from multiprocessing import Pool, cpu_count, Manager
|
||||||
|
from collections import defaultdict
|
||||||
|
import hunspell
|
||||||
|
|
||||||
|
def load_unique_words(full_list_path):
|
||||||
|
words = dict()
|
||||||
|
with open(full_list_path, 'r', encoding='utf-8', errors='replace') as f:
|
||||||
|
for line in f:
|
||||||
|
word = line.strip()
|
||||||
|
if '<EFBFBD>' in word:
|
||||||
|
continue
|
||||||
|
|
||||||
|
word_lower = word.lower()
|
||||||
|
if word_lower not in words or words[word_lower] == word_lower:
|
||||||
|
words[word_lower] = word
|
||||||
|
|
||||||
|
return words.values()
|
||||||
|
|
||||||
|
def init_hunspell_worker(aff_path, dic_path):
|
||||||
|
global hobj, hunspell_stems
|
||||||
|
hobj = hunspell.HunSpell(dic_path, aff_path)
|
||||||
|
with open(dic_path, "r") as f:
|
||||||
|
hunspell_stems = set({
|
||||||
|
line.split('/')[0].strip()
|
||||||
|
for line in f
|
||||||
|
if not line.startswith('#')
|
||||||
|
})
|
||||||
|
|
||||||
|
def fix_word_text_case(word):
|
||||||
|
word_lower = word.lower()
|
||||||
|
|
||||||
|
# check for direct matches to avoid expensive calls to HunSpell.suggest()
|
||||||
|
if word_lower != word and word_lower in hunspell_stems:
|
||||||
|
return word_lower
|
||||||
|
|
||||||
|
if word in hunspell_stems:
|
||||||
|
return word
|
||||||
|
|
||||||
|
# name -> Name
|
||||||
|
hunspell_variants = hobj.suggest(word_lower)
|
||||||
|
for variant in hunspell_variants:
|
||||||
|
if word_lower != variant and word_lower == variant.lower():
|
||||||
|
return variant
|
||||||
|
|
||||||
|
# if it can be either lowercase or uppercase, then we want to keep the lowercase
|
||||||
|
if word_lower in hunspell_variants:
|
||||||
|
return word_lower
|
||||||
|
|
||||||
|
# if it is an unknown word, keep it as-is
|
||||||
|
return word
|
||||||
|
|
||||||
|
def print_progress(current, total, start_time, interval):
|
||||||
|
if current % interval == 0 or current == total:
|
||||||
|
avg_time = (time.time() - start_time) / current
|
||||||
|
remaining_time = (total - current) * avg_time
|
||||||
|
HH, rem = divmod(int(remaining_time), 3600)
|
||||||
|
MM, SS = divmod(rem, 60)
|
||||||
|
print(f"\rFixing text case using hunspell... {current}/{total}, Remaining: {HH:02}:{MM:02}:{SS:02}", end=" ")
|
||||||
|
|
||||||
|
|
||||||
|
def run_hunspell_batch(words, aff_path, dic_path, num_workers):
|
||||||
|
total = len(words)
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
with Pool(
|
||||||
|
processes=num_workers,
|
||||||
|
initializer=init_hunspell_worker,
|
||||||
|
initargs=(aff_path, dic_path)
|
||||||
|
) as pool:
|
||||||
|
for i, correct_word in enumerate (pool.imap_unordered(fix_word_text_case, words), 1):
|
||||||
|
print_progress(i, total, start_time, 300)
|
||||||
|
yield correct_word
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Correct the text case of a word list using Hunspell.")
|
||||||
|
parser.add_argument("word_list", help="Path to the full list of words.")
|
||||||
|
parser.add_argument("output", help="Path to save the corrected words.")
|
||||||
|
parser.add_argument("--aff", required=True, help="Path to Hunspell .aff file.")
|
||||||
|
parser.add_argument("--dic", required=True, help="Path to Hunspell .dic file.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if not os.path.exists(args.word_list):
|
||||||
|
print(f"Full word list not found: {args.word_list}")
|
||||||
|
return
|
||||||
|
if not os.path.exists(args.aff):
|
||||||
|
print(f"Hunspell .aff file not found: {args.aff}")
|
||||||
|
return
|
||||||
|
if not os.path.exists(args.dic):
|
||||||
|
print(f"Hunspell .dic file not found: {args.dic}")
|
||||||
|
return
|
||||||
|
|
||||||
|
all_words = load_unique_words(args.word_list)
|
||||||
|
print(f"Loaded {len(all_words)} candidate words.")
|
||||||
|
|
||||||
|
corrected_words = run_hunspell_batch(all_words, args.aff, args.dic, cpu_count())
|
||||||
|
|
||||||
|
with open(args.output, 'w', encoding='utf-8') as f:
|
||||||
|
for word in sorted(corrected_words):
|
||||||
|
f.write(word + '\n')
|
||||||
|
|
||||||
|
print(" ") # clear the '\r'
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
64
scripts/generate-words-from-suffixes.py
Normal file
64
scripts/generate-words-from-suffixes.py
Normal file
|
|
@ -0,0 +1,64 @@
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
from multiprocessing import Pool, cpu_count
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
def load_unique_words(word_list_path):
|
||||||
|
words = set()
|
||||||
|
with open(word_list_path, 'r', encoding='utf-8', errors='replace') as f:
|
||||||
|
for line in f:
|
||||||
|
word = line.strip()
|
||||||
|
if '<EFBFBD>' in word:
|
||||||
|
continue
|
||||||
|
words.add(word.lower())
|
||||||
|
return words
|
||||||
|
|
||||||
|
def load_known_suffixes(suffix_file_path):
|
||||||
|
suffixes = set()
|
||||||
|
with open(suffix_file_path, 'r', encoding='utf-8', errors='replace') as f:
|
||||||
|
for line in f:
|
||||||
|
suffix = line.strip()
|
||||||
|
if suffix:
|
||||||
|
suffixes.add(suffix)
|
||||||
|
return suffixes
|
||||||
|
|
||||||
|
def generate_from_args(args):
|
||||||
|
word, suffixes = args
|
||||||
|
return {word + suffix for suffix in suffixes}
|
||||||
|
|
||||||
|
def generate_words(words, suffixes, num_workers):
|
||||||
|
new_words = set()
|
||||||
|
with Pool(processes=num_workers) as pool:
|
||||||
|
for result in pool.imap_unordered(generate_from_args, ((word, suffixes) for word in words)):
|
||||||
|
new_words.update(result)
|
||||||
|
return new_words
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Naively generate new words using a list of stems and a list of suffixes. Note that, you will have to clean up the invalid words after that.")
|
||||||
|
parser.add_argument("word_list", help="Path to the full list of words to filter")
|
||||||
|
parser.add_argument("suffix_file", help="Path to the file containing known suffixes")
|
||||||
|
parser.add_argument("output", help="Path to save the filtered output")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if not os.path.exists(args.word_list):
|
||||||
|
print(f"Full word list not found: {args.word_list}")
|
||||||
|
return
|
||||||
|
if not os.path.exists(args.suffix_file):
|
||||||
|
print(f"Suffix file not found: {args.suffix_file}")
|
||||||
|
return
|
||||||
|
|
||||||
|
print("Generating new words...", end=' ')
|
||||||
|
|
||||||
|
all_words = load_unique_words(args.word_list)
|
||||||
|
known_suffixes = load_known_suffixes(args.suffix_file)
|
||||||
|
|
||||||
|
print(f"\rGenerating new words out of {len(all_words)} stems and {len(known_suffixes)} suffixes...", end=' ')
|
||||||
|
generated = generate_words(all_words, known_suffixes, cpu_count())
|
||||||
|
print(f"OK ({len(generated) - len(all_words)} new words)")
|
||||||
|
|
||||||
|
with open(args.output, 'w', encoding='utf-8') as f:
|
||||||
|
for word in generated:
|
||||||
|
f.write(word + '\n')
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -1,111 +0,0 @@
|
||||||
const { basename } = require('path');
|
|
||||||
const { createReadStream, existsSync } = require('fs');
|
|
||||||
const { createInterface } = require('readline');
|
|
||||||
const { print, printError } = require('./_printers.js');
|
|
||||||
|
|
||||||
|
|
||||||
function printHelp() {
|
|
||||||
print(`Usage ${basename(process.argv[1])} word-list.txt UnicodeRange [ExcludeRange]`);
|
|
||||||
print('Extracts words that belong to a given unicode range. Assumes all punctuation, whitespace and foreign characters as word separators.');
|
|
||||||
print('Example UnicodeRange: U+900-U+97F');
|
|
||||||
print('Example ExcludeRange: U+950-U+954U+964-U+971U+200CU+200D');
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
function validateInput() {
|
|
||||||
if (process.argv.length < 3) {
|
|
||||||
printHelp();
|
|
||||||
process.exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!existsSync(process.argv[2])) {
|
|
||||||
printError(`Failure! Could not find word list file "${process.argv[2]}".`);
|
|
||||||
process.exit(2);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!validateUnicodeRange(process.argv[3])) {
|
|
||||||
printError(`Failure! Invalid or missing search Unicode range(s): "${process.argv[3]}".`);
|
|
||||||
process.exit(2);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (process.argv[4] && !validateUnicodeRange(process.argv[4])) {
|
|
||||||
printError(`Failure! Invalid exclude range(s): "${process.argv[4]}".`);
|
|
||||||
process.exit(2);
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
fileName: process.argv[2],
|
|
||||||
searchRegexString: process.argv[3],
|
|
||||||
excludeRegexString: typeof process.argv[4] === 'string' ? process.argv[4] : ''
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
function validateUnicodeRange(inputRange) {
|
|
||||||
return /^([uU]\+[\da-fA-F]+)(\-*[uU]\+[\da-fA-F]+)*$/.test(inputRange);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
function URangeToXRange(range) {
|
|
||||||
if (range.length === 0) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
return range
|
|
||||||
.toUpperCase()
|
|
||||||
.replaceAll(/U\+([\dA-F]+)/g, "\\u{$1}");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
function printWords(wordList) {
|
|
||||||
if (Array.isArray(wordList)) {
|
|
||||||
wordList.forEach(w => print(w));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
function cleanInvalidChars(line, searchRegex, excludeRegex) {
|
|
||||||
const spacesOnly = /^\s+$/;
|
|
||||||
|
|
||||||
if (!line || !line.length || spacesOnly.test(line)) {
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
|
|
||||||
const cleanLine = excludeRegex !== null ? line.replace(excludeRegex, ' ') : line;
|
|
||||||
return cleanLine
|
|
||||||
.replace(searchRegex, ' ')
|
|
||||||
.split(' ')
|
|
||||||
.filter(w => w.length > 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
async function readWords(fileName, searchRegex, excludeRegex) {
|
|
||||||
const words = new Set();
|
|
||||||
|
|
||||||
if (!fileName) {
|
|
||||||
return words;
|
|
||||||
}
|
|
||||||
|
|
||||||
for await (const line of createInterface({ input: createReadStream(fileName) })) {
|
|
||||||
cleanInvalidChars(line, searchRegex, excludeRegex).forEach(w => words.add(w));
|
|
||||||
}
|
|
||||||
|
|
||||||
return words;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
async function work({ fileName, searchRegexString, excludeRegexString }) {
|
|
||||||
const searchRegex = new RegExp("[^" + URangeToXRange(searchRegexString) + "]+", "gu");
|
|
||||||
const excludeRegex = excludeRegexString.length > 0 ? new RegExp("[" + URangeToXRange(excludeRegexString) + "]+", "gu") : null;
|
|
||||||
|
|
||||||
const words = Array.from(await readWords(fileName, searchRegex, excludeRegex));
|
|
||||||
return words.filter(word => word.length > 1).sort();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/** main **/
|
|
||||||
work(validateInput())
|
|
||||||
.then(words => printWords(words))
|
|
||||||
.catch(e => printError(e));
|
|
||||||
|
|
@ -1,85 +0,0 @@
|
||||||
const { basename } = require('path');
|
|
||||||
const { existsSync, readFileSync } = require('fs');;
|
|
||||||
const { print, printError } = require('./_printers.js')
|
|
||||||
|
|
||||||
|
|
||||||
function printHelp() {
|
|
||||||
print(`Usage ${basename(process.argv[1])} WORD-LIST.txt`);
|
|
||||||
print('Normalizes the frequencies in a dictionary with transcriptions.');
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
function validateInput() {
|
|
||||||
if (process.argv.length < 3) {
|
|
||||||
printHelp();
|
|
||||||
process.exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!existsSync(process.argv[2])) {
|
|
||||||
printError(`Failure! Could not find word list file "${process.argv[2]}".`);
|
|
||||||
process.exit(2);
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
fileName: process.argv[2]
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
function printWords(wordList) {
|
|
||||||
if (Array.isArray(wordList)) {
|
|
||||||
wordList.forEach(w => print(
|
|
||||||
w.number !== null ? `${w.chinese}\t${w.latin}\t${w.number}` : `${w.chinese}\t${w.latin}`
|
|
||||||
));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
const { fileName } = validateInput();
|
|
||||||
|
|
||||||
const data = readFileSync(fileName, 'utf8');
|
|
||||||
const lines = data.trim().split('\n');
|
|
||||||
|
|
||||||
// Parse the data into an array of objects
|
|
||||||
let entries = lines.map(line => {
|
|
||||||
const parts = line.split('\t');
|
|
||||||
return {
|
|
||||||
original: line,
|
|
||||||
chinese: parts[0],
|
|
||||||
latin: parts[1],
|
|
||||||
number: parts[2] ? parseInt(parts[2], 10) : null
|
|
||||||
};
|
|
||||||
});
|
|
||||||
|
|
||||||
// Group entries by the Latin character sequence
|
|
||||||
const groups = {};
|
|
||||||
entries.forEach(entry => {
|
|
||||||
if (!groups[entry.latin]) {
|
|
||||||
groups[entry.latin] = [];
|
|
||||||
}
|
|
||||||
groups[entry.latin].push(entry);
|
|
||||||
});
|
|
||||||
|
|
||||||
// Process each group: sort by number (descending) and reassign ordinal numbers
|
|
||||||
let sortedEntries = [];
|
|
||||||
for (const key in groups) {
|
|
||||||
let group = groups[key];
|
|
||||||
|
|
||||||
// Separate entries with and without numbers
|
|
||||||
let withNumbers = group.filter(e => e.number !== null);
|
|
||||||
let withoutNumbers = group.filter(e => e.number === null);
|
|
||||||
|
|
||||||
// Sort by number in descending order
|
|
||||||
withNumbers.sort((a, b) => b.number - a.number);
|
|
||||||
|
|
||||||
// Assign ordinal rankings
|
|
||||||
for (let i = 0; i < withNumbers.length; i++) {
|
|
||||||
withNumbers[i].number = (withNumbers.length - i).toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Preserve original order for entries without numbers
|
|
||||||
sortedEntries.push(...withNumbers, ...withoutNumbers);
|
|
||||||
}
|
|
||||||
|
|
||||||
printWords(sortedEntries);
|
|
||||||
86
scripts/normalize-transcribed.py
Normal file
86
scripts/normalize-transcribed.py
Normal file
|
|
@ -0,0 +1,86 @@
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
def print_error(message):
|
||||||
|
print(message, file=sys.stderr)
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Normalizes the frequencies in a dictionary with transcriptions."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"word_list",
|
||||||
|
help="Path to the word list file (e.g., WORD-LIST.txt)"
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
def validate_file(file_path):
|
||||||
|
if not os.path.isfile(file_path):
|
||||||
|
print_error(f'Failure! Could not find word list file "{file_path}".')
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
def load_entries(file_path):
|
||||||
|
with open(file_path, encoding='utf-8') as f:
|
||||||
|
lines = [line.strip() for line in f if line.strip()]
|
||||||
|
|
||||||
|
entries = []
|
||||||
|
for line_num, line in enumerate(lines, start=1):
|
||||||
|
parts = line.split('\t')
|
||||||
|
if len(parts) < 2:
|
||||||
|
print_error(f"Malformed line {line_num}: '{line}' (expected at least 2 tab-separated fields)")
|
||||||
|
sys.exit(3)
|
||||||
|
|
||||||
|
chinese, latin = parts[:2]
|
||||||
|
number = None
|
||||||
|
if len(parts) > 2:
|
||||||
|
try:
|
||||||
|
number = int(parts[2])
|
||||||
|
except ValueError:
|
||||||
|
print_error(f"Malformed line {line_num}: '{line}' (third field must be an integer if present)")
|
||||||
|
sys.exit(3)
|
||||||
|
|
||||||
|
entries.append({'chinese': chinese, 'latin': latin, 'number': number})
|
||||||
|
|
||||||
|
return entries
|
||||||
|
|
||||||
|
def group_entries(entries):
|
||||||
|
groups = defaultdict(list)
|
||||||
|
for entry in entries:
|
||||||
|
groups[entry['latin']].append(entry)
|
||||||
|
return groups
|
||||||
|
|
||||||
|
def normalize_frequencies(groups):
|
||||||
|
sorted_entries = []
|
||||||
|
for group in groups.values():
|
||||||
|
with_numbers = [e for e in group if e['number'] is not None]
|
||||||
|
without_numbers = [e for e in group if e['number'] is None]
|
||||||
|
|
||||||
|
with_numbers.sort(key=lambda e: e['number'], reverse=True)
|
||||||
|
|
||||||
|
for rank, entry in enumerate(with_numbers, start=1):
|
||||||
|
entry['number'] = str(len(with_numbers) - rank + 1)
|
||||||
|
|
||||||
|
sorted_entries.extend(with_numbers)
|
||||||
|
sorted_entries.extend(without_numbers)
|
||||||
|
|
||||||
|
return sorted_entries
|
||||||
|
|
||||||
|
def print_entries(entries):
|
||||||
|
for e in entries:
|
||||||
|
parts = [e['chinese'], e['latin']]
|
||||||
|
if e['number'] is not None:
|
||||||
|
parts.append(e['number'])
|
||||||
|
print('\t'.join(parts))
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
validate_file(args.word_list)
|
||||||
|
entries = load_entries(args.word_list)
|
||||||
|
groups = group_entries(entries)
|
||||||
|
sorted_entries = normalize_frequencies(groups)
|
||||||
|
print_entries(sorted_entries)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
46
scripts/remove-random-words.sh
Executable file
46
scripts/remove-random-words.sh
Executable file
|
|
@ -0,0 +1,46 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if [ $# -ne 7 ]; then
|
||||||
|
echo "Usage: $0 <locale> <all-words.txt> <courpus-words-with-frequencies.txt> <bad-combinations.txt> <output-file.txt> <vowels-list> <unpopular-max-length>"
|
||||||
|
echo "Example (Polish): $0 pl-PL pl.txt pl-corpus.txt pl-bad-combinations.txt pl-reduced.txt aąeęijoóuy 13"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
LOCALE="$1"
|
||||||
|
ORIGINAL_WORDS="$2"
|
||||||
|
CORPUS_WORDS="$3"
|
||||||
|
BAD_COMB_FILE="$4"
|
||||||
|
OUTPUT_FILE="$5"
|
||||||
|
VOWELS="$6"
|
||||||
|
UNPOPULAR_MAX_LENGTH="$7"
|
||||||
|
|
||||||
|
if ! [[ -f "$ORIGINAL_WORDS" ]]; then
|
||||||
|
echo "All words file: '$ORIGINAL_WORDS' does not exist"
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! [[ -f "$CORPUS_WORDS" ]]; then
|
||||||
|
echo "Corpus words file: '$CORPUS_WORDS' does not exist"
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! [[ -f "$BAD_COMB_FILE" ]]; then
|
||||||
|
echo "Bad letter combinations file: '$BAD_COMB_FILE' does not exist"
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
BAD_LETTER_COMBINATIONS=$(paste -sd'|' "$BAD_COMB_FILE")
|
||||||
|
|
||||||
|
|
||||||
|
sed -E 's/^[^\t]+\t[12]$//g' "$CORPUS_WORDS" | grep . | sed -E 's/[\t0-9]+$//g' > __tmp__popular.txt &
|
||||||
|
grep -Ev "(.)\1{2,}" "$ORIGINAL_WORDS" | grep -Ev "($BAD_LETTER_COMBINATIONS)" | grep -Ev "[$VOWELS]{3,}" | sort -u | uniq > __tmp__reduced.txt &
|
||||||
|
wait
|
||||||
|
|
||||||
|
node ~/src/tt9/scripts/remove-foreign-words.js --blacklist "$LOCALE" __tmp__reduced.txt "$LOCALE" __tmp__popular.txt | grep -Ev "(.)\1{2,}" > __tmp__unpopular.txt
|
||||||
|
node ~/src/tt9/scripts/remove-foreign-words.js --blacklist "$LOCALE" __tmp__reduced.txt "$LOCALE" __tmp__unpopular.txt > __tmp__popular_reduced.txt &
|
||||||
|
awk -v maxlen="$UNPOPULAR_MAX_LENGTH" 'length($0) <= maxlen' __tmp__unpopular.txt > __tmp__unpopular_reduced.txt &
|
||||||
|
wait
|
||||||
|
|
||||||
|
cat __tmp__popular_reduced.txt __tmp__unpopular_reduced.txt | sort -u | uniq > $OUTPUT_FILE
|
||||||
|
|
||||||
|
rm -f __tmp__*.txt
|
||||||
1
scripts/requirements.txt
Normal file
1
scripts/requirements.txt
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
hunspell==0.5.5
|
||||||
102
scripts/whitelist-filter.py
Normal file
102
scripts/whitelist-filter.py
Normal file
|
|
@ -0,0 +1,102 @@
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
from multiprocessing import Pool, cpu_count
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
def load_stem_buckets(whitelist_path):
|
||||||
|
buckets = defaultdict(set)
|
||||||
|
with open(whitelist_path, 'r', encoding='utf-8', errors='replace') as f:
|
||||||
|
for line in f:
|
||||||
|
word = line.strip()
|
||||||
|
if '<EFBFBD>' in word:
|
||||||
|
continue
|
||||||
|
word_lc = word.lower()
|
||||||
|
first_char = word_lc[0]
|
||||||
|
buckets[first_char].add(word_lc)
|
||||||
|
return dict(buckets)
|
||||||
|
|
||||||
|
def load_unique_words(full_list_path):
|
||||||
|
words = set()
|
||||||
|
with open(full_list_path, 'r', encoding='utf-8', errors='replace') as f:
|
||||||
|
for line in f:
|
||||||
|
word = line.strip()
|
||||||
|
if '<EFBFBD>' in word:
|
||||||
|
continue
|
||||||
|
words.add(word)
|
||||||
|
return words
|
||||||
|
|
||||||
|
def load_known_suffixes(suffix_file_path):
|
||||||
|
suffixes = set()
|
||||||
|
with open(suffix_file_path, 'r', encoding='utf-8', errors='replace') as f:
|
||||||
|
for line in f:
|
||||||
|
suffix = line.strip()
|
||||||
|
if suffix:
|
||||||
|
suffixes.add(suffix)
|
||||||
|
return suffixes
|
||||||
|
|
||||||
|
def match_word(word, buckets, known_suffixes):
|
||||||
|
"""Return all valid combinations: base word and word+suffix if found in stems."""
|
||||||
|
word_lc = word.lower()
|
||||||
|
first_char = word_lc[0]
|
||||||
|
possible_stems = buckets.get(first_char, set())
|
||||||
|
|
||||||
|
matches = []
|
||||||
|
|
||||||
|
if word_lc in possible_stems:
|
||||||
|
matches.append(word)
|
||||||
|
|
||||||
|
for suffix in known_suffixes:
|
||||||
|
compound_word = word_lc + suffix
|
||||||
|
if compound_word in possible_stems:
|
||||||
|
matches.append(compound_word)
|
||||||
|
|
||||||
|
return matches
|
||||||
|
|
||||||
|
def filter_words_parallel(all_words, stem_buckets, known_suffixes, num_workers):
|
||||||
|
args = [(word, stem_buckets, known_suffixes) for word in all_words]
|
||||||
|
with Pool(processes=num_workers) as pool:
|
||||||
|
results = pool.starmap(match_word, args)
|
||||||
|
|
||||||
|
matched_words = set()
|
||||||
|
for match_list in results:
|
||||||
|
matched_words.update(match_list)
|
||||||
|
return matched_words
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Filter given words by a stem whitelist. The list of suffixes is used to generate more variants of the valid words.")
|
||||||
|
parser.add_argument("whitelist", help="Path to the whitelist file (with valid words)")
|
||||||
|
parser.add_argument("full_list", help="Path to the full list of words to filter")
|
||||||
|
parser.add_argument("suffix_file", help="Path to the file containing known suffixes")
|
||||||
|
parser.add_argument("output", help="Path to save the filtered output")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if not os.path.exists(args.whitelist):
|
||||||
|
print(f"Whitelist file not found: {args.whitelist}")
|
||||||
|
return
|
||||||
|
if not os.path.exists(args.full_list):
|
||||||
|
print(f"Full word list not found: {args.full_list}")
|
||||||
|
return
|
||||||
|
if not os.path.exists(args.suffix_file):
|
||||||
|
print(f"Suffix file not found: {args.suffix_file}")
|
||||||
|
return
|
||||||
|
|
||||||
|
stem_buckets = load_stem_buckets(args.whitelist)
|
||||||
|
print(f"Loaded {sum(len(s) for s in stem_buckets.values())} valid stems across {len(stem_buckets)} buckets.")
|
||||||
|
|
||||||
|
all_words = load_unique_words(args.full_list)
|
||||||
|
print(f"Loaded {len(all_words)} candidate words.")
|
||||||
|
|
||||||
|
known_suffixes = load_known_suffixes(args.suffix_file)
|
||||||
|
print(f"Loaded {len(known_suffixes)} known suffixes.")
|
||||||
|
|
||||||
|
workers = cpu_count()
|
||||||
|
print(f"Filtering using {workers} threads...", end=' ')
|
||||||
|
filtered = filter_words_parallel(all_words, stem_buckets, known_suffixes, workers)
|
||||||
|
print(f"OK. Matched {len(filtered)} words.")
|
||||||
|
|
||||||
|
with open(args.output, 'w', encoding='utf-8') as f:
|
||||||
|
for word in sorted(filtered):
|
||||||
|
f.write(word + '\n')
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Add table
Add a link
Reference in a new issue