1
0
Fork 0

New dictionary tools (#830)

* new dictionary tools for generating an app dictionary from raw word lists

* replaced the normalize-transcribed script with a Python version, since the Javascript version was running out of memory for larger languages like Japanese
This commit is contained in:
Dimo Karaivanov 2025-06-27 17:01:42 +03:00 committed by GitHub
parent a34baef0f3
commit 62e8a08576
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 533 additions and 439 deletions

View file

@ -1,94 +0,0 @@
const { basename } = require('path');
const { createReadStream, existsSync } = require('fs');
const { print, printError } = require('./_printers.js');
function printHelp() {
print(`Usage ${basename(process.argv[1])} DICTIONARY-FILE-NAME.txt LIST-OF-CAPITALIZED-WORDS.txt MIN-WORD-LENGTH LOCALE`);
print('Capitalizes a word list using capitalized words in another list.');
print('\nMIN-WORD-LENGTH must be a positive number.');
print('LOCALE could be any valid JS locale, for exmaple: en, en-US, etc...');
}
function validateInput() {
if (process.argv.length < 6) {
printHelp();
process.exit(1);
}
if (!existsSync(process.argv[3])) {
printError(`Failure! Could not find list-of-capitals file "${process.argv[3]}".`);
process.exit(2);
}
if (!existsSync(process.argv[2])) {
printError(`Failure! Could not find dictionary file "${process.argv[2]}".`);
process.exit(2);
}
const minWordLength = Number.parseInt(process.argv[4]);
if (Number.isNaN(minWordLength) || minWordLength < 0) {
printError(`Failure! The minimum word length must be a positive number.`);
process.exit(2);
}
return { capitalsFileName: process.argv[3], dictionaryFileName: process.argv[2], locale: process.argv[5], minWordLength };
}
async function capitalize({ dictionaryFileName, capitalsFileName, locale, minWordLength }) {
// read the dictionary
let lineReader = require('readline').createInterface({
input: createReadStream(dictionaryFileName)
});
const words = {};
for await (const line of lineReader) {
words[line] = true;
}
// convert the dictionary words using the second file
lineReader = require('readline').createInterface({
input: createReadStream(capitalsFileName)
});
for await (const capitalizedWord of lineReader) {
if (capitalizedWord.length < minWordLength) {
continue;
}
const lowercaseWord = capitalizedWord.toLocaleLowerCase(locale);
if (words[lowercaseWord]) {
delete words[lowercaseWord];
words[capitalizedWord] = true;
}
const possessiveLowercaseWord = `${lowercaseWord}'s`;
if (words[possessiveLowercaseWord]) {
delete words[possessiveLowercaseWord];
words[`${capitalizedWord}'s`] = true;
}
}
return Object.keys(words);
}
function printWords(wordList) {
if (!Array.isArray(wordList)) {
return;
}
wordList.forEach(w => print(w));
}
/** main **/
capitalize(validateInput())
.then(words => printWords(words))
.catch(e => printError(e));

79
scripts/clean-raw-dictionary.sh Executable file
View file

@ -0,0 +1,79 @@
#!/bin/bash
if [ $# -lt 7 ]; then
echo "Usage: $0 <raw-word-list.txt> <output.txt> <suffixes.txt> <hunspell.aff> <hunspell.dic> <allowed-lowercase-char-list> <allowed-uppercase-char-list>"
echo
echo "Example (Slovak, no need of whitelist filter):"
echo " $0 sk-raw.txt sk-filtered.txt /dev/null sk_SK.aff sk_SK.dic aáäbcčdďeéfghiíjklĺľmnňoóôpqrŕsštťuúvwxyýzž AÁÄBCČDĎEÉFGHIÍJKLĹĽMNŇOÓÔPQRŔSŠTŤUÚVWXYÝZŽ"
echo
echo "Example (Slovak, with whitelist filter):"
echo " $0 sk-raw.txt sk-filtered.txt sk-suffix.txt sk_SK.aff sk_SK.dic aáäbcčdďeéfghiíjklĺľmnňoóôpqrŕsštťuúvwxyýzž AÁÄBCČDĎEÉFGHIÍJKLĹĽMNŇOÓÔPQRŔSŠTŤUÚVWXYÝZŽ"
exit 1
fi
DICTIONARY_FILE=$1
if ! [[ -f $DICTIONARY_FILE ]]; then
echo "base-dictionary-file: '$DICTIONARY_FILE' does not exist"
exit 2
fi
SUFFIXES_FILE=$3
AFF_FILE=$4
if ! [[ -f "$AFF_FILE" ]]; then
echo ".aff file: '$AFF_FILE' does not exist"
exit 2
fi
DIC_FILE=$5
if ! [[ -f "$DIC_FILE" ]]; then
echo ".dic file: '$DIC_FILE' does not exist"
exit 2
fi
OUTPUT_FILE=$2
ALLOWED_LOWERCASE_CHARS=$6
ALLOWED_UPPERCASE_CHARS=$7
WORK_DIR="/tmp/TT9_$(uuidgen)"
SCRIPT_DIR="$(dirname "$0")"
if ! [[ -d $SCRIPT_DIR/venv ]]; then
python -m venv $SCRIPT_DIR/venv && source $SCRIPT_DIR/venv/bin/activate && pip install -r $SCRIPT_DIR/requirements.txt
fi
generate_words() {
CLEAN_WORDS=$1
OUTPUT=$2
DICTIONARY=${AFF_FILE::-4}
if ! [[ -f "$SUFFIXES_FILE" ]]; then
echo "Suffixes file: '$SUFFIXES_FILE' does not exist. Skipping extra word generation."
cp $CLEAN_WORDS $OUTPUT
return
fi
printf "Extracting valid words for generating new ones... " && hunspell -i UTF-8 -G -d "$DICTIONARY" $CLEAN_WORDS | sort -u | uniq > $WORK_DIR/generation-stems.txt && echo "OK" \
&& python $SCRIPT_DIR/generate-words-from-suffixes.py $WORK_DIR/generation-stems.txt $SUFFIXES_FILE $WORK_DIR/generated-raw.txt \
&& printf "Validating generated words with Hunspell... " && hunspell -i UTF-8 -G -d "$DICTIONARY" $WORK_DIR/generated-raw.txt > $WORK_DIR/generated-valid.txt && echo "OK" \
&& printf "Merging generated and input words... " && cat $CLEAN_WORDS $WORK_DIR/generated-valid.txt | sort -u | uniq > $OUTPUT && echo "OK"
}
# remove Roman numerals: ^(M{0,3})(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$
date
mkdir -p $WORK_DIR \
&& printf "Removing foreign letters... " && grep --text -E "^[$ALLOWED_LOWERCASE_CHARS$ALLOWED_UPPERCASE_CHARS]+$" $DICTIONARY_FILE > $WORK_DIR/noforeign.txt && echo "OK" \
&& printf "Removing frequencies and duplicates... " && sed -E 's/[\t0-9]+//g' $WORK_DIR/noforeign.txt | sort | uniq > $WORK_DIR/nofreq_norepeat.txt && echo "OK" \
&& printf "Removing lowerUPPER... " && grep -vE "[$ALLOWED_LOWERCASE_CHARS][$ALLOWED_UPPERCASE_CHARS]" $WORK_DIR/nofreq_norepeat.txt > $WORK_DIR/no_low_up.txt && echo "OK" \
&& printf "Removing UPPERlower... " && grep -vE "[$ALLOWED_UPPERCASE_CHARS]{2,}[$ALLOWED_LOWERCASE_CHARS]" $WORK_DIR/no_low_up.txt > $WORK_DIR/no_up_low.txt && echo "OK" \
&& printf "Removing single chars... " && grep -vE "^.$" $WORK_DIR/no_up_low.txt > $WORK_DIR/no_single.txt && echo "OK" \
&& printf "Removing words with repeeeeaaaated letters... " && grep -vE "(.)\1{2,}" $WORK_DIR/no_single.txt | grep -vE "^(.)\1$" | sort | uniq > $WORK_DIR/no_multi.txt && echo "OK" \
&& generate_words $WORK_DIR/no_multi.txt $WORK_DIR/generated.txt \
&& echo "Preparing to fix the text case." && source $SCRIPT_DIR/venv/bin/activate && python $SCRIPT_DIR/fix-text-case.py $WORK_DIR/generated.txt $WORK_DIR/text_case.txt --aff "$AFF_FILE" --dic "$DIC_FILE" \
&& INITIAL_COUNT=$(wc -l < "$DICTIONARY_FILE") && FINAL_COUNT=$(wc -l < "$WORK_DIR/text_case.txt") && echo "Word count: $INITIAL_COUNT -> $FINAL_COUNT" \
&& mv $WORK_DIR/text_case.txt "$OUTPUT_FILE"
rm -rf $WORK_DIR
date

View file

@ -1,149 +0,0 @@
const { basename } = require('path');
const { createReadStream, existsSync } = require('fs');
const { createInterface } = require('readline');
const { print, printError } = require('./_printers.js');
function printHelp() {
print(`Usage ${basename(process.argv[1])} text.txt UnicodeRange [ExcludeRange] [EraseRange]`);
print('From the given text file, extracts and counts the unique words that belong to a given unicode range. Assumes all punctuation, whitespace and foreign characters as word separators.');
print('Example UnicodeRange: U+900-U+97FU+200CU+200D');
print('Example ExcludeRange: U+950-U+957U+966-U+97F');
print('Example EraseRange: U+964U+965U+970U+971');
}
function validateInput() {
if (process.argv.length < 3) {
printHelp();
process.exit(1);
}
if (!existsSync(process.argv[2])) {
printError(`Failure! Could not find word list file "${process.argv[2]}".`);
process.exit(2);
}
if (!validateUnicodeRange(process.argv[3])) {
printError(`Failure! Invalid or missing search Unicode range(s): "${process.argv[3]}".`);
process.exit(2);
}
if (process.argv[4] && !validateUnicodeRange(process.argv[4])) {
printError(`Failure! Invalid exclude range(s): "${process.argv[4]}".`);
process.exit(2);
}
if (process.argv[5] && !validateUnicodeRange(process.argv[5])) {
printError(`Failure! Invalid exclude range(s): "${process.argv[5]}".`);
process.exit(2);
}
return {
fileName: process.argv[2],
searchRegexString: process.argv[3],
excludeRegexString: typeof process.argv[4] === 'string' ? process.argv[4] : '',
eraseRegexString: typeof process.argv[5] === 'string' ? process.argv[5] : ''
};
}
function validateUnicodeRange(inputRange) {
return /^([uU]\+[\da-fA-F]+)(\-*[uU]\+[\da-fA-F]+)*$/.test(inputRange);
}
function URangeToXRange(range) {
if (range.length === 0) {
return null;
}
return range
.toUpperCase()
.replaceAll(/U\+([\dA-F]+)/g, "\\u{$1}");
}
function printWords(wordList) {
if (Array.isArray(wordList)) {
wordList.forEach(w => print(`${w.w}\t${w.f}`));
}
}
function cleanInvalidChars(line, eraseRegex, excludeRegexString) {
const spacesOnly = /^\s+$/;
if (!line || !line.length || spacesOnly.test(line)) {
return [];
}
const invalidWordRegex = excludeRegexString.length > 0 ? new RegExp("[" + URangeToXRange(excludeRegexString) + "]", "u") : null;
return line
.replace(eraseRegex, ' ')
.split(' ')
.filter(w => w.length > 1 && (invalidWordRegex === null || !invalidWordRegex.test(w)));
}
async function readWords(fileName, eraseRegex, excludeRegexString) {
const words = new Map();
if (!fileName) {
return words;
}
for await (const line of createInterface({ input: createReadStream(fileName) })) {
const parts = cleanInvalidChars(line, eraseRegex, excludeRegexString);
parts.forEach(w => {
words.set(w, words.has(w) ? words.get(w) + 1 : 1);
});
}
return words;
}
function sortWords(wordsMap) {
const words = [];
for (let [w, f] of wordsMap) {
words.push({ w, f });
}
return words.sort((a, b) => {
if (a.f > b.f) {
return -1;
}
if (a.f < b.f) {
return 1;
}
if (a.w < b.w) {
return -1;
}
if (a.w > b.w) {
return 1;
}
return 0;
});
}
async function work({ fileName, searchRegexString, excludeRegexString, eraseRegexString }) {
const eraseRegex = new RegExp("([^" + URangeToXRange(searchRegexString) + " ]+|[" + URangeToXRange(eraseRegexString) + "])", "gu");
return sortWords(
await readWords(fileName, eraseRegex, excludeRegexString)
);
}
/** main **/
work(validateInput())
.then(words => printWords(words))
.catch(e => printError(e));

View file

@ -0,0 +1,47 @@
import sys
import re
from collections import Counter
from os.path import basename
def usage():
print(f"Usage: e{basename(__file__)} [--freq|-f] <allowed_letters> <file1> [file2 ...]")
sys.exit(1)
# Check and parse arguments
args = sys.argv[1:]
if not args or len(args) < 2:
usage()
show_freq = False
if args[0] in ("--freq", "-f"):
show_freq = True
args = args[1:]
if len(args) < 2:
usage()
allowed_letters = set(args[0])
file_paths = args[1:]
# Unicode word pattern
word_pattern = re.compile(r'\b\w+\b', re.UNICODE)
word_counts = Counter()
# Process files
for path in file_paths:
try:
with open(path, 'r', encoding='utf-8') as f:
for line in f:
for word in word_pattern.findall(line):
if all(char in allowed_letters for char in word):
word_counts[word] += 1
except Exception as e:
print(f"Error reading {path}: {e}", file=sys.stderr)
# Output
if show_freq:
for word, count in sorted(word_counts.items()):
print(f"{word}\t{count}")
else:
for word in sorted(word_counts):
print(word)

108
scripts/fix-text-case.py Normal file
View file

@ -0,0 +1,108 @@
import argparse
import os
import time
from multiprocessing import Pool, cpu_count, Manager
from collections import defaultdict
import hunspell
def load_unique_words(full_list_path):
words = dict()
with open(full_list_path, 'r', encoding='utf-8', errors='replace') as f:
for line in f:
word = line.strip()
if '<EFBFBD>' in word:
continue
word_lower = word.lower()
if word_lower not in words or words[word_lower] == word_lower:
words[word_lower] = word
return words.values()
def init_hunspell_worker(aff_path, dic_path):
global hobj, hunspell_stems
hobj = hunspell.HunSpell(dic_path, aff_path)
with open(dic_path, "r") as f:
hunspell_stems = set({
line.split('/')[0].strip()
for line in f
if not line.startswith('#')
})
def fix_word_text_case(word):
word_lower = word.lower()
# check for direct matches to avoid expensive calls to HunSpell.suggest()
if word_lower != word and word_lower in hunspell_stems:
return word_lower
if word in hunspell_stems:
return word
# name -> Name
hunspell_variants = hobj.suggest(word_lower)
for variant in hunspell_variants:
if word_lower != variant and word_lower == variant.lower():
return variant
# if it can be either lowercase or uppercase, then we want to keep the lowercase
if word_lower in hunspell_variants:
return word_lower
# if it is an unknown word, keep it as-is
return word
def print_progress(current, total, start_time, interval):
if current % interval == 0 or current == total:
avg_time = (time.time() - start_time) / current
remaining_time = (total - current) * avg_time
HH, rem = divmod(int(remaining_time), 3600)
MM, SS = divmod(rem, 60)
print(f"\rFixing text case using hunspell... {current}/{total}, Remaining: {HH:02}:{MM:02}:{SS:02}", end=" ")
def run_hunspell_batch(words, aff_path, dic_path, num_workers):
total = len(words)
start_time = time.time()
with Pool(
processes=num_workers,
initializer=init_hunspell_worker,
initargs=(aff_path, dic_path)
) as pool:
for i, correct_word in enumerate (pool.imap_unordered(fix_word_text_case, words), 1):
print_progress(i, total, start_time, 300)
yield correct_word
def main():
parser = argparse.ArgumentParser(description="Correct the text case of a word list using Hunspell.")
parser.add_argument("word_list", help="Path to the full list of words.")
parser.add_argument("output", help="Path to save the corrected words.")
parser.add_argument("--aff", required=True, help="Path to Hunspell .aff file.")
parser.add_argument("--dic", required=True, help="Path to Hunspell .dic file.")
args = parser.parse_args()
if not os.path.exists(args.word_list):
print(f"Full word list not found: {args.word_list}")
return
if not os.path.exists(args.aff):
print(f"Hunspell .aff file not found: {args.aff}")
return
if not os.path.exists(args.dic):
print(f"Hunspell .dic file not found: {args.dic}")
return
all_words = load_unique_words(args.word_list)
print(f"Loaded {len(all_words)} candidate words.")
corrected_words = run_hunspell_batch(all_words, args.aff, args.dic, cpu_count())
with open(args.output, 'w', encoding='utf-8') as f:
for word in sorted(corrected_words):
f.write(word + '\n')
print(" ") # clear the '\r'
if __name__ == "__main__":
main()

View file

@ -0,0 +1,64 @@
import argparse
import os
from multiprocessing import Pool, cpu_count
from collections import defaultdict
def load_unique_words(word_list_path):
words = set()
with open(word_list_path, 'r', encoding='utf-8', errors='replace') as f:
for line in f:
word = line.strip()
if '<EFBFBD>' in word:
continue
words.add(word.lower())
return words
def load_known_suffixes(suffix_file_path):
suffixes = set()
with open(suffix_file_path, 'r', encoding='utf-8', errors='replace') as f:
for line in f:
suffix = line.strip()
if suffix:
suffixes.add(suffix)
return suffixes
def generate_from_args(args):
word, suffixes = args
return {word + suffix for suffix in suffixes}
def generate_words(words, suffixes, num_workers):
new_words = set()
with Pool(processes=num_workers) as pool:
for result in pool.imap_unordered(generate_from_args, ((word, suffixes) for word in words)):
new_words.update(result)
return new_words
def main():
parser = argparse.ArgumentParser(description="Naively generate new words using a list of stems and a list of suffixes. Note that, you will have to clean up the invalid words after that.")
parser.add_argument("word_list", help="Path to the full list of words to filter")
parser.add_argument("suffix_file", help="Path to the file containing known suffixes")
parser.add_argument("output", help="Path to save the filtered output")
args = parser.parse_args()
if not os.path.exists(args.word_list):
print(f"Full word list not found: {args.word_list}")
return
if not os.path.exists(args.suffix_file):
print(f"Suffix file not found: {args.suffix_file}")
return
print("Generating new words...", end=' ')
all_words = load_unique_words(args.word_list)
known_suffixes = load_known_suffixes(args.suffix_file)
print(f"\rGenerating new words out of {len(all_words)} stems and {len(known_suffixes)} suffixes...", end=' ')
generated = generate_words(all_words, known_suffixes, cpu_count())
print(f"OK ({len(generated) - len(all_words)} new words)")
with open(args.output, 'w', encoding='utf-8') as f:
for word in generated:
f.write(word + '\n')
if __name__ == "__main__":
main()

View file

@ -1,111 +0,0 @@
const { basename } = require('path');
const { createReadStream, existsSync } = require('fs');
const { createInterface } = require('readline');
const { print, printError } = require('./_printers.js');
function printHelp() {
print(`Usage ${basename(process.argv[1])} word-list.txt UnicodeRange [ExcludeRange]`);
print('Extracts words that belong to a given unicode range. Assumes all punctuation, whitespace and foreign characters as word separators.');
print('Example UnicodeRange: U+900-U+97F');
print('Example ExcludeRange: U+950-U+954U+964-U+971U+200CU+200D');
}
function validateInput() {
if (process.argv.length < 3) {
printHelp();
process.exit(1);
}
if (!existsSync(process.argv[2])) {
printError(`Failure! Could not find word list file "${process.argv[2]}".`);
process.exit(2);
}
if (!validateUnicodeRange(process.argv[3])) {
printError(`Failure! Invalid or missing search Unicode range(s): "${process.argv[3]}".`);
process.exit(2);
}
if (process.argv[4] && !validateUnicodeRange(process.argv[4])) {
printError(`Failure! Invalid exclude range(s): "${process.argv[4]}".`);
process.exit(2);
}
return {
fileName: process.argv[2],
searchRegexString: process.argv[3],
excludeRegexString: typeof process.argv[4] === 'string' ? process.argv[4] : ''
};
}
function validateUnicodeRange(inputRange) {
return /^([uU]\+[\da-fA-F]+)(\-*[uU]\+[\da-fA-F]+)*$/.test(inputRange);
}
function URangeToXRange(range) {
if (range.length === 0) {
return null;
}
return range
.toUpperCase()
.replaceAll(/U\+([\dA-F]+)/g, "\\u{$1}");
}
function printWords(wordList) {
if (Array.isArray(wordList)) {
wordList.forEach(w => print(w));
}
}
function cleanInvalidChars(line, searchRegex, excludeRegex) {
const spacesOnly = /^\s+$/;
if (!line || !line.length || spacesOnly.test(line)) {
return [];
}
const cleanLine = excludeRegex !== null ? line.replace(excludeRegex, ' ') : line;
return cleanLine
.replace(searchRegex, ' ')
.split(' ')
.filter(w => w.length > 1);
}
async function readWords(fileName, searchRegex, excludeRegex) {
const words = new Set();
if (!fileName) {
return words;
}
for await (const line of createInterface({ input: createReadStream(fileName) })) {
cleanInvalidChars(line, searchRegex, excludeRegex).forEach(w => words.add(w));
}
return words;
}
async function work({ fileName, searchRegexString, excludeRegexString }) {
const searchRegex = new RegExp("[^" + URangeToXRange(searchRegexString) + "]+", "gu");
const excludeRegex = excludeRegexString.length > 0 ? new RegExp("[" + URangeToXRange(excludeRegexString) + "]+", "gu") : null;
const words = Array.from(await readWords(fileName, searchRegex, excludeRegex));
return words.filter(word => word.length > 1).sort();
}
/** main **/
work(validateInput())
.then(words => printWords(words))
.catch(e => printError(e));

View file

@ -1,85 +0,0 @@
const { basename } = require('path');
const { existsSync, readFileSync } = require('fs');;
const { print, printError } = require('./_printers.js')
function printHelp() {
print(`Usage ${basename(process.argv[1])} WORD-LIST.txt`);
print('Normalizes the frequencies in a dictionary with transcriptions.');
}
function validateInput() {
if (process.argv.length < 3) {
printHelp();
process.exit(1);
}
if (!existsSync(process.argv[2])) {
printError(`Failure! Could not find word list file "${process.argv[2]}".`);
process.exit(2);
}
return {
fileName: process.argv[2]
};
}
function printWords(wordList) {
if (Array.isArray(wordList)) {
wordList.forEach(w => print(
w.number !== null ? `${w.chinese}\t${w.latin}\t${w.number}` : `${w.chinese}\t${w.latin}`
));
}
}
const { fileName } = validateInput();
const data = readFileSync(fileName, 'utf8');
const lines = data.trim().split('\n');
// Parse the data into an array of objects
let entries = lines.map(line => {
const parts = line.split('\t');
return {
original: line,
chinese: parts[0],
latin: parts[1],
number: parts[2] ? parseInt(parts[2], 10) : null
};
});
// Group entries by the Latin character sequence
const groups = {};
entries.forEach(entry => {
if (!groups[entry.latin]) {
groups[entry.latin] = [];
}
groups[entry.latin].push(entry);
});
// Process each group: sort by number (descending) and reassign ordinal numbers
let sortedEntries = [];
for (const key in groups) {
let group = groups[key];
// Separate entries with and without numbers
let withNumbers = group.filter(e => e.number !== null);
let withoutNumbers = group.filter(e => e.number === null);
// Sort by number in descending order
withNumbers.sort((a, b) => b.number - a.number);
// Assign ordinal rankings
for (let i = 0; i < withNumbers.length; i++) {
withNumbers[i].number = (withNumbers.length - i).toString();
}
// Preserve original order for entries without numbers
sortedEntries.push(...withNumbers, ...withoutNumbers);
}
printWords(sortedEntries);

View file

@ -0,0 +1,86 @@
import sys
import os
import argparse
from collections import defaultdict
def print_error(message):
print(message, file=sys.stderr)
def parse_args():
parser = argparse.ArgumentParser(
description="Normalizes the frequencies in a dictionary with transcriptions."
)
parser.add_argument(
"word_list",
help="Path to the word list file (e.g., WORD-LIST.txt)"
)
return parser.parse_args()
def validate_file(file_path):
if not os.path.isfile(file_path):
print_error(f'Failure! Could not find word list file "{file_path}".')
sys.exit(2)
def load_entries(file_path):
with open(file_path, encoding='utf-8') as f:
lines = [line.strip() for line in f if line.strip()]
entries = []
for line_num, line in enumerate(lines, start=1):
parts = line.split('\t')
if len(parts) < 2:
print_error(f"Malformed line {line_num}: '{line}' (expected at least 2 tab-separated fields)")
sys.exit(3)
chinese, latin = parts[:2]
number = None
if len(parts) > 2:
try:
number = int(parts[2])
except ValueError:
print_error(f"Malformed line {line_num}: '{line}' (third field must be an integer if present)")
sys.exit(3)
entries.append({'chinese': chinese, 'latin': latin, 'number': number})
return entries
def group_entries(entries):
groups = defaultdict(list)
for entry in entries:
groups[entry['latin']].append(entry)
return groups
def normalize_frequencies(groups):
sorted_entries = []
for group in groups.values():
with_numbers = [e for e in group if e['number'] is not None]
without_numbers = [e for e in group if e['number'] is None]
with_numbers.sort(key=lambda e: e['number'], reverse=True)
for rank, entry in enumerate(with_numbers, start=1):
entry['number'] = str(len(with_numbers) - rank + 1)
sorted_entries.extend(with_numbers)
sorted_entries.extend(without_numbers)
return sorted_entries
def print_entries(entries):
for e in entries:
parts = [e['chinese'], e['latin']]
if e['number'] is not None:
parts.append(e['number'])
print('\t'.join(parts))
def main():
args = parse_args()
validate_file(args.word_list)
entries = load_entries(args.word_list)
groups = group_entries(entries)
sorted_entries = normalize_frequencies(groups)
print_entries(sorted_entries)
if __name__ == "__main__":
main()

46
scripts/remove-random-words.sh Executable file
View file

@ -0,0 +1,46 @@
#!/bin/bash
if [ $# -ne 7 ]; then
echo "Usage: $0 <locale> <all-words.txt> <courpus-words-with-frequencies.txt> <bad-combinations.txt> <output-file.txt> <vowels-list> <unpopular-max-length>"
echo "Example (Polish): $0 pl-PL pl.txt pl-corpus.txt pl-bad-combinations.txt pl-reduced.txt aąeęijoóuy 13"
exit 1
fi
LOCALE="$1"
ORIGINAL_WORDS="$2"
CORPUS_WORDS="$3"
BAD_COMB_FILE="$4"
OUTPUT_FILE="$5"
VOWELS="$6"
UNPOPULAR_MAX_LENGTH="$7"
if ! [[ -f "$ORIGINAL_WORDS" ]]; then
echo "All words file: '$ORIGINAL_WORDS' does not exist"
exit 2
fi
if ! [[ -f "$CORPUS_WORDS" ]]; then
echo "Corpus words file: '$CORPUS_WORDS' does not exist"
exit 2
fi
if ! [[ -f "$BAD_COMB_FILE" ]]; then
echo "Bad letter combinations file: '$BAD_COMB_FILE' does not exist"
exit 2
fi
BAD_LETTER_COMBINATIONS=$(paste -sd'|' "$BAD_COMB_FILE")
sed -E 's/^[^\t]+\t[12]$//g' "$CORPUS_WORDS" | grep . | sed -E 's/[\t0-9]+$//g' > __tmp__popular.txt &
grep -Ev "(.)\1{2,}" "$ORIGINAL_WORDS" | grep -Ev "($BAD_LETTER_COMBINATIONS)" | grep -Ev "[$VOWELS]{3,}" | sort -u | uniq > __tmp__reduced.txt &
wait
node ~/src/tt9/scripts/remove-foreign-words.js --blacklist "$LOCALE" __tmp__reduced.txt "$LOCALE" __tmp__popular.txt | grep -Ev "(.)\1{2,}" > __tmp__unpopular.txt
node ~/src/tt9/scripts/remove-foreign-words.js --blacklist "$LOCALE" __tmp__reduced.txt "$LOCALE" __tmp__unpopular.txt > __tmp__popular_reduced.txt &
awk -v maxlen="$UNPOPULAR_MAX_LENGTH" 'length($0) <= maxlen' __tmp__unpopular.txt > __tmp__unpopular_reduced.txt &
wait
cat __tmp__popular_reduced.txt __tmp__unpopular_reduced.txt | sort -u | uniq > $OUTPUT_FILE
rm -f __tmp__*.txt

1
scripts/requirements.txt Normal file
View file

@ -0,0 +1 @@
hunspell==0.5.5

102
scripts/whitelist-filter.py Normal file
View file

@ -0,0 +1,102 @@
import argparse
import os
from multiprocessing import Pool, cpu_count
from collections import defaultdict
def load_stem_buckets(whitelist_path):
buckets = defaultdict(set)
with open(whitelist_path, 'r', encoding='utf-8', errors='replace') as f:
for line in f:
word = line.strip()
if '<EFBFBD>' in word:
continue
word_lc = word.lower()
first_char = word_lc[0]
buckets[first_char].add(word_lc)
return dict(buckets)
def load_unique_words(full_list_path):
words = set()
with open(full_list_path, 'r', encoding='utf-8', errors='replace') as f:
for line in f:
word = line.strip()
if '<EFBFBD>' in word:
continue
words.add(word)
return words
def load_known_suffixes(suffix_file_path):
suffixes = set()
with open(suffix_file_path, 'r', encoding='utf-8', errors='replace') as f:
for line in f:
suffix = line.strip()
if suffix:
suffixes.add(suffix)
return suffixes
def match_word(word, buckets, known_suffixes):
"""Return all valid combinations: base word and word+suffix if found in stems."""
word_lc = word.lower()
first_char = word_lc[0]
possible_stems = buckets.get(first_char, set())
matches = []
if word_lc in possible_stems:
matches.append(word)
for suffix in known_suffixes:
compound_word = word_lc + suffix
if compound_word in possible_stems:
matches.append(compound_word)
return matches
def filter_words_parallel(all_words, stem_buckets, known_suffixes, num_workers):
args = [(word, stem_buckets, known_suffixes) for word in all_words]
with Pool(processes=num_workers) as pool:
results = pool.starmap(match_word, args)
matched_words = set()
for match_list in results:
matched_words.update(match_list)
return matched_words
def main():
parser = argparse.ArgumentParser(description="Filter given words by a stem whitelist. The list of suffixes is used to generate more variants of the valid words.")
parser.add_argument("whitelist", help="Path to the whitelist file (with valid words)")
parser.add_argument("full_list", help="Path to the full list of words to filter")
parser.add_argument("suffix_file", help="Path to the file containing known suffixes")
parser.add_argument("output", help="Path to save the filtered output")
args = parser.parse_args()
if not os.path.exists(args.whitelist):
print(f"Whitelist file not found: {args.whitelist}")
return
if not os.path.exists(args.full_list):
print(f"Full word list not found: {args.full_list}")
return
if not os.path.exists(args.suffix_file):
print(f"Suffix file not found: {args.suffix_file}")
return
stem_buckets = load_stem_buckets(args.whitelist)
print(f"Loaded {sum(len(s) for s in stem_buckets.values())} valid stems across {len(stem_buckets)} buckets.")
all_words = load_unique_words(args.full_list)
print(f"Loaded {len(all_words)} candidate words.")
known_suffixes = load_known_suffixes(args.suffix_file)
print(f"Loaded {len(known_suffixes)} known suffixes.")
workers = cpu_count()
print(f"Filtering using {workers} threads...", end=' ')
filtered = filter_words_parallel(all_words, stem_buckets, known_suffixes, workers)
print(f"OK. Matched {len(filtered)} words.")
with open(args.output, 'w', encoding='utf-8') as f:
for word in sorted(filtered):
f.write(word + '\n')
if __name__ == "__main__":
main()