New dictionary tools (#830)
* new dictionary tools for generating an app dictionary from raw word lists * replaced the normalize-transcribed script with a Python version, since the Javascript version was running out of memory for larger languages like Japanese
This commit is contained in:
parent
a34baef0f3
commit
62e8a08576
12 changed files with 533 additions and 439 deletions
86
scripts/normalize-transcribed.py
Normal file
86
scripts/normalize-transcribed.py
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
import sys
|
||||
import os
|
||||
import argparse
|
||||
from collections import defaultdict
|
||||
|
||||
def print_error(message):
|
||||
print(message, file=sys.stderr)
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Normalizes the frequencies in a dictionary with transcriptions."
|
||||
)
|
||||
parser.add_argument(
|
||||
"word_list",
|
||||
help="Path to the word list file (e.g., WORD-LIST.txt)"
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
def validate_file(file_path):
|
||||
if not os.path.isfile(file_path):
|
||||
print_error(f'Failure! Could not find word list file "{file_path}".')
|
||||
sys.exit(2)
|
||||
|
||||
def load_entries(file_path):
|
||||
with open(file_path, encoding='utf-8') as f:
|
||||
lines = [line.strip() for line in f if line.strip()]
|
||||
|
||||
entries = []
|
||||
for line_num, line in enumerate(lines, start=1):
|
||||
parts = line.split('\t')
|
||||
if len(parts) < 2:
|
||||
print_error(f"Malformed line {line_num}: '{line}' (expected at least 2 tab-separated fields)")
|
||||
sys.exit(3)
|
||||
|
||||
chinese, latin = parts[:2]
|
||||
number = None
|
||||
if len(parts) > 2:
|
||||
try:
|
||||
number = int(parts[2])
|
||||
except ValueError:
|
||||
print_error(f"Malformed line {line_num}: '{line}' (third field must be an integer if present)")
|
||||
sys.exit(3)
|
||||
|
||||
entries.append({'chinese': chinese, 'latin': latin, 'number': number})
|
||||
|
||||
return entries
|
||||
|
||||
def group_entries(entries):
|
||||
groups = defaultdict(list)
|
||||
for entry in entries:
|
||||
groups[entry['latin']].append(entry)
|
||||
return groups
|
||||
|
||||
def normalize_frequencies(groups):
|
||||
sorted_entries = []
|
||||
for group in groups.values():
|
||||
with_numbers = [e for e in group if e['number'] is not None]
|
||||
without_numbers = [e for e in group if e['number'] is None]
|
||||
|
||||
with_numbers.sort(key=lambda e: e['number'], reverse=True)
|
||||
|
||||
for rank, entry in enumerate(with_numbers, start=1):
|
||||
entry['number'] = str(len(with_numbers) - rank + 1)
|
||||
|
||||
sorted_entries.extend(with_numbers)
|
||||
sorted_entries.extend(without_numbers)
|
||||
|
||||
return sorted_entries
|
||||
|
||||
def print_entries(entries):
|
||||
for e in entries:
|
||||
parts = [e['chinese'], e['latin']]
|
||||
if e['number'] is not None:
|
||||
parts.append(e['number'])
|
||||
print('\t'.join(parts))
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
validate_file(args.word_list)
|
||||
entries = load_entries(args.word_list)
|
||||
groups = group_entries(entries)
|
||||
sorted_entries = normalize_frequencies(groups)
|
||||
print_entries(sorted_entries)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue