1
0
Fork 0
tt9/scripts/normalize-transcribed.py
Dimo Karaivanov 62e8a08576
New dictionary tools (#830)
* new dictionary tools for generating an app dictionary from raw word lists

* replaced the normalize-transcribed script with a Python version, since the Javascript version was running out of memory for larger languages like Japanese
2025-06-27 17:01:42 +03:00

86 lines
2.5 KiB
Python

import sys
import os
import argparse
from collections import defaultdict
def print_error(message):
print(message, file=sys.stderr)
def parse_args():
parser = argparse.ArgumentParser(
description="Normalizes the frequencies in a dictionary with transcriptions."
)
parser.add_argument(
"word_list",
help="Path to the word list file (e.g., WORD-LIST.txt)"
)
return parser.parse_args()
def validate_file(file_path):
if not os.path.isfile(file_path):
print_error(f'Failure! Could not find word list file "{file_path}".')
sys.exit(2)
def load_entries(file_path):
with open(file_path, encoding='utf-8') as f:
lines = [line.strip() for line in f if line.strip()]
entries = []
for line_num, line in enumerate(lines, start=1):
parts = line.split('\t')
if len(parts) < 2:
print_error(f"Malformed line {line_num}: '{line}' (expected at least 2 tab-separated fields)")
sys.exit(3)
chinese, latin = parts[:2]
number = None
if len(parts) > 2:
try:
number = int(parts[2])
except ValueError:
print_error(f"Malformed line {line_num}: '{line}' (third field must be an integer if present)")
sys.exit(3)
entries.append({'chinese': chinese, 'latin': latin, 'number': number})
return entries
def group_entries(entries):
groups = defaultdict(list)
for entry in entries:
groups[entry['latin']].append(entry)
return groups
def normalize_frequencies(groups):
sorted_entries = []
for group in groups.values():
with_numbers = [e for e in group if e['number'] is not None]
without_numbers = [e for e in group if e['number'] is None]
with_numbers.sort(key=lambda e: e['number'], reverse=True)
for rank, entry in enumerate(with_numbers, start=1):
entry['number'] = str(len(with_numbers) - rank + 1)
sorted_entries.extend(with_numbers)
sorted_entries.extend(without_numbers)
return sorted_entries
def print_entries(entries):
for e in entries:
parts = [e['chinese'], e['latin']]
if e['number'] is not None:
parts.append(e['number'])
print('\t'.join(parts))
def main():
args = parse_args()
validate_file(args.word_list)
entries = load_entries(args.word_list)
groups = group_entries(entries)
sorted_entries = normalize_frequencies(groups)
print_entries(sorted_entries)
if __name__ == "__main__":
main()