New dictionary tools (#830)

* new dictionary tools for generating an app dictionary from raw word lists * replaced the normalize-transcribed script with a Python version, since the Javascript version was running out of memory for larger languages like Japanese
2025-06-27 17:01:42 +03:00 · 2025-06-27 17:01:42 +03:00 · 62e8a08576
commit 62e8a08576
parent a34baef0f3
12 changed files with 533 additions and 439 deletions
--- a/scripts/normalize-transcribed.py
+++ b/scripts/normalize-transcribed.py
@ -0,0 +1,86 @@
+import sys
+import os
+import argparse
+from collections import defaultdict
+
+def print_error(message):
+    print(message, file=sys.stderr)
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Normalizes the frequencies in a dictionary with transcriptions."
+    )
+    parser.add_argument(
+        "word_list",
+        help="Path to the word list file (e.g., WORD-LIST.txt)"
+    )
+    return parser.parse_args()
+
+def validate_file(file_path):
+    if not os.path.isfile(file_path):
+        print_error(f'Failure! Could not find word list file "{file_path}".')
+        sys.exit(2)
+
+def load_entries(file_path):
+    with open(file_path, encoding='utf-8') as f:
+        lines = [line.strip() for line in f if line.strip()]
+
+    entries = []
+    for line_num, line in enumerate(lines, start=1):
+        parts = line.split('\t')
+        if len(parts) < 2:
+            print_error(f"Malformed line {line_num}: '{line}' (expected at least 2 tab-separated fields)")
+            sys.exit(3)
+
+        chinese, latin = parts[:2]
+        number = None
+        if len(parts) > 2:
+            try:
+                number = int(parts[2])
+            except ValueError:
+                print_error(f"Malformed line {line_num}: '{line}' (third field must be an integer if present)")
+                sys.exit(3)
+
+        entries.append({'chinese': chinese, 'latin': latin, 'number': number})
+
+    return entries
+
+def group_entries(entries):
+    groups = defaultdict(list)
+    for entry in entries:
+        groups[entry['latin']].append(entry)
+    return groups
+
+def normalize_frequencies(groups):
+    sorted_entries = []
+    for group in groups.values():
+        with_numbers = [e for e in group if e['number'] is not None]
+        without_numbers = [e for e in group if e['number'] is None]
+
+        with_numbers.sort(key=lambda e: e['number'], reverse=True)
+
+        for rank, entry in enumerate(with_numbers, start=1):
+            entry['number'] = str(len(with_numbers) - rank + 1)
+
+        sorted_entries.extend(with_numbers)
+        sorted_entries.extend(without_numbers)
+
+    return sorted_entries
+
+def print_entries(entries):
+    for e in entries:
+        parts = [e['chinese'], e['latin']]
+        if e['number'] is not None:
+            parts.append(e['number'])
+        print('\t'.join(parts))
+
+def main():
+    args = parse_args()
+    validate_file(args.word_list)
+    entries = load_entries(args.word_list)
+    groups = group_entries(entries)
+    sorted_entries = normalize_frequencies(groups)
+    print_entries(sorted_entries)
+
+if __name__ == "__main__":
+    main()