New dictionary tools (#830)

* new dictionary tools for generating an app dictionary from raw word lists * replaced the normalize-transcribed script with a Python version, since the Javascript version was running out of memory for larger languages like Japanese
2025-06-27 17:01:42 +03:00 · 2025-06-27 17:01:42 +03:00 · 62e8a08576
commit 62e8a08576
parent a34baef0f3
12 changed files with 533 additions and 439 deletions
--- a/scripts/extract-words-from-text.py
+++ b/scripts/extract-words-from-text.py
@ -0,0 +1,47 @@
+import sys
+import re
+from collections import Counter
+from os.path import basename
+
+def usage():
+    print(f"Usage: e{basename(__file__)} [--freq|-f] <allowed_letters> <file1> [file2 ...]")
+    sys.exit(1)
+
+# Check and parse arguments
+args = sys.argv[1:]
+if not args or len(args) < 2:
+    usage()
+
+show_freq = False
+if args[0] in ("--freq", "-f"):
+    show_freq = True
+    args = args[1:]
+
+if len(args) < 2:
+    usage()
+
+allowed_letters = set(args[0])
+file_paths = args[1:]
+
+# Unicode word pattern
+word_pattern = re.compile(r'\b\w+\b', re.UNICODE)
+word_counts = Counter()
+
+# Process files
+for path in file_paths:
+    try:
+        with open(path, 'r', encoding='utf-8') as f:
+            for line in f:
+                for word in word_pattern.findall(line):
+                    if all(char in allowed_letters for char in word):
+                        word_counts[word] += 1
+    except Exception as e:
+        print(f"Error reading {path}: {e}", file=sys.stderr)
+
+# Output
+if show_freq:
+    for word, count in sorted(word_counts.items()):
+        print(f"{word}\t{count}")
+else:
+    for word in sorted(word_counts):
+        print(word)