a script for injesting words by unicode range

2024-12-08 15:12:21 +02:00 · 2024-12-08 15:12:21 +02:00 · e9b832c46c
commit e9b832c46c
parent 00e00e1802
1 changed files with 111 additions and 0 deletions
--- a/scripts/injest-script-words.js
+++ b/scripts/injest-script-words.js
@ -0,0 +1,111 @@
+const { basename } = require('path');
+const { createReadStream, existsSync } = require('fs');
+const { createInterface } = require('readline');
+const { print, printError } = require('./_printers.js');
+
+
+function printHelp() {
+	print(`Usage ${basename(process.argv[1])} word-list.txt UnicodeRange [ExcludeRange]`);
+	print('Extracts words that belong to a given unicode range. Assumes all punctuation, whitespace and foreign characters as word separators.');
+	print('Example UnicodeRange: U+900-U+97F');
+	print('Example ExcludeRange: U+950-U+954U+964-U+971U+200CU+200D');
+}
+
+
+
+function validateInput() {
+	if (process.argv.length < 3) {
+		printHelp();
+		process.exit(1);
+	}
+
+	if (!existsSync(process.argv[2])) {
+		printError(`Failure! Could not find word list file "${process.argv[2]}".`);
+		process.exit(2);
+	}
+
+	if (!validateUnicodeRange(process.argv[3])) {
+		printError(`Failure! Invalid or missing search Unicode range(s): "${process.argv[3]}".`);
+		process.exit(2);
+	}
+
+	if (process.argv[4] && !validateUnicodeRange(process.argv[4])) {
+		printError(`Failure! Invalid exclude range(s): "${process.argv[4]}".`);
+		process.exit(2);
+	}
+
+	return {
+		fileName: process.argv[2],
+		searchRegexString: process.argv[3],
+		excludeRegexString: typeof process.argv[4] === 'string' ? process.argv[4] : ''
+	};
+}
+
+
+function validateUnicodeRange(inputRange) {
+	return /^([uU]\+[\da-fA-F]+)(\-*[uU]\+[\da-fA-F]+)*$/.test(inputRange);
+}
+
+
+function URangeToXRange(range) {
+	if (range.length === 0) {
+		return null;
+	}
+
+	return range
+		.toUpperCase()
+		.replaceAll(/U\+([\dA-F]+)/g, "\\u{$1}");
+}
+
+
+function printWords(wordList) {
+	if (Array.isArray(wordList)) {
+		wordList.forEach(w => print(w));
+	}
+}
+
+
+function cleanInvalidChars(line, searchRegex, excludeRegex) {
+	const spacesOnly = /^\s+$/;
+
+	if (!line || !line.length || spacesOnly.test(line)) {
+		return [];
+	}
+
+	const cleanLine = excludeRegex !== null ? line.replace(excludeRegex, ' ') : line;
+	return cleanLine
+		.replace(searchRegex, ' ')
+		.split(' ')
+		.filter(w => w.length > 1);
+}
+
+
+async function readWords(fileName, searchRegex, excludeRegex) {
+	const words = new Set();
+
+	if (!fileName) {
+		return words;
+	}
+
+	for await (const line of createInterface({ input: createReadStream(fileName) })) {
+		cleanInvalidChars(line, searchRegex, excludeRegex).forEach(w => words.add(w));
+	}
+
+	return words;
+}
+
+
+async function work({ fileName, searchRegexString, excludeRegexString }) {
+	const searchRegex = new RegExp("[^" + URangeToXRange(searchRegexString) + "]+", "gu");
+	const excludeRegex = excludeRegexString.length > 0 ? new RegExp("[" + URangeToXRange(excludeRegexString) + "]+", "gu") : null;
+
+	const words = Array.from(await readWords(fileName, searchRegex, excludeRegex));
+	return words.filter(word => word.length > 1).sort();
+}
+
+
+
+/** main **/
+work(validateInput())
+	.then(words => printWords(words))
+	.catch(e => printError(e));