new word processing scripts

* a script for injesting words by unicode range * Devanagari validation script
2024-12-08 15:12:21 +02:00 · 2024-12-08 15:12:21 +02:00 · 622a954633
commit 622a954633
parent e9b832c46c
2 changed files with 245 additions and 0 deletions
--- a/scripts/extract-frequencies-from-text.js
+++ b/scripts/extract-frequencies-from-text.js
@ -0,0 +1,149 @@
 const { basename } = require('path');
 const { createReadStream, existsSync } = require('fs');
 const { createInterface } = require('readline');
 const { print, printError } = require('./_printers.js');
 function printHelp() {
 	print(`Usage ${basename(process.argv[1])} text.txt UnicodeRange [ExcludeRange] [EraseRange]`);
 	print('From the given text file, extracts and counts the unique words that belong to a given unicode range. Assumes all punctuation, whitespace and foreign characters as word separators.');
 	print('Example UnicodeRange: U+900-U+97FU+200CU+200D');
 	print('Example ExcludeRange: U+950-U+957U+966-U+97F');
 	print('Example EraseRange: U+964U+965U+970U+971');
 }
 function validateInput() {
 	if (process.argv.length < 3) {
 		printHelp();
 		process.exit(1);
 	}
 	if (!existsSync(process.argv[2])) {
 		printError(`Failure! Could not find word list file "${process.argv[2]}".`);
 		process.exit(2);
 	}
 	if (!validateUnicodeRange(process.argv[3])) {
 		printError(`Failure! Invalid or missing search Unicode range(s): "${process.argv[3]}".`);
 		process.exit(2);
 	}
 	if (process.argv[4] && !validateUnicodeRange(process.argv[4])) {
 		printError(`Failure! Invalid exclude range(s): "${process.argv[4]}".`);
 		process.exit(2);
 	}
 	if (process.argv[5] && !validateUnicodeRange(process.argv[5])) {
 		printError(`Failure! Invalid exclude range(s): "${process.argv[5]}".`);
 		process.exit(2);
 	}
 	return {
 		fileName: process.argv[2],
 		searchRegexString: process.argv[3],
 		excludeRegexString: typeof process.argv[4] === 'string' ? process.argv[4] : '',
 		eraseRegexString: typeof process.argv[5] === 'string' ? process.argv[5] : ''
 	};
 }
 function validateUnicodeRange(inputRange) {
 	return /^([uU]\+[\da-fA-F]+)(\-*[uU]\+[\da-fA-F]+)*$/.test(inputRange);
 }
 function URangeToXRange(range) {
 	if (range.length === 0) {
 		return null;
 	}
 	return range
 		.toUpperCase()
 		.replaceAll(/U\+([\dA-F]+)/g, "\\u{$1}");
 }
 function printWords(wordList) {
 	if (Array.isArray(wordList)) {
 		wordList.forEach(w => print(`${w.w}\t${w.f}`));
 	}
 }
 function cleanInvalidChars(line, eraseRegex, excludeRegexString) {
 	const spacesOnly = /^\s+$/;
 	if (!line || !line.length || spacesOnly.test(line)) {
 		return [];
 	}
 	const invalidWordRegex = excludeRegexString.length > 0 ? new RegExp("[" + URangeToXRange(excludeRegexString) + "]", "u") : null;
 	return line
 		.replace(eraseRegex, ' ')
 		.split(' ')
 		.filter(w => w.length > 1 && (invalidWordRegex === null || !invalidWordRegex.test(w)));
 }
 async function readWords(fileName, eraseRegex, excludeRegexString) {
 	const words = new Map();
 	if (!fileName) {
 		return words;
 	}
 	for await (const line of createInterface({ input: createReadStream(fileName) })) {
 		const parts = cleanInvalidChars(line, eraseRegex, excludeRegexString);
 		parts.forEach(w => {
 			words.set(w, words.has(w) ? words.get(w) + 1 : 1);
 		});
 	}
 	return words;
 }
 function sortWords(wordsMap) {
 	const words = [];
 	for (let [w, f] of wordsMap) {
 		words.push({ w, f });
 	}
 	return words.sort((a, b) => {
 		if (a.f > b.f) {
 			return -1;
 		}
 		if (a.f < b.f) {
 			return 1;
 		}
 		if (a.w < b.w) {
 			return -1;
 		}
 		if (a.w > b.w) {
 			return 1;
 		}
 		return 0;
 	});
 }
 async function work({ fileName, searchRegexString, excludeRegexString, eraseRegexString }) {
 	const eraseRegex = new RegExp("([^" + URangeToXRange(searchRegexString) + " ]+|[" + URangeToXRange(eraseRegexString) + "])", "gu");
 	return sortWords(
 		await readWords(fileName, eraseRegex, excludeRegexString)
 	);
 }
 /** main **/
 work(validateInput())
 	.then(words => printWords(words))
 	.catch(e => printError(e));
--- a/scripts/validate-devanagari.js
+++ b/scripts/validate-devanagari.js
@ -0,0 +1,96 @@
 const { basename } = require('path');
 const { existsSync, readFileSync } = require('fs');
 const { print, printError, printWordsWithFrequencies } = require('./_printers.js');
 function printHelp() {
 	print(`Usage: node ${basename(process.argv[1])} <file>`);
 }
 function validateInput() {
 	if (process.argv.length < 3) {
 		printHelp();
 		process.exit(1);
 	}
 	if (!existsSync(process.argv[2])) {
 		printError(`Failure! Could not find the input file "${process.argv[2]}".`);
 		process.exit(2);
 	}
 	return { file: process.argv[2] };
 }
 function getWordsFromFile(filename) {
 	const content = readFileSync(filename, 'utf8');
 	return new Set(content.split('\n').map(word => word.trim()).filter(word => word.length > 0));
 }
 /**
 * containsInvalidCombination
 * Based on the "do not use sequences" table from here: https://lontar.eu/en/notes/issues-in-devanagari-cluster-validation/
 */
 function containsInvalidCombination(word) {
 	return /(\u{905}\u{946}|\u{905}\u{93E}|\u{930}\u{94D}\u{907}|\u{909}\u{941}|\u{90F}\u{945}|\u{90F}\u{946}|\u{90F}\u{947}|\u{905}\u{949}|\u{906}\u{945}|\u{905}\u{94A}|\u{906}\u{946}|\u{905}\u{94B}|\u{906}\u{947}|\u{905}\u{94C}|\u{906}\u{948}|\u{905}\u{945}|\u{905}\u{93A}|\u{905}\u{93B}|\u{906}\u{93A}|\u{905}\u{94F}|\u{905}\u{956}|\u{905}\u{957}|\u{916}\u{94D}\u{93E}|\u{916}\u{94D}\u{200D}\u{93E}|\u{917}\u{94D}\u{93E}|\u{917}\u{94D}\u{200D}\u{93E}|\u{918}\u{94D}\u{93E}|\u{918}\u{94D}\u{200D}\u{93E}|\u{91A}\u{94D}\u{93E}|\u{91A}\u{94D}\u{200D}\u{93E}|\u{91C}\u{94D}\u{93E}|\u{91C}\u{94D}\u{200D}\u{93E}|\u{91D}\u{94D}\u{93E}|\u{91D}\u{94D}\u{200D}\u{93E}|\u{91E}\u{94D}\u{93E}|\u{91E}\u{94D}\u{200D}\u{93E}|\u{923}\u{94D}\u{93E}|\u{923}\u{94D}\u{200D}\u{93E}|\u{924}\u{94D}\u{93E}|\u{924}\u{94D}\u{200D}\u{93E}|\u{925}\u{94D}\u{93E}|\u{925}\u{94D}\u{200D}\u{93E}|\u{927}\u{94D}\u{93E}|\u{927}\u{94D}\u{200D}\u{93E}|\u{928}\u{94D}\u{93E}|\u{928}\u{94D}\u{200D}\u{93E}|\u{929}\u{94D}\u{93E}|\u{929}\u{94D}\u{200D}\u{93E}|\u{928}\u{93C}\u{94D}\u{93E}|\u{928}\u{93C}\u{94D}\u{200D}\u{93E}|\u{92A}\u{94D}\u{93E}|\u{92A}\u{94D}\u{200D}\u{93E}|\u{92C}\u{94D}\u{93E}|\u{92C}\u{94D}\u{200D}\u{93E}|\u{92D}\u{94D}\u{93E}|\u{92D}\u{94D}\u{200D}\u{93E}|\u{92E}\u{94D}\u{93E}|\u{92E}\u{94D}\u{200D}\u{93E}|\u{92F}\u{94D}\u{93E}|\u{92F}\u{94D}\u{200D}\u{93E}|\u{932}\u{94D}\u{93E}|\u{932}\u{94D}\u{200D}\u{93E}|\u{935}\u{94D}\u{93E}|\u{935}\u{94D}\u{200D}\u{93E}|\u{936}\u{94D}\u{93E}|\u{936}\u{94D}\u{200D}\u{93E}|\u{937}\u{94D}\u{93E}|\u{937}\u{94D}\u{200D}\u{93E}|\u{938}\u{94D}\u{93E}|\u{938}\u{94D}\u{200D}\u{93E}|\u{959}\u{94D}\u{93E}|\u{959}\u{94D}\u{200D}\u{93E}|\u{916}\u{93C}\u{94D}\u{93E}|\u{916}\u{93C}\u{94D}\u{200D}\u{93E}|\u{95A}\u{94D}\u{93E}|\u{95A}\u{94D}\u{200D}\u{93E}|\u{917}\u{93C}\u{94D}\u{93E}|\u{917}\u{93C}\u{94D}\u{200D}\u{93E}|\u{95B}\u{94D}\u{93E}|\u{95B}\u{94D}\u{200D}\u{93E}|\u{91C}\u{93C}\u{94D}\u{93E}|\u{91C}\u{93C}\u{94D}\u{200D}\u{93E}|\u{95F}\u{94D}\u{93E}|\u{95F}\u{94D}\u{200D}\u{93E}|\u{92F}\u{93C}\u{94D}\u{93E}|\u{92F}\u{93C}\u{94D}\u{200D}\u{93E}|\u{979}\u{94D}\u{93E}|\u{979}\u{94D}\u{200D}\u{93E}|\u{97A}\u{94D}\u{93E}|\u{97A}\u{94D}\u{200D}\u{93E}|\u{97B}\u{94D}\u{93E}|\u{97B}\u{94D}\u{200D}\u{93E}|\u{97C}\u{94D}\u{93E}|\u{97C}\u{94D}\u{200D}\u{93E}|\u{97E}\u{94D}\u{93E}|\u{97E}\u{94D}\u{200D}\u{93E}|\u{97F}\u{94D}\u{93E}|\u{97F}\u{94D}\u{200D}\u{93E}|\u{915}\u{94D}\u{91A}\u{94D}\u{93E}|\u{915}\u{94D}\u{91A}\u{94D}\u{200D}\u{93E}|\u{915}\u{94D}\u{937}\u{94D}\u{93E}|\u{915}\u{94D}\u{937}\u{94D}\u{200D}\u{93E}|\u{924}\u{94D}\u{924}\u{94D}\u{93E}|\u{924}\u{94D}\u{924}\u{94D}\u{200D}\u{93E}|\u{928}\u{94D}\u{924}\u{94D}\u{93E}|\u{928}\u{94D}\u{924}\u{94D}\u{200D}\u{93E})/u
 		.test(word);
 }
 function containsVowelMatra(word) {
 	return /[\u{905}-\u{90C}\u{90F}\u{910}\u{913}\u{914}\u{960}\u{961}][\u{93E}-\u{944}\u{947}\u{948}\u{94B}\u{94C}\u{962}\u{963}]/u.test(word);
 }
 function containsConsonantHalantMatra(word) {
 	return /[\u0915-\u0939\u0958-\u095F]\u094D[\u093E-\u094C\u0962-\u0963]/u.test(word);
 }
 function containsInvalidZWJ(word) {
 	return /[\u0900-\u0903\u0904\u0905-\u0914\u093E-\u094C\u0962-\u0963\u093D]\u200D/u.test(word);
 }
 function containsMultipleNasalizations(word) {
 	return /[\u{900}\u{901}\u{902}\u{903}]{2,}/u.test(word);
 }
 function containsMultipleMatraNasalizations(word) {
 	return /([\u{93E}-\u{944}\u{947}\u{948}\u{94B}\u{94C}\u{962}\u{963}][\u{900}\u{901}\u{902}\u{903}]|[\u{900}\u{901}\u{902}\u{903}][\u{93E}-\u{944}\u{947}\u{948}\u{94B}\u{94C}\u{962}\u{963}])[\u{900}\u{901}\u{902}\u{903}\u{93E}-\u{944}\u{947}\u{948}\u{94B}\u{94C}\u{962}\u{963}]/u
 		.test(word);
 }
 function containsTooManyRepeatedLetters(word) {
 	return /(.)\1{2,}/.test(word);
 }
 /**
 * isValid
 *
 * Most validation rules are based on the comments here: https://github.com/harfbuzz/harfbuzz/issues/2803.
 */
 function isValid(word) {
 	return !containsInvalidCombination(word)
 		&& !containsVowelMatra(word)
 		&& !containsConsonantHalantMatra(word)
 		&& !containsInvalidZWJ(word)
 		&& !containsMultipleNasalizations(word)
 		&& !containsMultipleMatraNasalizations(word)
 		&& !containsTooManyRepeatedLetters(word)
 }
 function work({ file }) {
 	Array.from(getWordsFromFile(file)).forEach(w => {
 		if (isValid(w)) print(w);
 	});
 }
 work(validateInput());