tt9/scripts/indic/validate-gujarati.js

const { basename } = require('path');
const { existsSync, readFileSync } = require('fs');
const { print, printError, printWordsWithFrequencies } = require('../_printers.js');


function printHelp() {
	print(`Usage: node ${basename(process.argv[1])} <file>`);
}


function validateInput() {
	if (process.argv.length < 3) {
		printHelp();
		process.exit(1);
	}

	if (!existsSync(process.argv[2])) {
		printError(`Failure! Could not find the input file "${process.argv[2]}".`);
		process.exit(2);
	}

	return { file: process.argv[2] };
}


function getWordsFromFile(filename) {
	const content = readFileSync(filename, 'utf8');
	return new Set(content.split('\n').map(word => word.trim()).filter(word => word.length > 0));
}

const FOREIGN_CHARS = '[^\\u{A80}-\\u{AFF}]';
const UNWANTED_CHARS = 'ૠ\\u{AC4}\\u{AE3}-\\u{AFF}\\u{AD1}-\\u{ADF}\\u{A80}\\u{A84}\\u{A8E}\\u{A92}\\u{AA9}\\u{AB1}\\u{AB4}\\u{ABA}\\u{ABB}\\u{AC6}\\u{ACA}\\u{ACE}\\u{ACF}';
const NUMBERS = '\\u{0AE6}-\\u{0AEF}';
const VOWELS = '\\u{0A85}-\\u{0A94}\\u{0AE0}\\u{0AE1}ૐ';
const CONSONANTS = '\\u{0A95}-\\u{0AB9}';
const VOWEL_MATRAS = '\\u{0ABE}-\\u{0AC5}\\u{0AC7}-\\u{0AC9}\\u{0ACB}-\\u{0ACC}\\u{0AE2}\\u{0AE3}';
const NASALIZATIONS = '\\u{0A81}-\\u{0A83}';
const HALANT = '\\u{0ACD}';
const NUQTA = '\\u{0ABC}';
const AVAGRAHA = '\\u{0ABD}';
const ZWJ = '\\u{200D}';


const INVALIDATORS = [
    (word) => new RegExp(`(\\p{L}\\p{M}?)(?!${AVAGRAHA})\\1{2,}`, 'u').test(word), // too many repeated letters
    (word) => new RegExp(`^[${VOWEL_MATRAS}${NASALIZATIONS}${HALANT}${NUQTA}${AVAGRAHA}]`, 'u').test(word), // starts with a combining character
    (word) => new RegExp(`[${VOWELS}][${VOWEL_MATRAS}${NUQTA}${HALANT}]`, 'u').test(word),
    (word) => new RegExp(`[${CONSONANTS}]${HALANT}[${VOWEL_MATRAS}]`, 'u').test(word),
    (word) => new RegExp(`[${NASALIZATIONS}${VOWELS}${VOWEL_MATRAS}${AVAGRAHA}]${ZWJ}`, 'u').test(word), // invalid ZWJ
    (word) => new RegExp(`([${VOWEL_MATRAS}]{2}|[${NASALIZATIONS}]{2}|${HALANT}{2}|${NUQTA}{2})`, 'u').test(word), // multiple combining
    (word) => new RegExp(`([${VOWEL_MATRAS}][${NASALIZATIONS}]|[${NASALIZATIONS}][${VOWEL_MATRAS}])[${VOWEL_MATRAS}${NASALIZATIONS}]`, 'u').test(word), // multiple matra nasalizations
    (word) => new RegExp(`[${NASALIZATIONS}${HALANT}][${VOWEL_MATRAS}]`, 'u').test(word), // modifier + matra
    (word) => new RegExp(`[^${CONSONANTS}][${NUQTA}]`, 'u').test(word), // non-consonant + nukta
    (word) => new RegExp(`[${UNWANTED_CHARS}]`, 'u').test(word),
    (word) => new RegExp(`${FOREIGN_CHARS}`, 'u').test(word),
    (word) => new RegExp(`[${NUMBERS}]`, 'u').test(word),
];


/**
 * isValid
 *
 * Most validation rules are based on the comments here: https://github.com/harfbuzz/harfbuzz/issues/2803.
 */
function isValid(word) {
    for (let i = 0; i < INVALIDATORS.length; i++) {
        if (INVALIDATORS[i](word)) {
            return false;
        }
    }

	return true;
}


function fixNuqta(word) {
//	return word.replaceAll('ऴ', '\u{933}\u{93c}');
    return word;
}


function getWordsWithObsoleteCandrabinduInGujarati(allWords) {
    const allWordsAnusvara = new Set();
	allWords.forEach(w => {
	    const converted = w.replaceAll('\u{A81}', '\u{A82}');
	    if (converted !== w) {
	        allWordsAnusvara.add(converted);
        }
	});

	return allWordsAnusvara;
}


function work({ file }) {
    const allWords = Array.from(getWordsFromFile(file));

    // Detect obsolete candrabindu instead of anusvara in Gujarati
	const allWordsAnusvara = getWordsWithObsoleteCandrabinduInGujarati(allWords);

	allWords.forEach(w => {
	    const word = allWordsAnusvara.has(w) ? w.replaceAll('\u{A81}', '\u{A82}') : w;
		if (isValid(word)) print(fixNuqta(word));
	});
}

work(validateInput());