1
0
Fork 0
tt9/scripts/indic/validate-gujarati.js
2025-01-03 10:36:29 +02:00

107 lines
3.7 KiB
JavaScript

const { basename } = require('path');
const { existsSync, readFileSync } = require('fs');
const { print, printError, printWordsWithFrequencies } = require('../_printers.js');
function printHelp() {
print(`Usage: node ${basename(process.argv[1])} <file>`);
}
function validateInput() {
if (process.argv.length < 3) {
printHelp();
process.exit(1);
}
if (!existsSync(process.argv[2])) {
printError(`Failure! Could not find the input file "${process.argv[2]}".`);
process.exit(2);
}
return { file: process.argv[2] };
}
function getWordsFromFile(filename) {
const content = readFileSync(filename, 'utf8');
return new Set(content.split('\n').map(word => word.trim()).filter(word => word.length > 0));
}
const FOREIGN_CHARS = '[^\\u{A80}-\\u{AFF}]';
const UNWANTED_CHARS = 'ૠ\\u{AC4}\\u{AE3}-\\u{AFF}\\u{AD1}-\\u{ADF}\\u{A80}\\u{A84}\\u{A8E}\\u{A92}\\u{AA9}\\u{AB1}\\u{AB4}\\u{ABA}\\u{ABB}\\u{AC6}\\u{ACA}\\u{ACE}\\u{ACF}';
const NUMBERS = '\\u{0AE6}-\\u{0AEF}';
const VOWELS = '\\u{0A85}-\\u{0A94}\\u{0AE0}\\u{0AE1}ૐ';
const CONSONANTS = '\\u{0A95}-\\u{0AB9}';
const VOWEL_MATRAS = '\\u{0ABE}-\\u{0AC5}\\u{0AC7}-\\u{0AC9}\\u{0ACB}-\\u{0ACC}\\u{0AE2}\\u{0AE3}';
const NASALIZATIONS = '\\u{0A81}-\\u{0A83}';
const HALANT = '\\u{0ACD}';
const NUQTA = '\\u{0ABC}';
const AVAGRAHA = '\\u{0ABD}';
const ZWJ = '\\u{200D}';
const INVALIDATORS = [
(word) => new RegExp(`(\\p{L}\\p{M}?)(?!${AVAGRAHA})\\1{2,}`, 'u').test(word), // too many repeated letters
(word) => new RegExp(`^[${VOWEL_MATRAS}${NASALIZATIONS}${HALANT}${NUQTA}${AVAGRAHA}]`, 'u').test(word), // starts with a combining character
(word) => new RegExp(`[${VOWELS}][${VOWEL_MATRAS}${NUQTA}${HALANT}]`, 'u').test(word),
(word) => new RegExp(`[${CONSONANTS}]${HALANT}[${VOWEL_MATRAS}]`, 'u').test(word),
(word) => new RegExp(`[${NASALIZATIONS}${VOWELS}${VOWEL_MATRAS}${AVAGRAHA}]${ZWJ}`, 'u').test(word), // invalid ZWJ
(word) => new RegExp(`([${VOWEL_MATRAS}]{2}|[${NASALIZATIONS}]{2}|${HALANT}{2}|${NUQTA}{2})`, 'u').test(word), // multiple combining
(word) => new RegExp(`([${VOWEL_MATRAS}][${NASALIZATIONS}]|[${NASALIZATIONS}][${VOWEL_MATRAS}])[${VOWEL_MATRAS}${NASALIZATIONS}]`, 'u').test(word), // multiple matra nasalizations
(word) => new RegExp(`[${NASALIZATIONS}${HALANT}][${VOWEL_MATRAS}]`, 'u').test(word), // modifier + matra
(word) => new RegExp(`[^${CONSONANTS}][${NUQTA}]`, 'u').test(word), // non-consonant + nukta
(word) => new RegExp(`[${UNWANTED_CHARS}]`, 'u').test(word),
(word) => new RegExp(`${FOREIGN_CHARS}`, 'u').test(word),
(word) => new RegExp(`[${NUMBERS}]`, 'u').test(word),
];
/**
* isValid
*
* Most validation rules are based on the comments here: https://github.com/harfbuzz/harfbuzz/issues/2803.
*/
function isValid(word) {
for (let i = 0; i < INVALIDATORS.length; i++) {
if (INVALIDATORS[i](word)) {
return false;
}
}
return true;
}
function fixNuqta(word) {
// return word.replaceAll('ऴ', '\u{933}\u{93c}');
return word;
}
function getWordsWithObsoleteCandrabinduInGujarati(allWords) {
const allWordsAnusvara = new Set();
allWords.forEach(w => {
const converted = w.replaceAll('\u{A81}', '\u{A82}');
if (converted !== w) {
allWordsAnusvara.add(converted);
}
});
return allWordsAnusvara;
}
function work({ file }) {
const allWords = Array.from(getWordsFromFile(file));
// Detect obsolete candrabindu instead of anusvara in Gujarati
const allWordsAnusvara = getWordsWithObsoleteCandrabinduInGujarati(allWords);
allWords.forEach(w => {
const word = allWordsAnusvara.has(w) ? w.replaceAll('\u{A81}', '\u{A82}') : w;
if (isValid(word)) print(fixNuqta(word));
});
}
work(validateInput());