107 lines
3.7 KiB
JavaScript
107 lines
3.7 KiB
JavaScript
const { basename } = require('path');
|
|
const { existsSync, readFileSync } = require('fs');
|
|
const { print, printError, printWordsWithFrequencies } = require('../_printers.js');
|
|
|
|
|
|
function printHelp() {
|
|
print(`Usage: node ${basename(process.argv[1])} <file>`);
|
|
}
|
|
|
|
|
|
function validateInput() {
|
|
if (process.argv.length < 3) {
|
|
printHelp();
|
|
process.exit(1);
|
|
}
|
|
|
|
if (!existsSync(process.argv[2])) {
|
|
printError(`Failure! Could not find the input file "${process.argv[2]}".`);
|
|
process.exit(2);
|
|
}
|
|
|
|
return { file: process.argv[2] };
|
|
}
|
|
|
|
|
|
function getWordsFromFile(filename) {
|
|
const content = readFileSync(filename, 'utf8');
|
|
return new Set(content.split('\n').map(word => word.trim()).filter(word => word.length > 0));
|
|
}
|
|
|
|
const FOREIGN_CHARS = '[^\\u{A80}-\\u{AFF}]';
|
|
const UNWANTED_CHARS = 'ૠ\\u{AC4}\\u{AE3}-\\u{AFF}\\u{AD1}-\\u{ADF}\\u{A80}\\u{A84}\\u{A8E}\\u{A92}\\u{AA9}\\u{AB1}\\u{AB4}\\u{ABA}\\u{ABB}\\u{AC6}\\u{ACA}\\u{ACE}\\u{ACF}';
|
|
const NUMBERS = '\\u{0AE6}-\\u{0AEF}';
|
|
const VOWELS = '\\u{0A85}-\\u{0A94}\\u{0AE0}\\u{0AE1}ૐ';
|
|
const CONSONANTS = '\\u{0A95}-\\u{0AB9}';
|
|
const VOWEL_MATRAS = '\\u{0ABE}-\\u{0AC5}\\u{0AC7}-\\u{0AC9}\\u{0ACB}-\\u{0ACC}\\u{0AE2}\\u{0AE3}';
|
|
const NASALIZATIONS = '\\u{0A81}-\\u{0A83}';
|
|
const HALANT = '\\u{0ACD}';
|
|
const NUQTA = '\\u{0ABC}';
|
|
const AVAGRAHA = '\\u{0ABD}';
|
|
const ZWJ = '\\u{200D}';
|
|
|
|
|
|
const INVALIDATORS = [
|
|
(word) => new RegExp(`(\\p{L}\\p{M}?)(?!${AVAGRAHA})\\1{2,}`, 'u').test(word), // too many repeated letters
|
|
(word) => new RegExp(`^[${VOWEL_MATRAS}${NASALIZATIONS}${HALANT}${NUQTA}${AVAGRAHA}]`, 'u').test(word), // starts with a combining character
|
|
(word) => new RegExp(`[${VOWELS}][${VOWEL_MATRAS}${NUQTA}${HALANT}]`, 'u').test(word),
|
|
(word) => new RegExp(`[${CONSONANTS}]${HALANT}[${VOWEL_MATRAS}]`, 'u').test(word),
|
|
(word) => new RegExp(`[${NASALIZATIONS}${VOWELS}${VOWEL_MATRAS}${AVAGRAHA}]${ZWJ}`, 'u').test(word), // invalid ZWJ
|
|
(word) => new RegExp(`([${VOWEL_MATRAS}]{2}|[${NASALIZATIONS}]{2}|${HALANT}{2}|${NUQTA}{2})`, 'u').test(word), // multiple combining
|
|
(word) => new RegExp(`([${VOWEL_MATRAS}][${NASALIZATIONS}]|[${NASALIZATIONS}][${VOWEL_MATRAS}])[${VOWEL_MATRAS}${NASALIZATIONS}]`, 'u').test(word), // multiple matra nasalizations
|
|
(word) => new RegExp(`[${NASALIZATIONS}${HALANT}][${VOWEL_MATRAS}]`, 'u').test(word), // modifier + matra
|
|
(word) => new RegExp(`[^${CONSONANTS}][${NUQTA}]`, 'u').test(word), // non-consonant + nukta
|
|
(word) => new RegExp(`[${UNWANTED_CHARS}]`, 'u').test(word),
|
|
(word) => new RegExp(`${FOREIGN_CHARS}`, 'u').test(word),
|
|
(word) => new RegExp(`[${NUMBERS}]`, 'u').test(word),
|
|
];
|
|
|
|
|
|
/**
|
|
* isValid
|
|
*
|
|
* Most validation rules are based on the comments here: https://github.com/harfbuzz/harfbuzz/issues/2803.
|
|
*/
|
|
function isValid(word) {
|
|
for (let i = 0; i < INVALIDATORS.length; i++) {
|
|
if (INVALIDATORS[i](word)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
function fixNuqta(word) {
|
|
// return word.replaceAll('ऴ', '\u{933}\u{93c}');
|
|
return word;
|
|
}
|
|
|
|
|
|
function getWordsWithObsoleteCandrabinduInGujarati(allWords) {
|
|
const allWordsAnusvara = new Set();
|
|
allWords.forEach(w => {
|
|
const converted = w.replaceAll('\u{A81}', '\u{A82}');
|
|
if (converted !== w) {
|
|
allWordsAnusvara.add(converted);
|
|
}
|
|
});
|
|
|
|
return allWordsAnusvara;
|
|
}
|
|
|
|
|
|
function work({ file }) {
|
|
const allWords = Array.from(getWordsFromFile(file));
|
|
|
|
// Detect obsolete candrabindu instead of anusvara in Gujarati
|
|
const allWordsAnusvara = getWordsWithObsoleteCandrabinduInGujarati(allWords);
|
|
|
|
allWords.forEach(w => {
|
|
const word = allWordsAnusvara.has(w) ? w.replaceAll('\u{A81}', '\u{A82}') : w;
|
|
if (isValid(word)) print(fixNuqta(word));
|
|
});
|
|
}
|
|
|
|
work(validateInput());
|