1
0
Fork 0
This commit is contained in:
sspanak 2024-12-28 14:07:57 +02:00 committed by Dimo Karaivanov
parent 240e5c444a
commit e3d0bac90f
13 changed files with 1380245 additions and 18 deletions

View file

@ -0,0 +1,112 @@
const { basename } = require('path');
const { existsSync, readFileSync } = require('fs');
const { print, printError, printWordsWithFrequencies } = require('../_printers.js');
function printHelp() {
print(`Usage: node ${basename(process.argv[1])} <file>`);
}
function validateInput() {
if (process.argv.length < 3) {
printHelp();
process.exit(1);
}
if (!existsSync(process.argv[2])) {
printError(`Failure! Could not find the input file "${process.argv[2]}".`);
process.exit(2);
}
return { file: process.argv[2] };
}
function getWordsFromFile(filename) {
const content = readFileSync(filename, 'utf8');
return new Set(content.split('\n').map(word => word.trim()).filter(word => word.length > 0));
}
/**
* containsInvalidCombination
* Based on the "do not use sequences" table from here: https://lontar.eu/en/notes/issues-in-devanagari-cluster-validation/
*/
function containsInvalidCombination(word) {
return /(\u{905}\u{946}|\u{905}\u{93E}|\u{930}\u{94D}\u{907}|\u{909}\u{941}|\u{90F}\u{945}|\u{90F}\u{946}|\u{90F}\u{947}|\u{905}\u{949}|\u{906}\u{945}|\u{905}\u{94A}|\u{906}\u{946}|\u{905}\u{94B}|\u{906}\u{947}|\u{905}\u{94C}|\u{906}\u{948}|\u{905}\u{945}|\u{905}\u{93A}|\u{905}\u{93B}|\u{906}\u{93A}|\u{905}\u{94F}|\u{905}\u{956}|\u{905}\u{957}|\u{916}\u{94D}\u{93E}|\u{916}\u{94D}\u{200D}\u{93E}|\u{917}\u{94D}\u{93E}|\u{917}\u{94D}\u{200D}\u{93E}|\u{918}\u{94D}\u{93E}|\u{918}\u{94D}\u{200D}\u{93E}|\u{91A}\u{94D}\u{93E}|\u{91A}\u{94D}\u{200D}\u{93E}|\u{91C}\u{94D}\u{93E}|\u{91C}\u{94D}\u{200D}\u{93E}|\u{91D}\u{94D}\u{93E}|\u{91D}\u{94D}\u{200D}\u{93E}|\u{91E}\u{94D}\u{93E}|\u{91E}\u{94D}\u{200D}\u{93E}|\u{923}\u{94D}\u{93E}|\u{923}\u{94D}\u{200D}\u{93E}|\u{924}\u{94D}\u{93E}|\u{924}\u{94D}\u{200D}\u{93E}|\u{925}\u{94D}\u{93E}|\u{925}\u{94D}\u{200D}\u{93E}|\u{927}\u{94D}\u{93E}|\u{927}\u{94D}\u{200D}\u{93E}|\u{928}\u{94D}\u{93E}|\u{928}\u{94D}\u{200D}\u{93E}|\u{929}\u{94D}\u{93E}|\u{929}\u{94D}\u{200D}\u{93E}|\u{928}\u{93C}\u{94D}\u{93E}|\u{928}\u{93C}\u{94D}\u{200D}\u{93E}|\u{92A}\u{94D}\u{93E}|\u{92A}\u{94D}\u{200D}\u{93E}|\u{92C}\u{94D}\u{93E}|\u{92C}\u{94D}\u{200D}\u{93E}|\u{92D}\u{94D}\u{93E}|\u{92D}\u{94D}\u{200D}\u{93E}|\u{92E}\u{94D}\u{93E}|\u{92E}\u{94D}\u{200D}\u{93E}|\u{92F}\u{94D}\u{93E}|\u{92F}\u{94D}\u{200D}\u{93E}|\u{932}\u{94D}\u{93E}|\u{932}\u{94D}\u{200D}\u{93E}|\u{935}\u{94D}\u{93E}|\u{935}\u{94D}\u{200D}\u{93E}|\u{936}\u{94D}\u{93E}|\u{936}\u{94D}\u{200D}\u{93E}|\u{937}\u{94D}\u{93E}|\u{937}\u{94D}\u{200D}\u{93E}|\u{938}\u{94D}\u{93E}|\u{938}\u{94D}\u{200D}\u{93E}|\u{959}\u{94D}\u{93E}|\u{959}\u{94D}\u{200D}\u{93E}|\u{916}\u{93C}\u{94D}\u{93E}|\u{916}\u{93C}\u{94D}\u{200D}\u{93E}|\u{95A}\u{94D}\u{93E}|\u{95A}\u{94D}\u{200D}\u{93E}|\u{917}\u{93C}\u{94D}\u{93E}|\u{917}\u{93C}\u{94D}\u{200D}\u{93E}|\u{95B}\u{94D}\u{93E}|\u{95B}\u{94D}\u{200D}\u{93E}|\u{91C}\u{93C}\u{94D}\u{93E}|\u{91C}\u{93C}\u{94D}\u{200D}\u{93E}|\u{95F}\u{94D}\u{93E}|\u{95F}\u{94D}\u{200D}\u{93E}|\u{92F}\u{93C}\u{94D}\u{93E}|\u{92F}\u{93C}\u{94D}\u{200D}\u{93E}|\u{979}\u{94D}\u{93E}|\u{979}\u{94D}\u{200D}\u{93E}|\u{97A}\u{94D}\u{93E}|\u{97A}\u{94D}\u{200D}\u{93E}|\u{97B}\u{94D}\u{93E}|\u{97B}\u{94D}\u{200D}\u{93E}|\u{97C}\u{94D}\u{93E}|\u{97C}\u{94D}\u{200D}\u{93E}|\u{97E}\u{94D}\u{93E}|\u{97E}\u{94D}\u{200D}\u{93E}|\u{97F}\u{94D}\u{93E}|\u{97F}\u{94D}\u{200D}\u{93E}|\u{915}\u{94D}\u{91A}\u{94D}\u{93E}|\u{915}\u{94D}\u{91A}\u{94D}\u{200D}\u{93E}|\u{915}\u{94D}\u{937}\u{94D}\u{93E}|\u{915}\u{94D}\u{937}\u{94D}\u{200D}\u{93E}|\u{924}\u{94D}\u{924}\u{94D}\u{93E}|\u{924}\u{94D}\u{924}\u{94D}\u{200D}\u{93E}|\u{928}\u{94D}\u{924}\u{94D}\u{93E}|\u{928}\u{94D}\u{924}\u{94D}\u{200D}\u{93E})/u
.test(word);
}
function containsVowelMatra(word) {
return /[\u{905}-\u{90C}\u{90F}\u{910}\u{913}\u{914}\u{960}\u{961}][\u{93E}-\u{944}\u{947}\u{948}\u{94B}\u{94C}\u{962}\u{963}]/u.test(word);
}
function containsConsonantHalantMatra(word) {
return /[\u0915-\u0939\u0958-\u095F]\u094D[\u093E-\u094C\u0962-\u0963]/u.test(word);
}
function containsInvalidZWJ(word) {
return /[\u0900-\u0903\u0904\u0905-\u0914\u093E-\u094C\u0962-\u0963\u093D]\u200D/u.test(word);
}
function containsMultipleNasalizations(word) {
return /[\u{900}\u{901}\u{902}\u{903}]{2,}/u.test(word);
}
function containsMultipleMatraNasalizations(word) {
return /([\u{93E}-\u{944}\u{947}\u{948}\u{94B}\u{94C}\u{962}\u{963}][\u{900}\u{901}\u{902}\u{903}]|[\u{900}\u{901}\u{902}\u{903}][\u{93E}-\u{944}\u{947}\u{948}\u{94B}\u{94C}\u{962}\u{963}])[\u{900}\u{901}\u{902}\u{903}\u{93E}-\u{944}\u{947}\u{948}\u{94B}\u{94C}\u{962}\u{963}]/u
.test(word);
}
function containsModifierMatra(word) {
return /[\u{900}-\u{903}\u{94d}][\u{93E}-\u{944}\u{947}\u{948}\u{94B}\u{94C}\u{962}\u{963}]/u.test(word);
}
function containsTooManyRepeatedLetters(word) {
return /(.)\1{2,}/.test(word);
}
function containsForeignLetters(word) {
return /[\u{944}ऑऍऎऒॠ]+[\u{900}-\u{903}\u{94d}\u{93E}-\u{944}\u{947}\u{948}\u{94B}\u{94C}\u{962}\u{963}]?/u.test(word);
}
function fixNuqta(word) {
return word.replaceAll('ऴ', '\u{933}\u{93c}');
}
/**
* isValid
*
* Most validation rules are based on the comments here: https://github.com/harfbuzz/harfbuzz/issues/2803.
*/
function isValid(word) {
return !containsInvalidCombination(word)
&& !containsVowelMatra(word)
&& !containsConsonantHalantMatra(word)
&& !containsInvalidZWJ(word)
&& !containsMultipleNasalizations(word)
&& !containsMultipleMatraNasalizations(word)
&& !containsModifierMatra(word)
&& !containsTooManyRepeatedLetters(word)
&& !containsForeignLetters(word)
}
function work({ file }) {
Array.from(getWordsFromFile(file)).forEach(w => {
if (isValid(w)) print(fixNuqta(w));
});
}
work(validateInput());

View file

@ -0,0 +1,107 @@
const { basename } = require('path');
const { existsSync, readFileSync } = require('fs');
const { print, printError, printWordsWithFrequencies } = require('../_printers.js');
function printHelp() {
print(`Usage: node ${basename(process.argv[1])} <file>`);
}
function validateInput() {
if (process.argv.length < 3) {
printHelp();
process.exit(1);
}
if (!existsSync(process.argv[2])) {
printError(`Failure! Could not find the input file "${process.argv[2]}".`);
process.exit(2);
}
return { file: process.argv[2] };
}
function getWordsFromFile(filename) {
const content = readFileSync(filename, 'utf8');
return new Set(content.split('\n').map(word => word.trim()).filter(word => word.length > 0));
}
const FOREIGN_CHARS = '[^\\u{A80}-\\u{AFF}]';
const UNWANTED_CHARS = 'ૠ\\u{AC4}\\u{AE3}-\\u{AFF}\\u{AD1}-\\u{ADF}\\u{A80}\\u{A84}\\u{A8E}\\u{A92}\\u{AA9}\\u{AB1}\\u{AB4}\\u{ABA}\\u{ABB}\\u{AC6}\\u{ACA}\\u{ACE}\\u{ACF}';
const NUMBERS = '\\u{0AE6}-\\u{0AEF}';
const VOWELS = '\\u{0A85}-\\u{0A94}\\u{0AE0}\\u{0AE1}ૐ';
const CONSONANTS = '\\u{0A95}-\\u{0AB9}';
const VOWEL_MATRAS = '\\u{0ABE}-\\u{0AC5}\\u{0AC7}-\\u{0AC9}\\u{0ACB}-\\u{0ACC}\\u{0AE2}\\u{0AE3}';
const NASALIZATIONS = '\\u{0A81}-\\u{0A83}';
const HALANT = '\\u{0ACD}';
const NUQTA = '\\u{0ABC}';
const AVAGRAHA = '\\u{0ABD}';
const ZWJ = '\\u{200D}';
const INVALIDATORS = [
(word) => new RegExp(`(\\p{L}\\p{M}?)(?!${AVAGRAHA})\\1{2,}`, 'u').test(word), // too many repeated letters
(word) => new RegExp(`^[${VOWEL_MATRAS}${NASALIZATIONS}${HALANT}${NUQTA}${AVAGRAHA}]`, 'u').test(word), // starts with a combining character
(word) => new RegExp(`[${VOWELS}][${VOWEL_MATRAS}${NUQTA}${HALANT}]`, 'u').test(word),
(word) => new RegExp(`[${CONSONANTS}]${HALANT}[${VOWEL_MATRAS}]`, 'u').test(word),
(word) => new RegExp(`[${NASALIZATIONS}${VOWELS}${VOWEL_MATRAS}${AVAGRAHA}]${ZWJ}`, 'u').test(word), // invalid ZWJ
(word) => new RegExp(`([${VOWEL_MATRAS}]{2}|[${NASALIZATIONS}]{2}|${HALANT}{2}|${NUQTA}{2})`, 'u').test(word), // multiple combining
(word) => new RegExp(`([${VOWEL_MATRAS}][${NASALIZATIONS}]|[${NASALIZATIONS}][${VOWEL_MATRAS}])[${VOWEL_MATRAS}${NASALIZATIONS}]`, 'u').test(word), // multiple matra nasalizations
(word) => new RegExp(`[${NASALIZATIONS}${HALANT}][${VOWEL_MATRAS}]`, 'u').test(word), // modifier + matra
(word) => new RegExp(`[^${CONSONANTS}][${NUQTA}]`, 'u').test(word), // non-consonant + nukta
(word) => new RegExp(`[${UNWANTED_CHARS}]`, 'u').test(word),
(word) => new RegExp(`${FOREIGN_CHARS}`, 'u').test(word),
(word) => new RegExp(`[${NUMBERS}]`, 'u').test(word),
];
/**
* isValid
*
* Most validation rules are based on the comments here: https://github.com/harfbuzz/harfbuzz/issues/2803.
*/
function isValid(word) {
for (let i = 0; i < INVALIDATORS.length; i++) {
if (INVALIDATORS[i](word)) {
return false;
}
}
return true;
}
function fixNuqta(word) {
// return word.replaceAll('ऴ', '\u{933}\u{93c}');
return word;
}
function getWordsWithObsoleteCandrabinduInGujarati(allWords) {
const allWordsAnusvara = new Set();
allWords.forEach(w => {
const converted = w.replaceAll('\u{A81}', '\u{A82}');
if (converted !== w) {
allWordsAnusvara.add(converted);
}
});
return allWordsAnusvara;
}
function work({ file }) {
const allWords = Array.from(getWordsFromFile(file));
// Detect obsolete candrabindu instead of anusvara in Gujarati
const allWordsAnusvara = getWordsWithObsoleteCandrabinduInGujarati(allWords);
allWords.forEach(w => {
const word = allWordsAnusvara.has(w) ? w.replaceAll('\u{A81}', '\u{A82}') : w;
if (isValid(word)) print(fixNuqta(word));
});
}
work(validateInput());