1
0
Fork 0
tt9/app/validate-languages.gradle
Dimo Karaivanov 0ec912f9c9
Japanese (#770)
* added Japanese (Hiragana, Katakana, Kanji)

* improved dictionary validation: it is now possible to have the same ideogram with two different transcriptions

* fixed frequency updating not working sometimes (in Chinese too)
2025-04-12 11:59:13 +03:00

388 lines
13 KiB
Groovy
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

apply from: 'dictionary-tools.gradle'
ext.validateLanguageFiles = { definitionsDir, dictionariesDir, validationDir ->
int errorCount = 0
def errorStream = fileTree(dir: definitionsDir).getFiles().parallelStream().map { definition ->
if (errorCount >= MAX_ERRORS) {
return "Too many errors! Skipping: ${definition}\n"
}
def (alphabet, sounds, _, isAlphabeticLanguage, locale, dictionaryFile, langFileErrorCount, langFileErrorMsg) = parseLanguageDefintion(definition, dictionariesDir)
def languageHash = DictionaryTools.getLanguageHash(definition, dictionaryFile)
def validationFile = new File("${validationDir}/${definition.name.replace(".yml", "")}.txt")
errorCount += langFileErrorCount
if (!langFileErrorMsg.isEmpty()) {
validationFile.text = "${languageHash} INVALID"
return langFileErrorMsg
}
if (validationFile.exists() && validationFile.text == "${languageHash} OK") {
return ""
}
def (dictionaryErrorCount, dictionaryErrorMesages) = validateDictionary(dictionaryFile, alphabet, sounds, isAlphabeticLanguage, locale, MAX_ERRORS, CSV_DELIMITER, MAX_WORD_FREQUENCY)
errorCount += dictionaryErrorCount
if (!dictionaryErrorMesages.isEmpty()) {
validationFile.text = "${languageHash} INVALID"
return dictionaryErrorMesages
}
validationFile.text = "${languageHash} OK"
return ""
}
String errorsMsg = errorStream.reduce("", String::concat)
if (errorsMsg) {
throw new GradleException(errorsMsg)
}
}
ext.parseLanguageDefintion = { File languageFile, String dictionariesDir ->
String alphabet = ''
int layoutKey = 0
HashMap<String, String> sounds = new HashMap<>()
HashMap<String, String> layoutSounds = new HashMap<>()
File dictionaryFile
int errorCount = 0
String errorMsg = ""
String abcString = ""
boolean hasABC = true
boolean hasLayout = false
boolean hasSounds = false
boolean filterBySounds = false
boolean areNumeralsValid = true
String localeString = ""
String dictionaryFileName = ""
for (String rawLine : languageFile.readLines()) {
if (
rawLine.matches("^[a-zA-Z].*")
&& !rawLine.startsWith("abcString")
&& !rawLine.startsWith("currency")
&& !rawLine.startsWith("dictionaryFile")
&& !rawLine.startsWith("filterBySounds")
&& !rawLine.startsWith("hasABC")
&& !rawLine.startsWith("hasSpaceBetweenWords")
&& !rawLine.startsWith("hasUpperCase")
&& !rawLine.startsWith("layout")
&& !rawLine.startsWith("locale")
&& !rawLine.startsWith("name")
&& !rawLine.startsWith("numerals")
&& !rawLine.startsWith("sounds")
) {
def parts = rawLine.split(":")
def property = parts.length > 0 ? parts[0] : rawLine
errorCount++
errorMsg += "Language '${languageFile.name}' is invalid. Found unknown property: '${property}'.\n"
}
String line = rawLine.replaceFirst("#[\\s\\S]+\$", "")
def booleanProperties = ["hasUpperCase", "hasSpaceBetweenWords", "filterBySounds", "hasABC"]
for (String property : booleanProperties) {
String booleanError = validateBooleanProperty(line, property, languageFile.name)
if (booleanError) {
errorCount++
errorMsg += booleanError
}
}
if (line.startsWith("abcString")) {
abcString = line.replace("abcString:", "").trim()
}
if (line.startsWith("dictionaryFile")) {
dictionaryFileName = line.replace("dictionaryFile:", "").trim()
}
if (line.startsWith("filterBySounds")) {
filterBySounds = line.endsWith("yes")
}
if (line.startsWith("hasABC")) {
hasABC = line.endsWith("yes")
}
if (line.startsWith("numerals")) {
areNumeralsValid = line.matches("^numerals:\\s*\\[(.,\\s*?){9}.\\]")
}
if (line.startsWith("layout")) {
hasLayout = true
}
if (line.startsWith("sounds")) {
hasSounds = true
}
if (line.startsWith("locale")) {
localeString = line.replace("locale:", "").trim()
}
// alphabet string
def lineCharacters = extractAlphabetCharsFromLine(line)
lineCharacters = lineCharacters.isEmpty() ? extractAlphabetExtraCharsFromLine(languageFile.name, line) : lineCharacters
alphabet += lineCharacters
// sounds, single letters or special characters that are treated as letters
if (lineCharacters) {
lineCharacters.each { letter ->
layoutSounds.put(letter, layoutKey.toString())
}
}
if (isLayoutLine(line)) {
layoutKey++
}
// sounds, syllables
def (sound, sequence) = extractSoundFromLine(line)
if (!sound.isEmpty() && !sequence.isEmpty()) {
sounds.put(sound, sequence)
}
}
if (!hasABC && !abcString.isEmpty()) {
errorCount++
errorMsg += "Language '${languageFile.name}' is invalid. hasABC must be 'true' when abcString is provided.\n"
}
if (!hasLayout) {
errorCount++
errorMsg += "Language '${languageFile.name}' is invalid. Missing 'layout' property.\n"
}
if (alphabet.isEmpty()) {
errorCount++
errorMsg += "Language '${languageFile.name}' is invalid. No language characters found. Make sure 'layout' contains series of characters per each key in the format: ' - [a, b, c]' and so on\n"
}
if (hasSounds && sounds.isEmpty()) {
errorCount++
errorMsg += "Language '${languageFile.name}' is invalid. 'sounds' property must contain series of phonetic transcriptions per digit sequence in the format: ' - [Yae,1221]' and so on.\n"
}
if (filterBySounds && !hasSounds) {
errorCount++
errorMsg += "Language '${languageFile.name}' is invalid. 'filterBySounds' property can only be used with 'sounds' property.\n"
}
if (!localeString.matches("^[a-z]{2,3}(?:-[A-Z]{2})?\$")) {
errorCount++
def msg = localeString.isEmpty() ? "Missing 'locale' property." : "Unrecognized locale format: '${localeString}'"
errorMsg += "Language '${languageFile.name}' is invalid. ${msg}\n"
}
if (!areNumeralsValid) {
errorCount++
errorMsg += "Language '${languageFile.name}' is invalid. 'numerals' property must contain a comma-separated list of 10 characters representing the digits from 0 to 9.\n"
}
dictionaryFile = new File("$dictionariesDir/${dictionaryFileName}")
if (dictionaryFileName.isEmpty() || !dictionaryFile.exists()) {
errorCount++
errorMsg += "Could not find dictionary file: '${dictionaryFileName}' in: '${dictionariesDir}'. Make sure 'dictionaryFile' is set correctly in: '${languageFile.name}'.\n"
}
String[] localeParts = localeString.split(("[-_]"))
Locale locale = new Locale(localeParts[0], localeParts.length > 1 ? localeParts[1] : "")
if (!hasSounds && locale != null) {
layoutSounds.forEach { sound, sequence ->
sounds.put(sound.toUpperCase(locale), sequence)
}
}
return [alphabet, sounds, filterBySounds, !hasSounds, locale, dictionaryFile, errorCount, errorMsg]
}
// this cannot be static, because DictionaryTools will not be visible
def validateDictionary(File dictionaryFile, String alphabet, HashMap<String, String> sounds, boolean isAlphabeticLanguage, Locale locale, int maxErrors, String csvDelimiter, int maxWordFrequency) {
String regexSafeAlphabet = alphabet.replaceAll("([\\[\\]\\-\\.])", "")
final VALID_CHARS = alphabet.toUpperCase(locale) == alphabet ? "^[${regexSafeAlphabet}\\.\\-]+\$" : "^[${regexSafeAlphabet}${regexSafeAlphabet.toUpperCase(locale)}\\.\\-]+\$"
int errorCount = 0
String errorMsg = ''
Set<String> uniqueWords = new HashSet<>()
List<String> fileContents = dictionaryFile.readLines()
for (int lineNumber = 1; lineNumber <= fileContents.size() && errorCount < maxErrors; lineNumber++) {
String line = fileContents.get(lineNumber - 1)
boolean lineHasErrors = false
String whiteSpaceError = validateNoWhitespace(line, lineNumber)
if (whiteSpaceError) {
lineHasErrors = true
errorCount++
errorMsg += whiteSpaceError
}
def (word, transcription, frequency) = DictionaryTools.getDictionaryLineData(line, csvDelimiter)
String frequencyError = validateFrequency(frequency, maxWordFrequency, dictionaryFile.name, lineNumber)
if (frequencyError) {
lineHasErrors = true
errorCount++
errorMsg += frequencyError
}
def (wordErrorCount, wordErrors) = validateWord(word, VALID_CHARS, isAlphabeticLanguage, lineNumber, "Dictionary '${dictionaryFile.name}' is invalid")
if (wordErrorCount > 0) {
errorCount += wordErrorCount
errorMsg += wordErrors
}
final uniqueWord = transcription + word
if (uniqueWords.contains(uniqueWord)) {
lineHasErrors = true
errorCount++
errorMsg += "Dictionary '${dictionaryFile.name}' is invalid. Found duplicate word: '${word}${!transcription.isEmpty() ? ' [' + transcription + ']' : ''}' on line ${lineNumber}. Remove all duplicates.\n"
} else {
uniqueWords.add(uniqueWord)
}
if (lineHasErrors) {
// the validations below make no sense if the previous ones have failed
continue
}
try {
def transcribedWord = transcription.isEmpty() ? word : transcription
DictionaryTools.wordToDigitSequence(locale, transcribedWord, sounds, !transcription.isEmpty())
} catch (IllegalArgumentException e) {
errorCount++
errorMsg += "Dictionary '${dictionaryFile.name}' is invalid. Failed generating digit sequence for word '${word}' on line ${lineNumber}. ${e.message}\n"
}
}
return [errorCount, errorMsg]
}
//////////////////// PARSING ////////////////////
static def extractAlphabetExtraCharsFromLine(String languageName, String line) {
if (languageName == null || !line.contains('PUNCTUATION') || !isLayoutLine(line)) {
return ''
}
final DEFAULT = "'-."
if (languageName.contains('Korean')) {
return DEFAULT
} else if (languageName.contains("Hebrew") || languageName.contains("Yiddish")) {
return DEFAULT + '"'
}
String allChars = line
.replaceFirst('\\].*', '')
.replaceFirst('^\\s+- \\[', '')
.replaceFirst("PUNCTUATION[^,\\s]*", '')
.replace(',', '')
.replace(' ', '')
// add Zero-width non-joiner for some languages
if (line.contains("PUNCTUATION_FA") || line.contains("PUNCTUATION_IN")) {
allChars += '\u200C'
}
if (line.contains("PUNCTUATION") && languageName.contains("Japanese")) {
allChars += ''
}
return DEFAULT + allChars
}
static def extractAlphabetCharsFromLine(String line) {
if (line.contains('PUNCTUATION') || line.contains('SPECIAL') || !isLayoutLine(line)) {
return ''
}
return line.replaceFirst('^\\s+- \\[', '').replaceFirst('\\].*', '').replace(',', '').replace(' ', '')
}
static def extractSoundFromLine(String line) {
if (!line.matches('\\s+- \\[\\w+\\s*,\\s*\\d+\\].*')) {
return ['', '']
}
def cleanLine = line.replaceFirst('^\\s+- \\[', '').replaceFirst('\\].*', '').replace(' ', '')
def parts = cleanLine.split(',')
return parts.length > 1 ? [parts[0], parts[1]] : ['', '']
}
static def isLayoutLine(String line) {
return line.matches('\\s+- \\[.*?\\].*') && !line.find('\\d+]')
}
//////////////////// VALIDATION ////////////////////
static def validateBooleanProperty(String line, String propertyName, String languageFileName) {
if (line.startsWith(propertyName) && !line.endsWith("yes") && !line.endsWith("no")) {
def property = line.replaceAll(":.*\$", "")
def invalidVal = line.replace("hasUpperCase:", "").trim()
return "Language '${languageFileName}' is invalid. Unrecognized '${property}' value: '${invalidVal}'. Only 'yes' and 'no' are allowed.\n"
}
return ''
}
static def validateNoWhitespace(String line, int lineNumber) {
if (line == "") {
return "There is no word on line ${lineNumber}. Remove all empty lines.\n"
} else if (line.contains(" ")) {
return "Found space on line ${lineNumber}. Make sure each word is on a new line. Phrases are not allowed.\n"
}
return ''
}
static def validateFrequency(int frequency, int maxFrequency, String dictionaryFileName, int lineNumber) {
if (frequency < 0 || frequency > maxFrequency) {
return "Dictionary '${dictionaryFileName}' is invalid. Found out-of-range word frequency: '${frequency}' on line ${lineNumber}. Frequency must be an integer between 0 and ${maxFrequency}.\n"
}
return ''
}
static def validateWord(String word, String validCharacters, boolean isAlphabeticLanguage, int lineNumber, String errorMsgPrefix) {
int errorCount = 0
def errors = ''
if (word.matches("(\\d.+?|.+?\\d|\\d)")) {
errorCount++
errors += "${errorMsgPrefix}. Found numbers on line ${lineNumber}. Remove all numbers.\n"
}
if (word.matches("^\\P{L}+\$") && !validCharacters.contains(word)) {
errorCount++
errors += "${errorMsgPrefix}. Found a garbage word: '${word}' on line ${lineNumber}.\n"
}
if (isAlphabeticLanguage && word.trim().length() == 1) {
errorCount++
errors += "${errorMsgPrefix}. Found a single letter: '${word}' on line ${lineNumber}. Only uppercase single letters are allowed. The rest of the alphabet will be added automatically.\n"
}
if (errorCount == 0 && isAlphabeticLanguage && !word.matches(validCharacters)) {
errorCount++
errors += "${errorMsgPrefix}. Word '${word}' on line ${lineNumber} contains characters outside of the defined alphabet: $validCharacters.\n"
}
return [errorCount, errors]
}