apply from: 'dictionary-tools.gradle' ext.validateLanguageFiles = { definitionsDir, dictionariesDir, validationDir -> int errorCount = 0 def errorStream = fileTree(dir: definitionsDir).getFiles().parallelStream().map { definition -> if (errorCount >= MAX_ERRORS) { return "Too many errors! Skipping: ${definition}\n" } def (alphabet, sounds, _, isAlphabeticLanguage, locale, dictionaryFile, langFileErrorCount, langFileErrorMsg) = parseLanguageDefintion(definition, dictionariesDir) def languageHash = DictionaryTools.getLanguageHash(definition, dictionaryFile) def validationFile = new File("${validationDir}/${definition.name.replace(".yml", "")}.txt") errorCount += langFileErrorCount if (!langFileErrorMsg.isEmpty()) { validationFile.text = "${languageHash} INVALID" return langFileErrorMsg } if (validationFile.exists() && validationFile.text == "${languageHash} OK") { return "" } def (dictionaryErrorCount, dictionaryErrorMesages) = validateDictionary(dictionaryFile, alphabet, sounds, isAlphabeticLanguage, locale, MAX_ERRORS, CSV_DELIMITER, MAX_WORD_FREQUENCY) errorCount += dictionaryErrorCount if (!dictionaryErrorMesages.isEmpty()) { validationFile.text = "${languageHash} INVALID" return dictionaryErrorMesages } validationFile.text = "${languageHash} OK" return "" } String errorsMsg = errorStream.reduce("", String::concat) if (errorsMsg) { throw new GradleException(errorsMsg) } } ext.parseLanguageDefintion = { File languageFile, String dictionariesDir -> String alphabet = '' int layoutKey = 0 HashMap sounds = new HashMap<>() HashMap layoutSounds = new HashMap<>() File dictionaryFile int errorCount = 0 String errorMsg = "" String abcString = "" boolean hasABC = true boolean hasLayout = false boolean hasSounds = false boolean filterBySounds = false boolean areNumeralsValid = true String localeString = "" String dictionaryFileName = "" for (String rawLine : languageFile.readLines()) { if ( rawLine.matches("^[a-zA-Z].*") && !rawLine.startsWith("abcString") && !rawLine.startsWith("currency") && !rawLine.startsWith("dictionaryFile") && !rawLine.startsWith("filterBySounds") && !rawLine.startsWith("hasABC") && !rawLine.startsWith("hasSpaceBetweenWords") && !rawLine.startsWith("hasUpperCase") && !rawLine.startsWith("layout") && !rawLine.startsWith("locale") && !rawLine.startsWith("name") && !rawLine.startsWith("numerals") && !rawLine.startsWith("sounds") ) { def parts = rawLine.split(":") def property = parts.length > 0 ? parts[0] : rawLine errorCount++ errorMsg += "Language '${languageFile.name}' is invalid. Found unknown property: '${property}'.\n" } String line = rawLine.replaceFirst("#[\\s\\S]+\$", "") def booleanProperties = ["hasUpperCase", "hasSpaceBetweenWords", "filterBySounds", "hasABC"] for (String property : booleanProperties) { String booleanError = validateBooleanProperty(line, property, languageFile.name) if (booleanError) { errorCount++ errorMsg += booleanError } } if (line.startsWith("abcString")) { abcString = line.replace("abcString:", "").trim() } if (line.startsWith("dictionaryFile")) { dictionaryFileName = line.replace("dictionaryFile:", "").trim() } if (line.startsWith("filterBySounds")) { filterBySounds = line.endsWith("yes") } if (line.startsWith("hasABC")) { hasABC = line.endsWith("yes") } if (line.startsWith("numerals")) { areNumeralsValid = line.matches("^numerals:\\s*\\[(.,\\s*?){9}.\\]") } if (line.startsWith("layout")) { hasLayout = true } if (line.startsWith("sounds")) { hasSounds = true } if (line.startsWith("locale")) { localeString = line.replace("locale:", "").trim() } // alphabet string def lineCharacters = extractAlphabetCharsFromLine(line) lineCharacters = lineCharacters.isEmpty() ? extractAlphabetExtraCharsFromLine(languageFile.name, line) : lineCharacters alphabet += lineCharacters // sounds, single letters or special characters that are treated as letters if (lineCharacters) { lineCharacters.each { letter -> layoutSounds.put(letter, layoutKey.toString()) } } if (isLayoutLine(line)) { layoutKey++ } // sounds, syllables def (sound, sequence) = extractSoundFromLine(line) if (!sound.isEmpty() && !sequence.isEmpty()) { sounds.put(sound, sequence) } } if (!hasABC && !abcString.isEmpty()) { errorCount++ errorMsg += "Language '${languageFile.name}' is invalid. hasABC must be 'true' when abcString is provided.\n" } if (!hasLayout) { errorCount++ errorMsg += "Language '${languageFile.name}' is invalid. Missing 'layout' property.\n" } if (alphabet.isEmpty()) { errorCount++ errorMsg += "Language '${languageFile.name}' is invalid. No language characters found. Make sure 'layout' contains series of characters per each key in the format: ' - [a, b, c]' and so on\n" } if (hasSounds && sounds.isEmpty()) { errorCount++ errorMsg += "Language '${languageFile.name}' is invalid. 'sounds' property must contain series of phonetic transcriptions per digit sequence in the format: ' - [Yae,1221]' and so on.\n" } if (filterBySounds && !hasSounds) { errorCount++ errorMsg += "Language '${languageFile.name}' is invalid. 'filterBySounds' property can only be used with 'sounds' property.\n" } if (!localeString.matches("^[a-z]{2,3}(?:-[A-Z]{2})?\$")) { errorCount++ def msg = localeString.isEmpty() ? "Missing 'locale' property." : "Unrecognized locale format: '${localeString}'" errorMsg += "Language '${languageFile.name}' is invalid. ${msg}\n" } if (!areNumeralsValid) { errorCount++ errorMsg += "Language '${languageFile.name}' is invalid. 'numerals' property must contain a comma-separated list of 10 characters representing the digits from 0 to 9.\n" } dictionaryFile = new File("$dictionariesDir/${dictionaryFileName}") if (dictionaryFileName.isEmpty() || !dictionaryFile.exists()) { errorCount++ errorMsg += "Could not find dictionary file: '${dictionaryFileName}' in: '${dictionariesDir}'. Make sure 'dictionaryFile' is set correctly in: '${languageFile.name}'.\n" } String[] localeParts = localeString.split(("[-_]")) Locale locale = new Locale(localeParts[0], localeParts.length > 1 ? localeParts[1] : "") if (!hasSounds && locale != null) { layoutSounds.forEach { sound, sequence -> sounds.put(sound.toUpperCase(locale), sequence) } } return [alphabet, sounds, filterBySounds, !hasSounds, locale, dictionaryFile, errorCount, errorMsg] } // this cannot be static, because DictionaryTools will not be visible def validateDictionary(File dictionaryFile, String alphabet, HashMap sounds, boolean isAlphabeticLanguage, Locale locale, int maxErrors, String csvDelimiter, int maxWordFrequency) { String regexSafeAlphabet = alphabet.replaceAll("([\\[\\]\\-\\.])", "") final VALID_CHARS = alphabet.toUpperCase(locale) == alphabet ? "^[${regexSafeAlphabet}\\.\\-]+\$" : "^[${regexSafeAlphabet}${regexSafeAlphabet.toUpperCase(locale)}\\.\\-]+\$" int errorCount = 0 String errorMsg = '' Set uniqueWords = new HashSet<>() List fileContents = dictionaryFile.readLines() for (int lineNumber = 1; lineNumber <= fileContents.size() && errorCount < maxErrors; lineNumber++) { String line = fileContents.get(lineNumber - 1) boolean lineHasErrors = false String whiteSpaceError = validateNoWhitespace(line, lineNumber) if (whiteSpaceError) { lineHasErrors = true errorCount++ errorMsg += whiteSpaceError } def (word, transcription, frequency) = DictionaryTools.getDictionaryLineData(line, csvDelimiter) String frequencyError = validateFrequency(frequency, maxWordFrequency, dictionaryFile.name, lineNumber) if (frequencyError) { lineHasErrors = true errorCount++ errorMsg += frequencyError } def (wordErrorCount, wordErrors) = validateWord(word, VALID_CHARS, isAlphabeticLanguage, lineNumber, "Dictionary '${dictionaryFile.name}' is invalid") if (wordErrorCount > 0) { errorCount += wordErrorCount errorMsg += wordErrors } final uniqueWord = transcription + word if (uniqueWords.contains(uniqueWord)) { lineHasErrors = true errorCount++ errorMsg += "Dictionary '${dictionaryFile.name}' is invalid. Found duplicate word: '${word}${!transcription.isEmpty() ? ' [' + transcription + ']' : ''}' on line ${lineNumber}. Remove all duplicates.\n" } else { uniqueWords.add(uniqueWord) } if (lineHasErrors) { // the validations below make no sense if the previous ones have failed continue } try { def transcribedWord = transcription.isEmpty() ? word : transcription DictionaryTools.wordToDigitSequence(locale, transcribedWord, sounds, !transcription.isEmpty()) } catch (IllegalArgumentException e) { errorCount++ errorMsg += "Dictionary '${dictionaryFile.name}' is invalid. Failed generating digit sequence for word '${word}' on line ${lineNumber}. ${e.message}\n" } } return [errorCount, errorMsg] } //////////////////// PARSING //////////////////// static def extractAlphabetExtraCharsFromLine(String languageName, String line) { if (languageName == null || !line.contains('PUNCTUATION') || !isLayoutLine(line)) { return '' } final DEFAULT = "'-." if (languageName.contains('Korean')) { return DEFAULT } else if (languageName.contains("Hebrew") || languageName.contains("Yiddish")) { return DEFAULT + '"' } String allChars = line .replaceFirst('\\].*', '') .replaceFirst('^\\s+- \\[', '') .replaceFirst("PUNCTUATION[^,\\s]*", '') .replace(',', '') .replace(' ', '') // add Zero-width non-joiner for some languages if (line.contains("PUNCTUATION_FA") || line.contains("PUNCTUATION_IN")) { allChars += '\u200C' } if (line.contains("PUNCTUATION") && languageName.contains("Japanese")) { allChars += '〇' } return DEFAULT + allChars } static def extractAlphabetCharsFromLine(String line) { if (line.contains('PUNCTUATION') || line.contains('SPECIAL') || !isLayoutLine(line)) { return '' } return line.replaceFirst('^\\s+- \\[', '').replaceFirst('\\].*', '').replace(',', '').replace(' ', '') } static def extractSoundFromLine(String line) { if (!line.matches('\\s+- \\[\\w+\\s*,\\s*\\d+\\].*')) { return ['', ''] } def cleanLine = line.replaceFirst('^\\s+- \\[', '').replaceFirst('\\].*', '').replace(' ', '') def parts = cleanLine.split(',') return parts.length > 1 ? [parts[0], parts[1]] : ['', ''] } static def isLayoutLine(String line) { return line.matches('\\s+- \\[.*?\\].*') && !line.find('\\d+]') } //////////////////// VALIDATION //////////////////// static def validateBooleanProperty(String line, String propertyName, String languageFileName) { if (line.startsWith(propertyName) && !line.endsWith("yes") && !line.endsWith("no")) { def property = line.replaceAll(":.*\$", "") def invalidVal = line.replace("hasUpperCase:", "").trim() return "Language '${languageFileName}' is invalid. Unrecognized '${property}' value: '${invalidVal}'. Only 'yes' and 'no' are allowed.\n" } return '' } static def validateNoWhitespace(String line, int lineNumber) { if (line == "") { return "There is no word on line ${lineNumber}. Remove all empty lines.\n" } else if (line.contains(" ")) { return "Found space on line ${lineNumber}. Make sure each word is on a new line. Phrases are not allowed.\n" } return '' } static def validateFrequency(int frequency, int maxFrequency, String dictionaryFileName, int lineNumber) { if (frequency < 0 || frequency > maxFrequency) { return "Dictionary '${dictionaryFileName}' is invalid. Found out-of-range word frequency: '${frequency}' on line ${lineNumber}. Frequency must be an integer between 0 and ${maxFrequency}.\n" } return '' } static def validateWord(String word, String validCharacters, boolean isAlphabeticLanguage, int lineNumber, String errorMsgPrefix) { int errorCount = 0 def errors = '' if (word.matches("(\\d.+?|.+?\\d|\\d)")) { errorCount++ errors += "${errorMsgPrefix}. Found numbers on line ${lineNumber}. Remove all numbers.\n" } if (word.matches("^\\P{L}+\$") && !validCharacters.contains(word)) { errorCount++ errors += "${errorMsgPrefix}. Found a garbage word: '${word}' on line ${lineNumber}.\n" } if (isAlphabeticLanguage && word.trim().length() == 1) { errorCount++ errors += "${errorMsgPrefix}. Found a single letter: '${word}' on line ${lineNumber}. Only uppercase single letters are allowed. The rest of the alphabet will be added automatically.\n" } if (errorCount == 0 && isAlphabeticLanguage && !word.matches(validCharacters)) { errorCount++ errors += "${errorMsgPrefix}. Word '${word}' on line ${lineNumber} contains characters outside of the defined alphabet: $validCharacters.\n" } return [errorCount, errors] }