From 44ecb8999ea1f009f9fd35d36eeba13c43f03963 Mon Sep 17 00:00:00 2001 From: Dimo Karaivanov Date: Mon, 21 Aug 2023 15:29:30 +0300 Subject: [PATCH] Build scripts cleanup and dictionary loading optimization * moved the source languages out of assets/ into their own directory (#356) * split build.gradle into several smaller files * improved word frequency validation during build time * slightly optimized dictionary loading speed using pre-calculated file size * fixed a potential crash when loading invalid assets * fixed dictionary loading progress starting at 100% then jumping to 0% when manually loading two dictionaries one after another * documentation update --- .github/workflows/build.yml | 6 +- .gitignore | 1 + CONTRIBUTING.md | 9 +- build.gradle | 329 +++--------------- gradle/scripts/constants.gradle | 16 + gradle/scripts/dictionary-tools.gradle | 6 + gradle/scripts/validate-languages.gradle | 194 +++++++++++ gradle/scripts/version-tools.gradle | 62 ++++ .../definitions/BrazilianPortuguese.yml | 0 .../definitions/Bulgarian.yml | 0 .../definitions/Danish.yml | 0 .../definitions/Dutch.yml | 0 .../definitions/English.yml | 0 .../definitions/Finnish.yml | 0 .../definitions/French.yml | 0 .../definitions/German.yml | 0 .../definitions/Greek.yml | 0 .../definitions/Hebrew.yml | 0 .../definitions/Indonesian.yml | 0 .../definitions/Italian.yml | 0 .../definitions/Norwegian.yml | 0 .../definitions/Polish.yml | 0 .../definitions/Romanian.yml | 0 .../definitions/Russian.yml | 0 .../definitions/Spanish.yml | 0 .../definitions/Swedish.yml | 0 .../definitions/Ukrainian.yml | 0 .../definitions/Yiddish.yml | 0 .../dictionaries/bg-utf8.csv | 0 .../dictionaries/da-utf8.csv | 0 .../dictionaries/de-utf8.csv | 0 .../dictionaries/en-utf8.csv | 0 .../dictionaries/es-utf8.csv | 0 .../dictionaries/fi-utf8.csv | 0 .../dictionaries/fr-utf8.csv | 0 .../dictionaries/gr-utf8.csv | 0 .../dictionaries/he-utf8.csv | 0 .../dictionaries/id-utf8.csv | 0 .../dictionaries/it-utf8.csv | 0 .../dictionaries/ji-utf8.csv | 0 .../dictionaries/nb-utf8.csv | 0 .../dictionaries/nl-utf8.csv | 0 .../dictionaries/pl-utf8.csv | 0 .../dictionaries/pt-BR-utf8.csv | 0 .../dictionaries/ro-utf8.csv | 0 .../dictionaries/ru-utf8.csv | 0 .../dictionaries/sv-utf8.csv | 0 .../dictionaries/uk-utf8.csv | 0 .../sspanak/tt9/db/DictionaryLoader.java | 62 ++-- .../tt9/languages/LanguageDefinition.java | 2 +- 50 files changed, 367 insertions(+), 320 deletions(-) create mode 100644 gradle/scripts/constants.gradle create mode 100644 gradle/scripts/dictionary-tools.gradle create mode 100644 gradle/scripts/validate-languages.gradle create mode 100644 gradle/scripts/version-tools.gradle rename {assets/languages => languages}/definitions/BrazilianPortuguese.yml (100%) rename {assets/languages => languages}/definitions/Bulgarian.yml (100%) rename {assets/languages => languages}/definitions/Danish.yml (100%) rename {assets/languages => languages}/definitions/Dutch.yml (100%) rename {assets/languages => languages}/definitions/English.yml (100%) rename {assets/languages => languages}/definitions/Finnish.yml (100%) rename {assets/languages => languages}/definitions/French.yml (100%) rename {assets/languages => languages}/definitions/German.yml (100%) rename {assets/languages => languages}/definitions/Greek.yml (100%) rename {assets/languages => languages}/definitions/Hebrew.yml (100%) rename {assets/languages => languages}/definitions/Indonesian.yml (100%) rename {assets/languages => languages}/definitions/Italian.yml (100%) rename {assets/languages => languages}/definitions/Norwegian.yml (100%) rename {assets/languages => languages}/definitions/Polish.yml (100%) rename {assets/languages => languages}/definitions/Romanian.yml (100%) rename {assets/languages => languages}/definitions/Russian.yml (100%) rename {assets/languages => languages}/definitions/Spanish.yml (100%) rename {assets/languages => languages}/definitions/Swedish.yml (100%) rename {assets/languages => languages}/definitions/Ukrainian.yml (100%) rename {assets/languages => languages}/definitions/Yiddish.yml (100%) rename {assets/languages => languages}/dictionaries/bg-utf8.csv (100%) rename {assets/languages => languages}/dictionaries/da-utf8.csv (100%) rename {assets/languages => languages}/dictionaries/de-utf8.csv (100%) rename {assets/languages => languages}/dictionaries/en-utf8.csv (100%) rename {assets/languages => languages}/dictionaries/es-utf8.csv (100%) rename {assets/languages => languages}/dictionaries/fi-utf8.csv (100%) rename {assets/languages => languages}/dictionaries/fr-utf8.csv (100%) rename {assets/languages => languages}/dictionaries/gr-utf8.csv (100%) rename {assets/languages => languages}/dictionaries/he-utf8.csv (100%) rename {assets/languages => languages}/dictionaries/id-utf8.csv (100%) rename {assets/languages => languages}/dictionaries/it-utf8.csv (100%) rename {assets/languages => languages}/dictionaries/ji-utf8.csv (100%) rename {assets/languages => languages}/dictionaries/nb-utf8.csv (100%) rename {assets/languages => languages}/dictionaries/nl-utf8.csv (100%) rename {assets/languages => languages}/dictionaries/pl-utf8.csv (100%) rename {assets/languages => languages}/dictionaries/pt-BR-utf8.csv (100%) rename {assets/languages => languages}/dictionaries/ro-utf8.csv (100%) rename {assets/languages => languages}/dictionaries/ru-utf8.csv (100%) rename {assets/languages => languages}/dictionaries/sv-utf8.csv (100%) rename {assets/languages => languages}/dictionaries/uk-utf8.csv (100%) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 15f3a86a..bf2dbda6 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -23,10 +23,12 @@ jobs: - name: Setup Gradle uses: gradle/gradle-build-action@v2 - # validation + # validate and build - name: Validate Dictionaries run: ./gradlew validateLanguages + - name: Build Languages + run: ./gradlew copyLanguages calculateDictionarySizes - name: Lint - run: ./gradlew lint + run: ./gradlew lint # this actually runs mergeResources, so it must come after the dictionary tasks - name: Build Release APK run: ./gradlew build diff --git a/.gitignore b/.gitignore index ad2c9e8a..6c2841e6 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ gen/ # Gradle/build files .gradle/ +assets/ build/ release/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6403af06..019f72cb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -48,11 +48,11 @@ Make sure you have a signing key. If you don't have one, follow the [official ma ## Adding a New Language To support a new language one needs to: -- Find a suitable dictionary and add it to the `assets/languages/dictionaries/` folder. Two file formats are supported, [see below](#dictionary-formats). +- Find a suitable dictionary and add it to the `languages/dictionaries/` folder. Two file formats are supported, [see below](#dictionary-formats). - Do not forget to include the dictionary license (or readme) file in the `docs/` folder. -- Create a new `.yml` file in `assets/languages/definitions/` and define the language properties. +- Create a new `.yml` file in `languages/definitions/` and define the language properties. - `locale` contains the language and the country codes (e.g. "en-US", "es-AR", "it-IT"). Refer to the list of [supported locales in Java](https://www.oracle.com/java/technologies/javase/jdk8-jre8-suported-locales.html#util-text). - - `dictionaryFile` is the name of the dictionary in `assets/languages/dictionaries/` folder. + - `dictionaryFile` is the name of the dictionary in `languages/dictionaries/` folder. - `layout` contains the letters and punctuation marks associated with each key. - For 0-key `[SPECIAL]`, will be fine in most languages, but you could define your own set of special characters, for example: `[@, #, $]`. - For 1-key, you could use `[PUNCTUATION]` and have standard English/computer punctuation; or `[PUNCTUATION_FR]` that includes the French quotation marks: `«`, `»`; or `[PUNCTUATION_DE]` that includes the German quotation marks: `„`, `“`. And if the language has extra punctuation marks, like Spanish, you could complement the list like this: `[PUNCTUATION, ¡, ¿]`. Or you could define your own list, like for 0-key. @@ -70,6 +70,7 @@ Constraints: - No single lowercase letters. They will be added automatically. - No repeating words. - No digits or garbage characters as part of the words. +- The words must consist only of the letters definied in the respective YML definition file. _The constraints will be verified automatically upon building._ @@ -88,7 +89,7 @@ Constraints: - No header. - The separator is `TAB`. - The frequency is optional. If missing, it is assumed to be 0. -- The frequency must be a non-negative integer, when present. +- The frequency must be an integer between 0 and 255, when present. _The TXT format constraints listed above also apply._ diff --git a/build.gradle b/build.gradle index 13c45968..a4a86769 100644 --- a/build.gradle +++ b/build.gradle @@ -14,13 +14,18 @@ buildscript { apply plugin: 'com.android.application' apply plugin: 'at.zierler.yamlvalidator' +apply from: 'gradle/scripts/constants.gradle' +apply from: 'gradle/scripts/dictionary-tools.gradle' +apply from: 'gradle/scripts/validate-languages.gradle' +apply from: 'gradle/scripts/version-tools.gradle' + configurations.configureEach { // fixes 'duplicate class error', when using these combine: androidx.core:1.10.1, androidx.preference:1.2.0 and androidx.room:2.5.1 // see: https://stackoverflow.com/questions/75274720/a-failure-occurred-while-executing-appcheckdebugduplicateclasses/75315276#75315276 exclude group: 'org.jetbrains.kotlin', module: 'kotlin-stdlib-jdk8' yamlValidator { - searchPaths = ['assets/languages/definitions'] + searchPaths = ['languages/definitions'] } } @@ -39,277 +44,55 @@ repositories { } } -def execThing ( String cmdStr ) { - def stdout = new ByteArrayOutputStream() - String prefix = System.getenv("GITCMDPREFIX") - if (prefix != null) { - String cmd = prefix + cmdStr - exec { - commandLine cmd.tokenize() - standardOutput = stdout - } - } else { - exec { - commandLine cmdStr.tokenize() - standardOutput = stdout - } - } - return stdout.toString().trim() -} - -def getCurrentGitHash = { -> - return execThing('git log -1 --format=%h') -} - -def getVersionCode = { -> - String commitsCount = execThing("git rev-list --count HEAD") - return Integer.valueOf(commitsCount) -} - -def getVersionName = { -> - // major version - String versionTagsRaw = execThing('git tag --list v[0-9]*') - int versionTagsCount = versionTagsRaw == "" ? 0 : versionTagsRaw.split('\n').size() - - // minor version - String commitsSinceLastTag = "0" - if (versionTagsCount > 1) { - String lastVersionTag = execThing('git describe --match v[0-9]* --tags --abbrev=0') - String gitLogResult = execThing("git log $lastVersionTag..HEAD --oneline") - commitsSinceLastTag = gitLogResult == '' ? "0" : gitLogResult.split('\n').size() - } - - - // the commit we are building from - - // beta string, if this is a beta - String lastTagName = (execThing('git tag --list') == "") ? "" : execThing('git describe --tags --abbrev=0') - String lastTagHash = (lastTagName == "") ? "" : execThing("git log -1 --format=%h $lastTagName") - String betaString = lastTagHash == getCurrentGitHash() && lastTagName.contains("-beta") ? '-beta' : '' - - return "$versionTagsCount.$commitsSinceLastTag$betaString" -} - -def getDebugVersion = { -> - return "git-${getCurrentGitHash()} (debug)" -} - -def getReleaseVersion = { -> - return "${getVersionName()} (${getCurrentGitHash()})" -} - -static def validateDictionaryLine(String line, int lineNumber) { - if (line == "") { - return "There is no word on line ${lineNumber}. Remove all empty lines." - } else if (line.contains(" ")) { - return "Found space on line ${lineNumber}. Make sure each word is on a new line. Phrases are not allowed." - } - - return '' -} - -static def extractAlphabetCharsFromLine(String line) { - if (line.contains('PUNCTUATION') || line.contains('SPECIAL') || !line.matches('\\s+- \\[.+?\\].*')) { - return '' - } - - return line.replaceFirst('^\\s+- \\[', '').replaceFirst('\\].*', '').replace(',', '').replace(' ', '') -} - -static def validateDictionaryWord(String word, int lineNumber, String validCharacters, String errorMsgPrefix) { - int errorCount = 0 - def errors = '' - - if (word.matches("(\\d.+?|.+?\\d|\\d)")) { - errorCount++ - errors += "${errorMsgPrefix}. Found numbers on line ${lineNumber}. Remove all numbers.\n" - } - - if (word.matches("^\\P{L}+\$")) { - errorCount++ - errors += "${errorMsgPrefix}. Found a garbage word: '${word}' on line ${lineNumber}.\n" - } - - if (word.matches("^.\$")) { - errorCount++ - errors += "${errorMsgPrefix}. Found a single letter: '${word}' on line ${lineNumber}. Only uppercase single letters are allowed. The rest of the alphabet will be added automatically.\n" - } - - if (errorCount == 0 && !word.matches(validCharacters)) { - errorCount++ - errors += "${errorMsgPrefix}. Word '${word}' on line ${lineNumber} contain characters outside of the defined alphabet: $validCharacters.\n" - } - - return [errorCount, errors] -} - -task validateLanguages { - final baseDir = "${project.rootDir}/assets/languages" - final definitionsDir = "${baseDir}/definitions" - final dictionariesDir = "${baseDir}/dictionaries" - - inputs.dir fileTree(dir:baseDir, excludes:['dict.properties']) +tasks.register('validateLanguages') { + mustRunAfter(validateYaml) + inputs.dir fileTree(dir: LANGUAGES_INPUT_DIR) outputs.file "${project.buildDir}/lang.validation.txt" doLast { - final String CSV_DELIMITER = ' ' // TAB - final GEOGRAPHICAL_NAME = ~"[A-Z]\\w+-[^\\n]+" - - - final MAX_ERRORS = 50 - String errors = "" - int errorCount = 0 - - outputs.files.singleFile.text = "" - - fileTree(definitionsDir).getFiles().each { File languageFile -> - if (errorCount >= MAX_ERRORS) { - return - } - - println "Validating language: ${languageFile.name}" - - boolean isFileValid = true - - boolean hasLayout = false - boolean isLocaleValid = false - String localeString = '' - String dictionaryFileName = '' - - String alphabet = languageFile.name.contains("Hebrew") ? '"' : '' - - languageFile.eachLine { line -> - if ( - line.matches("^[a-zA-Z].*") - && !line.startsWith("abcString") - && !line.startsWith("dictionaryFile") - && !line.startsWith("hasUpperCase") - && !line.startsWith("layout") - && !line.startsWith("locale") - && !line.startsWith("name") - ) { - isFileValid = false - def parts = line.split(":") - def property = parts.length > 0 ? parts[0] : line - - errorCount++ - errors += "Language '${languageFile.name}' is invalid. Found unknown property: '${property}'.\n" - } - - if (line.startsWith("hasUpperCase") && !line.endsWith("yes") && !line.endsWith("no")) { - def invalidVal = line.replace("hasUpperCase:", "").trim() - isFileValid = false - errorCount++ - errors += "Language '${languageFile.name}' is invalid. Unrecognized 'hasUpperCase' value: '${invalidVal}'. Only 'yes' and 'no' are allowed.\n" - } - - if (line.startsWith("layout")) { - hasLayout = true - } - - if (line.startsWith("locale")) { - localeString = line.replace("locale:", "").trim() - isLocaleValid = line.matches("^locale:\\s*[a-z]{2}(?:-[A-Z]{2})?") - } - - if (line.startsWith("dictionaryFile")) { - dictionaryFileName = line.replace("dictionaryFile:", "").trim() - } - - alphabet += extractAlphabetCharsFromLine(line) - } - - if (!hasLayout) { - isFileValid = false - errorCount++ - errors += "Language '${languageFile.name}' is invalid. Missing 'layout' property.\n" - } - - if (alphabet.isEmpty()) { - isFileValid = false - errorCount++ - errors += "Language '${languageFile.name}' is invalid. No language characters found. Make sure 'layout' contains series of characters per each key in the format: ' - [a, b, c]' and so on\n" - } - - if (!isLocaleValid) { - isFileValid = false - errorCount++ - def msg = localeString.isEmpty() ? "Missing 'locale' property." : "Unrecognized locale format: '${localeString}'" - errors += "Language '${languageFile.name}' is invalid. ${msg}\n" - } - - def dictionaryFile = new File("$dictionariesDir/${dictionaryFileName}") - if (dictionaryFileName.isEmpty() || !dictionaryFile.exists()) { - errorCount++ - errors += "Could not find dictionary file: '${dictionaryFileName}' in: '${dictionariesDir}'. Make sure 'dictionaryFile' is set correctly in: '${languageFile.name}'.\n" - - outputs.files.singleFile.text += "${languageFile.name} INVALID \n" - return - } - - def validChars = alphabet.toUpperCase() == alphabet ? "^[${alphabet}\\-']+\$" : "^[${alphabet}${alphabet.toUpperCase()}\\-']+\$" - def uniqueWords = [:] - int lineNumber = 0 - - dictionaryFile.eachLine {line -> - if (errorCount >= MAX_ERRORS) { - return - } - - lineNumber++ - - String error = validateDictionaryLine(line, lineNumber) - if (!error.isEmpty()) { - isFileValid = false - errorCount++ - errors += "Dictionary '${dictionaryFile.name}' is invalid. ${error}.\n" - return - } - - String[] parts = line.split(CSV_DELIMITER, 2) - String word = parts[0] - String frequency = parts.length > 1 ? parts[1] : "" - - if (frequency.length() > 0 && !frequency.matches("^\\d+\$")) { - isFileValid = false - errorCount++ - errors += "Dictionary '${dictionaryFile.name}' is invalid. Found out-of-range word frequency: '${frequency}' on line ${lineNumber}. Frequency must be a non-negative integer.\n" - } - - def (wordErrorCount, wordErrors) = validateDictionaryWord(word, lineNumber, validChars, "Dictionary '${dictionaryFile.name}' is invalid") - isFileValid = wordErrorCount > 0 ? false : isFileValid - errorCount += wordErrorCount - errors += wordErrors - - String uniqueWordKey = word ==~ GEOGRAPHICAL_NAME ? word : word.toLowerCase() - if (uniqueWords[uniqueWordKey] != null && uniqueWords[uniqueWordKey] == true) { - isFileValid = false - errorCount++ - errors += "Dictionary '${dictionaryFile.name}' is invalid. Found a repeating word: '${word}' on line ${lineNumber}. Ensure all words appear only once.\n" - } else { - uniqueWords[uniqueWordKey] = true - } - - if (errorCount >= MAX_ERRORS) { - errors += "Too many errors! Aborting.\n" - } - } - - outputs.files.singleFile.text += "${languageFile.name} ${isFileValid ? 'OK' : 'INVALID'}\n" - } - - if (errors != "") { - throw new GradleException(errors) - } + validateLanguageFiles(DEFINITIONS_INPUT_DIR, DICTIONARIES_INPUT_DIR, outputs.files.singleFile) } } -android { - defaultConfig { - compileSdk 33 +tasks.register('copyLanguages', Copy) { + from LANGUAGES_INPUT_DIR + include '**/*.csv' + include '**/*.txt' + include '**/*.yml' + into LANGUAGES_OUTPUT_DIR +} + +tasks.register('calculateDictionarySizes') { + inputs.dir fileTree(dir: DICTIONARIES_INPUT_DIR) + outputs.dir DICTIONARIES_OUTPUT_DIR + + doLast { + getDictionarySizes(DICTIONARIES_INPUT_DIR, DICTIONARIES_OUTPUT_DIR) } +} + +clean { + delete LANGUAGES_OUTPUT_DIR +} + + +// using the exported Closures directly causes weird values, hence the extra wrappers here +def getVerCode = { -> return getVersionCode() } +def getVerName = { -> return getVersionName() } +def getVersionString = { flavor -> return flavor == 'debug' ? getDebugVersion() : getReleaseVersion() } + +android { namespace "io.github.sspanak.tt9" + defaultConfig { + minSdkVersion 19 + //noinspection ExpiredTargetSdkVersion + targetSdk 30 + compileSdk 33 + versionCode getVerCode() + versionName getVerName() + } + sourceSets { main { manifest.srcFile 'AndroidManifest.xml' @@ -330,21 +113,13 @@ android { release.setRoot('build-types/release') } - defaultConfig { - minSdkVersion 19 - //noinspection ExpiredTargetSdkVersion - targetSdk 30 - versionCode getVersionCode() - versionName getVersionName() - } - buildTypes { debug { data -> - data.buildConfigField 'String', 'VERSION_FULL', "\"${getDebugVersion()}\"" + data.buildConfigField 'String', 'VERSION_FULL', "\"${getVersionString('debug')}\"" } release { data -> - data.buildConfigField 'String', 'VERSION_FULL', "\"${getReleaseVersion()}\"" + data.buildConfigField 'String', 'VERSION_FULL', "\"${getVersionString('release')}\"" debuggable false jniDebuggable false @@ -360,8 +135,12 @@ android { applicationVariants.configureEach { variant -> - tasks["merge${variant.name.capitalize()}Assets"] - .dependsOn(validateYaml) + tasks["generate${variant.name.capitalize()}Assets"] .dependsOn(validateLanguages) + .dependsOn(copyLanguages) + .dependsOn(calculateDictionarySizes) + + tasks.findByName('lintVitalAnalyzeRelease')?.mustRunAfter(copyLanguages)?.mustRunAfter(calculateDictionarySizes) + tasks.findByName('lintAnalyzeDebug')?.mustRunAfter(copyLanguages)?.mustRunAfter(calculateDictionarySizes) } } diff --git a/gradle/scripts/constants.gradle b/gradle/scripts/constants.gradle new file mode 100644 index 00000000..da7ebffa --- /dev/null +++ b/gradle/scripts/constants.gradle @@ -0,0 +1,16 @@ +ext.LANGUAGES_DIR_NAME = 'languages' +ext.DEFINITIONS_DIR_NAME = 'definitions' +ext.DICTIONARIES_DIR_NAME = 'dictionaries' +ext.DICTIONARY_SIZES_DIR_NAME = 'dictionary-sizes' + +ext.LANGUAGES_INPUT_DIR = "${project.rootDir}/${LANGUAGES_DIR_NAME}" +ext.DEFINITIONS_INPUT_DIR = "${LANGUAGES_INPUT_DIR}/${DEFINITIONS_DIR_NAME}" +ext.DICTIONARIES_INPUT_DIR = "${LANGUAGES_INPUT_DIR}/${DICTIONARIES_DIR_NAME}" + +ext.LANGUAGES_OUTPUT_DIR = "${LANGUAGES_INPUT_DIR}".replace("${project.rootDir}", "${project.rootDir}/assets") +ext.DEFINITIONS_OUTPUT_DIR = "${DEFINITIONS_INPUT_DIR}".replace("${project.rootDir}", "${project.rootDir}/assets") +ext.DICTIONARIES_OUTPUT_DIR = "${DICTIONARIES_INPUT_DIR}".replace("${project.rootDir}", "${project.rootDir}/assets") + +ext.CSV_DELIMITER = ' ' // TAB +ext.MAX_WORD_FREQUENCY = 255 +ext.MAX_ERRORS = 50 diff --git a/gradle/scripts/dictionary-tools.gradle b/gradle/scripts/dictionary-tools.gradle new file mode 100644 index 00000000..e1305bdd --- /dev/null +++ b/gradle/scripts/dictionary-tools.gradle @@ -0,0 +1,6 @@ +ext.getDictionarySizes = { dictionariesDir, sizesDir -> + fileTree(dir: dictionariesDir).forEach {dictionary -> + def dictionarySize = dictionary.exists() ? dictionary.text.split("\n").length : 0 + new File(sizesDir, "${dictionary.getName()}.size").text = dictionarySize + } +} \ No newline at end of file diff --git a/gradle/scripts/validate-languages.gradle b/gradle/scripts/validate-languages.gradle new file mode 100644 index 00000000..a08fbb4e --- /dev/null +++ b/gradle/scripts/validate-languages.gradle @@ -0,0 +1,194 @@ +static def validateDictionaryLine(String line, int lineNumber) { + if (line == "") { + return "There is no word on line ${lineNumber}. Remove all empty lines." + } else if (line.contains(" ")) { + return "Found space on line ${lineNumber}. Make sure each word is on a new line. Phrases are not allowed." + } + + return '' +} + +static def extractAlphabetCharsFromLine(String line) { + if (line.contains('PUNCTUATION') || line.contains('SPECIAL') || !line.matches('\\s+- \\[.+?\\].*')) { + return '' + } + + return line.replaceFirst('^\\s+- \\[', '').replaceFirst('\\].*', '').replace(',', '').replace(' ', '') +} + +static def validateDictionaryWord(String word, int lineNumber, String validCharacters, String errorMsgPrefix) { + int errorCount = 0 + def errors = '' + + if (word.matches("(\\d.+?|.+?\\d|\\d)")) { + errorCount++ + errors += "${errorMsgPrefix}. Found numbers on line ${lineNumber}. Remove all numbers.\n" + } + + if (word.matches("^\\P{L}+\$")) { + errorCount++ + errors += "${errorMsgPrefix}. Found a garbage word: '${word}' on line ${lineNumber}.\n" + } + + if (word.matches("^.\$")) { + errorCount++ + errors += "${errorMsgPrefix}. Found a single letter: '${word}' on line ${lineNumber}. Only uppercase single letters are allowed. The rest of the alphabet will be added automatically.\n" + } + + if (errorCount == 0 && !word.matches(validCharacters)) { + errorCount++ + errors += "${errorMsgPrefix}. Word '${word}' on line ${lineNumber} contain characters outside of the defined alphabet: $validCharacters.\n" + } + + return [errorCount, errors] +} + +ext.validateLanguageFiles = { definitionsDir, dictionariesDir, outputFile -> + final GEOGRAPHICAL_NAME = ~"[A-Z]\\w+-[^\\n]+" + + String errors = "" + int errorCount = 0 + + outputFile.text = "" + + fileTree(definitionsDir).getFiles().each { File languageFile -> + if (errorCount >= MAX_ERRORS) { + return + } + + println "Validating language: ${languageFile.name}" + + boolean isFileValid = true + + boolean hasLayout = false + boolean isLocaleValid = false + String localeString = '' + String dictionaryFileName = '' + + String alphabet = languageFile.name.contains("Hebrew") ? '"' : '' + + languageFile.eachLine { line -> + if ( + line.matches("^[a-zA-Z].*") + && !line.startsWith("abcString") + && !line.startsWith("dictionaryFile") + && !line.startsWith("hasUpperCase") + && !line.startsWith("layout") + && !line.startsWith("locale") + && !line.startsWith("name") + ) { + isFileValid = false + def parts = line.split(":") + def property = parts.length > 0 ? parts[0] : line + + errorCount++ + errors += "Language '${languageFile.name}' is invalid. Found unknown property: '${property}'.\n" + } + + if (line.startsWith("hasUpperCase") && !line.endsWith("yes") && !line.endsWith("no")) { + def invalidVal = line.replace("hasUpperCase:", "").trim() + isFileValid = false + errorCount++ + errors += "Language '${languageFile.name}' is invalid. Unrecognized 'hasUpperCase' value: '${invalidVal}'. Only 'yes' and 'no' are allowed.\n" + } + + if (line.startsWith("layout")) { + hasLayout = true + } + + if (line.startsWith("locale")) { + localeString = line.replace("locale:", "").trim() + isLocaleValid = line.matches("^locale:\\s*[a-z]{2}(?:-[A-Z]{2})?") + } + + if (line.startsWith("dictionaryFile")) { + dictionaryFileName = line.replace("dictionaryFile:", "").trim() + } + + def lineCharacters = extractAlphabetCharsFromLine(line) + alphabet += lineCharacters + } + + if (!hasLayout) { + isFileValid = false + errorCount++ + errors += "Language '${languageFile.name}' is invalid. Missing 'layout' property.\n" + } + + if (alphabet.isEmpty()) { + isFileValid = false + errorCount++ + errors += "Language '${languageFile.name}' is invalid. No language characters found. Make sure 'layout' contains series of characters per each key in the format: ' - [a, b, c]' and so on\n" + } + + if (!isLocaleValid) { + isFileValid = false + errorCount++ + def msg = localeString.isEmpty() ? "Missing 'locale' property." : "Unrecognized locale format: '${localeString}'" + errors += "Language '${languageFile.name}' is invalid. ${msg}\n" + } + + def dictionaryFile = new File("$dictionariesDir/${dictionaryFileName}") + if (dictionaryFileName.isEmpty() || !dictionaryFile.exists()) { + errorCount++ + errors += "Could not find dictionary file: '${dictionaryFileName}' in: '${dictionariesDir}'. Make sure 'dictionaryFile' is set correctly in: '${languageFile.name}'.\n" + + outputFile.text += "${languageFile.name} INVALID \n" + return + } + + def validChars = alphabet.toUpperCase() == alphabet ? "^[${alphabet}\\-']+\$" : "^[${alphabet}${alphabet.toUpperCase()}\\-']+\$" + def uniqueWords = [:] + int lineNumber = 0 + + dictionaryFile.eachLine {line -> + if (errorCount >= MAX_ERRORS) { + return + } + + lineNumber++ + + String error = validateDictionaryLine(line, lineNumber) + if (!error.isEmpty()) { + isFileValid = false + errorCount++ + errors += "Dictionary '${dictionaryFile.name}' is invalid. ${error}.\n" + return + } + + String[] parts = line.split(CSV_DELIMITER, 2) + String word = parts[0] + final frequency = (parts.length > 1 ? parts[1] : "0") as int + + if (frequency < 0 || frequency > MAX_WORD_FREQUENCY) { + isFileValid = false + errorCount++ + errors += "Dictionary '${dictionaryFile.name}' is invalid. Found out-of-range word frequency: '${frequency}' on line ${lineNumber}. Frequency must be an integer between 0 and ${MAX_WORD_FREQUENCY}.\n" + } + + def (wordErrorCount, wordErrors) = validateDictionaryWord(word, lineNumber, validChars, "Dictionary '${dictionaryFile.name}' is invalid") + isFileValid = wordErrorCount > 0 ? false : isFileValid + errorCount += wordErrorCount + errors += wordErrors + + String uniqueWordKey = word ==~ GEOGRAPHICAL_NAME ? word : word.toLowerCase() + if (uniqueWords[uniqueWordKey] != null && uniqueWords[uniqueWordKey] == true) { + isFileValid = false + errorCount++ + errors += "Dictionary '${dictionaryFile.name}' is invalid. Found a repeating word: '${word}' on line ${lineNumber}. Ensure all words appear only once.\n" + } else { + uniqueWords[uniqueWordKey] = true + } + + if (errorCount >= MAX_ERRORS) { + errors += "Too many errors! Aborting.\n" + } + } + + outputFile.text += "${languageFile.name} ${isFileValid ? 'OK' : 'INVALID'}\n" + } + + if (errors != "") { + throw new GradleException(errors) + } +} \ No newline at end of file diff --git a/gradle/scripts/version-tools.gradle b/gradle/scripts/version-tools.gradle new file mode 100644 index 00000000..8deaf023 --- /dev/null +++ b/gradle/scripts/version-tools.gradle @@ -0,0 +1,62 @@ +def execThing (String cmdStr) { + def stdout = new ByteArrayOutputStream() + String prefix = System.getenv("GITCMDPREFIX") + if (prefix != null) { + String cmd = prefix + cmdStr + exec { + commandLine cmd.tokenize() + standardOutput = stdout + } + } else { + exec { + commandLine cmdStr.tokenize() + standardOutput = stdout + } + } + return stdout.toString().trim() +} + +def getCurrentGitHash() { + return execThing('git log -1 --format=%h') +} + +def generateVersionName() { + // major version + String versionTagsRaw = execThing('git tag --list v[0-9]*') + int versionTagsCount = versionTagsRaw == "" ? 0 : versionTagsRaw.split('\n').size() + + // minor version + String commitsSinceLastTag = "0" + if (versionTagsCount > 1) { + String lastVersionTag = execThing('git describe --match v[0-9]* --tags --abbrev=0') + String gitLogResult = execThing("git log $lastVersionTag..HEAD --oneline") + commitsSinceLastTag = gitLogResult == '' ? "0" : gitLogResult.split('\n').size() + } + + + // the commit we are building from + + // beta string, if this is a beta + String lastTagName = (execThing('git tag --list') == "") ? "" : execThing('git describe --tags --abbrev=0') + String lastTagHash = (lastTagName == "") ? "" : execThing("git log -1 --format=%h $lastTagName") + String betaString = lastTagHash == getCurrentGitHash() && lastTagName.contains("-beta") ? '-beta' : '' + + return "$versionTagsCount.$commitsSinceLastTag$betaString" +} + +ext.getVersionName = { -> + return generateVersionName() +} + +ext.getVersionCode = { -> + String commitsCount = execThing("git rev-list --count HEAD") + return Integer.valueOf(commitsCount) +} + +ext.getDebugVersion = { -> + return "git-${getCurrentGitHash()} (debug)" +} + +ext.getReleaseVersion = { -> + return "${generateVersionName()} (${getCurrentGitHash()})" +} diff --git a/assets/languages/definitions/BrazilianPortuguese.yml b/languages/definitions/BrazilianPortuguese.yml similarity index 100% rename from assets/languages/definitions/BrazilianPortuguese.yml rename to languages/definitions/BrazilianPortuguese.yml diff --git a/assets/languages/definitions/Bulgarian.yml b/languages/definitions/Bulgarian.yml similarity index 100% rename from assets/languages/definitions/Bulgarian.yml rename to languages/definitions/Bulgarian.yml diff --git a/assets/languages/definitions/Danish.yml b/languages/definitions/Danish.yml similarity index 100% rename from assets/languages/definitions/Danish.yml rename to languages/definitions/Danish.yml diff --git a/assets/languages/definitions/Dutch.yml b/languages/definitions/Dutch.yml similarity index 100% rename from assets/languages/definitions/Dutch.yml rename to languages/definitions/Dutch.yml diff --git a/assets/languages/definitions/English.yml b/languages/definitions/English.yml similarity index 100% rename from assets/languages/definitions/English.yml rename to languages/definitions/English.yml diff --git a/assets/languages/definitions/Finnish.yml b/languages/definitions/Finnish.yml similarity index 100% rename from assets/languages/definitions/Finnish.yml rename to languages/definitions/Finnish.yml diff --git a/assets/languages/definitions/French.yml b/languages/definitions/French.yml similarity index 100% rename from assets/languages/definitions/French.yml rename to languages/definitions/French.yml diff --git a/assets/languages/definitions/German.yml b/languages/definitions/German.yml similarity index 100% rename from assets/languages/definitions/German.yml rename to languages/definitions/German.yml diff --git a/assets/languages/definitions/Greek.yml b/languages/definitions/Greek.yml similarity index 100% rename from assets/languages/definitions/Greek.yml rename to languages/definitions/Greek.yml diff --git a/assets/languages/definitions/Hebrew.yml b/languages/definitions/Hebrew.yml similarity index 100% rename from assets/languages/definitions/Hebrew.yml rename to languages/definitions/Hebrew.yml diff --git a/assets/languages/definitions/Indonesian.yml b/languages/definitions/Indonesian.yml similarity index 100% rename from assets/languages/definitions/Indonesian.yml rename to languages/definitions/Indonesian.yml diff --git a/assets/languages/definitions/Italian.yml b/languages/definitions/Italian.yml similarity index 100% rename from assets/languages/definitions/Italian.yml rename to languages/definitions/Italian.yml diff --git a/assets/languages/definitions/Norwegian.yml b/languages/definitions/Norwegian.yml similarity index 100% rename from assets/languages/definitions/Norwegian.yml rename to languages/definitions/Norwegian.yml diff --git a/assets/languages/definitions/Polish.yml b/languages/definitions/Polish.yml similarity index 100% rename from assets/languages/definitions/Polish.yml rename to languages/definitions/Polish.yml diff --git a/assets/languages/definitions/Romanian.yml b/languages/definitions/Romanian.yml similarity index 100% rename from assets/languages/definitions/Romanian.yml rename to languages/definitions/Romanian.yml diff --git a/assets/languages/definitions/Russian.yml b/languages/definitions/Russian.yml similarity index 100% rename from assets/languages/definitions/Russian.yml rename to languages/definitions/Russian.yml diff --git a/assets/languages/definitions/Spanish.yml b/languages/definitions/Spanish.yml similarity index 100% rename from assets/languages/definitions/Spanish.yml rename to languages/definitions/Spanish.yml diff --git a/assets/languages/definitions/Swedish.yml b/languages/definitions/Swedish.yml similarity index 100% rename from assets/languages/definitions/Swedish.yml rename to languages/definitions/Swedish.yml diff --git a/assets/languages/definitions/Ukrainian.yml b/languages/definitions/Ukrainian.yml similarity index 100% rename from assets/languages/definitions/Ukrainian.yml rename to languages/definitions/Ukrainian.yml diff --git a/assets/languages/definitions/Yiddish.yml b/languages/definitions/Yiddish.yml similarity index 100% rename from assets/languages/definitions/Yiddish.yml rename to languages/definitions/Yiddish.yml diff --git a/assets/languages/dictionaries/bg-utf8.csv b/languages/dictionaries/bg-utf8.csv similarity index 100% rename from assets/languages/dictionaries/bg-utf8.csv rename to languages/dictionaries/bg-utf8.csv diff --git a/assets/languages/dictionaries/da-utf8.csv b/languages/dictionaries/da-utf8.csv similarity index 100% rename from assets/languages/dictionaries/da-utf8.csv rename to languages/dictionaries/da-utf8.csv diff --git a/assets/languages/dictionaries/de-utf8.csv b/languages/dictionaries/de-utf8.csv similarity index 100% rename from assets/languages/dictionaries/de-utf8.csv rename to languages/dictionaries/de-utf8.csv diff --git a/assets/languages/dictionaries/en-utf8.csv b/languages/dictionaries/en-utf8.csv similarity index 100% rename from assets/languages/dictionaries/en-utf8.csv rename to languages/dictionaries/en-utf8.csv diff --git a/assets/languages/dictionaries/es-utf8.csv b/languages/dictionaries/es-utf8.csv similarity index 100% rename from assets/languages/dictionaries/es-utf8.csv rename to languages/dictionaries/es-utf8.csv diff --git a/assets/languages/dictionaries/fi-utf8.csv b/languages/dictionaries/fi-utf8.csv similarity index 100% rename from assets/languages/dictionaries/fi-utf8.csv rename to languages/dictionaries/fi-utf8.csv diff --git a/assets/languages/dictionaries/fr-utf8.csv b/languages/dictionaries/fr-utf8.csv similarity index 100% rename from assets/languages/dictionaries/fr-utf8.csv rename to languages/dictionaries/fr-utf8.csv diff --git a/assets/languages/dictionaries/gr-utf8.csv b/languages/dictionaries/gr-utf8.csv similarity index 100% rename from assets/languages/dictionaries/gr-utf8.csv rename to languages/dictionaries/gr-utf8.csv diff --git a/assets/languages/dictionaries/he-utf8.csv b/languages/dictionaries/he-utf8.csv similarity index 100% rename from assets/languages/dictionaries/he-utf8.csv rename to languages/dictionaries/he-utf8.csv diff --git a/assets/languages/dictionaries/id-utf8.csv b/languages/dictionaries/id-utf8.csv similarity index 100% rename from assets/languages/dictionaries/id-utf8.csv rename to languages/dictionaries/id-utf8.csv diff --git a/assets/languages/dictionaries/it-utf8.csv b/languages/dictionaries/it-utf8.csv similarity index 100% rename from assets/languages/dictionaries/it-utf8.csv rename to languages/dictionaries/it-utf8.csv diff --git a/assets/languages/dictionaries/ji-utf8.csv b/languages/dictionaries/ji-utf8.csv similarity index 100% rename from assets/languages/dictionaries/ji-utf8.csv rename to languages/dictionaries/ji-utf8.csv diff --git a/assets/languages/dictionaries/nb-utf8.csv b/languages/dictionaries/nb-utf8.csv similarity index 100% rename from assets/languages/dictionaries/nb-utf8.csv rename to languages/dictionaries/nb-utf8.csv diff --git a/assets/languages/dictionaries/nl-utf8.csv b/languages/dictionaries/nl-utf8.csv similarity index 100% rename from assets/languages/dictionaries/nl-utf8.csv rename to languages/dictionaries/nl-utf8.csv diff --git a/assets/languages/dictionaries/pl-utf8.csv b/languages/dictionaries/pl-utf8.csv similarity index 100% rename from assets/languages/dictionaries/pl-utf8.csv rename to languages/dictionaries/pl-utf8.csv diff --git a/assets/languages/dictionaries/pt-BR-utf8.csv b/languages/dictionaries/pt-BR-utf8.csv similarity index 100% rename from assets/languages/dictionaries/pt-BR-utf8.csv rename to languages/dictionaries/pt-BR-utf8.csv diff --git a/assets/languages/dictionaries/ro-utf8.csv b/languages/dictionaries/ro-utf8.csv similarity index 100% rename from assets/languages/dictionaries/ro-utf8.csv rename to languages/dictionaries/ro-utf8.csv diff --git a/assets/languages/dictionaries/ru-utf8.csv b/languages/dictionaries/ru-utf8.csv similarity index 100% rename from assets/languages/dictionaries/ru-utf8.csv rename to languages/dictionaries/ru-utf8.csv diff --git a/assets/languages/dictionaries/sv-utf8.csv b/languages/dictionaries/sv-utf8.csv similarity index 100% rename from assets/languages/dictionaries/sv-utf8.csv rename to languages/dictionaries/sv-utf8.csv diff --git a/assets/languages/dictionaries/uk-utf8.csv b/languages/dictionaries/uk-utf8.csv similarity index 100% rename from assets/languages/dictionaries/uk-utf8.csv rename to languages/dictionaries/uk-utf8.csv diff --git a/src/io/github/sspanak/tt9/db/DictionaryLoader.java b/src/io/github/sspanak/tt9/db/DictionaryLoader.java index eaa9cc1b..5ea35650 100644 --- a/src/io/github/sspanak/tt9/db/DictionaryLoader.java +++ b/src/io/github/sspanak/tt9/db/DictionaryLoader.java @@ -7,7 +7,6 @@ import android.os.Handler; import java.io.BufferedReader; import java.io.InputStreamReader; -import java.io.LineNumberReader; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Locale; @@ -25,6 +24,7 @@ import io.github.sspanak.tt9.preferences.SettingsStore; public class DictionaryLoader { private static DictionaryLoader self; + private final String logTag = "DictionaryLoader"; private final AssetManager assets; private final SettingsStore settings; @@ -70,7 +70,7 @@ public class DictionaryLoader { } if (languages.size() == 0) { - Logger.d("DictionaryLoader", "Nothing to do"); + Logger.d(logTag, "Nothing to do"); return; } @@ -108,8 +108,6 @@ public class DictionaryLoader { private void importAll(Language language) { - final String logTag = "tt9.DictionaryLoader.importAll"; - if (language == null) { Logger.e(logTag, "Failed loading a dictionary for NULL language."); sendError(InvalidLanguageException.class.getSimpleName(), -1); @@ -119,7 +117,7 @@ public class DictionaryLoader { DictionaryDb.runInTransaction(() -> { try { long start = System.currentTimeMillis(); - importWords(language); + importWords(language, language.getDictionaryFile()); Logger.i( logTag, "Dictionary: '" + language.getDictionaryFile() + "'" + @@ -190,22 +188,16 @@ public class DictionaryLoader { } - private void importWords(Language language) throws Exception { - importWords(language, language.getDictionaryFile()); - } - - private void importWords(Language language, String dictionaryFile) throws Exception { - long totalWords = countWords(dictionaryFile); - - BufferedReader br = new BufferedReader(new InputStreamReader(assets.open(dictionaryFile), StandardCharsets.UTF_8)); - - ArrayList dbWords = new ArrayList<>(); - long lineCount = 0; - sendProgressMessage(language, 0, 0); - for (String line; (line = br.readLine()) != null; lineCount++) { + long currentLine = 0; + long totalLines = getFileSize(dictionaryFile); + + BufferedReader br = new BufferedReader(new InputStreamReader(assets.open(dictionaryFile), StandardCharsets.UTF_8)); + ArrayList dbWords = new ArrayList<>(); + + for (String line; (line = br.readLine()) != null; currentLine++) { if (loadThread.isInterrupted()) { br.close(); sendProgressMessage(language, -1, 0); @@ -219,16 +211,17 @@ public class DictionaryLoader { try { dbWords.add(stringToWord(language, word, frequency)); } catch (InvalidLanguageCharactersException e) { - throw new DictionaryImportException(word, lineCount); + br.close(); + throw new DictionaryImportException(word, currentLine); } - if (lineCount % settings.getDictionaryImportWordChunkSize() == 0 || lineCount == totalWords - 1) { + if (dbWords.size() >= settings.getDictionaryImportWordChunkSize() || currentLine >= totalLines - 1) { DictionaryDb.upsertWordsSync(dbWords); dbWords.clear(); } - if (totalWords > 0) { - int progress = (int) Math.floor(100.0 * lineCount / totalWords); + if (totalLines > 0) { + int progress = (int) Math.floor(100.0 * currentLine / totalLines); sendProgressMessage(language, progress, settings.getDictionaryImportProgressUpdateInterval()); } } @@ -255,16 +248,13 @@ public class DictionaryLoader { } - private long countWords(String filename) { - try (LineNumberReader reader = new LineNumberReader(new InputStreamReader(assets.open(filename), StandardCharsets.UTF_8))) { - //noinspection ResultOfMethodCallIgnored - reader.skip(Long.MAX_VALUE); - long lines = reader.getLineNumber(); - reader.close(); + private long getFileSize(String filename) { + String sizeFilename = filename + ".size"; - return lines; + try (BufferedReader reader = new BufferedReader(new InputStreamReader(assets.open(sizeFilename), StandardCharsets.UTF_8))) { + return Integer.parseInt(reader.readLine()); } catch (Exception e) { - Logger.w("DictionaryLoader.countWords", "Could not count the lines of file: " + filename + ". " + e.getMessage()); + Logger.w(logTag, "Could not read the size of: " + filename + " from: " + sizeFilename + ". " + e.getMessage()); return 0; } } @@ -293,9 +283,7 @@ public class DictionaryLoader { private void sendFileCount(int fileCount) { if (onStatusChange == null) { - Logger.w( - "DictionaryLoader.sendFileCount", - "Cannot send file count without a status Handler. Ignoring message."); + Logger.w(logTag, "Cannot send file count without a status Handler. Ignoring message."); return; } @@ -307,9 +295,7 @@ public class DictionaryLoader { private void sendProgressMessage(Language language, int progress, int progressUpdateInterval) { if (onStatusChange == null) { - Logger.w( - "DictionaryLoader.sendProgressMessage", - "Cannot send progress without a status Handler. Ignoring message."); + Logger.w(logTag, "Cannot send progress without a status Handler. Ignoring message."); return; } @@ -331,7 +317,7 @@ public class DictionaryLoader { private void sendError(String message, int langId) { if (onStatusChange == null) { - Logger.w("DictionaryLoader.sendError", "Cannot send an error without a status Handler. Ignoring message."); + Logger.w(logTag, "Cannot send an error without a status Handler. Ignoring message."); return; } @@ -344,7 +330,7 @@ public class DictionaryLoader { private void sendImportError(String message, int langId, long fileLine, String word) { if (onStatusChange == null) { - Logger.w("DictionaryLoader.sendError", "Cannot send an import error without a status Handler. Ignoring message."); + Logger.w(logTag, "Cannot send an import error without a status Handler. Ignoring message."); return; } diff --git a/src/io/github/sspanak/tt9/languages/LanguageDefinition.java b/src/io/github/sspanak/tt9/languages/LanguageDefinition.java index 5b896b79..6f2d2226 100644 --- a/src/io/github/sspanak/tt9/languages/LanguageDefinition.java +++ b/src/io/github/sspanak/tt9/languages/LanguageDefinition.java @@ -38,7 +38,7 @@ public class LanguageDefinition { } Logger.d("LanguageDefinition", "Found: " + files.size() + " languages."); - } catch (IOException e) { + } catch (IOException | NullPointerException e) { Logger.e("tt9.LanguageDefinition", "Failed reading language definitions from: '" + definitionsDir + "'. " + e.getMessage()); }