1
0
Fork 0

Improved dictionary validation (#195)

* more dictionary validations during build time (empty line and punctuation presence)

* slightly improved dictionary loading speed

* removed a blank line from Ukrainian
This commit is contained in:
Dimo Karaivanov 2023-02-27 21:26:10 +02:00 committed by GitHub
parent 9b0a3c64ef
commit 935ca590c9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 49 additions and 24 deletions

View file

@ -4332,7 +4332,6 @@
маті 62 маті 62
махи 38 махи 38
маху 49 маху 49
маца 61 маца 61
маци 44 маци 44
мацу 51 мацу 51

Can't render this file because it is too large.

View file

@ -80,16 +80,41 @@ def getReleaseVersion = { ->
return "${getVersionName()} (${getCurrentGitHash()})" return "${getVersionName()} (${getCurrentGitHash()})"
} }
def isPunctuationInWordsAllowed (String dictionaryFile) {
boolean isAllowed = false
file("${project.projectDir}/src/io/github/sspanak/tt9/languages/definitions").listFiles().each { file ->
boolean isTheDefinitionFile = false
file.eachLine {line ->
if (line.contains(dictionaryFile)) {
isTheDefinitionFile = true
}
}
if (isTheDefinitionFile) {
file.eachLine {line ->
if (line.matches(".+?isPunctuationPartOfWords\\s*=\\s*true.+?")) {
isAllowed = true
}
}
}
}
return isAllowed
}
task validateDictionaries { task validateDictionaries {
inputs.dir fileTree(dir:'assets', excludes:['dict.properties']) inputs.dir fileTree(dir:'assets', excludes:['dict.properties'])
outputs.file "${project.buildDir}/dict.validation.txt" outputs.file "${project.buildDir}/dict.validation.txt"
doLast { doLast {
final String csvDelimiter = ' ' // TAB final String CSV_DELIMITER = ' ' // TAB
final GEOGRAPHICAL_NAME = ~"[A-Z]\\w+-[^\\n]+"
final PUNCTUATION_CHARS = ~".*?\\p{Punct}(?<!-).*?"
final MAX_ERRORS = 50
String errors = "" String errors = ""
int errorCount = 0 int errorCount = 0
final MAX_ERRORS = 50
outputs.files.singleFile.text = "" outputs.files.singleFile.text = ""
@ -100,7 +125,7 @@ task validateDictionaries {
println "Validating dictionary: " + file.name println "Validating dictionary: " + file.name
def geographicalName = ~"[A-Z]\\w+-[^\\n]+" def isPunctuationAllowed = isPunctuationInWordsAllowed(file.name)
def uniqueWords = [:] def uniqueWords = [:]
int lineNumber = 0 int lineNumber = 0
@ -113,7 +138,14 @@ task validateDictionaries {
lineNumber++ lineNumber++
String[] parts = line.split(csvDelimiter, 2) if (line == "") {
isFileValid = false
errorCount++
errors += "Dictionary '" + file.name + "' is invalid. There is no word on line " + lineNumber + ". Remove all empty lines.\n"
return
}
String[] parts = line.split(CSV_DELIMITER, 2)
String word = parts[0] String word = parts[0]
String frequency = parts.length > 1 ? parts[1] : "" String frequency = parts.length > 1 ? parts[1] : ""
@ -126,7 +158,7 @@ task validateDictionaries {
if (word.matches("(\\d.+?|.+?\\d|\\d)")) { if (word.matches("(\\d.+?|.+?\\d|\\d)")) {
isFileValid = false isFileValid = false
errorCount++ errorCount++
errors += "Dictionary '" + file.name + "' is invalid. Found numbers on line " + lineNumber + ". Please, remove all numbers.\n" errors += "Dictionary '" + file.name + "' is invalid. Found numbers on line " + lineNumber + ". Remove all numbers.\n"
} }
if (word.matches("^\\P{L}+\$")) { if (word.matches("^\\P{L}+\$")) {
@ -141,7 +173,13 @@ task validateDictionaries {
errors += "Dictionary '" + file.name + "' is invalid. Found a single letter: '" + word + "' on line " + lineNumber + ". Only uppercase single letters are allowed. The rest of the alphabet will be added automatically.\n" errors += "Dictionary '" + file.name + "' is invalid. Found a single letter: '" + word + "' on line " + lineNumber + ". Only uppercase single letters are allowed. The rest of the alphabet will be added automatically.\n"
} }
String uniqueWordKey = word ==~ geographicalName ? word : word.toLowerCase() if (!isPunctuationAllowed && word.matches(PUNCTUATION_CHARS)) {
isFileValid = false
errorCount++
errors += "Dictionary '" + file.name + "' is invalid. Found a punctuation mark in word: '" + word + "' on line " + lineNumber + ". Remove all punctuation characters when the language definition disallows them or update the definition.\n"
}
String uniqueWordKey = word ==~ GEOGRAPHICAL_NAME ? word : word.toLowerCase()
if (uniqueWords[uniqueWordKey] != null && uniqueWords[uniqueWordKey] == true) { if (uniqueWords[uniqueWordKey] != null && uniqueWords[uniqueWordKey] == true) {
isFileValid = false isFileValid = false
errorCount++ errorCount++

View file

@ -25,7 +25,6 @@ public class DictionaryLoader {
private final AssetManager assets; private final AssetManager assets;
private final SettingsStore settings; private final SettingsStore settings;
private final Pattern containsPunctuation = Pattern.compile("\\p{Punct}(?<!-)");
private Handler statusHandler = null; private Handler statusHandler = null;
private Thread loadThread; private Thread loadThread;
@ -208,8 +207,8 @@ public class DictionaryLoader {
} }
String[] parts = splitLine(line); String[] parts = splitLine(line);
String word = validateWord(language, parts, lineCount); String word = parts[0];
int frequency = validateFrequency(parts); int frequency = getFrequency(parts);
try { try {
dbWords.add(stringToWord(language, word, frequency)); dbWords.add(stringToWord(language, word, frequency));
@ -265,18 +264,7 @@ public class DictionaryLoader {
} }
private String validateWord(Language language, String[] lineParts, long line) throws DictionaryImportException { private int getFrequency(String[] lineParts) {
String word = lineParts[0];
if (!language.isPunctuationPartOfWords() && containsPunctuation.matcher(word).find()) {
throw new DictionaryImportException(language.getDictionaryFile(), word, line);
}
return word;
}
private int validateFrequency(String[] lineParts) {
try { try {
return Integer.parseInt(lineParts[1]); return Integer.parseInt(lineParts[1]);
} catch (Exception e) { } catch (Exception e) {