Improved dictionary validation (#195)
* more dictionary validations during build time (empty line and punctuation presence) * slightly improved dictionary loading speed * removed a blank line from Ukrainian
This commit is contained in:
parent
9b0a3c64ef
commit
935ca590c9
3 changed files with 49 additions and 24 deletions
|
|
@ -4332,7 +4332,6 @@
|
|||
маті 62
|
||||
махи 38
|
||||
маху 49
|
||||
|
||||
маца 61
|
||||
маци 44
|
||||
мацу 51
|
||||
|
|
|
|||
|
Can't render this file because it is too large.
|
54
build.gradle
54
build.gradle
|
|
@ -80,16 +80,41 @@ def getReleaseVersion = { ->
|
|||
return "${getVersionName()} (${getCurrentGitHash()})"
|
||||
}
|
||||
|
||||
def isPunctuationInWordsAllowed (String dictionaryFile) {
|
||||
boolean isAllowed = false
|
||||
|
||||
file("${project.projectDir}/src/io/github/sspanak/tt9/languages/definitions").listFiles().each { file ->
|
||||
boolean isTheDefinitionFile = false
|
||||
file.eachLine {line ->
|
||||
if (line.contains(dictionaryFile)) {
|
||||
isTheDefinitionFile = true
|
||||
}
|
||||
}
|
||||
|
||||
if (isTheDefinitionFile) {
|
||||
file.eachLine {line ->
|
||||
if (line.matches(".+?isPunctuationPartOfWords\\s*=\\s*true.+?")) {
|
||||
isAllowed = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return isAllowed
|
||||
}
|
||||
|
||||
task validateDictionaries {
|
||||
inputs.dir fileTree(dir:'assets', excludes:['dict.properties'])
|
||||
outputs.file "${project.buildDir}/dict.validation.txt"
|
||||
|
||||
doLast {
|
||||
final String csvDelimiter = ' ' // TAB
|
||||
final String CSV_DELIMITER = ' ' // TAB
|
||||
final GEOGRAPHICAL_NAME = ~"[A-Z]\\w+-[^\\n]+"
|
||||
final PUNCTUATION_CHARS = ~".*?\\p{Punct}(?<!-).*?"
|
||||
|
||||
final MAX_ERRORS = 50
|
||||
String errors = ""
|
||||
int errorCount = 0
|
||||
final MAX_ERRORS = 50
|
||||
|
||||
outputs.files.singleFile.text = ""
|
||||
|
||||
|
|
@ -100,7 +125,7 @@ task validateDictionaries {
|
|||
|
||||
println "Validating dictionary: " + file.name
|
||||
|
||||
def geographicalName = ~"[A-Z]\\w+-[^\\n]+"
|
||||
def isPunctuationAllowed = isPunctuationInWordsAllowed(file.name)
|
||||
def uniqueWords = [:]
|
||||
|
||||
int lineNumber = 0
|
||||
|
|
@ -113,20 +138,27 @@ task validateDictionaries {
|
|||
|
||||
lineNumber++
|
||||
|
||||
String[] parts = line.split(csvDelimiter, 2)
|
||||
if (line == "") {
|
||||
isFileValid = false
|
||||
errorCount++
|
||||
errors += "Dictionary '" + file.name + "' is invalid. There is no word on line " + lineNumber + ". Remove all empty lines.\n"
|
||||
return
|
||||
}
|
||||
|
||||
String[] parts = line.split(CSV_DELIMITER, 2)
|
||||
String word = parts[0]
|
||||
String frequency = parts.length > 1 ? parts[1] : ""
|
||||
|
||||
if (frequency.length() > 0 && !frequency.matches("^\\d+\$")) {
|
||||
isFileValid = false
|
||||
errorCount++
|
||||
errors += "Dictionary '" + file.name + "' is invalid. Found out-of-range word frequency: '" + frequency + "' on line " + lineNumber + ". Frequency must be a non-negative integer. \n"
|
||||
errors += "Dictionary '" + file.name + "' is invalid. Found out-of-range word frequency: '" + frequency + "' on line " + lineNumber + ". Frequency must be a non-negative integer.\n"
|
||||
}
|
||||
|
||||
if (word.matches("(\\d.+?|.+?\\d|\\d)")) {
|
||||
isFileValid = false
|
||||
errorCount++
|
||||
errors += "Dictionary '" + file.name + "' is invalid. Found numbers on line " + lineNumber + ". Please, remove all numbers.\n"
|
||||
errors += "Dictionary '" + file.name + "' is invalid. Found numbers on line " + lineNumber + ". Remove all numbers.\n"
|
||||
}
|
||||
|
||||
if (word.matches("^\\P{L}+\$")) {
|
||||
|
|
@ -141,7 +173,13 @@ task validateDictionaries {
|
|||
errors += "Dictionary '" + file.name + "' is invalid. Found a single letter: '" + word + "' on line " + lineNumber + ". Only uppercase single letters are allowed. The rest of the alphabet will be added automatically.\n"
|
||||
}
|
||||
|
||||
String uniqueWordKey = word ==~ geographicalName ? word : word.toLowerCase()
|
||||
if (!isPunctuationAllowed && word.matches(PUNCTUATION_CHARS)) {
|
||||
isFileValid = false
|
||||
errorCount++
|
||||
errors += "Dictionary '" + file.name + "' is invalid. Found a punctuation mark in word: '" + word + "' on line " + lineNumber + ". Remove all punctuation characters when the language definition disallows them or update the definition.\n"
|
||||
}
|
||||
|
||||
String uniqueWordKey = word ==~ GEOGRAPHICAL_NAME ? word : word.toLowerCase()
|
||||
if (uniqueWords[uniqueWordKey] != null && uniqueWords[uniqueWordKey] == true) {
|
||||
isFileValid = false
|
||||
errorCount++
|
||||
|
|
@ -150,7 +188,7 @@ task validateDictionaries {
|
|||
uniqueWords[uniqueWordKey] = true
|
||||
}
|
||||
|
||||
if (errorCount >= MAX_ERRORS ) {
|
||||
if (errorCount >= MAX_ERRORS) {
|
||||
errors += "Too many errors! Aborting.\n"
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,7 +25,6 @@ public class DictionaryLoader {
|
|||
private final AssetManager assets;
|
||||
private final SettingsStore settings;
|
||||
|
||||
private final Pattern containsPunctuation = Pattern.compile("\\p{Punct}(?<!-)");
|
||||
private Handler statusHandler = null;
|
||||
private Thread loadThread;
|
||||
|
||||
|
|
@ -208,8 +207,8 @@ public class DictionaryLoader {
|
|||
}
|
||||
|
||||
String[] parts = splitLine(line);
|
||||
String word = validateWord(language, parts, lineCount);
|
||||
int frequency = validateFrequency(parts);
|
||||
String word = parts[0];
|
||||
int frequency = getFrequency(parts);
|
||||
|
||||
try {
|
||||
dbWords.add(stringToWord(language, word, frequency));
|
||||
|
|
@ -265,18 +264,7 @@ public class DictionaryLoader {
|
|||
}
|
||||
|
||||
|
||||
private String validateWord(Language language, String[] lineParts, long line) throws DictionaryImportException {
|
||||
String word = lineParts[0];
|
||||
|
||||
if (!language.isPunctuationPartOfWords() && containsPunctuation.matcher(word).find()) {
|
||||
throw new DictionaryImportException(language.getDictionaryFile(), word, line);
|
||||
}
|
||||
|
||||
return word;
|
||||
}
|
||||
|
||||
|
||||
private int validateFrequency(String[] lineParts) {
|
||||
private int getFrequency(String[] lineParts) {
|
||||
try {
|
||||
return Integer.parseInt(lineParts[1]);
|
||||
} catch (Exception e) {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue