Improved dictionary validation (#195)
* more dictionary validations during build time (empty line and punctuation presence) * slightly improved dictionary loading speed * removed a blank line from Ukrainian
This commit is contained in:
parent
9b0a3c64ef
commit
935ca590c9
3 changed files with 49 additions and 24 deletions
|
|
@ -4332,7 +4332,6 @@
|
||||||
маті 62
|
маті 62
|
||||||
махи 38
|
махи 38
|
||||||
маху 49
|
маху 49
|
||||||
|
|
||||||
маца 61
|
маца 61
|
||||||
маци 44
|
маци 44
|
||||||
мацу 51
|
мацу 51
|
||||||
|
|
|
||||||
|
Can't render this file because it is too large.
|
50
build.gradle
50
build.gradle
|
|
@ -80,16 +80,41 @@ def getReleaseVersion = { ->
|
||||||
return "${getVersionName()} (${getCurrentGitHash()})"
|
return "${getVersionName()} (${getCurrentGitHash()})"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def isPunctuationInWordsAllowed (String dictionaryFile) {
|
||||||
|
boolean isAllowed = false
|
||||||
|
|
||||||
|
file("${project.projectDir}/src/io/github/sspanak/tt9/languages/definitions").listFiles().each { file ->
|
||||||
|
boolean isTheDefinitionFile = false
|
||||||
|
file.eachLine {line ->
|
||||||
|
if (line.contains(dictionaryFile)) {
|
||||||
|
isTheDefinitionFile = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isTheDefinitionFile) {
|
||||||
|
file.eachLine {line ->
|
||||||
|
if (line.matches(".+?isPunctuationPartOfWords\\s*=\\s*true.+?")) {
|
||||||
|
isAllowed = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return isAllowed
|
||||||
|
}
|
||||||
|
|
||||||
task validateDictionaries {
|
task validateDictionaries {
|
||||||
inputs.dir fileTree(dir:'assets', excludes:['dict.properties'])
|
inputs.dir fileTree(dir:'assets', excludes:['dict.properties'])
|
||||||
outputs.file "${project.buildDir}/dict.validation.txt"
|
outputs.file "${project.buildDir}/dict.validation.txt"
|
||||||
|
|
||||||
doLast {
|
doLast {
|
||||||
final String csvDelimiter = ' ' // TAB
|
final String CSV_DELIMITER = ' ' // TAB
|
||||||
|
final GEOGRAPHICAL_NAME = ~"[A-Z]\\w+-[^\\n]+"
|
||||||
|
final PUNCTUATION_CHARS = ~".*?\\p{Punct}(?<!-).*?"
|
||||||
|
|
||||||
|
final MAX_ERRORS = 50
|
||||||
String errors = ""
|
String errors = ""
|
||||||
int errorCount = 0
|
int errorCount = 0
|
||||||
final MAX_ERRORS = 50
|
|
||||||
|
|
||||||
outputs.files.singleFile.text = ""
|
outputs.files.singleFile.text = ""
|
||||||
|
|
||||||
|
|
@ -100,7 +125,7 @@ task validateDictionaries {
|
||||||
|
|
||||||
println "Validating dictionary: " + file.name
|
println "Validating dictionary: " + file.name
|
||||||
|
|
||||||
def geographicalName = ~"[A-Z]\\w+-[^\\n]+"
|
def isPunctuationAllowed = isPunctuationInWordsAllowed(file.name)
|
||||||
def uniqueWords = [:]
|
def uniqueWords = [:]
|
||||||
|
|
||||||
int lineNumber = 0
|
int lineNumber = 0
|
||||||
|
|
@ -113,7 +138,14 @@ task validateDictionaries {
|
||||||
|
|
||||||
lineNumber++
|
lineNumber++
|
||||||
|
|
||||||
String[] parts = line.split(csvDelimiter, 2)
|
if (line == "") {
|
||||||
|
isFileValid = false
|
||||||
|
errorCount++
|
||||||
|
errors += "Dictionary '" + file.name + "' is invalid. There is no word on line " + lineNumber + ". Remove all empty lines.\n"
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
String[] parts = line.split(CSV_DELIMITER, 2)
|
||||||
String word = parts[0]
|
String word = parts[0]
|
||||||
String frequency = parts.length > 1 ? parts[1] : ""
|
String frequency = parts.length > 1 ? parts[1] : ""
|
||||||
|
|
||||||
|
|
@ -126,7 +158,7 @@ task validateDictionaries {
|
||||||
if (word.matches("(\\d.+?|.+?\\d|\\d)")) {
|
if (word.matches("(\\d.+?|.+?\\d|\\d)")) {
|
||||||
isFileValid = false
|
isFileValid = false
|
||||||
errorCount++
|
errorCount++
|
||||||
errors += "Dictionary '" + file.name + "' is invalid. Found numbers on line " + lineNumber + ". Please, remove all numbers.\n"
|
errors += "Dictionary '" + file.name + "' is invalid. Found numbers on line " + lineNumber + ". Remove all numbers.\n"
|
||||||
}
|
}
|
||||||
|
|
||||||
if (word.matches("^\\P{L}+\$")) {
|
if (word.matches("^\\P{L}+\$")) {
|
||||||
|
|
@ -141,7 +173,13 @@ task validateDictionaries {
|
||||||
errors += "Dictionary '" + file.name + "' is invalid. Found a single letter: '" + word + "' on line " + lineNumber + ". Only uppercase single letters are allowed. The rest of the alphabet will be added automatically.\n"
|
errors += "Dictionary '" + file.name + "' is invalid. Found a single letter: '" + word + "' on line " + lineNumber + ". Only uppercase single letters are allowed. The rest of the alphabet will be added automatically.\n"
|
||||||
}
|
}
|
||||||
|
|
||||||
String uniqueWordKey = word ==~ geographicalName ? word : word.toLowerCase()
|
if (!isPunctuationAllowed && word.matches(PUNCTUATION_CHARS)) {
|
||||||
|
isFileValid = false
|
||||||
|
errorCount++
|
||||||
|
errors += "Dictionary '" + file.name + "' is invalid. Found a punctuation mark in word: '" + word + "' on line " + lineNumber + ". Remove all punctuation characters when the language definition disallows them or update the definition.\n"
|
||||||
|
}
|
||||||
|
|
||||||
|
String uniqueWordKey = word ==~ GEOGRAPHICAL_NAME ? word : word.toLowerCase()
|
||||||
if (uniqueWords[uniqueWordKey] != null && uniqueWords[uniqueWordKey] == true) {
|
if (uniqueWords[uniqueWordKey] != null && uniqueWords[uniqueWordKey] == true) {
|
||||||
isFileValid = false
|
isFileValid = false
|
||||||
errorCount++
|
errorCount++
|
||||||
|
|
|
||||||
|
|
@ -25,7 +25,6 @@ public class DictionaryLoader {
|
||||||
private final AssetManager assets;
|
private final AssetManager assets;
|
||||||
private final SettingsStore settings;
|
private final SettingsStore settings;
|
||||||
|
|
||||||
private final Pattern containsPunctuation = Pattern.compile("\\p{Punct}(?<!-)");
|
|
||||||
private Handler statusHandler = null;
|
private Handler statusHandler = null;
|
||||||
private Thread loadThread;
|
private Thread loadThread;
|
||||||
|
|
||||||
|
|
@ -208,8 +207,8 @@ public class DictionaryLoader {
|
||||||
}
|
}
|
||||||
|
|
||||||
String[] parts = splitLine(line);
|
String[] parts = splitLine(line);
|
||||||
String word = validateWord(language, parts, lineCount);
|
String word = parts[0];
|
||||||
int frequency = validateFrequency(parts);
|
int frequency = getFrequency(parts);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
dbWords.add(stringToWord(language, word, frequency));
|
dbWords.add(stringToWord(language, word, frequency));
|
||||||
|
|
@ -265,18 +264,7 @@ public class DictionaryLoader {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private String validateWord(Language language, String[] lineParts, long line) throws DictionaryImportException {
|
private int getFrequency(String[] lineParts) {
|
||||||
String word = lineParts[0];
|
|
||||||
|
|
||||||
if (!language.isPunctuationPartOfWords() && containsPunctuation.matcher(word).find()) {
|
|
||||||
throw new DictionaryImportException(language.getDictionaryFile(), word, line);
|
|
||||||
}
|
|
||||||
|
|
||||||
return word;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private int validateFrequency(String[] lineParts) {
|
|
||||||
try {
|
try {
|
||||||
return Integer.parseInt(lineParts[1]);
|
return Integer.parseInt(lineParts[1]);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue