Improved dictionary validation (#195)

* more dictionary validations during build time (empty line and punctuation presence) * slightly improved dictionary loading speed * removed a blank line from Ukrainian
2023-02-27 21:26:10 +02:00 · 2023-02-27 21:26:10 +02:00 · 935ca590c9
commit 935ca590c9
parent 9b0a3c64ef
3 changed files with 49 additions and 24 deletions
--- a/assets/uk-utf8.csv
+++ b/assets/uk-utf8.csv
@ -4332,7 +4332,6 @@
 маті	62
 махи	38
 маху	49
 маца	61
 маци	44
 мацу	51
--- a/build.gradle
+++ b/build.gradle
@ -80,16 +80,41 @@ def getReleaseVersion = { ->
 	return "${getVersionName()} (${getCurrentGitHash()})"
 }
 def isPunctuationInWordsAllowed (String dictionaryFile) {
 	boolean isAllowed = false
 	file("${project.projectDir}/src/io/github/sspanak/tt9/languages/definitions").listFiles().each { file ->
 		boolean isTheDefinitionFile = false
 		file.eachLine {line ->
 			if (line.contains(dictionaryFile)) {
 				isTheDefinitionFile = true
 			}
 		}
 		if (isTheDefinitionFile) {
 			file.eachLine {line ->
 				if (line.matches(".+?isPunctuationPartOfWords\\s*=\\s*true.+?")) {
 					isAllowed = true
 				}
 			}
 		}
 	}
 	return isAllowed
 }
 task validateDictionaries {
 	inputs.dir fileTree(dir:'assets', excludes:['dict.properties'])
 	outputs.file "${project.buildDir}/dict.validation.txt"
 	doLast {
-		final String csvDelimiter = '	' // TAB
+		final String CSV_DELIMITER = '	' // TAB
 		final GEOGRAPHICAL_NAME = ~"[A-Z]\\w+-[^\\n]+"
 		final PUNCTUATION_CHARS = ~".*?\\p{Punct}(?<!-).*?"
 		final MAX_ERRORS = 50
 		String errors = ""
 		int errorCount = 0
 		final MAX_ERRORS = 50
 		outputs.files.singleFile.text = ""
@ -100,7 +125,7 @@ task validateDictionaries {
 			println "Validating dictionary: " + file.name
-			def geographicalName = ~"[A-Z]\\w+-[^\\n]+"
+			def isPunctuationAllowed = isPunctuationInWordsAllowed(file.name)
 			def uniqueWords = [:]
 			int lineNumber = 0
@ -113,7 +138,14 @@ task validateDictionaries {
 				lineNumber++
-				String[] parts = line.split(csvDelimiter, 2)
+				if (line == "") {
 					isFileValid = false
 					errorCount++
 					errors += "Dictionary '" + file.name + "' is invalid. There is no word on line " + lineNumber + ". Remove all empty lines.\n"
 					return
 				}
 				String[] parts = line.split(CSV_DELIMITER, 2)
 				String word = parts[0]
 				String frequency = parts.length > 1 ? parts[1] : ""
@ -126,7 +158,7 @@ task validateDictionaries {
 				if (word.matches("(\\d.+?|.+?\\d|\\d)")) {
 					isFileValid = false
 					errorCount++
-					errors += "Dictionary '" + file.name + "' is invalid. Found numbers on line " + lineNumber + ". Please, remove all numbers.\n"
+					errors += "Dictionary '" + file.name + "' is invalid. Found numbers on line " + lineNumber + ". Remove all numbers.\n"
 				}
 				if (word.matches("^\\P{L}+\$")) {
@ -141,7 +173,13 @@ task validateDictionaries {
 					errors += "Dictionary '" + file.name + "' is invalid. Found a single letter: '" + word + "' on line " + lineNumber + ". Only uppercase single letters are allowed. The rest of the alphabet will be added automatically.\n"
 				}
-				String uniqueWordKey = word ==~ geographicalName ? word : word.toLowerCase()
+				if (!isPunctuationAllowed && word.matches(PUNCTUATION_CHARS)) {
 					isFileValid = false
 					errorCount++
 					errors += "Dictionary '" + file.name + "' is invalid. Found a punctuation mark in word: '" + word + "' on line " + lineNumber + ". Remove all punctuation characters when the language definition disallows them or update the definition.\n"
 				}
 				String uniqueWordKey = word ==~ GEOGRAPHICAL_NAME ? word : word.toLowerCase()
 				if (uniqueWords[uniqueWordKey] != null && uniqueWords[uniqueWordKey] == true) {
 					isFileValid = false
 					errorCount++
--- a/src/io/github/sspanak/tt9/db/DictionaryLoader.java
+++ b/src/io/github/sspanak/tt9/db/DictionaryLoader.java
@ -25,7 +25,6 @@ public class DictionaryLoader {
 	private final AssetManager assets;
 	private final SettingsStore settings;
 	private final Pattern containsPunctuation = Pattern.compile("\\p{Punct}(?<!-)");
 	private Handler statusHandler = null;
 	private Thread loadThread;
@ -208,8 +207,8 @@ public class DictionaryLoader {
 			}
 			String[] parts = splitLine(line);
-			String word = validateWord(language, parts, lineCount);
+			String word = parts[0];
-			int frequency = validateFrequency(parts);
+			int frequency = getFrequency(parts);
 			try {
 				dbWords.add(stringToWord(language, word, frequency));
@ -265,18 +264,7 @@ public class DictionaryLoader {
 	}
-	private String validateWord(Language language, String[] lineParts, long line) throws DictionaryImportException {
+	private int getFrequency(String[] lineParts) {
 		String word = lineParts[0];
 		if (!language.isPunctuationPartOfWords() && containsPunctuation.matcher(word).find()) {
 			throw new DictionaryImportException(language.getDictionaryFile(), word, line);
 		}
 		return word;
 	}
 	private int validateFrequency(String[] lineParts) {
 		try {
 			return Integer.parseInt(lineParts[1]);
 		} catch (Exception e) {