Language improvements

* fixed some English words always appearing in small letters, when some should be capitalized (I, Friday, December, etc...)" * fixed Bulgarian words that should have been capitalized, but were not * fixed mixed case suggestions priority not updating * optimized dictionary loading (it's more than 2x faster now) * fixed the last words of dictionaries not being loaded
2022-12-15 11:27:01 +02:00 · 2022-12-15 11:27:01 +02:00 · 759317dce1
commit 759317dce1
parent 8888485f70
11 changed files with 25342 additions and 23472 deletions
--- a/assets/bg-utf8.txt
+++ b/assets/bg-utf8.txt
--- a/assets/en-utf8.txt
+++ b/assets/en-utf8.txt
--- a/build.gradle
+++ b/build.gradle
@ -173,9 +173,9 @@ task validateDictionaries {
 					errors += "Dictionary '" + file.name + "' is invalid. Found a garbage word: '" + line + "' on line " + lineNumber + ".\n"
 				}
-				if (line.matches("^.\$")) {
+				if (line.matches("^.\$") && !Character.isUpperCase(line.charAt(0))) {
 					errorCount++
-					errors += "Dictionary '" + file.name + "' is invalid. Found a single letter: '" + line + "' on line " + lineNumber + ". Remove all single letters. The alphabet will be added automatically.\n"
+					errors += "Dictionary '" + file.name + "' is invalid. Found a single letter: '" + line + "' on line " + lineNumber + ". Only uppercase single letters are allowed. The rest of the alphabet will be added automatically.\n"
 				}
 				String uniqueWordKey = line ==~ geographicalName ? line : line.toLowerCase()
--- a/docs/dictionaries/bgWordlistReadme.txt
+++ b/docs/dictionaries/bgWordlistReadme.txt
@ -4,3 +4,8 @@ Source: https://github.com/miglen/bulgarian-wordlists/blob/master/wordlists/bg-w
 License: https://github.com/miglen/bulgarian-wordlists/blob/master/LICENSE
 Additionally cleaned up repeating words and added some missing ones.
 Also, used the wooorm's hunspell-compatible dictionary to determine which words need to start with a capital letter
 Link: https://github.com/wooorm/dictionaries/tree/main/dictionaries/bg
 License: MIT (available in the link)
 Git commit: 13 Apr 2022 [0c78cc810c8aafb2e6f5140bb6dcd4026b247eb8]
--- a/docs/dictionaries/enWordlistReadme.txt
+++ b/docs/dictionaries/enWordlistReadme.txt
@ -18,6 +18,14 @@ Using Git Commit From: Mon Dec 7 20:14:35 2020 -0500 [5ef55f9]
 =====
 Also, used the wooorm's hunspell-compatible dictionary to determine
 which words need to start with a capital letter
 Link: https://github.com/wooorm/dictionaries/tree/main/dictionaries/en
 License: MIT (available in the link)
 Git commit: 13 Apr 2022 [0c78cc810c8aafb2e6f5140bb6dcd4026b247eb8]
 =====
 Spell Checking Oriented Word Lists (SCOWL)
 Mon Dec 7 20:14:35 2020 -0500 [5ef55f9]
--- a/scripts/capitalize-dictionary-words.js
+++ b/scripts/capitalize-dictionary-words.js
@ -0,0 +1,95 @@
 const { basename } = require('path');
 const { createReadStream, existsSync } = require('fs');
 function printHelp() {
 	console.log(`Usage ${basename(process.argv[1])} DICTIONARY-FILE-NAME.txt LIST-OF-CAPITALIZED-WORDS.txt MIN-WORD-LENGTH LOCALE`);
 	console.log('Capitalizes a word list using capitalized words in another list.');
 	console.log('\nMIN-WORD-LENGTH must be a positive number.');
 	console.log('LOCALE could be any valid JS locale, for exmaple: en, en-US, etc...');
 }
 function validateInput() {
 	if (process.argv.length < 6) {
 		printHelp();
 		process.exit(1);
 	}
 	if (!existsSync(process.argv[3])) {
 		console.error(`Failure! Could not find list-of-capitals file "${process.argv[3]}."`);
 		process.exit(2);
 	}
 	if (!existsSync(process.argv[2])) {
 		console.error(`Failure! Could not find dictionary file "${process.argv[2]}."`);
 		process.exit(2);
 	}
 	const minWordLength = Number.parseInt(process.argv[4]);
 	if (Number.isNaN(minWordLength) || minWordLength < 0) {
 		console.error(`Failure! The minimum word length must be a positive number.`);
 		process.exit(2);
 	}
 	return { capitalsFileName: process.argv[3], dictionaryFileName: process.argv[2], locale: process.argv[5], minWordLength };
 }
 async function capitalize({ dictionaryFileName, capitalsFileName, locale, minWordLength }) {
 	// read the dictionary
 	let lineReader = require('readline').createInterface({
 	  input: createReadStream(dictionaryFileName)
 	});
 	const words = {};
 	for await (const line of lineReader) {
 		words[line] = true;
 	}
 	// convert the dictionary words using the second file
 	lineReader = require('readline').createInterface({
 	  input: createReadStream(capitalsFileName)
 	});
 	for await (const capitalizedWord of lineReader) {
 		if (capitalizedWord.length < minWordLength) {
 			continue;
 		}
 		const lowercaseWord = capitalizedWord.toLocaleLowerCase(locale);
 		if (words[lowercaseWord]) {
 			delete words[lowercaseWord];
 			words[capitalizedWord] = true;
 		}
 		const possessiveLowercaseWord = `${lowercaseWord}'s`;
 		if (words[possessiveLowercaseWord]) {
 			delete words[possessiveLowercaseWord];
 			words[`${capitalizedWord}'s`] = true;
 		}
 	}
 	return Object.keys(words);
 }
 function printWords(wordList) {
 	if (!Array.isArray(wordList)) {
 		return;
 	}
 	wordList.forEach(w => console.log(w));
 }
 /** main **/
 capitalize(validateInput())
 	.then(words => printWords(words))
 	.catch(e => console.error(e));
--- a/scripts/remove-dictionary-repeating-words.js
+++ b/scripts/remove-dictionary-repeating-words.js
@ -8,7 +8,7 @@ const GEO_NAME = /[A-Z]\w+\-[^\n]+/;
 function printHelp() {
 	console.log(`Usage ${basename(process.argv[1])} LOCALE FILENAME.txt `);
 	console.log('Removes repeating words from a word list');
-	console.log('\nLocale could any valid JS locale, for exmaple: en, en-US, etc...');
+	console.log('\nLocale could be any valid JS locale, for exmaple: en, en-US, etc...');
 }
--- a/src/io/github/sspanak/tt9/db/DictionaryDb.java
+++ b/src/io/github/sspanak/tt9/db/DictionaryDb.java
@ -81,6 +81,15 @@ public class DictionaryDb {
 	}
 	public static boolean doesWordExistSync(Language language, String word) {
 		if (language == null || word == null || word.equals("")) {
 			return false;
 		}
 		return getInstance().wordsDao().doesWordExist(language.getId(), word) > 0;
 	}
 	public static void truncateWords(Handler handler) {
 		new Thread() {
 			@Override
@ -155,7 +164,13 @@ public class DictionaryDb {
 			@Override
 			public void run() {
 				try {
-					int affectedRows = getInstance().wordsDao().incrementFrequency(language.getId(), word.toLowerCase(language.getLocale()), sequence);
+					int affectedRows = getInstance().wordsDao().incrementFrequency(language.getId(), word, sequence);
 					if (affectedRows == 0) {
 						// If the user has changed the case manually, so there would be no matching word.
 						// In this case, try again with the lowercase equivalent.
 						affectedRows = getInstance().wordsDao().incrementFrequency(language.getId(), word.toLowerCase(language.getLocale()), sequence);
 					}
 					Logger.d("incrementWordFrequency", "Affected rows: " + affectedRows);
 				} catch (Exception e) {
 					Logger.e(
--- a/src/io/github/sspanak/tt9/db/DictionaryLoader.java
+++ b/src/io/github/sspanak/tt9/db/DictionaryLoader.java
@ -108,19 +108,19 @@ public class DictionaryLoader {
 		DictionaryDb.runInTransaction(() -> {
 			try {
 				long start = System.currentTimeMillis();
 				importLetters(language);
 				Logger.i(
 					logTag,
 					"Loaded letters for '" + language.getName() + "' language in: " + (System.currentTimeMillis() - start) + " ms"
 				);
 				start = System.currentTimeMillis();
 				importWords(language);
 				Logger.i(
 					logTag,
 					"Dictionary: '" + language.getDictionaryFile() + "'" +
 						" processing time: " + (System.currentTimeMillis() - start) + " ms"
 				);
 				start = System.currentTimeMillis();
 				importLetters(language);
 				Logger.i(
 					logTag,
 					"Loaded letters for '" + language.getName() + "' language in: " + (System.currentTimeMillis() - start) + " ms"
 				);
 			} catch (DictionaryImportAbortedException e) {
 				stop();
@ -167,6 +167,10 @@ public class DictionaryLoader {
 					continue;
 				}
 				if (DictionaryDb.doesWordExistSync(language, langChar.toUpperCase(language.getLocale()))) {
 					continue;
 				}
 				Word word = new Word();
 				word.langId = language.getId();
 				word.frequency = 0;
@ -210,7 +214,7 @@ public class DictionaryLoader {
 				throw new DictionaryImportException(dictionaryFile, word, line);
 			}
-			if (line % settings.getDictionaryImportWordChunkSize() == 0) {
+			if (line % settings.getDictionaryImportWordChunkSize() == 0 || line == totalWords - 1) {
 				DictionaryDb.insertWordsSync(dbWords);
 				dbWords.clear();
 			}
--- a/src/io/github/sspanak/tt9/db/WordsDao.java
+++ b/src/io/github/sspanak/tt9/db/WordsDao.java
@ -12,6 +12,9 @@ interface WordsDao {
 	@Query("SELECT COUNT(id) FROM words WHERE :langId < 0 OR lang = :langId")
 	int count(int langId);
 	@Query("SELECT COUNT(id) FROM words WHERE lang = :langId AND word = :word")
 	int doesWordExist(int langId, String word);
 	@Query(
 		"SELECT * " +
 		"FROM words " +
--- a/src/io/github/sspanak/tt9/languages/Language.java
+++ b/src/io/github/sspanak/tt9/languages/Language.java
@ -3,6 +3,7 @@ package io.github.sspanak.tt9.languages;
 import androidx.annotation.NonNull;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Locale;
@ -15,6 +16,7 @@ public class Language {
 	protected int abcLowerCaseIcon;
 	protected int abcUpperCaseIcon;
 	protected ArrayList<ArrayList<String>> characterMap = new ArrayList<>();
 	private final HashMap<Character, String> reverseCharacterMap = new HashMap<>();
 	// settings
 	protected boolean isPunctuationPartOfWords; // see the getter for more info
@ -65,12 +67,26 @@ public class Language {
 	/************* utility *************/
 	private void generateReverseCharacterMap() {
 		reverseCharacterMap.clear();
 		for (int digit = 0; digit <= 9; digit++) {
 			for (String keyChar : getKeyCharacters(digit)) {
 				reverseCharacterMap.put(keyChar.charAt(0), String.valueOf(digit));
 			}
 		}
 	}
 	public String capitalize(String word) {
 		return word != null ? word.substring(0, 1).toUpperCase(locale) + word.substring(1).toLowerCase(locale) : null;
 	}
 	public boolean isMixedCaseWord(String word) {
-		return word != null && !word.toLowerCase(locale).equals(word) && !word.toUpperCase(locale).equals(word);
+		return word != null
 			&& (
 				(word.length() == 1 && word.toUpperCase(locale).equals(word))
 				|| (!word.toLowerCase(locale).equals(word) && !word.toUpperCase(locale).equals(word))
 			);
 	}
 	public ArrayList<String> getKeyCharacters(int key) {
@ -90,16 +106,17 @@ public class Language {
 		StringBuilder sequence = new StringBuilder();
 		String lowerCaseWord = word.toLowerCase(locale);
-		for (int i = 0; i < lowerCaseWord.length(); i++) {
+		if (reverseCharacterMap.isEmpty()) {
-			for (int key = 0; key <= 9; key++) {
+			generateReverseCharacterMap();
 				if (getKeyCharacters(key).contains(Character.toString(lowerCaseWord.charAt(i)))) {
 					sequence.append(key);
 				}
 			}
 		}
-		if (word.length() != sequence.length()) {
+		for (int i = 0; i < lowerCaseWord.length(); i++) {
-			throw new InvalidLanguageCharactersException(this, "Failed generating digit sequence for word: '" + word);
+			char letter = lowerCaseWord.charAt(i);
 			if (!reverseCharacterMap.containsKey(letter)) {
 				throw new InvalidLanguageCharactersException(this, "Failed generating digit sequence for word: '" + word);
 			}
 			sequence.append(reverseCharacterMap.get(letter));
 		}
 		return sequence.toString();