Language improvements

* fixed some English words always appearing in small letters, when some should be capitalized (I, Friday, December, etc...)" * fixed Bulgarian words that should have been capitalized, but were not * fixed mixed case suggestions priority not updating * optimized dictionary loading (it's more than 2x faster now) * fixed the last words of dictionaries not being loaded
2022-12-15 11:27:01 +02:00 · 2022-12-15 11:27:01 +02:00 · 759317dce1
commit 759317dce1
parent 8888485f70
11 changed files with 25342 additions and 23472 deletions
--- a/assets/bg-utf8.txt
+++ b/assets/bg-utf8.txt
--- a/assets/en-utf8.txt
+++ b/assets/en-utf8.txt
--- a/build.gradle
+++ b/build.gradle
@ -173,9 +173,9 @@ task validateDictionaries {
 					errors += "Dictionary '" + file.name + "' is invalid. Found a garbage word: '" + line + "' on line " + lineNumber + ".\n"
 				}

-				if (line.matches("^.\$")) {
+				if (line.matches("^.\$") && !Character.isUpperCase(line.charAt(0))) {
 					errorCount++
-					errors += "Dictionary '" + file.name + "' is invalid. Found a single letter: '" + line + "' on line " + lineNumber + ". Remove all single letters. The alphabet will be added automatically.\n"
+					errors += "Dictionary '" + file.name + "' is invalid. Found a single letter: '" + line + "' on line " + lineNumber + ". Only uppercase single letters are allowed. The rest of the alphabet will be added automatically.\n"
 				}

 				String uniqueWordKey = line ==~ geographicalName ? line : line.toLowerCase()
--- a/docs/dictionaries/bgWordlistReadme.txt
+++ b/docs/dictionaries/bgWordlistReadme.txt
@ -3,4 +3,9 @@ Version: f46eff1 (2022-04-26)
 Source: https://github.com/miglen/bulgarian-wordlists/blob/master/wordlists/bg-words-validated-cyrillic.txt
 License: https://github.com/miglen/bulgarian-wordlists/blob/master/LICENSE

-Additionally cleaned up repeating words and added some missing ones.
+Additionally cleaned up repeating words and added some missing ones.
+
+Also, used the wooorm's hunspell-compatible dictionary to determine which words need to start with a capital letter
+Link: https://github.com/wooorm/dictionaries/tree/main/dictionaries/bg
+License: MIT (available in the link)
+Git commit: 13 Apr 2022 [0c78cc810c8aafb2e6f5140bb6dcd4026b247eb8]
--- a/docs/dictionaries/enWordlistReadme.txt
+++ b/docs/dictionaries/enWordlistReadme.txt
@ -18,6 +18,14 @@ Using Git Commit From: Mon Dec 7 20:14:35 2020 -0500 [5ef55f9]

 =====

+Also, used the wooorm's hunspell-compatible dictionary to determine
+which words need to start with a capital letter
+Link: https://github.com/wooorm/dictionaries/tree/main/dictionaries/en
+License: MIT (available in the link)
+Git commit: 13 Apr 2022 [0c78cc810c8aafb2e6f5140bb6dcd4026b247eb8]
+
+=====
+
 Spell Checking Oriented Word Lists (SCOWL)

 Mon Dec 7 20:14:35 2020 -0500 [5ef55f9]
--- a/scripts/capitalize-dictionary-words.js
+++ b/scripts/capitalize-dictionary-words.js
@ -0,0 +1,95 @@
+const { basename } = require('path');
+const { createReadStream, existsSync } = require('fs');
+
+
+function printHelp() {
+	console.log(`Usage ${basename(process.argv[1])} DICTIONARY-FILE-NAME.txt LIST-OF-CAPITALIZED-WORDS.txt MIN-WORD-LENGTH LOCALE`);
+	console.log('Capitalizes a word list using capitalized words in another list.');
+	console.log('\nMIN-WORD-LENGTH must be a positive number.');
+	console.log('LOCALE could be any valid JS locale, for exmaple: en, en-US, etc...');
+}
+
+
+
+function validateInput() {
+	if (process.argv.length < 6) {
+		printHelp();
+		process.exit(1);
+	}
+
+
+	if (!existsSync(process.argv[3])) {
+		console.error(`Failure! Could not find list-of-capitals file "${process.argv[3]}."`);
+		process.exit(2);
+	}
+
+
+	if (!existsSync(process.argv[2])) {
+		console.error(`Failure! Could not find dictionary file "${process.argv[2]}."`);
+		process.exit(2);
+	}
+
+	const minWordLength = Number.parseInt(process.argv[4]);
+	if (Number.isNaN(minWordLength) || minWordLength < 0) {
+		console.error(`Failure! The minimum word length must be a positive number.`);
+		process.exit(2);
+	}
+
+	return { capitalsFileName: process.argv[3], dictionaryFileName: process.argv[2], locale: process.argv[5], minWordLength };
+}
+
+
+async function capitalize({ dictionaryFileName, capitalsFileName, locale, minWordLength }) {
+	// read the dictionary
+	let lineReader = require('readline').createInterface({
+	  input: createReadStream(dictionaryFileName)
+	});
+
+	const words = {};
+	for await (const line of lineReader) {
+		words[line] = true;
+	}
+
+
+	// convert the dictionary words using the second file
+	lineReader = require('readline').createInterface({
+	  input: createReadStream(capitalsFileName)
+	});
+
+	for await (const capitalizedWord of lineReader) {
+		if (capitalizedWord.length < minWordLength) {
+			continue;
+		}
+
+		const lowercaseWord = capitalizedWord.toLocaleLowerCase(locale);
+		if (words[lowercaseWord]) {
+			delete words[lowercaseWord];
+			words[capitalizedWord] = true;
+		}
+
+		const possessiveLowercaseWord = `${lowercaseWord}'s`;
+		if (words[possessiveLowercaseWord]) {
+			delete words[possessiveLowercaseWord];
+			words[`${capitalizedWord}'s`] = true;
+		}
+	}
+
+	return Object.keys(words);
+}
+
+
+
+function printWords(wordList) {
+	if (!Array.isArray(wordList)) {
+		return;
+	}
+
+	wordList.forEach(w => console.log(w));
+}
+
+
+
+/** main **/
+capitalize(validateInput())
+	.then(words => printWords(words))
+	.catch(e => console.error(e));
--- a/scripts/remove-dictionary-repeating-words.js
+++ b/scripts/remove-dictionary-repeating-words.js
@ -8,7 +8,7 @@ const GEO_NAME = /[A-Z]\w+\-[^\n]+/;
 function printHelp() {
 	console.log(`Usage ${basename(process.argv[1])} LOCALE FILENAME.txt `);
 	console.log('Removes repeating words from a word list');
-	console.log('\nLocale could any valid JS locale, for exmaple: en, en-US, etc...');
+	console.log('\nLocale could be any valid JS locale, for exmaple: en, en-US, etc...');
 }


--- a/src/io/github/sspanak/tt9/db/DictionaryDb.java
+++ b/src/io/github/sspanak/tt9/db/DictionaryDb.java
@ -81,6 +81,15 @@ public class DictionaryDb {
 	}


+	public static boolean doesWordExistSync(Language language, String word) {
+		if (language == null || word == null || word.equals("")) {
+			return false;
+		}
+
+		return getInstance().wordsDao().doesWordExist(language.getId(), word) > 0;
+	}
+
+
 	public static void truncateWords(Handler handler) {
 		new Thread() {
 			@Override
@ -155,7 +164,13 @@ public class DictionaryDb {
 			@Override
 			public void run() {
 				try {
-					int affectedRows = getInstance().wordsDao().incrementFrequency(language.getId(), word.toLowerCase(language.getLocale()), sequence);
+					int affectedRows = getInstance().wordsDao().incrementFrequency(language.getId(), word, sequence);
+					if (affectedRows == 0) {
+						// If the user has changed the case manually, so there would be no matching word.
+						// In this case, try again with the lowercase equivalent.
+						affectedRows = getInstance().wordsDao().incrementFrequency(language.getId(), word.toLowerCase(language.getLocale()), sequence);
+					}
+
 					Logger.d("incrementWordFrequency", "Affected rows: " + affectedRows);
 				} catch (Exception e) {
 					Logger.e(
--- a/src/io/github/sspanak/tt9/db/DictionaryLoader.java
+++ b/src/io/github/sspanak/tt9/db/DictionaryLoader.java
@ -108,19 +108,19 @@ public class DictionaryLoader {
 		DictionaryDb.runInTransaction(() -> {
 			try {
 				long start = System.currentTimeMillis();
-				importLetters(language);
-				Logger.i(
-					logTag,
-					"Loaded letters for '" + language.getName() + "' language in: " + (System.currentTimeMillis() - start) + " ms"
-				);
-
-				start = System.currentTimeMillis();
 				importWords(language);
 				Logger.i(
 					logTag,
 					"Dictionary: '" + language.getDictionaryFile() + "'" +
 						" processing time: " + (System.currentTimeMillis() - start) + " ms"
 				);
+
+				start = System.currentTimeMillis();
+				importLetters(language);
+				Logger.i(
+					logTag,
+					"Loaded letters for '" + language.getName() + "' language in: " + (System.currentTimeMillis() - start) + " ms"
+				);
 			} catch (DictionaryImportAbortedException e) {
 				stop();

@ -167,6 +167,10 @@ public class DictionaryLoader {
 					continue;
 				}

+				if (DictionaryDb.doesWordExistSync(language, langChar.toUpperCase(language.getLocale()))) {
+					continue;
+				}
+
 				Word word = new Word();
 				word.langId = language.getId();
 				word.frequency = 0;
@ -210,7 +214,7 @@ public class DictionaryLoader {
 				throw new DictionaryImportException(dictionaryFile, word, line);
 			}

-			if (line % settings.getDictionaryImportWordChunkSize() == 0) {
+			if (line % settings.getDictionaryImportWordChunkSize() == 0 || line == totalWords - 1) {
 				DictionaryDb.insertWordsSync(dbWords);
 				dbWords.clear();
 			}
--- a/src/io/github/sspanak/tt9/db/WordsDao.java
+++ b/src/io/github/sspanak/tt9/db/WordsDao.java
@ -12,6 +12,9 @@ interface WordsDao {
 	@Query("SELECT COUNT(id) FROM words WHERE :langId < 0 OR lang = :langId")
 	int count(int langId);

+	@Query("SELECT COUNT(id) FROM words WHERE lang = :langId AND word = :word")
+	int doesWordExist(int langId, String word);
+
 	@Query(
 		"SELECT * " +
 		"FROM words " +
--- a/src/io/github/sspanak/tt9/languages/Language.java
+++ b/src/io/github/sspanak/tt9/languages/Language.java
@ -3,6 +3,7 @@ package io.github.sspanak.tt9.languages;
 import androidx.annotation.NonNull;

 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.Locale;


@ -15,6 +16,7 @@ public class Language {
 	protected int abcLowerCaseIcon;
 	protected int abcUpperCaseIcon;
 	protected ArrayList<ArrayList<String>> characterMap = new ArrayList<>();
+	private final HashMap<Character, String> reverseCharacterMap = new HashMap<>();

 	// settings
 	protected boolean isPunctuationPartOfWords; // see the getter for more info
@ -65,12 +67,26 @@ public class Language {

 	/************* utility *************/

+	private void generateReverseCharacterMap() {
+		reverseCharacterMap.clear();
+		for (int digit = 0; digit <= 9; digit++) {
+			for (String keyChar : getKeyCharacters(digit)) {
+				reverseCharacterMap.put(keyChar.charAt(0), String.valueOf(digit));
+			}
+		}
+	}
+
+
 	public String capitalize(String word) {
 		return word != null ? word.substring(0, 1).toUpperCase(locale) + word.substring(1).toLowerCase(locale) : null;
 	}

 	public boolean isMixedCaseWord(String word) {
-		return word != null && !word.toLowerCase(locale).equals(word) && !word.toUpperCase(locale).equals(word);
+		return word != null
+			&& (
+				(word.length() == 1 && word.toUpperCase(locale).equals(word))
+				|| (!word.toLowerCase(locale).equals(word) && !word.toUpperCase(locale).equals(word))
+			);
 	}

 	public ArrayList<String> getKeyCharacters(int key) {
@ -90,16 +106,17 @@ public class Language {
 		StringBuilder sequence = new StringBuilder();
 		String lowerCaseWord = word.toLowerCase(locale);

-		for (int i = 0; i < lowerCaseWord.length(); i++) {
-			for (int key = 0; key <= 9; key++) {
-				if (getKeyCharacters(key).contains(Character.toString(lowerCaseWord.charAt(i)))) {
-					sequence.append(key);
-				}
-			}
+		if (reverseCharacterMap.isEmpty()) {
+			generateReverseCharacterMap();
 		}

-		if (word.length() != sequence.length()) {
-			throw new InvalidLanguageCharactersException(this, "Failed generating digit sequence for word: '" + word);
+		for (int i = 0; i < lowerCaseWord.length(); i++) {
+			char letter = lowerCaseWord.charAt(i);
+			if (!reverseCharacterMap.containsKey(letter)) {
+				throw new InvalidLanguageCharactersException(this, "Failed generating digit sequence for word: '" + word);
+			}
+
+			sequence.append(reverseCharacterMap.get(letter));
 		}

 		return sequence.toString();