1
0
Fork 0

Language improvements

* fixed some English words always appearing in small letters, when some should be capitalized (I, Friday, December, etc...)"

* fixed Bulgarian words that should have been capitalized, but were not

* fixed mixed case suggestions priority not updating

* optimized dictionary loading (it's more than 2x faster now)

* fixed the last words of dictionaries not being loaded
This commit is contained in:
sspanak 2022-12-15 11:27:01 +02:00
parent 8888485f70
commit 759317dce1
11 changed files with 25342 additions and 23472 deletions

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -173,9 +173,9 @@ task validateDictionaries {
errors += "Dictionary '" + file.name + "' is invalid. Found a garbage word: '" + line + "' on line " + lineNumber + ".\n" errors += "Dictionary '" + file.name + "' is invalid. Found a garbage word: '" + line + "' on line " + lineNumber + ".\n"
} }
if (line.matches("^.\$")) { if (line.matches("^.\$") && !Character.isUpperCase(line.charAt(0))) {
errorCount++ errorCount++
errors += "Dictionary '" + file.name + "' is invalid. Found a single letter: '" + line + "' on line " + lineNumber + ". Remove all single letters. The alphabet will be added automatically.\n" errors += "Dictionary '" + file.name + "' is invalid. Found a single letter: '" + line + "' on line " + lineNumber + ". Only uppercase single letters are allowed. The rest of the alphabet will be added automatically.\n"
} }
String uniqueWordKey = line ==~ geographicalName ? line : line.toLowerCase() String uniqueWordKey = line ==~ geographicalName ? line : line.toLowerCase()

View file

@ -4,3 +4,8 @@ Source: https://github.com/miglen/bulgarian-wordlists/blob/master/wordlists/bg-w
License: https://github.com/miglen/bulgarian-wordlists/blob/master/LICENSE License: https://github.com/miglen/bulgarian-wordlists/blob/master/LICENSE
Additionally cleaned up repeating words and added some missing ones. Additionally cleaned up repeating words and added some missing ones.
Also, used the wooorm's hunspell-compatible dictionary to determine which words need to start with a capital letter
Link: https://github.com/wooorm/dictionaries/tree/main/dictionaries/bg
License: MIT (available in the link)
Git commit: 13 Apr 2022 [0c78cc810c8aafb2e6f5140bb6dcd4026b247eb8]

View file

@ -18,6 +18,14 @@ Using Git Commit From: Mon Dec 7 20:14:35 2020 -0500 [5ef55f9]
===== =====
Also, used the wooorm's hunspell-compatible dictionary to determine
which words need to start with a capital letter
Link: https://github.com/wooorm/dictionaries/tree/main/dictionaries/en
License: MIT (available in the link)
Git commit: 13 Apr 2022 [0c78cc810c8aafb2e6f5140bb6dcd4026b247eb8]
=====
Spell Checking Oriented Word Lists (SCOWL) Spell Checking Oriented Word Lists (SCOWL)
Mon Dec 7 20:14:35 2020 -0500 [5ef55f9] Mon Dec 7 20:14:35 2020 -0500 [5ef55f9]

View file

@ -0,0 +1,95 @@
const { basename } = require('path');
const { createReadStream, existsSync } = require('fs');
function printHelp() {
console.log(`Usage ${basename(process.argv[1])} DICTIONARY-FILE-NAME.txt LIST-OF-CAPITALIZED-WORDS.txt MIN-WORD-LENGTH LOCALE`);
console.log('Capitalizes a word list using capitalized words in another list.');
console.log('\nMIN-WORD-LENGTH must be a positive number.');
console.log('LOCALE could be any valid JS locale, for exmaple: en, en-US, etc...');
}
function validateInput() {
if (process.argv.length < 6) {
printHelp();
process.exit(1);
}
if (!existsSync(process.argv[3])) {
console.error(`Failure! Could not find list-of-capitals file "${process.argv[3]}."`);
process.exit(2);
}
if (!existsSync(process.argv[2])) {
console.error(`Failure! Could not find dictionary file "${process.argv[2]}."`);
process.exit(2);
}
const minWordLength = Number.parseInt(process.argv[4]);
if (Number.isNaN(minWordLength) || minWordLength < 0) {
console.error(`Failure! The minimum word length must be a positive number.`);
process.exit(2);
}
return { capitalsFileName: process.argv[3], dictionaryFileName: process.argv[2], locale: process.argv[5], minWordLength };
}
async function capitalize({ dictionaryFileName, capitalsFileName, locale, minWordLength }) {
// read the dictionary
let lineReader = require('readline').createInterface({
input: createReadStream(dictionaryFileName)
});
const words = {};
for await (const line of lineReader) {
words[line] = true;
}
// convert the dictionary words using the second file
lineReader = require('readline').createInterface({
input: createReadStream(capitalsFileName)
});
for await (const capitalizedWord of lineReader) {
if (capitalizedWord.length < minWordLength) {
continue;
}
const lowercaseWord = capitalizedWord.toLocaleLowerCase(locale);
if (words[lowercaseWord]) {
delete words[lowercaseWord];
words[capitalizedWord] = true;
}
const possessiveLowercaseWord = `${lowercaseWord}'s`;
if (words[possessiveLowercaseWord]) {
delete words[possessiveLowercaseWord];
words[`${capitalizedWord}'s`] = true;
}
}
return Object.keys(words);
}
function printWords(wordList) {
if (!Array.isArray(wordList)) {
return;
}
wordList.forEach(w => console.log(w));
}
/** main **/
capitalize(validateInput())
.then(words => printWords(words))
.catch(e => console.error(e));

View file

@ -8,7 +8,7 @@ const GEO_NAME = /[A-Z]\w+\-[^\n]+/;
function printHelp() { function printHelp() {
console.log(`Usage ${basename(process.argv[1])} LOCALE FILENAME.txt `); console.log(`Usage ${basename(process.argv[1])} LOCALE FILENAME.txt `);
console.log('Removes repeating words from a word list'); console.log('Removes repeating words from a word list');
console.log('\nLocale could any valid JS locale, for exmaple: en, en-US, etc...'); console.log('\nLocale could be any valid JS locale, for exmaple: en, en-US, etc...');
} }

View file

@ -81,6 +81,15 @@ public class DictionaryDb {
} }
public static boolean doesWordExistSync(Language language, String word) {
if (language == null || word == null || word.equals("")) {
return false;
}
return getInstance().wordsDao().doesWordExist(language.getId(), word) > 0;
}
public static void truncateWords(Handler handler) { public static void truncateWords(Handler handler) {
new Thread() { new Thread() {
@Override @Override
@ -155,7 +164,13 @@ public class DictionaryDb {
@Override @Override
public void run() { public void run() {
try { try {
int affectedRows = getInstance().wordsDao().incrementFrequency(language.getId(), word.toLowerCase(language.getLocale()), sequence); int affectedRows = getInstance().wordsDao().incrementFrequency(language.getId(), word, sequence);
if (affectedRows == 0) {
// If the user has changed the case manually, so there would be no matching word.
// In this case, try again with the lowercase equivalent.
affectedRows = getInstance().wordsDao().incrementFrequency(language.getId(), word.toLowerCase(language.getLocale()), sequence);
}
Logger.d("incrementWordFrequency", "Affected rows: " + affectedRows); Logger.d("incrementWordFrequency", "Affected rows: " + affectedRows);
} catch (Exception e) { } catch (Exception e) {
Logger.e( Logger.e(

View file

@ -108,19 +108,19 @@ public class DictionaryLoader {
DictionaryDb.runInTransaction(() -> { DictionaryDb.runInTransaction(() -> {
try { try {
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
importLetters(language);
Logger.i(
logTag,
"Loaded letters for '" + language.getName() + "' language in: " + (System.currentTimeMillis() - start) + " ms"
);
start = System.currentTimeMillis();
importWords(language); importWords(language);
Logger.i( Logger.i(
logTag, logTag,
"Dictionary: '" + language.getDictionaryFile() + "'" + "Dictionary: '" + language.getDictionaryFile() + "'" +
" processing time: " + (System.currentTimeMillis() - start) + " ms" " processing time: " + (System.currentTimeMillis() - start) + " ms"
); );
start = System.currentTimeMillis();
importLetters(language);
Logger.i(
logTag,
"Loaded letters for '" + language.getName() + "' language in: " + (System.currentTimeMillis() - start) + " ms"
);
} catch (DictionaryImportAbortedException e) { } catch (DictionaryImportAbortedException e) {
stop(); stop();
@ -167,6 +167,10 @@ public class DictionaryLoader {
continue; continue;
} }
if (DictionaryDb.doesWordExistSync(language, langChar.toUpperCase(language.getLocale()))) {
continue;
}
Word word = new Word(); Word word = new Word();
word.langId = language.getId(); word.langId = language.getId();
word.frequency = 0; word.frequency = 0;
@ -210,7 +214,7 @@ public class DictionaryLoader {
throw new DictionaryImportException(dictionaryFile, word, line); throw new DictionaryImportException(dictionaryFile, word, line);
} }
if (line % settings.getDictionaryImportWordChunkSize() == 0) { if (line % settings.getDictionaryImportWordChunkSize() == 0 || line == totalWords - 1) {
DictionaryDb.insertWordsSync(dbWords); DictionaryDb.insertWordsSync(dbWords);
dbWords.clear(); dbWords.clear();
} }

View file

@ -12,6 +12,9 @@ interface WordsDao {
@Query("SELECT COUNT(id) FROM words WHERE :langId < 0 OR lang = :langId") @Query("SELECT COUNT(id) FROM words WHERE :langId < 0 OR lang = :langId")
int count(int langId); int count(int langId);
@Query("SELECT COUNT(id) FROM words WHERE lang = :langId AND word = :word")
int doesWordExist(int langId, String word);
@Query( @Query(
"SELECT * " + "SELECT * " +
"FROM words " + "FROM words " +

View file

@ -3,6 +3,7 @@ package io.github.sspanak.tt9.languages;
import androidx.annotation.NonNull; import androidx.annotation.NonNull;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap;
import java.util.Locale; import java.util.Locale;
@ -15,6 +16,7 @@ public class Language {
protected int abcLowerCaseIcon; protected int abcLowerCaseIcon;
protected int abcUpperCaseIcon; protected int abcUpperCaseIcon;
protected ArrayList<ArrayList<String>> characterMap = new ArrayList<>(); protected ArrayList<ArrayList<String>> characterMap = new ArrayList<>();
private final HashMap<Character, String> reverseCharacterMap = new HashMap<>();
// settings // settings
protected boolean isPunctuationPartOfWords; // see the getter for more info protected boolean isPunctuationPartOfWords; // see the getter for more info
@ -65,12 +67,26 @@ public class Language {
/************* utility *************/ /************* utility *************/
private void generateReverseCharacterMap() {
reverseCharacterMap.clear();
for (int digit = 0; digit <= 9; digit++) {
for (String keyChar : getKeyCharacters(digit)) {
reverseCharacterMap.put(keyChar.charAt(0), String.valueOf(digit));
}
}
}
public String capitalize(String word) { public String capitalize(String word) {
return word != null ? word.substring(0, 1).toUpperCase(locale) + word.substring(1).toLowerCase(locale) : null; return word != null ? word.substring(0, 1).toUpperCase(locale) + word.substring(1).toLowerCase(locale) : null;
} }
public boolean isMixedCaseWord(String word) { public boolean isMixedCaseWord(String word) {
return word != null && !word.toLowerCase(locale).equals(word) && !word.toUpperCase(locale).equals(word); return word != null
&& (
(word.length() == 1 && word.toUpperCase(locale).equals(word))
|| (!word.toLowerCase(locale).equals(word) && !word.toUpperCase(locale).equals(word))
);
} }
public ArrayList<String> getKeyCharacters(int key) { public ArrayList<String> getKeyCharacters(int key) {
@ -90,16 +106,17 @@ public class Language {
StringBuilder sequence = new StringBuilder(); StringBuilder sequence = new StringBuilder();
String lowerCaseWord = word.toLowerCase(locale); String lowerCaseWord = word.toLowerCase(locale);
for (int i = 0; i < lowerCaseWord.length(); i++) { if (reverseCharacterMap.isEmpty()) {
for (int key = 0; key <= 9; key++) { generateReverseCharacterMap();
if (getKeyCharacters(key).contains(Character.toString(lowerCaseWord.charAt(i)))) {
sequence.append(key);
}
}
} }
if (word.length() != sequence.length()) { for (int i = 0; i < lowerCaseWord.length(); i++) {
throw new InvalidLanguageCharactersException(this, "Failed generating digit sequence for word: '" + word); char letter = lowerCaseWord.charAt(i);
if (!reverseCharacterMap.containsKey(letter)) {
throw new InvalidLanguageCharactersException(this, "Failed generating digit sequence for word: '" + word);
}
sequence.append(reverseCharacterMap.get(letter));
} }
return sequence.toString(); return sequence.toString();