Language improvements
* fixed some English words always appearing in small letters, when some should be capitalized (I, Friday, December, etc...)" * fixed Bulgarian words that should have been capitalized, but were not * fixed mixed case suggestions priority not updating * optimized dictionary loading (it's more than 2x faster now) * fixed the last words of dictionaries not being loaded
This commit is contained in:
parent
8888485f70
commit
759317dce1
11 changed files with 25342 additions and 23472 deletions
10173
assets/bg-utf8.txt
10173
assets/bg-utf8.txt
File diff suppressed because it is too large
Load diff
38450
assets/en-utf8.txt
38450
assets/en-utf8.txt
File diff suppressed because it is too large
Load diff
|
|
@ -173,9 +173,9 @@ task validateDictionaries {
|
|||
errors += "Dictionary '" + file.name + "' is invalid. Found a garbage word: '" + line + "' on line " + lineNumber + ".\n"
|
||||
}
|
||||
|
||||
if (line.matches("^.\$")) {
|
||||
if (line.matches("^.\$") && !Character.isUpperCase(line.charAt(0))) {
|
||||
errorCount++
|
||||
errors += "Dictionary '" + file.name + "' is invalid. Found a single letter: '" + line + "' on line " + lineNumber + ". Remove all single letters. The alphabet will be added automatically.\n"
|
||||
errors += "Dictionary '" + file.name + "' is invalid. Found a single letter: '" + line + "' on line " + lineNumber + ". Only uppercase single letters are allowed. The rest of the alphabet will be added automatically.\n"
|
||||
}
|
||||
|
||||
String uniqueWordKey = line ==~ geographicalName ? line : line.toLowerCase()
|
||||
|
|
|
|||
|
|
@ -3,4 +3,9 @@ Version: f46eff1 (2022-04-26)
|
|||
Source: https://github.com/miglen/bulgarian-wordlists/blob/master/wordlists/bg-words-validated-cyrillic.txt
|
||||
License: https://github.com/miglen/bulgarian-wordlists/blob/master/LICENSE
|
||||
|
||||
Additionally cleaned up repeating words and added some missing ones.
|
||||
Additionally cleaned up repeating words and added some missing ones.
|
||||
|
||||
Also, used the wooorm's hunspell-compatible dictionary to determine which words need to start with a capital letter
|
||||
Link: https://github.com/wooorm/dictionaries/tree/main/dictionaries/bg
|
||||
License: MIT (available in the link)
|
||||
Git commit: 13 Apr 2022 [0c78cc810c8aafb2e6f5140bb6dcd4026b247eb8]
|
||||
|
|
|
|||
|
|
@ -18,6 +18,14 @@ Using Git Commit From: Mon Dec 7 20:14:35 2020 -0500 [5ef55f9]
|
|||
|
||||
=====
|
||||
|
||||
Also, used the wooorm's hunspell-compatible dictionary to determine
|
||||
which words need to start with a capital letter
|
||||
Link: https://github.com/wooorm/dictionaries/tree/main/dictionaries/en
|
||||
License: MIT (available in the link)
|
||||
Git commit: 13 Apr 2022 [0c78cc810c8aafb2e6f5140bb6dcd4026b247eb8]
|
||||
|
||||
=====
|
||||
|
||||
Spell Checking Oriented Word Lists (SCOWL)
|
||||
|
||||
Mon Dec 7 20:14:35 2020 -0500 [5ef55f9]
|
||||
|
|
|
|||
95
scripts/capitalize-dictionary-words.js
Normal file
95
scripts/capitalize-dictionary-words.js
Normal file
|
|
@ -0,0 +1,95 @@
|
|||
const { basename } = require('path');
|
||||
const { createReadStream, existsSync } = require('fs');
|
||||
|
||||
|
||||
function printHelp() {
|
||||
console.log(`Usage ${basename(process.argv[1])} DICTIONARY-FILE-NAME.txt LIST-OF-CAPITALIZED-WORDS.txt MIN-WORD-LENGTH LOCALE`);
|
||||
console.log('Capitalizes a word list using capitalized words in another list.');
|
||||
console.log('\nMIN-WORD-LENGTH must be a positive number.');
|
||||
console.log('LOCALE could be any valid JS locale, for exmaple: en, en-US, etc...');
|
||||
}
|
||||
|
||||
|
||||
|
||||
function validateInput() {
|
||||
if (process.argv.length < 6) {
|
||||
printHelp();
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
|
||||
if (!existsSync(process.argv[3])) {
|
||||
console.error(`Failure! Could not find list-of-capitals file "${process.argv[3]}."`);
|
||||
process.exit(2);
|
||||
}
|
||||
|
||||
|
||||
if (!existsSync(process.argv[2])) {
|
||||
console.error(`Failure! Could not find dictionary file "${process.argv[2]}."`);
|
||||
process.exit(2);
|
||||
}
|
||||
|
||||
const minWordLength = Number.parseInt(process.argv[4]);
|
||||
if (Number.isNaN(minWordLength) || minWordLength < 0) {
|
||||
console.error(`Failure! The minimum word length must be a positive number.`);
|
||||
process.exit(2);
|
||||
}
|
||||
|
||||
return { capitalsFileName: process.argv[3], dictionaryFileName: process.argv[2], locale: process.argv[5], minWordLength };
|
||||
}
|
||||
|
||||
|
||||
async function capitalize({ dictionaryFileName, capitalsFileName, locale, minWordLength }) {
|
||||
// read the dictionary
|
||||
let lineReader = require('readline').createInterface({
|
||||
input: createReadStream(dictionaryFileName)
|
||||
});
|
||||
|
||||
const words = {};
|
||||
for await (const line of lineReader) {
|
||||
words[line] = true;
|
||||
}
|
||||
|
||||
|
||||
// convert the dictionary words using the second file
|
||||
lineReader = require('readline').createInterface({
|
||||
input: createReadStream(capitalsFileName)
|
||||
});
|
||||
|
||||
for await (const capitalizedWord of lineReader) {
|
||||
if (capitalizedWord.length < minWordLength) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const lowercaseWord = capitalizedWord.toLocaleLowerCase(locale);
|
||||
if (words[lowercaseWord]) {
|
||||
delete words[lowercaseWord];
|
||||
words[capitalizedWord] = true;
|
||||
}
|
||||
|
||||
const possessiveLowercaseWord = `${lowercaseWord}'s`;
|
||||
if (words[possessiveLowercaseWord]) {
|
||||
delete words[possessiveLowercaseWord];
|
||||
words[`${capitalizedWord}'s`] = true;
|
||||
}
|
||||
}
|
||||
|
||||
return Object.keys(words);
|
||||
}
|
||||
|
||||
|
||||
|
||||
function printWords(wordList) {
|
||||
if (!Array.isArray(wordList)) {
|
||||
return;
|
||||
}
|
||||
|
||||
wordList.forEach(w => console.log(w));
|
||||
}
|
||||
|
||||
|
||||
|
||||
/** main **/
|
||||
capitalize(validateInput())
|
||||
.then(words => printWords(words))
|
||||
.catch(e => console.error(e));
|
||||
|
|
@ -8,7 +8,7 @@ const GEO_NAME = /[A-Z]\w+\-[^\n]+/;
|
|||
function printHelp() {
|
||||
console.log(`Usage ${basename(process.argv[1])} LOCALE FILENAME.txt `);
|
||||
console.log('Removes repeating words from a word list');
|
||||
console.log('\nLocale could any valid JS locale, for exmaple: en, en-US, etc...');
|
||||
console.log('\nLocale could be any valid JS locale, for exmaple: en, en-US, etc...');
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -81,6 +81,15 @@ public class DictionaryDb {
|
|||
}
|
||||
|
||||
|
||||
public static boolean doesWordExistSync(Language language, String word) {
|
||||
if (language == null || word == null || word.equals("")) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return getInstance().wordsDao().doesWordExist(language.getId(), word) > 0;
|
||||
}
|
||||
|
||||
|
||||
public static void truncateWords(Handler handler) {
|
||||
new Thread() {
|
||||
@Override
|
||||
|
|
@ -155,7 +164,13 @@ public class DictionaryDb {
|
|||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
int affectedRows = getInstance().wordsDao().incrementFrequency(language.getId(), word.toLowerCase(language.getLocale()), sequence);
|
||||
int affectedRows = getInstance().wordsDao().incrementFrequency(language.getId(), word, sequence);
|
||||
if (affectedRows == 0) {
|
||||
// If the user has changed the case manually, so there would be no matching word.
|
||||
// In this case, try again with the lowercase equivalent.
|
||||
affectedRows = getInstance().wordsDao().incrementFrequency(language.getId(), word.toLowerCase(language.getLocale()), sequence);
|
||||
}
|
||||
|
||||
Logger.d("incrementWordFrequency", "Affected rows: " + affectedRows);
|
||||
} catch (Exception e) {
|
||||
Logger.e(
|
||||
|
|
|
|||
|
|
@ -108,19 +108,19 @@ public class DictionaryLoader {
|
|||
DictionaryDb.runInTransaction(() -> {
|
||||
try {
|
||||
long start = System.currentTimeMillis();
|
||||
importLetters(language);
|
||||
Logger.i(
|
||||
logTag,
|
||||
"Loaded letters for '" + language.getName() + "' language in: " + (System.currentTimeMillis() - start) + " ms"
|
||||
);
|
||||
|
||||
start = System.currentTimeMillis();
|
||||
importWords(language);
|
||||
Logger.i(
|
||||
logTag,
|
||||
"Dictionary: '" + language.getDictionaryFile() + "'" +
|
||||
" processing time: " + (System.currentTimeMillis() - start) + " ms"
|
||||
);
|
||||
|
||||
start = System.currentTimeMillis();
|
||||
importLetters(language);
|
||||
Logger.i(
|
||||
logTag,
|
||||
"Loaded letters for '" + language.getName() + "' language in: " + (System.currentTimeMillis() - start) + " ms"
|
||||
);
|
||||
} catch (DictionaryImportAbortedException e) {
|
||||
stop();
|
||||
|
||||
|
|
@ -167,6 +167,10 @@ public class DictionaryLoader {
|
|||
continue;
|
||||
}
|
||||
|
||||
if (DictionaryDb.doesWordExistSync(language, langChar.toUpperCase(language.getLocale()))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Word word = new Word();
|
||||
word.langId = language.getId();
|
||||
word.frequency = 0;
|
||||
|
|
@ -210,7 +214,7 @@ public class DictionaryLoader {
|
|||
throw new DictionaryImportException(dictionaryFile, word, line);
|
||||
}
|
||||
|
||||
if (line % settings.getDictionaryImportWordChunkSize() == 0) {
|
||||
if (line % settings.getDictionaryImportWordChunkSize() == 0 || line == totalWords - 1) {
|
||||
DictionaryDb.insertWordsSync(dbWords);
|
||||
dbWords.clear();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -12,6 +12,9 @@ interface WordsDao {
|
|||
@Query("SELECT COUNT(id) FROM words WHERE :langId < 0 OR lang = :langId")
|
||||
int count(int langId);
|
||||
|
||||
@Query("SELECT COUNT(id) FROM words WHERE lang = :langId AND word = :word")
|
||||
int doesWordExist(int langId, String word);
|
||||
|
||||
@Query(
|
||||
"SELECT * " +
|
||||
"FROM words " +
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ package io.github.sspanak.tt9.languages;
|
|||
import androidx.annotation.NonNull;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Locale;
|
||||
|
||||
|
||||
|
|
@ -15,6 +16,7 @@ public class Language {
|
|||
protected int abcLowerCaseIcon;
|
||||
protected int abcUpperCaseIcon;
|
||||
protected ArrayList<ArrayList<String>> characterMap = new ArrayList<>();
|
||||
private final HashMap<Character, String> reverseCharacterMap = new HashMap<>();
|
||||
|
||||
// settings
|
||||
protected boolean isPunctuationPartOfWords; // see the getter for more info
|
||||
|
|
@ -65,12 +67,26 @@ public class Language {
|
|||
|
||||
/************* utility *************/
|
||||
|
||||
private void generateReverseCharacterMap() {
|
||||
reverseCharacterMap.clear();
|
||||
for (int digit = 0; digit <= 9; digit++) {
|
||||
for (String keyChar : getKeyCharacters(digit)) {
|
||||
reverseCharacterMap.put(keyChar.charAt(0), String.valueOf(digit));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public String capitalize(String word) {
|
||||
return word != null ? word.substring(0, 1).toUpperCase(locale) + word.substring(1).toLowerCase(locale) : null;
|
||||
}
|
||||
|
||||
public boolean isMixedCaseWord(String word) {
|
||||
return word != null && !word.toLowerCase(locale).equals(word) && !word.toUpperCase(locale).equals(word);
|
||||
return word != null
|
||||
&& (
|
||||
(word.length() == 1 && word.toUpperCase(locale).equals(word))
|
||||
|| (!word.toLowerCase(locale).equals(word) && !word.toUpperCase(locale).equals(word))
|
||||
);
|
||||
}
|
||||
|
||||
public ArrayList<String> getKeyCharacters(int key) {
|
||||
|
|
@ -90,16 +106,17 @@ public class Language {
|
|||
StringBuilder sequence = new StringBuilder();
|
||||
String lowerCaseWord = word.toLowerCase(locale);
|
||||
|
||||
for (int i = 0; i < lowerCaseWord.length(); i++) {
|
||||
for (int key = 0; key <= 9; key++) {
|
||||
if (getKeyCharacters(key).contains(Character.toString(lowerCaseWord.charAt(i)))) {
|
||||
sequence.append(key);
|
||||
}
|
||||
}
|
||||
if (reverseCharacterMap.isEmpty()) {
|
||||
generateReverseCharacterMap();
|
||||
}
|
||||
|
||||
if (word.length() != sequence.length()) {
|
||||
throw new InvalidLanguageCharactersException(this, "Failed generating digit sequence for word: '" + word);
|
||||
for (int i = 0; i < lowerCaseWord.length(); i++) {
|
||||
char letter = lowerCaseWord.charAt(i);
|
||||
if (!reverseCharacterMap.containsKey(letter)) {
|
||||
throw new InvalidLanguageCharactersException(this, "Failed generating digit sequence for word: '" + word);
|
||||
}
|
||||
|
||||
sequence.append(reverseCharacterMap.get(letter));
|
||||
}
|
||||
|
||||
return sequence.toString();
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue