1
0
Fork 0

Csv dictionary support (#145)

* the dictionary loader now supports word frequencies

* word frequency validation upon building

* added default word frequencies to all dictionaries

* updated documentation
This commit is contained in:
Dimo Karaivanov 2023-01-26 11:47:34 +02:00 committed by GitHub
parent b5cd92f1f7
commit 2510aba58a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
30 changed files with 1175323 additions and 1175101 deletions

View file

@ -196,31 +196,34 @@ public class DictionaryLoader {
BufferedReader br = new BufferedReader(new InputStreamReader(assets.open(dictionaryFile), StandardCharsets.UTF_8));
ArrayList<Word> dbWords = new ArrayList<>();
long line = 0;
long lineCount = 0;
sendProgressMessage(language, 0, 0);
for (String word; (word = br.readLine()) != null; line++) {
for (String line; (line = br.readLine()) != null; lineCount++) {
if (loadThread.isInterrupted()) {
br.close();
sendProgressMessage(language, -1, 0);
throw new DictionaryImportAbortedException();
}
validateWord(language, word, line);
String[] parts = splitLine(line);
String word = validateWord(language, parts, lineCount);
int frequency = validateFrequency(parts);
try {
dbWords.add(stringToWord(language, word));
dbWords.add(stringToWord(language, word, frequency));
} catch (InvalidLanguageCharactersException e) {
throw new DictionaryImportException(dictionaryFile, word, line);
throw new DictionaryImportException(dictionaryFile, word, lineCount);
}
if (line % settings.getDictionaryImportWordChunkSize() == 0 || line == totalWords - 1) {
if (lineCount % settings.getDictionaryImportWordChunkSize() == 0 || lineCount == totalWords - 1) {
DictionaryDb.insertWordsSync(dbWords);
dbWords.clear();
}
if (totalWords > 0) {
int progress = (int) Math.floor(100.0 * line / totalWords);
int progress = (int) Math.floor(100.0 * lineCount / totalWords);
sendProgressMessage(language, progress, settings.getDictionaryImportProgressUpdateInterval());
}
}
@ -230,6 +233,23 @@ public class DictionaryLoader {
}
private String[] splitLine(String line) {
String[] parts = { line, "" };
// This is faster than String.split() by around 10%, so it's worth having it.
// It runs very often, so any other optimizations are welcome.
for (int i = 0 ; i < line.length(); i++) {
if (line.charAt(i) == ' ') { // the delimiter is TAB
parts[0] = line.substring(0, i);
parts[1] = i < line.length() - 1 ? line.substring(i + 1) : "";
break;
}
}
return parts;
}
private long countWords(String filename) {
try (LineNumberReader reader = new LineNumberReader(new InputStreamReader(assets.open(filename), StandardCharsets.UTF_8))) {
//noinspection ResultOfMethodCallIgnored
@ -245,17 +265,30 @@ public class DictionaryLoader {
}
private void validateWord(Language language, String word, long line) throws DictionaryImportException {
private String validateWord(Language language, String[] lineParts, long line) throws DictionaryImportException {
String word = lineParts[0];
if (!language.isPunctuationPartOfWords() && containsPunctuation.matcher(word).find()) {
throw new DictionaryImportException(language.getDictionaryFile(), word, line);
}
return word;
}
private int validateFrequency(String[] lineParts) {
try {
return Integer.parseInt(lineParts[1]);
} catch (Exception e) {
return 0;
}
}
private Word stringToWord(Language language, String word) throws InvalidLanguageCharactersException {
private Word stringToWord(Language language, String word, int frequency) throws InvalidLanguageCharactersException {
Word dbWord = new Word();
dbWord.langId = language.getId();
dbWord.frequency = 0;
dbWord.frequency = frequency;
dbWord.sequence = language.getDigitSequenceForWord(word);
dbWord.word = word;

View file

@ -13,7 +13,7 @@ public class Bulgarian extends Language {
id = 7;
name = "Български";
locale = new Locale("bg","BG");
dictionaryFile = "bg-utf8.txt";
dictionaryFile = "bg-utf8.csv";
icon = R.drawable.ime_lang_bg;
abcLowerCaseIcon = R.drawable.ime_lang_cyrillic_lower;
abcUpperCaseIcon = R.drawable.ime_lang_cyrillic_upper;

View file

@ -12,7 +12,7 @@ public class Dutch extends English {
id = 8;
name = "Nederlands";
locale = new Locale("nl","NL");
dictionaryFile = "nl-utf8.txt";
dictionaryFile = "nl-utf8.csv";
icon = R.drawable.ime_lang_nl;
isPunctuationPartOfWords = true;

View file

@ -13,7 +13,7 @@ public class English extends Language {
id = 1;
name = "English";
locale = Locale.ENGLISH;
dictionaryFile = "en-utf8.txt";
dictionaryFile = "en-utf8.csv";
icon = R.drawable.ime_lang_en;
abcLowerCaseIcon = R.drawable.ime_lang_latin_lower;
abcUpperCaseIcon = R.drawable.ime_lang_latin_upper;

View file

@ -12,7 +12,7 @@ public class French extends English {
id = 4;
name = "Français";
locale = Locale.FRENCH;
dictionaryFile = "fr-utf8.txt";
dictionaryFile = "fr-utf8.csv";
icon = R.drawable.ime_lang_fr;
isPunctuationPartOfWords = false;

View file

@ -11,7 +11,7 @@ public class German extends English {
id = 3;
name = "Deutsch";
locale = Locale.GERMAN;
dictionaryFile = "de-utf8.txt";
dictionaryFile = "de-utf8.csv";
icon = R.drawable.ime_lang_de;
isPunctuationPartOfWords = false;

View file

@ -12,7 +12,7 @@ public class Italian extends English {
id = 5;
name = "Italiano";
locale = Locale.ITALIAN;
dictionaryFile = "it-utf8.txt";
dictionaryFile = "it-utf8.csv";
icon = R.drawable.ime_lang_it;
isPunctuationPartOfWords = false;

View file

@ -13,7 +13,7 @@ public class Russian extends Language {
id = 2;
name = "Русский";
locale = new Locale("ru","RU");
dictionaryFile = "ru-utf8.txt";
dictionaryFile = "ru-utf8.csv";
icon = R.drawable.ime_lang_ru;
abcLowerCaseIcon = R.drawable.ime_lang_cyrillic_lower;
abcUpperCaseIcon = R.drawable.ime_lang_cyrillic_upper;

View file

@ -15,7 +15,7 @@ public class Spanish extends English {
id = 9;
name = "Español";
locale = new Locale("es", "ES");
dictionaryFile = "es-utf8.txt";
dictionaryFile = "es-utf8.csv";
icon = R.drawable.ime_lang_es;
isPunctuationPartOfWords = false;

View file

@ -13,7 +13,7 @@ public class Ukrainian extends Language {
id = 6;
name = "Українська";
locale = new Locale("uk","UA");
dictionaryFile = "uk-utf8.txt";
dictionaryFile = "uk-utf8.csv";
icon = R.drawable.ime_lang_uk;
abcLowerCaseIcon = R.drawable.ime_lang_cyrillic_lower;
abcUpperCaseIcon = R.drawable.ime_lang_cyrillic_upper;