Csv dictionary support (#145)
* the dictionary loader now supports word frequencies * word frequency validation upon building * added default word frequencies to all dictionaries * updated documentation
This commit is contained in:
parent
b5cd92f1f7
commit
2510aba58a
30 changed files with 1175323 additions and 1175101 deletions
|
|
@ -196,31 +196,34 @@ public class DictionaryLoader {
|
|||
BufferedReader br = new BufferedReader(new InputStreamReader(assets.open(dictionaryFile), StandardCharsets.UTF_8));
|
||||
|
||||
ArrayList<Word> dbWords = new ArrayList<>();
|
||||
long line = 0;
|
||||
long lineCount = 0;
|
||||
|
||||
sendProgressMessage(language, 0, 0);
|
||||
|
||||
for (String word; (word = br.readLine()) != null; line++) {
|
||||
for (String line; (line = br.readLine()) != null; lineCount++) {
|
||||
if (loadThread.isInterrupted()) {
|
||||
br.close();
|
||||
sendProgressMessage(language, -1, 0);
|
||||
throw new DictionaryImportAbortedException();
|
||||
}
|
||||
|
||||
validateWord(language, word, line);
|
||||
String[] parts = splitLine(line);
|
||||
String word = validateWord(language, parts, lineCount);
|
||||
int frequency = validateFrequency(parts);
|
||||
|
||||
try {
|
||||
dbWords.add(stringToWord(language, word));
|
||||
dbWords.add(stringToWord(language, word, frequency));
|
||||
} catch (InvalidLanguageCharactersException e) {
|
||||
throw new DictionaryImportException(dictionaryFile, word, line);
|
||||
throw new DictionaryImportException(dictionaryFile, word, lineCount);
|
||||
}
|
||||
|
||||
if (line % settings.getDictionaryImportWordChunkSize() == 0 || line == totalWords - 1) {
|
||||
if (lineCount % settings.getDictionaryImportWordChunkSize() == 0 || lineCount == totalWords - 1) {
|
||||
DictionaryDb.insertWordsSync(dbWords);
|
||||
dbWords.clear();
|
||||
}
|
||||
|
||||
if (totalWords > 0) {
|
||||
int progress = (int) Math.floor(100.0 * line / totalWords);
|
||||
int progress = (int) Math.floor(100.0 * lineCount / totalWords);
|
||||
sendProgressMessage(language, progress, settings.getDictionaryImportProgressUpdateInterval());
|
||||
}
|
||||
}
|
||||
|
|
@ -230,6 +233,23 @@ public class DictionaryLoader {
|
|||
}
|
||||
|
||||
|
||||
private String[] splitLine(String line) {
|
||||
String[] parts = { line, "" };
|
||||
|
||||
// This is faster than String.split() by around 10%, so it's worth having it.
|
||||
// It runs very often, so any other optimizations are welcome.
|
||||
for (int i = 0 ; i < line.length(); i++) {
|
||||
if (line.charAt(i) == ' ') { // the delimiter is TAB
|
||||
parts[0] = line.substring(0, i);
|
||||
parts[1] = i < line.length() - 1 ? line.substring(i + 1) : "";
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return parts;
|
||||
}
|
||||
|
||||
|
||||
private long countWords(String filename) {
|
||||
try (LineNumberReader reader = new LineNumberReader(new InputStreamReader(assets.open(filename), StandardCharsets.UTF_8))) {
|
||||
//noinspection ResultOfMethodCallIgnored
|
||||
|
|
@ -245,17 +265,30 @@ public class DictionaryLoader {
|
|||
}
|
||||
|
||||
|
||||
private void validateWord(Language language, String word, long line) throws DictionaryImportException {
|
||||
private String validateWord(Language language, String[] lineParts, long line) throws DictionaryImportException {
|
||||
String word = lineParts[0];
|
||||
|
||||
if (!language.isPunctuationPartOfWords() && containsPunctuation.matcher(word).find()) {
|
||||
throw new DictionaryImportException(language.getDictionaryFile(), word, line);
|
||||
}
|
||||
|
||||
return word;
|
||||
}
|
||||
|
||||
|
||||
private int validateFrequency(String[] lineParts) {
|
||||
try {
|
||||
return Integer.parseInt(lineParts[1]);
|
||||
} catch (Exception e) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private Word stringToWord(Language language, String word) throws InvalidLanguageCharactersException {
|
||||
private Word stringToWord(Language language, String word, int frequency) throws InvalidLanguageCharactersException {
|
||||
Word dbWord = new Word();
|
||||
dbWord.langId = language.getId();
|
||||
dbWord.frequency = 0;
|
||||
dbWord.frequency = frequency;
|
||||
dbWord.sequence = language.getDigitSequenceForWord(word);
|
||||
dbWord.word = word;
|
||||
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ public class Bulgarian extends Language {
|
|||
id = 7;
|
||||
name = "Български";
|
||||
locale = new Locale("bg","BG");
|
||||
dictionaryFile = "bg-utf8.txt";
|
||||
dictionaryFile = "bg-utf8.csv";
|
||||
icon = R.drawable.ime_lang_bg;
|
||||
abcLowerCaseIcon = R.drawable.ime_lang_cyrillic_lower;
|
||||
abcUpperCaseIcon = R.drawable.ime_lang_cyrillic_upper;
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ public class Dutch extends English {
|
|||
id = 8;
|
||||
name = "Nederlands";
|
||||
locale = new Locale("nl","NL");
|
||||
dictionaryFile = "nl-utf8.txt";
|
||||
dictionaryFile = "nl-utf8.csv";
|
||||
icon = R.drawable.ime_lang_nl;
|
||||
|
||||
isPunctuationPartOfWords = true;
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ public class English extends Language {
|
|||
id = 1;
|
||||
name = "English";
|
||||
locale = Locale.ENGLISH;
|
||||
dictionaryFile = "en-utf8.txt";
|
||||
dictionaryFile = "en-utf8.csv";
|
||||
icon = R.drawable.ime_lang_en;
|
||||
abcLowerCaseIcon = R.drawable.ime_lang_latin_lower;
|
||||
abcUpperCaseIcon = R.drawable.ime_lang_latin_upper;
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ public class French extends English {
|
|||
id = 4;
|
||||
name = "Français";
|
||||
locale = Locale.FRENCH;
|
||||
dictionaryFile = "fr-utf8.txt";
|
||||
dictionaryFile = "fr-utf8.csv";
|
||||
icon = R.drawable.ime_lang_fr;
|
||||
|
||||
isPunctuationPartOfWords = false;
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ public class German extends English {
|
|||
id = 3;
|
||||
name = "Deutsch";
|
||||
locale = Locale.GERMAN;
|
||||
dictionaryFile = "de-utf8.txt";
|
||||
dictionaryFile = "de-utf8.csv";
|
||||
icon = R.drawable.ime_lang_de;
|
||||
|
||||
isPunctuationPartOfWords = false;
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ public class Italian extends English {
|
|||
id = 5;
|
||||
name = "Italiano";
|
||||
locale = Locale.ITALIAN;
|
||||
dictionaryFile = "it-utf8.txt";
|
||||
dictionaryFile = "it-utf8.csv";
|
||||
icon = R.drawable.ime_lang_it;
|
||||
|
||||
isPunctuationPartOfWords = false;
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ public class Russian extends Language {
|
|||
id = 2;
|
||||
name = "Русский";
|
||||
locale = new Locale("ru","RU");
|
||||
dictionaryFile = "ru-utf8.txt";
|
||||
dictionaryFile = "ru-utf8.csv";
|
||||
icon = R.drawable.ime_lang_ru;
|
||||
abcLowerCaseIcon = R.drawable.ime_lang_cyrillic_lower;
|
||||
abcUpperCaseIcon = R.drawable.ime_lang_cyrillic_upper;
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ public class Spanish extends English {
|
|||
id = 9;
|
||||
name = "Español";
|
||||
locale = new Locale("es", "ES");
|
||||
dictionaryFile = "es-utf8.txt";
|
||||
dictionaryFile = "es-utf8.csv";
|
||||
icon = R.drawable.ime_lang_es;
|
||||
|
||||
isPunctuationPartOfWords = false;
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ public class Ukrainian extends Language {
|
|||
id = 6;
|
||||
name = "Українська";
|
||||
locale = new Locale("uk","UA");
|
||||
dictionaryFile = "uk-utf8.txt";
|
||||
dictionaryFile = "uk-utf8.csv";
|
||||
icon = R.drawable.ime_lang_uk;
|
||||
abcLowerCaseIcon = R.drawable.ime_lang_cyrillic_lower;
|
||||
abcUpperCaseIcon = R.drawable.ime_lang_cyrillic_upper;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue