1
0
Fork 0

Bulgarian update (#268)

* fixed Bulgarian layout: moved 'ь' to 8-key 

* added a migration for removing all Bulgarian words, since the digit sequences are no longer compatible with the new layout

* fixed incorrect text case of some words

* removed some nonsense words

* added new Bulgarian words
This commit is contained in:
Dimo Karaivanov 2023-07-13 14:33:54 +03:00 committed by GitHub
parent c4a78c1931
commit 0aa934cebd
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 251186 additions and 230906 deletions

View file

@ -10,5 +10,5 @@ layout:
- [м, н, о, п] # 5 - [м, н, о, п] # 5
- [р, с, т, у] # 6 - [р, с, т, у] # 6
- [ф, х, ц, ч] # 7 - [ф, х, ц, ч] # 7
- [ш, щ, ъ] # 8 - [ш, щ, ъ, ь] # 8
- [ь, ю, я] # 9 - [ю, я] # 9

File diff suppressed because it is too large Load diff

View file

@ -1,13 +1,18 @@
Bulgarian wordlist by: Miglen Georgiev Bulgarian wordlist 1 by Miglen Georgiev
Version: f46eff1 (2022-04-26) Version: f46eff1 (2022-04-26)
Source: https://github.com/miglen/bulgarian-wordlists/blob/master/wordlists/bg-words-validated-cyrillic.txt Source: https://github.com/miglen/bulgarian-wordlists/blob/master/wordlists/bg-words-validated-cyrillic.txt
License: https://github.com/miglen/bulgarian-wordlists/blob/master/LICENSE License: https://github.com/miglen/bulgarian-wordlists/blob/master/LICENSE
Additionally cleaned up repeating words and added some missing ones. Bulgarian wordlist 2 by michmech
Version: 9c91fe4
Source: https://github.com/michmech/lemmatization-lists/blob/master/lemmatization-bg.txt
License: https://github.com/michmech/lemmatization-lists/blob/master/LICENCE
Also, used the wooorm's hunspell-compatible dictionary to determine which words need to start with a capital letter Also, used the wooorm's hunspell-compatible dictionary to determine which words need to start with a capital letter
Link: https://github.com/wooorm/dictionaries/tree/main/dictionaries/bg Link: https://github.com/wooorm/dictionaries/tree/main/dictionaries/bg
Git commit: 13 Apr 2022 [0c78cc810c8aafb2e6f5140bb6dcd4026b247eb8] Git commit: 13 Apr 2022 [0c78cc810c8aafb2e6f5140bb6dcd4026b247eb8]
Additionally cleaned up repeating words and added some missing ones manually.
Word frequencies obtained from the "General" word frequency dictionary by the Department of Computational Linguistics of the Bulgarian Academy of Sciences. Word frequencies obtained from the "General" word frequency dictionary by the Department of Computational Linguistics of the Bulgarian Academy of Sciences.
Link: https://dcl.bas.bg/frequency.html Link: https://dcl.bas.bg/frequency.html

View file

@ -33,7 +33,8 @@ public class DB11 {
assert Dutch != null; assert Dutch != null;
database.beginTransaction(); database.beginTransaction();
database.execSQL(getDeleteEnglishSwordsQuery(English)); database.execSQL(getTruncateBulgarianQuery());
database.execSQL(getDeleteEnglishSwordsQuery());
database.execSQL(getDeleteWordsQuery(English.getId(), enWords)); database.execSQL(getDeleteWordsQuery(English.getId(), enWords));
database.execSQL(getDeleteWordsQuery(Dutch.getId(), nlWords)); database.execSQL(getDeleteWordsQuery(Dutch.getId(), nlWords));
database.setTransactionSuccessful(); database.setTransactionSuccessful();
@ -45,11 +46,19 @@ public class DB11 {
} }
}; };
private String getDeleteEnglishSwordsQuery(Language English) { private String getDeleteEnglishSwordsQuery() {
Language English = LanguageCollection.getByLocale(ctx, Locale.ENGLISH.toString());
assert English != null;
return "DELETE FROM words WHERE lang=" + English.getId() + " AND word LIKE '%''s'"; return "DELETE FROM words WHERE lang=" + English.getId() + " AND word LIKE '%''s'";
} }
private String getDeleteWordsQuery(int langId, String wordList) { private String getDeleteWordsQuery(int langId, String wordList) {
return "DELETE FROM words WHERE lang=" + langId + " AND word IN(" + wordList + ")"; return "DELETE FROM words WHERE lang=" + langId + " AND word IN(" + wordList + ")";
} }
private String getTruncateBulgarianQuery() {
Language Bulgarian = LanguageCollection.getByLocale(ctx, "bg_BG");
assert Bulgarian != null;
return "DELETE FROM words WHERE lang=" + Bulgarian.getId();
}
} }